aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDave Kleikamp <shaggy@austin.ibm.com>2006-01-24 15:34:47 -0500
committerDave Kleikamp <shaggy@austin.ibm.com>2006-01-24 15:34:47 -0500
commit0a0fc0ddbe732779366ab6b1b879f62195e65967 (patch)
tree7b42490a676cf39ae0691b6859ecf7fd410f229b /fs
parent4d5dbd0945d9e0833dd7964a3d6ee33157f7cc7a (diff)
parent3ee68c4af3fd7228c1be63254b9f884614f9ebb2 (diff)
Merge with /home/shaggy/git/linus-clean/
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/9p.c314
-rw-r--r--fs/9p/9p.h77
-rw-r--r--fs/9p/Makefile11
-rw-r--r--fs/9p/conv.c906
-rw-r--r--fs/9p/conv.h35
-rw-r--r--fs/9p/debug.h23
-rw-r--r--fs/9p/error.c10
-rw-r--r--fs/9p/error.h2
-rw-r--r--fs/9p/fid.c5
-rw-r--r--fs/9p/mux.c1145
-rw-r--r--fs/9p/mux.h41
-rw-r--r--fs/9p/trans_fd.c53
-rw-r--r--fs/9p/trans_sock.c161
-rw-r--r--fs/9p/transport.h4
-rw-r--r--fs/9p/v9fs.c59
-rw-r--r--fs/9p/v9fs.h17
-rw-r--r--fs/9p/v9fs_vfs.h6
-rw-r--r--fs/9p/vfs_addr.c109
-rw-r--r--fs/9p/vfs_dentry.c15
-rw-r--r--fs/9p/vfs_dir.c47
-rw-r--r--fs/9p/vfs_file.c32
-rw-r--r--fs/9p/vfs_inode.c622
-rw-r--r--fs/9p/vfs_super.c14
-rw-r--r--fs/Kconfig68
-rw-r--r--fs/Kconfig.binfmt2
-rw-r--r--fs/Makefile6
-rw-r--r--fs/affs/inode.c4
-rw-r--r--fs/afs/callback.c1
-rw-r--r--fs/afs/cmservice.c2
-rw-r--r--fs/afs/dir.c2
-rw-r--r--fs/afs/volume.h4
-rw-r--r--fs/aio.c47
-rw-r--r--fs/attr.c26
-rw-r--r--fs/autofs/root.c5
-rw-r--r--fs/autofs4/autofs_i.h2
-rw-r--r--fs/autofs4/expire.c12
-rw-r--r--fs/autofs4/inode.c4
-rw-r--r--fs/autofs4/root.c28
-rw-r--r--fs/befs/attribute.c117
-rw-r--r--fs/binfmt_aout.c2
-rw-r--r--fs/binfmt_elf.c30
-rw-r--r--fs/binfmt_elf_fdpic.c2
-rw-r--r--fs/binfmt_flat.c19
-rw-r--r--fs/binfmt_misc.c14
-rw-r--r--fs/bio.c67
-rw-r--r--fs/block_dev.c4
-rw-r--r--fs/buffer.c91
-rw-r--r--fs/char_dev.c96
-rw-r--r--fs/cifs/CHANGES16
-rw-r--r--fs/cifs/README42
-rw-r--r--fs/cifs/TODO4
-rw-r--r--fs/cifs/cifs_debug.c51
-rw-r--r--fs/cifs/cifs_fs_sb.h5
-rw-r--r--fs/cifs/cifs_unicode.c13
-rw-r--r--fs/cifs/cifs_unicode.h6
-rw-r--r--fs/cifs/cifs_uniupr.h2
-rw-r--r--fs/cifs/cifsacl.h38
-rw-r--r--fs/cifs/cifsencrypt.c57
-rw-r--r--fs/cifs/cifsfs.c158
-rw-r--r--fs/cifs/cifsfs.h4
-rw-r--r--fs/cifs/cifsglob.h18
-rw-r--r--fs/cifs/cifspdu.h107
-rw-r--r--fs/cifs/cifsproto.h23
-rw-r--r--fs/cifs/cifssmb.c360
-rw-r--r--fs/cifs/connect.c180
-rw-r--r--fs/cifs/dir.c74
-rw-r--r--fs/cifs/file.c81
-rw-r--r--fs/cifs/inode.c268
-rw-r--r--fs/cifs/misc.c36
-rw-r--r--fs/cifs/netmisc.c4
-rw-r--r--fs/cifs/readdir.c60
-rw-r--r--fs/cifs/rfc1002pdu.h4
-rw-r--r--fs/cifs/transport.c43
-rw-r--r--fs/cifs/xattr.c22
-rw-r--r--fs/coda/cache.c2
-rw-r--r--fs/coda/dir.c4
-rw-r--r--fs/coda/file.c8
-rw-r--r--fs/compat.c359
-rw-r--r--fs/compat_ioctl.c498
-rw-r--r--fs/configfs/Makefile7
-rw-r--r--fs/configfs/configfs_internal.h142
-rw-r--r--fs/configfs/dir.c1102
-rw-r--r--fs/configfs/file.c360
-rw-r--r--fs/configfs/inode.c162
-rw-r--r--fs/configfs/item.c227
-rw-r--r--fs/configfs/mount.c159
-rw-r--r--fs/configfs/symlink.c281
-rw-r--r--fs/dcache.c43
-rw-r--r--fs/dcookies.c1
-rw-r--r--fs/debugfs/inode.c8
-rw-r--r--fs/devfs/base.c22
-rw-r--r--fs/devpts/inode.c8
-rw-r--r--fs/direct-io.c30
-rw-r--r--fs/dquot.c23
-rw-r--r--fs/drop_caches.c68
-rw-r--r--fs/efs/super.c5
-rw-r--r--fs/exec.c44
-rw-r--r--fs/exportfs/expfs.c91
-rw-r--r--fs/ext2/acl.c11
-rw-r--r--fs/ext2/balloc.c1
-rw-r--r--fs/ext2/bitmap.c7
-rw-r--r--fs/ext2/dir.c2
-rw-r--r--fs/ext2/ext2.h2
-rw-r--r--fs/ext2/ioctl.c1
-rw-r--r--fs/ext2/namei.c5
-rw-r--r--fs/ext2/super.c6
-rw-r--r--fs/ext2/xattr.c6
-rw-r--r--fs/ext2/xattr_trusted.c5
-rw-r--r--fs/ext2/xattr_user.c14
-rw-r--r--fs/ext3/acl.c11
-rw-r--r--fs/ext3/balloc.c3
-rw-r--r--fs/ext3/bitmap.c8
-rw-r--r--fs/ext3/bitmap.h8
-rw-r--r--fs/ext3/ialloc.c7
-rw-r--r--fs/ext3/inode.c4
-rw-r--r--fs/ext3/ioctl.c1
-rw-r--r--fs/ext3/namei.c7
-rw-r--r--fs/ext3/resize.c33
-rw-r--r--fs/ext3/super.c60
-rw-r--r--fs/ext3/xattr.c6
-rw-r--r--fs/ext3/xattr_trusted.c5
-rw-r--r--fs/ext3/xattr_user.c15
-rw-r--r--fs/fat/cache.c14
-rw-r--r--fs/fat/dir.c28
-rw-r--r--fs/fat/fatent.c10
-rw-r--r--fs/fat/file.c38
-rw-r--r--fs/fat/inode.c119
-rw-r--r--fs/fat/misc.c8
-rw-r--r--fs/fcntl.c9
-rw-r--r--fs/fifo.c6
-rw-r--r--fs/file_table.c9
-rw-r--r--fs/freevxfs/vxfs_bmap.c1
-rw-r--r--fs/freevxfs/vxfs_immed.c4
-rw-r--r--fs/freevxfs/vxfs_olt.c1
-rw-r--r--fs/fuse/dev.c237
-rw-r--r--fs/fuse/dir.c315
-rw-r--r--fs/fuse/file.c134
-rw-r--r--fs/fuse/fuse_i.h106
-rw-r--r--fs/fuse/inode.c282
-rw-r--r--fs/hfs/bfind.c3
-rw-r--r--fs/hfs/bnode.c6
-rw-r--r--fs/hfs/brec.c2
-rw-r--r--fs/hfs/btree.c10
-rw-r--r--fs/hfs/catalog.c2
-rw-r--r--fs/hfs/dir.c12
-rw-r--r--fs/hfs/hfs_fs.h3
-rw-r--r--fs/hfs/inode.c5
-rw-r--r--fs/hfs/mdb.c22
-rw-r--r--fs/hfs/super.c40
-rw-r--r--fs/hfsplus/bfind.c3
-rw-r--r--fs/hfsplus/bitmap.c8
-rw-r--r--fs/hfsplus/bnode.c11
-rw-r--r--fs/hfsplus/brec.c2
-rw-r--r--fs/hfsplus/btree.c37
-rw-r--r--fs/hfsplus/catalog.c44
-rw-r--r--fs/hfsplus/dir.c47
-rw-r--r--fs/hfsplus/extents.c6
-rw-r--r--fs/hfsplus/hfsplus_fs.h16
-rw-r--r--fs/hfsplus/hfsplus_raw.h25
-rw-r--r--fs/hfsplus/inode.c25
-rw-r--r--fs/hfsplus/ioctl.c1
-rw-r--r--fs/hfsplus/options.c24
-rw-r--r--fs/hfsplus/super.c87
-rw-r--r--fs/hfsplus/unicode.c30
-rw-r--r--fs/hfsplus/wrapper.c17
-rw-r--r--fs/hostfs/hostfs_kern.c9
-rw-r--r--fs/hpfs/dir.c6
-rw-r--r--fs/hppfs/hppfs_kern.c6
-rw-r--r--fs/hugetlbfs/inode.c22
-rw-r--r--fs/inode.c57
-rw-r--r--fs/inotify.c14
-rw-r--r--fs/ioctl.c1
-rw-r--r--fs/ioprio.c2
-rw-r--r--fs/isofs/namei.c5
-rw-r--r--fs/jbd/checkpoint.c418
-rw-r--r--fs/jbd/commit.c3
-rw-r--r--fs/jffs/inode-v23.c6
-rw-r--r--fs/jffs/jffs_fm.c1
-rw-r--r--fs/jffs2/build.c2
-rw-r--r--fs/jffs2/debug.h8
-rw-r--r--fs/jffs2/fs.c3
-rw-r--r--fs/jffs2/nodelist.c4
-rw-r--r--fs/jffs2/scan.c3
-rw-r--r--fs/jffs2/super.c2
-rw-r--r--fs/jfs/jfs_dmap.c3
-rw-r--r--fs/jfs/jfs_imap.c6
-rw-r--r--fs/jfs/jfs_incore.h4
-rw-r--r--fs/jfs/jfs_txnmgr.c6
-rw-r--r--fs/jfs/jfs_umount.c6
-rw-r--r--fs/jfs/resize.c3
-rw-r--r--fs/jfs/super.c3
-rw-r--r--fs/jfs/xattr.c67
-rw-r--r--fs/libfs.c20
-rw-r--r--fs/lockd/clntlock.c4
-rw-r--r--fs/lockd/clntproc.c41
-rw-r--r--fs/lockd/host.c4
-rw-r--r--fs/lockd/mon.c1
-rw-r--r--fs/lockd/svc.c4
-rw-r--r--fs/lockd/svc4proc.c15
-rw-r--r--fs/lockd/svclock.c42
-rw-r--r--fs/lockd/svcproc.c14
-rw-r--r--fs/lockd/xdr.c6
-rw-r--r--fs/lockd/xdr4.c4
-rw-r--r--fs/locks.c30
-rw-r--r--fs/mbcache.c6
-rw-r--r--fs/mpage.c6
-rw-r--r--fs/namei.c271
-rw-r--r--fs/namespace.c36
-rw-r--r--fs/ncpfs/dir.c2
-rw-r--r--fs/ncpfs/file.c2
-rw-r--r--fs/ncpfs/inode.c19
-rw-r--r--fs/ncpfs/ioctl.c21
-rw-r--r--fs/ncpfs/ncplib_kernel.h4
-rw-r--r--fs/nfs/Makefile1
-rw-r--r--fs/nfs/callback.c6
-rw-r--r--fs/nfs/callback.h1
-rw-r--r--fs/nfs/callback_proc.c4
-rw-r--r--fs/nfs/delegation.c47
-rw-r--r--fs/nfs/delegation.h2
-rw-r--r--fs/nfs/dir.c17
-rw-r--r--fs/nfs/direct.c81
-rw-r--r--fs/nfs/file.c26
-rw-r--r--fs/nfs/idmap.c9
-rw-r--r--fs/nfs/inode.c304
-rw-r--r--fs/nfs/mount_clnt.c1
-rw-r--r--fs/nfs/nfs2xdr.c21
-rw-r--r--fs/nfs/nfs3proc.c70
-rw-r--r--fs/nfs/nfs3xdr.c2
-rw-r--r--fs/nfs/nfs4_fs.h20
-rw-r--r--fs/nfs/nfs4proc.c1497
-rw-r--r--fs/nfs/nfs4renewd.c14
-rw-r--r--fs/nfs/nfs4state.c203
-rw-r--r--fs/nfs/nfs4xdr.c187
-rw-r--r--fs/nfs/nfsroot.c8
-rw-r--r--fs/nfs/proc.c32
-rw-r--r--fs/nfs/read.c16
-rw-r--r--fs/nfs/sysctl.c84
-rw-r--r--fs/nfs/unlink.c30
-rw-r--r--fs/nfs/write.c95
-rw-r--r--fs/nfsctl.c1
-rw-r--r--fs/nfsd/nfs2acl.c2
-rw-r--r--fs/nfsd/nfs3acl.c2
-rw-r--r--fs/nfsd/nfs3proc.c11
-rw-r--r--fs/nfsd/nfs3xdr.c47
-rw-r--r--fs/nfsd/nfs4callback.c11
-rw-r--r--fs/nfsd/nfs4proc.c55
-rw-r--r--fs/nfsd/nfs4recover.c30
-rw-r--r--fs/nfsd/nfs4state.c220
-rw-r--r--fs/nfsd/nfs4xdr.c10
-rw-r--r--fs/nfsd/nfsproc.c37
-rw-r--r--fs/nfsd/nfsxdr.c52
-rw-r--r--fs/nfsd/vfs.c250
-rw-r--r--fs/ntfs/ChangeLog2
-rw-r--r--fs/ntfs/attrib.c4
-rw-r--r--fs/ntfs/dir.c8
-rw-r--r--fs/ntfs/file.c20
-rw-r--r--fs/ntfs/index.c6
-rw-r--r--fs/ntfs/inode.c28
-rw-r--r--fs/ntfs/namei.c6
-rw-r--r--fs/ntfs/quota.c6
-rw-r--r--fs/ntfs/super.c44
-rw-r--r--fs/ocfs2/Makefile33
-rw-r--r--fs/ocfs2/alloc.c2040
-rw-r--r--fs/ocfs2/alloc.h82
-rw-r--r--fs/ocfs2/aops.c643
-rw-r--r--fs/ocfs2/aops.h41
-rw-r--r--fs/ocfs2/buffer_head_io.c232
-rw-r--r--fs/ocfs2/buffer_head_io.h73
-rw-r--r--fs/ocfs2/cluster/Makefile4
-rw-r--r--fs/ocfs2/cluster/endian.h30
-rw-r--r--fs/ocfs2/cluster/heartbeat.c1797
-rw-r--r--fs/ocfs2/cluster/heartbeat.h82
-rw-r--r--fs/ocfs2/cluster/masklog.c166
-rw-r--r--fs/ocfs2/cluster/masklog.h274
-rw-r--r--fs/ocfs2/cluster/nodemanager.c791
-rw-r--r--fs/ocfs2/cluster/nodemanager.h64
-rw-r--r--fs/ocfs2/cluster/ocfs2_heartbeat.h37
-rw-r--r--fs/ocfs2/cluster/ocfs2_nodemanager.h39
-rw-r--r--fs/ocfs2/cluster/quorum.c315
-rw-r--r--fs/ocfs2/cluster/quorum.h36
-rw-r--r--fs/ocfs2/cluster/sys.c124
-rw-r--r--fs/ocfs2/cluster/sys.h33
-rw-r--r--fs/ocfs2/cluster/tcp.c1829
-rw-r--r--fs/ocfs2/cluster/tcp.h113
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h174
-rw-r--r--fs/ocfs2/cluster/ver.c42
-rw-r--r--fs/ocfs2/cluster/ver.h31
-rw-r--r--fs/ocfs2/dcache.c91
-rw-r--r--fs/ocfs2/dcache.h31
-rw-r--r--fs/ocfs2/dir.c618
-rw-r--r--fs/ocfs2/dir.h54
-rw-r--r--fs/ocfs2/dlm/Makefile8
-rw-r--r--fs/ocfs2/dlm/dlmapi.h214
-rw-r--r--fs/ocfs2/dlm/dlmast.c466
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h884
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c530
-rw-r--r--fs/ocfs2/dlm/dlmconvert.h35
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c246
-rw-r--r--fs/ocfs2/dlm/dlmdebug.h30
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c1469
-rw-r--r--fs/ocfs2/dlm/dlmdomain.h36
-rw-r--r--fs/ocfs2/dlm/dlmfs.c640
-rw-r--r--fs/ocfs2/dlm/dlmfsver.c42
-rw-r--r--fs/ocfs2/dlm/dlmfsver.h31
-rw-r--r--fs/ocfs2/dlm/dlmlock.c676
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c2664
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c2132
-rw-r--r--fs/ocfs2/dlm/dlmthread.c692
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c672
-rw-r--r--fs/ocfs2/dlm/dlmver.c42
-rw-r--r--fs/ocfs2/dlm/dlmver.h31
-rw-r--r--fs/ocfs2/dlm/userdlm.c658
-rw-r--r--fs/ocfs2/dlm/userdlm.h111
-rw-r--r--fs/ocfs2/dlmglue.c2904
-rw-r--r--fs/ocfs2/dlmglue.h111
-rw-r--r--fs/ocfs2/endian.h45
-rw-r--r--fs/ocfs2/export.c248
-rw-r--r--fs/ocfs2/export.h31
-rw-r--r--fs/ocfs2/extent_map.c994
-rw-r--r--fs/ocfs2/extent_map.h46
-rw-r--r--fs/ocfs2/file.c1238
-rw-r--r--fs/ocfs2/file.h57
-rw-r--r--fs/ocfs2/heartbeat.c378
-rw-r--r--fs/ocfs2/heartbeat.h67
-rw-r--r--fs/ocfs2/inode.c1140
-rw-r--r--fs/ocfs2/inode.h145
-rw-r--r--fs/ocfs2/journal.c1652
-rw-r--r--fs/ocfs2/journal.h457
-rw-r--r--fs/ocfs2/localalloc.c983
-rw-r--r--fs/ocfs2/localalloc.h56
-rw-r--r--fs/ocfs2/mmap.c98
-rw-r--r--fs/ocfs2/mmap.h6
-rw-r--r--fs/ocfs2/namei.c2264
-rw-r--r--fs/ocfs2/namei.h58
-rw-r--r--fs/ocfs2/ocfs1_fs_compat.h109
-rw-r--r--fs/ocfs2/ocfs2.h464
-rw-r--r--fs/ocfs2/ocfs2_fs.h638
-rw-r--r--fs/ocfs2/ocfs2_lockid.h73
-rw-r--r--fs/ocfs2/slot_map.c303
-rw-r--r--fs/ocfs2/slot_map.h66
-rw-r--r--fs/ocfs2/suballoc.c1651
-rw-r--r--fs/ocfs2/suballoc.h132
-rw-r--r--fs/ocfs2/super.c1733
-rw-r--r--fs/ocfs2/super.h44
-rw-r--r--fs/ocfs2/symlink.c180
-rw-r--r--fs/ocfs2/symlink.h42
-rw-r--r--fs/ocfs2/sysfile.c131
-rw-r--r--fs/ocfs2/sysfile.h33
-rw-r--r--fs/ocfs2/uptodate.c544
-rw-r--r--fs/ocfs2/uptodate.h44
-rw-r--r--fs/ocfs2/ver.c43
-rw-r--r--fs/ocfs2/ver.h31
-rw-r--r--fs/ocfs2/vote.c1202
-rw-r--r--fs/ocfs2/vote.h56
-rw-r--r--fs/open.c123
-rw-r--r--fs/partitions/Kconfig31
-rw-r--r--fs/partitions/Makefile1
-rw-r--r--fs/partitions/check.c37
-rw-r--r--fs/partitions/ibm.c30
-rw-r--r--fs/partitions/karma.c57
-rw-r--r--fs/partitions/karma.h8
-rw-r--r--fs/partitions/ultrix.c1
-rw-r--r--fs/pipe.c50
-rw-r--r--fs/pnode.c2
-rw-r--r--fs/proc/array.c8
-rw-r--r--fs/proc/base.c1
-rw-r--r--fs/proc/generic.c49
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/internal.h4
-rw-r--r--fs/proc/kcore.c1
-rw-r--r--fs/proc/proc_devtree.c24
-rw-r--r--fs/proc/proc_misc.c164
-rw-r--r--fs/proc/root.c3
-rw-r--r--fs/proc/task_mmu.c130
-rw-r--r--fs/proc/vmcore.c6
-rw-r--r--fs/qnx4/bitmap.c2
-rw-r--r--fs/quota.c7
-rw-r--r--fs/quota_v2.c3
-rw-r--r--fs/ramfs/Makefile4
-rw-r--r--fs/ramfs/file-mmu.c57
-rw-r--r--fs/ramfs/file-nommu.c292
-rw-r--r--fs/ramfs/inode.c22
-rw-r--r--fs/ramfs/internal.h15
-rw-r--r--fs/read_write.c38
-rw-r--r--fs/readdir.c4
-rw-r--r--fs/reiserfs/file.c12
-rw-r--r--fs/reiserfs/hashes.c1
-rw-r--r--fs/reiserfs/inode.c42
-rw-r--r--fs/reiserfs/ioctl.c5
-rw-r--r--fs/reiserfs/journal.c32
-rw-r--r--fs/reiserfs/namei.c6
-rw-r--r--fs/reiserfs/super.c4
-rw-r--r--fs/reiserfs/tail_conversion.c2
-rw-r--r--fs/reiserfs/xattr.c60
-rw-r--r--fs/reiserfs/xattr_acl.c7
-rw-r--r--fs/reiserfs/xattr_trusted.c1
-rw-r--r--fs/reiserfs/xattr_user.c30
-rw-r--r--fs/relayfs/buffers.c3
-rw-r--r--fs/relayfs/inode.c226
-rw-r--r--fs/relayfs/relay.c77
-rw-r--r--fs/relayfs/relay.h4
-rw-r--r--fs/romfs/inode.c6
-rw-r--r--fs/select.c348
-rw-r--r--fs/smbfs/Makefile1
-rw-r--r--fs/smbfs/cache.c4
-rw-r--r--fs/smbfs/file.c7
-rw-r--r--fs/smbfs/inode.c35
-rw-r--r--fs/smbfs/proc.c3
-rw-r--r--fs/smbfs/request.c13
-rw-r--r--fs/stat.c66
-rw-r--r--fs/super.c20
-rw-r--r--fs/sysfs/dir.c37
-rw-r--r--fs/sysfs/file.c17
-rw-r--r--fs/sysfs/inode.c9
-rw-r--r--fs/sysfs/symlink.c5
-rw-r--r--fs/sysv/ChangeLog2
-rw-r--r--fs/sysv/dir.c4
-rw-r--r--fs/udf/balloc.c7
-rw-r--r--fs/udf/crc.c5
-rw-r--r--fs/udf/dir.c5
-rw-r--r--fs/udf/directory.c5
-rw-r--r--fs/udf/file.c6
-rw-r--r--fs/udf/fsync.c5
-rw-r--r--fs/udf/ialloc.c5
-rw-r--r--fs/udf/inode.c10
-rw-r--r--fs/udf/lowlevel.c5
-rw-r--r--fs/udf/misc.c5
-rw-r--r--fs/udf/namei.c5
-rw-r--r--fs/udf/partition.c5
-rw-r--r--fs/udf/super.c5
-rw-r--r--fs/udf/symlink.c5
-rw-r--r--fs/udf/truncate.c5
-rw-r--r--fs/udf/unicode.c5
-rw-r--r--fs/ufs/balloc.c20
-rw-r--r--fs/ufs/dir.c2
-rw-r--r--fs/ufs/ialloc.c4
-rw-r--r--fs/ufs/inode.c11
-rw-r--r--fs/ufs/super.c65
-rw-r--r--fs/ufs/util.h28
-rw-r--r--fs/xattr.c201
-rw-r--r--fs/xfs/Kbuild6
-rw-r--r--fs/xfs/linux-2.6/mutex.h18
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c1124
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.h10
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c1373
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h696
-rw-r--r--fs/xfs/linux-2.6/xfs_cred.h4
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c6
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.c3
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c16
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c137
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.h5
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h6
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c76
-rw-r--r--fs/xfs/linux-2.6/xfs_stats.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_stats.h18
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c19
-rw-r--r--fs/xfs/linux-2.6/xfs_vnode.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_vnode.h19
-rw-r--r--fs/xfs/quota/xfs_dquot.c4
-rw-r--r--fs/xfs/quota/xfs_dquot_item.c4
-rw-r--r--fs/xfs/quota/xfs_qm.c146
-rw-r--r--fs/xfs/quota/xfs_qm.h2
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c2
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c11
-rw-r--r--fs/xfs/quota/xfs_quota_priv.h2
-rw-r--r--fs/xfs/support/debug.c60
-rw-r--r--fs/xfs/support/debug.h25
-rw-r--r--fs/xfs/support/uuid.c29
-rw-r--r--fs/xfs/xfs_acl.c1
-rw-r--r--fs/xfs/xfs_arch.h22
-rw-r--r--fs/xfs/xfs_attr.c32
-rw-r--r--fs/xfs/xfs_attr_leaf.c15
-rw-r--r--fs/xfs/xfs_attr_leaf.h79
-rw-r--r--fs/xfs/xfs_bmap.c412
-rw-r--r--fs/xfs/xfs_bmap.h7
-rw-r--r--fs/xfs/xfs_clnt.h2
-rw-r--r--fs/xfs/xfs_dfrag.c16
-rw-r--r--fs/xfs/xfs_dinode.h22
-rw-r--r--fs/xfs/xfs_dir.c2
-rw-r--r--fs/xfs/xfs_dir.h2
-rw-r--r--fs/xfs/xfs_dir2.h3
-rw-r--r--fs/xfs/xfs_dir_leaf.c96
-rw-r--r--fs/xfs/xfs_dir_leaf.h64
-rw-r--r--fs/xfs/xfs_dmapi.h14
-rw-r--r--fs/xfs/xfs_error.c1
-rw-r--r--fs/xfs/xfs_error.h8
-rw-r--r--fs/xfs/xfs_fs.h10
-rw-r--r--fs/xfs/xfs_fsops.c32
-rw-r--r--fs/xfs/xfs_fsops.h1
-rw-r--r--fs/xfs/xfs_iget.c5
-rw-r--r--fs/xfs/xfs_inode.c61
-rw-r--r--fs/xfs/xfs_inode.h4
-rw-r--r--fs/xfs/xfs_inode_item.c9
-rw-r--r--fs/xfs/xfs_iomap.c426
-rw-r--r--fs/xfs/xfs_iomap.h2
-rw-r--r--fs/xfs/xfs_itable.c5
-rw-r--r--fs/xfs/xfs_log.c175
-rw-r--r--fs/xfs/xfs_log.h19
-rw-r--r--fs/xfs/xfs_log_priv.h89
-rw-r--r--fs/xfs/xfs_log_recover.c12
-rw-r--r--fs/xfs/xfs_mount.c73
-rw-r--r--fs/xfs/xfs_mount.h5
-rw-r--r--fs/xfs/xfs_rename.c7
-rw-r--r--fs/xfs/xfs_rw.c9
-rw-r--r--fs/xfs/xfs_sb.h17
-rw-r--r--fs/xfs/xfs_trans.c14
-rw-r--r--fs/xfs/xfs_trans.h1
-rw-r--r--fs/xfs/xfs_trans_item.c22
-rw-r--r--fs/xfs/xfs_utils.c9
-rw-r--r--fs/xfs/xfs_vfsops.c50
-rw-r--r--fs/xfs/xfs_vnodeops.c260
513 files changed, 58733 insertions, 9198 deletions
diff --git a/fs/9p/9p.c b/fs/9p/9p.c
index e847f504a47..1a6d08761f3 100644
--- a/fs/9p/9p.c
+++ b/fs/9p/9p.c
@@ -1,8 +1,9 @@
1/* 1/*
2 * linux/fs/9p/9p.c 2 * linux/fs/9p/9p.c
3 * 3 *
4 * This file contains functions 9P2000 functions 4 * This file contains functions to perform synchronous 9P calls
5 * 5 *
6 * Copyright (C) 2004 by Latchesar Ionkov <lucho@ionkov.net>
6 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> 7 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
7 * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> 8 * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
8 * 9 *
@@ -33,6 +34,7 @@
33#include "debug.h" 34#include "debug.h"
34#include "v9fs.h" 35#include "v9fs.h"
35#include "9p.h" 36#include "9p.h"
37#include "conv.h"
36#include "mux.h" 38#include "mux.h"
37 39
38/** 40/**
@@ -46,16 +48,21 @@
46 48
47int 49int
48v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize, 50v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize,
49 char *version, struct v9fs_fcall **fcall) 51 char *version, struct v9fs_fcall **rcp)
50{ 52{
51 struct v9fs_fcall msg; 53 int ret;
54 struct v9fs_fcall *tc;
52 55
53 dprintk(DEBUG_9P, "msize: %d version: %s\n", msize, version); 56 dprintk(DEBUG_9P, "msize: %d version: %s\n", msize, version);
54 msg.id = TVERSION; 57 tc = v9fs_create_tversion(msize, version);
55 msg.params.tversion.msize = msize;
56 msg.params.tversion.version = version;
57 58
58 return v9fs_mux_rpc(v9ses, &msg, fcall); 59 if (!IS_ERR(tc)) {
60 ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
61 kfree(tc);
62 } else
63 ret = PTR_ERR(tc);
64
65 return ret;
59} 66}
60 67
61/** 68/**
@@ -71,19 +78,45 @@ v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize,
71 78
72int 79int
73v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname, 80v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname,
74 u32 fid, u32 afid, struct v9fs_fcall **fcall) 81 u32 fid, u32 afid, struct v9fs_fcall **rcp)
75{ 82{
76 struct v9fs_fcall msg; 83 int ret;
84 struct v9fs_fcall* tc;
77 85
78 dprintk(DEBUG_9P, "uname '%s' aname '%s' fid %d afid %d\n", uname, 86 dprintk(DEBUG_9P, "uname '%s' aname '%s' fid %d afid %d\n", uname,
79 aname, fid, afid); 87 aname, fid, afid);
80 msg.id = TATTACH;
81 msg.params.tattach.fid = fid;
82 msg.params.tattach.afid = afid;
83 msg.params.tattach.uname = uname;
84 msg.params.tattach.aname = aname;
85 88
86 return v9fs_mux_rpc(v9ses, &msg, fcall); 89 tc = v9fs_create_tattach(fid, afid, uname, aname);
90 if (!IS_ERR(tc)) {
91 ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
92 kfree(tc);
93 } else
94 ret = PTR_ERR(tc);
95
96 return ret;
97}
98
99static void v9fs_t_clunk_cb(void *a, struct v9fs_fcall *tc,
100 struct v9fs_fcall *rc, int err)
101{
102 int fid;
103 struct v9fs_session_info *v9ses;
104
105 if (err)
106 return;
107
108 fid = tc->params.tclunk.fid;
109 kfree(tc);
110
111 if (!rc)
112 return;
113
114 dprintk(DEBUG_9P, "tcall id %d rcall id %d\n", tc->id, rc->id);
115 v9ses = a;
116 if (rc->id == RCLUNK)
117 v9fs_put_idpool(fid, &v9ses->fidpool);
118
119 kfree(rc);
87} 120}
88 121
89/** 122/**
@@ -95,16 +128,25 @@ v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname,
95 */ 128 */
96 129
97int 130int
98v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid, 131v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid)
99 struct v9fs_fcall **fcall)
100{ 132{
101 struct v9fs_fcall msg; 133 int ret;
134 struct v9fs_fcall *tc, *rc;
102 135
103 dprintk(DEBUG_9P, "fid %d\n", fid); 136 dprintk(DEBUG_9P, "fid %d\n", fid);
104 msg.id = TCLUNK;
105 msg.params.tclunk.fid = fid;
106 137
107 return v9fs_mux_rpc(v9ses, &msg, fcall); 138 rc = NULL;
139 tc = v9fs_create_tclunk(fid);
140 if (!IS_ERR(tc))
141 ret = v9fs_mux_rpc(v9ses->mux, tc, &rc);
142 else
143 ret = PTR_ERR(tc);
144
145 if (ret)
146 dprintk(DEBUG_ERROR, "failed fid %d err %d\n", fid, ret);
147
148 v9fs_t_clunk_cb(v9ses, tc, rc, ret);
149 return ret;
108} 150}
109 151
110/** 152/**
@@ -114,14 +156,21 @@ v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid,
114 * 156 *
115 */ 157 */
116 158
117int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 tag) 159int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 oldtag)
118{ 160{
119 struct v9fs_fcall msg; 161 int ret;
162 struct v9fs_fcall *tc;
163
164 dprintk(DEBUG_9P, "oldtag %d\n", oldtag);
165
166 tc = v9fs_create_tflush(oldtag);
167 if (!IS_ERR(tc)) {
168 ret = v9fs_mux_rpc(v9ses->mux, tc, NULL);
169 kfree(tc);
170 } else
171 ret = PTR_ERR(tc);
120 172
121 dprintk(DEBUG_9P, "oldtag %d\n", tag); 173 return ret;
122 msg.id = TFLUSH;
123 msg.params.tflush.oldtag = tag;
124 return v9fs_mux_rpc(v9ses, &msg, NULL);
125} 174}
126 175
127/** 176/**
@@ -133,17 +182,22 @@ int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 tag)
133 */ 182 */
134 183
135int 184int
136v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid, struct v9fs_fcall **fcall) 185v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid, struct v9fs_fcall **rcp)
137{ 186{
138 struct v9fs_fcall msg; 187 int ret;
188 struct v9fs_fcall *tc;
139 189
140 dprintk(DEBUG_9P, "fid %d\n", fid); 190 dprintk(DEBUG_9P, "fid %d\n", fid);
141 if (fcall)
142 *fcall = NULL;
143 191
144 msg.id = TSTAT; 192 ret = -ENOMEM;
145 msg.params.tstat.fid = fid; 193 tc = v9fs_create_tstat(fid);
146 return v9fs_mux_rpc(v9ses, &msg, fcall); 194 if (!IS_ERR(tc)) {
195 ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
196 kfree(tc);
197 } else
198 ret = PTR_ERR(tc);
199
200 return ret;
147} 201}
148 202
149/** 203/**
@@ -157,16 +211,21 @@ v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid, struct v9fs_fcall **fcall)
157 211
158int 212int
159v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid, 213v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid,
160 struct v9fs_stat *stat, struct v9fs_fcall **fcall) 214 struct v9fs_wstat *wstat, struct v9fs_fcall **rcp)
161{ 215{
162 struct v9fs_fcall msg; 216 int ret;
217 struct v9fs_fcall *tc;
218
219 dprintk(DEBUG_9P, "fid %d\n", fid);
163 220
164 dprintk(DEBUG_9P, "fid %d length %d\n", fid, (int)stat->length); 221 tc = v9fs_create_twstat(fid, wstat, v9ses->extended);
165 msg.id = TWSTAT; 222 if (!IS_ERR(tc)) {
166 msg.params.twstat.fid = fid; 223 ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
167 msg.params.twstat.stat = stat; 224 kfree(tc);
225 } else
226 ret = PTR_ERR(tc);
168 227
169 return v9fs_mux_rpc(v9ses, &msg, fcall); 228 return ret;
170} 229}
171 230
172/** 231/**
@@ -183,23 +242,27 @@ v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid,
183 242
184int 243int
185v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid, 244v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid,
186 char *name, struct v9fs_fcall **fcall) 245 char *name, struct v9fs_fcall **rcp)
187{ 246{
188 struct v9fs_fcall msg; 247 int ret;
248 struct v9fs_fcall *tc;
249 int nwname;
189 250
190 dprintk(DEBUG_9P, "fid %d newfid %d wname '%s'\n", fid, newfid, name); 251 dprintk(DEBUG_9P, "fid %d newfid %d wname '%s'\n", fid, newfid, name);
191 msg.id = TWALK; 252
192 msg.params.twalk.fid = fid; 253 if (name)
193 msg.params.twalk.newfid = newfid; 254 nwname = 1;
194 255 else
195 if (name) { 256 nwname = 0;
196 msg.params.twalk.nwname = 1; 257
197 msg.params.twalk.wnames = &name; 258 tc = v9fs_create_twalk(fid, newfid, nwname, &name);
198 } else { 259 if (!IS_ERR(tc)) {
199 msg.params.twalk.nwname = 0; 260 ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
200 } 261 kfree(tc);
201 262 } else
202 return v9fs_mux_rpc(v9ses, &msg, fcall); 263 ret = PTR_ERR(tc);
264
265 return ret;
203} 266}
204 267
205/** 268/**
@@ -214,19 +277,21 @@ v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid,
214 277
215int 278int
216v9fs_t_open(struct v9fs_session_info *v9ses, u32 fid, u8 mode, 279v9fs_t_open(struct v9fs_session_info *v9ses, u32 fid, u8 mode,
217 struct v9fs_fcall **fcall) 280 struct v9fs_fcall **rcp)
218{ 281{
219 struct v9fs_fcall msg; 282 int ret;
220 long errorno = -1; 283 struct v9fs_fcall *tc;
221 284
222 dprintk(DEBUG_9P, "fid %d mode %d\n", fid, mode); 285 dprintk(DEBUG_9P, "fid %d mode %d\n", fid, mode);
223 msg.id = TOPEN;
224 msg.params.topen.fid = fid;
225 msg.params.topen.mode = mode;
226 286
227 errorno = v9fs_mux_rpc(v9ses, &msg, fcall); 287 tc = v9fs_create_topen(fid, mode);
288 if (!IS_ERR(tc)) {
289 ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
290 kfree(tc);
291 } else
292 ret = PTR_ERR(tc);
228 293
229 return errorno; 294 return ret;
230} 295}
231 296
232/** 297/**
@@ -239,14 +304,21 @@ v9fs_t_open(struct v9fs_session_info *v9ses, u32 fid, u8 mode,
239 304
240int 305int
241v9fs_t_remove(struct v9fs_session_info *v9ses, u32 fid, 306v9fs_t_remove(struct v9fs_session_info *v9ses, u32 fid,
242 struct v9fs_fcall **fcall) 307 struct v9fs_fcall **rcp)
243{ 308{
244 struct v9fs_fcall msg; 309 int ret;
310 struct v9fs_fcall *tc;
245 311
246 dprintk(DEBUG_9P, "fid %d\n", fid); 312 dprintk(DEBUG_9P, "fid %d\n", fid);
247 msg.id = TREMOVE; 313
248 msg.params.tremove.fid = fid; 314 tc = v9fs_create_tremove(fid);
249 return v9fs_mux_rpc(v9ses, &msg, fcall); 315 if (!IS_ERR(tc)) {
316 ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
317 kfree(tc);
318 } else
319 ret = PTR_ERR(tc);
320
321 return ret;
250} 322}
251 323
252/** 324/**
@@ -262,20 +334,22 @@ v9fs_t_remove(struct v9fs_session_info *v9ses, u32 fid,
262 334
263int 335int
264v9fs_t_create(struct v9fs_session_info *v9ses, u32 fid, char *name, 336v9fs_t_create(struct v9fs_session_info *v9ses, u32 fid, char *name,
265 u32 perm, u8 mode, struct v9fs_fcall **fcall) 337 u32 perm, u8 mode, struct v9fs_fcall **rcp)
266{ 338{
267 struct v9fs_fcall msg; 339 int ret;
340 struct v9fs_fcall *tc;
268 341
269 dprintk(DEBUG_9P, "fid %d name '%s' perm %x mode %d\n", 342 dprintk(DEBUG_9P, "fid %d name '%s' perm %x mode %d\n",
270 fid, name, perm, mode); 343 fid, name, perm, mode);
271 344
272 msg.id = TCREATE; 345 tc = v9fs_create_tcreate(fid, name, perm, mode);
273 msg.params.tcreate.fid = fid; 346 if (!IS_ERR(tc)) {
274 msg.params.tcreate.name = name; 347 ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
275 msg.params.tcreate.perm = perm; 348 kfree(tc);
276 msg.params.tcreate.mode = mode; 349 } else
350 ret = PTR_ERR(tc);
277 351
278 return v9fs_mux_rpc(v9ses, &msg, fcall); 352 return ret;
279} 353}
280 354
281/** 355/**
@@ -290,31 +364,29 @@ v9fs_t_create(struct v9fs_session_info *v9ses, u32 fid, char *name,
290 364
291int 365int
292v9fs_t_read(struct v9fs_session_info *v9ses, u32 fid, u64 offset, 366v9fs_t_read(struct v9fs_session_info *v9ses, u32 fid, u64 offset,
293 u32 count, struct v9fs_fcall **fcall) 367 u32 count, struct v9fs_fcall **rcp)
294{ 368{
295 struct v9fs_fcall msg; 369 int ret;
296 struct v9fs_fcall *rc = NULL; 370 struct v9fs_fcall *tc, *rc;
297 long errorno = -1; 371
298 372 dprintk(DEBUG_9P, "fid %d offset 0x%llux count 0x%x\n", fid,
299 dprintk(DEBUG_9P, "fid %d offset 0x%lx count 0x%x\n", fid, 373 (long long unsigned) offset, count);
300 (long unsigned int)offset, count); 374
301 msg.id = TREAD; 375 tc = v9fs_create_tread(fid, offset, count);
302 msg.params.tread.fid = fid; 376 if (!IS_ERR(tc)) {
303 msg.params.tread.offset = offset; 377 ret = v9fs_mux_rpc(v9ses->mux, tc, &rc);
304 msg.params.tread.count = count; 378 if (!ret)
305 errorno = v9fs_mux_rpc(v9ses, &msg, &rc); 379 ret = rc->params.rread.count;
306 380 if (rcp)
307 if (!errorno) { 381 *rcp = rc;
308 errorno = rc->params.rread.count; 382 else
309 dump_data(rc->params.rread.data, rc->params.rread.count); 383 kfree(rc);
310 } 384
311 385 kfree(tc);
312 if (fcall) 386 } else
313 *fcall = rc; 387 ret = PTR_ERR(tc);
314 else 388
315 kfree(rc); 389 return ret;
316
317 return errorno;
318} 390}
319 391
320/** 392/**
@@ -328,32 +400,30 @@ v9fs_t_read(struct v9fs_session_info *v9ses, u32 fid, u64 offset,
328 */ 400 */
329 401
330int 402int
331v9fs_t_write(struct v9fs_session_info *v9ses, u32 fid, 403v9fs_t_write(struct v9fs_session_info *v9ses, u32 fid, u64 offset, u32 count,
332 u64 offset, u32 count, void *data, struct v9fs_fcall **fcall) 404 const char __user *data, struct v9fs_fcall **rcp)
333{ 405{
334 struct v9fs_fcall msg; 406 int ret;
335 struct v9fs_fcall *rc = NULL; 407 struct v9fs_fcall *tc, *rc;
336 long errorno = -1;
337 408
338 dprintk(DEBUG_9P, "fid %d offset 0x%llx count 0x%x\n", fid, 409 dprintk(DEBUG_9P, "fid %d offset 0x%llux count 0x%x\n", fid,
339 (unsigned long long)offset, count); 410 (long long unsigned) offset, count);
340 dump_data(data, count);
341 411
342 msg.id = TWRITE; 412 tc = v9fs_create_twrite(fid, offset, count, data);
343 msg.params.twrite.fid = fid; 413 if (!IS_ERR(tc)) {
344 msg.params.twrite.offset = offset; 414 ret = v9fs_mux_rpc(v9ses->mux, tc, &rc);
345 msg.params.twrite.count = count;
346 msg.params.twrite.data = data;
347 415
348 errorno = v9fs_mux_rpc(v9ses, &msg, &rc); 416 if (!ret)
417 ret = rc->params.rwrite.count;
418 if (rcp)
419 *rcp = rc;
420 else
421 kfree(rc);
349 422
350 if (!errorno) 423 kfree(tc);
351 errorno = rc->params.rwrite.count; 424 } else
425 ret = PTR_ERR(tc);
352 426
353 if (fcall) 427 return ret;
354 *fcall = rc;
355 else
356 kfree(rc);
357
358 return errorno;
359} 428}
429
diff --git a/fs/9p/9p.h b/fs/9p/9p.h
index f55424216be..0cd374d9471 100644
--- a/fs/9p/9p.h
+++ b/fs/9p/9p.h
@@ -3,6 +3,7 @@
3 * 3 *
4 * 9P protocol definitions. 4 * 9P protocol definitions.
5 * 5 *
6 * Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net>
6 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> 7 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
7 * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> 8 * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
8 * 9 *
@@ -100,9 +101,18 @@ enum {
100 V9FS_QTFILE = 0x00, 101 V9FS_QTFILE = 0x00,
101}; 102};
102 103
104#define V9FS_NOTAG (u16)(~0)
105#define V9FS_NOFID (u32)(~0)
106#define V9FS_MAXWELEM 16
107
103/* ample room for Twrite/Rread header (iounit) */ 108/* ample room for Twrite/Rread header (iounit) */
104#define V9FS_IOHDRSZ 24 109#define V9FS_IOHDRSZ 24
105 110
111struct v9fs_str {
112 u16 len;
113 char *str;
114};
115
106/* qids are the unique ID for a file (like an inode */ 116/* qids are the unique ID for a file (like an inode */
107struct v9fs_qid { 117struct v9fs_qid {
108 u8 type; 118 u8 type;
@@ -120,6 +130,29 @@ struct v9fs_stat {
120 u32 atime; 130 u32 atime;
121 u32 mtime; 131 u32 mtime;
122 u64 length; 132 u64 length;
133 struct v9fs_str name;
134 struct v9fs_str uid;
135 struct v9fs_str gid;
136 struct v9fs_str muid;
137 struct v9fs_str extension; /* 9p2000.u extensions */
138 u32 n_uid; /* 9p2000.u extensions */
139 u32 n_gid; /* 9p2000.u extensions */
140 u32 n_muid; /* 9p2000.u extensions */
141};
142
143/* file metadata (stat) structure used to create Twstat message
144 The is similar to v9fs_stat, but the strings don't point to
145 the same memory block and should be freed separately
146*/
147struct v9fs_wstat {
148 u16 size;
149 u16 type;
150 u32 dev;
151 struct v9fs_qid qid;
152 u32 mode;
153 u32 atime;
154 u32 mtime;
155 u64 length;
123 char *name; 156 char *name;
124 char *uid; 157 char *uid;
125 char *gid; 158 char *gid;
@@ -128,25 +161,24 @@ struct v9fs_stat {
128 u32 n_uid; /* 9p2000.u extensions */ 161 u32 n_uid; /* 9p2000.u extensions */
129 u32 n_gid; /* 9p2000.u extensions */ 162 u32 n_gid; /* 9p2000.u extensions */
130 u32 n_muid; /* 9p2000.u extensions */ 163 u32 n_muid; /* 9p2000.u extensions */
131 char data[0];
132}; 164};
133 165
134/* Structures for Protocol Operations */ 166/* Structures for Protocol Operations */
135 167
136struct Tversion { 168struct Tversion {
137 u32 msize; 169 u32 msize;
138 char *version; 170 struct v9fs_str version;
139}; 171};
140 172
141struct Rversion { 173struct Rversion {
142 u32 msize; 174 u32 msize;
143 char *version; 175 struct v9fs_str version;
144}; 176};
145 177
146struct Tauth { 178struct Tauth {
147 u32 afid; 179 u32 afid;
148 char *uname; 180 struct v9fs_str uname;
149 char *aname; 181 struct v9fs_str aname;
150}; 182};
151 183
152struct Rauth { 184struct Rauth {
@@ -154,12 +186,12 @@ struct Rauth {
154}; 186};
155 187
156struct Rerror { 188struct Rerror {
157 char *error; 189 struct v9fs_str error;
158 u32 errno; /* 9p2000.u extension */ 190 u32 errno; /* 9p2000.u extension */
159}; 191};
160 192
161struct Tflush { 193struct Tflush {
162 u32 oldtag; 194 u16 oldtag;
163}; 195};
164 196
165struct Rflush { 197struct Rflush {
@@ -168,8 +200,8 @@ struct Rflush {
168struct Tattach { 200struct Tattach {
169 u32 fid; 201 u32 fid;
170 u32 afid; 202 u32 afid;
171 char *uname; 203 struct v9fs_str uname;
172 char *aname; 204 struct v9fs_str aname;
173}; 205};
174 206
175struct Rattach { 207struct Rattach {
@@ -179,13 +211,13 @@ struct Rattach {
179struct Twalk { 211struct Twalk {
180 u32 fid; 212 u32 fid;
181 u32 newfid; 213 u32 newfid;
182 u32 nwname; 214 u16 nwname;
183 char **wnames; 215 struct v9fs_str wnames[16];
184}; 216};
185 217
186struct Rwalk { 218struct Rwalk {
187 u32 nwqid; 219 u16 nwqid;
188 struct v9fs_qid *wqids; 220 struct v9fs_qid wqids[16];
189}; 221};
190 222
191struct Topen { 223struct Topen {
@@ -200,7 +232,7 @@ struct Ropen {
200 232
201struct Tcreate { 233struct Tcreate {
202 u32 fid; 234 u32 fid;
203 char *name; 235 struct v9fs_str name;
204 u32 perm; 236 u32 perm;
205 u8 mode; 237 u8 mode;
206}; 238};
@@ -251,12 +283,12 @@ struct Tstat {
251}; 283};
252 284
253struct Rstat { 285struct Rstat {
254 struct v9fs_stat *stat; 286 struct v9fs_stat stat;
255}; 287};
256 288
257struct Twstat { 289struct Twstat {
258 u32 fid; 290 u32 fid;
259 struct v9fs_stat *stat; 291 struct v9fs_stat stat;
260}; 292};
261 293
262struct Rwstat { 294struct Rwstat {
@@ -271,6 +303,7 @@ struct v9fs_fcall {
271 u32 size; 303 u32 size;
272 u8 id; 304 u8 id;
273 u16 tag; 305 u16 tag;
306 void *sdata;
274 307
275 union { 308 union {
276 struct Tversion tversion; 309 struct Tversion tversion;
@@ -303,7 +336,9 @@ struct v9fs_fcall {
303 } params; 336 } params;
304}; 337};
305 338
306#define FCALL_ERROR(fcall) (fcall ? fcall->params.rerror.error : "") 339#define PRINT_FCALL_ERROR(s, fcall) dprintk(DEBUG_ERROR, "%s: %.*s\n", s, \
340 fcall?fcall->params.rerror.error.len:0, \
341 fcall?fcall->params.rerror.error.str:"");
307 342
308int v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize, 343int v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize,
309 char *version, struct v9fs_fcall **rcall); 344 char *version, struct v9fs_fcall **rcall);
@@ -311,8 +346,7 @@ int v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize,
311int v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname, 346int v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname,
312 u32 fid, u32 afid, struct v9fs_fcall **rcall); 347 u32 fid, u32 afid, struct v9fs_fcall **rcall);
313 348
314int v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid, 349int v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid);
315 struct v9fs_fcall **rcall);
316 350
317int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 oldtag); 351int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 oldtag);
318 352
@@ -320,7 +354,7 @@ int v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid,
320 struct v9fs_fcall **rcall); 354 struct v9fs_fcall **rcall);
321 355
322int v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid, 356int v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid,
323 struct v9fs_stat *stat, struct v9fs_fcall **rcall); 357 struct v9fs_wstat *wstat, struct v9fs_fcall **rcall);
324 358
325int v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid, 359int v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid,
326 char *name, struct v9fs_fcall **rcall); 360 char *name, struct v9fs_fcall **rcall);
@@ -338,4 +372,5 @@ int v9fs_t_read(struct v9fs_session_info *v9ses, u32 fid,
338 u64 offset, u32 count, struct v9fs_fcall **rcall); 372 u64 offset, u32 count, struct v9fs_fcall **rcall);
339 373
340int v9fs_t_write(struct v9fs_session_info *v9ses, u32 fid, u64 offset, 374int v9fs_t_write(struct v9fs_session_info *v9ses, u32 fid, u64 offset,
341 u32 count, void *data, struct v9fs_fcall **rcall); 375 u32 count, const char __user * data,
376 struct v9fs_fcall **rcall);
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index e4e4ffe5a7d..2f4ce43f7b6 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -1,17 +1,18 @@
1obj-$(CONFIG_9P_FS) := 9p2000.o 1obj-$(CONFIG_9P_FS) := 9p2000.o
2 2
39p2000-objs := \ 39p2000-objs := \
4 trans_fd.o \
5 trans_sock.o \
6 mux.o \
7 9p.o \
8 conv.o \
4 vfs_super.o \ 9 vfs_super.o \
5 vfs_inode.o \ 10 vfs_inode.o \
11 vfs_addr.o \
6 vfs_file.o \ 12 vfs_file.o \
7 vfs_dir.o \ 13 vfs_dir.o \
8 vfs_dentry.o \ 14 vfs_dentry.o \
9 error.o \ 15 error.o \
10 mux.o \
11 trans_fd.o \
12 trans_sock.o \
13 9p.o \
14 conv.o \
15 v9fs.o \ 16 v9fs.o \
16 fid.o 17 fid.o
17 18
diff --git a/fs/9p/conv.c b/fs/9p/conv.c
index 18121af99d3..32a9f99154e 100644
--- a/fs/9p/conv.c
+++ b/fs/9p/conv.c
@@ -30,7 +30,7 @@
30#include <linux/errno.h> 30#include <linux/errno.h>
31#include <linux/fs.h> 31#include <linux/fs.h>
32#include <linux/idr.h> 32#include <linux/idr.h>
33 33#include <asm/uaccess.h>
34#include "debug.h" 34#include "debug.h"
35#include "v9fs.h" 35#include "v9fs.h"
36#include "9p.h" 36#include "9p.h"
@@ -56,20 +56,23 @@ static inline int buf_check_overflow(struct cbuf *buf)
56 return buf->p > buf->ep; 56 return buf->p > buf->ep;
57} 57}
58 58
59static inline int buf_check_size(struct cbuf *buf, int len) 59static int buf_check_size(struct cbuf *buf, int len)
60{ 60{
61 if (buf->p+len > buf->ep) { 61 if (buf->p + len > buf->ep) {
62 if (buf->p < buf->ep) { 62 if (buf->p < buf->ep) {
63 eprintk(KERN_ERR, "buffer overflow\n"); 63 eprintk(KERN_ERR, "buffer overflow: want %d has %d\n",
64 len, (int)(buf->ep - buf->p));
65 dump_stack();
64 buf->p = buf->ep + 1; 66 buf->p = buf->ep + 1;
65 return 0;
66 } 67 }
68
69 return 0;
67 } 70 }
68 71
69 return 1; 72 return 1;
70} 73}
71 74
72static inline void *buf_alloc(struct cbuf *buf, int len) 75static void *buf_alloc(struct cbuf *buf, int len)
73{ 76{
74 void *ret = NULL; 77 void *ret = NULL;
75 78
@@ -81,7 +84,7 @@ static inline void *buf_alloc(struct cbuf *buf, int len)
81 return ret; 84 return ret;
82} 85}
83 86
84static inline void buf_put_int8(struct cbuf *buf, u8 val) 87static void buf_put_int8(struct cbuf *buf, u8 val)
85{ 88{
86 if (buf_check_size(buf, 1)) { 89 if (buf_check_size(buf, 1)) {
87 buf->p[0] = val; 90 buf->p[0] = val;
@@ -89,7 +92,7 @@ static inline void buf_put_int8(struct cbuf *buf, u8 val)
89 } 92 }
90} 93}
91 94
92static inline void buf_put_int16(struct cbuf *buf, u16 val) 95static void buf_put_int16(struct cbuf *buf, u16 val)
93{ 96{
94 if (buf_check_size(buf, 2)) { 97 if (buf_check_size(buf, 2)) {
95 *(__le16 *) buf->p = cpu_to_le16(val); 98 *(__le16 *) buf->p = cpu_to_le16(val);
@@ -97,7 +100,7 @@ static inline void buf_put_int16(struct cbuf *buf, u16 val)
97 } 100 }
98} 101}
99 102
100static inline void buf_put_int32(struct cbuf *buf, u32 val) 103static void buf_put_int32(struct cbuf *buf, u32 val)
101{ 104{
102 if (buf_check_size(buf, 4)) { 105 if (buf_check_size(buf, 4)) {
103 *(__le32 *)buf->p = cpu_to_le32(val); 106 *(__le32 *)buf->p = cpu_to_le32(val);
@@ -105,7 +108,7 @@ static inline void buf_put_int32(struct cbuf *buf, u32 val)
105 } 108 }
106} 109}
107 110
108static inline void buf_put_int64(struct cbuf *buf, u64 val) 111static void buf_put_int64(struct cbuf *buf, u64 val)
109{ 112{
110 if (buf_check_size(buf, 8)) { 113 if (buf_check_size(buf, 8)) {
111 *(__le64 *)buf->p = cpu_to_le64(val); 114 *(__le64 *)buf->p = cpu_to_le64(val);
@@ -113,7 +116,7 @@ static inline void buf_put_int64(struct cbuf *buf, u64 val)
113 } 116 }
114} 117}
115 118
116static inline void buf_put_stringn(struct cbuf *buf, const char *s, u16 slen) 119static void buf_put_stringn(struct cbuf *buf, const char *s, u16 slen)
117{ 120{
118 if (buf_check_size(buf, slen + 2)) { 121 if (buf_check_size(buf, slen + 2)) {
119 buf_put_int16(buf, slen); 122 buf_put_int16(buf, slen);
@@ -127,15 +130,7 @@ static inline void buf_put_string(struct cbuf *buf, const char *s)
127 buf_put_stringn(buf, s, strlen(s)); 130 buf_put_stringn(buf, s, strlen(s));
128} 131}
129 132
130static inline void buf_put_data(struct cbuf *buf, void *data, u32 datalen) 133static u8 buf_get_int8(struct cbuf *buf)
131{
132 if (buf_check_size(buf, datalen)) {
133 memcpy(buf->p, data, datalen);
134 buf->p += datalen;
135 }
136}
137
138static inline u8 buf_get_int8(struct cbuf *buf)
139{ 134{
140 u8 ret = 0; 135 u8 ret = 0;
141 136
@@ -147,7 +142,7 @@ static inline u8 buf_get_int8(struct cbuf *buf)
147 return ret; 142 return ret;
148} 143}
149 144
150static inline u16 buf_get_int16(struct cbuf *buf) 145static u16 buf_get_int16(struct cbuf *buf)
151{ 146{
152 u16 ret = 0; 147 u16 ret = 0;
153 148
@@ -159,7 +154,7 @@ static inline u16 buf_get_int16(struct cbuf *buf)
159 return ret; 154 return ret;
160} 155}
161 156
162static inline u32 buf_get_int32(struct cbuf *buf) 157static u32 buf_get_int32(struct cbuf *buf)
163{ 158{
164 u32 ret = 0; 159 u32 ret = 0;
165 160
@@ -171,7 +166,7 @@ static inline u32 buf_get_int32(struct cbuf *buf)
171 return ret; 166 return ret;
172} 167}
173 168
174static inline u64 buf_get_int64(struct cbuf *buf) 169static u64 buf_get_int64(struct cbuf *buf)
175{ 170{
176 u64 ret = 0; 171 u64 ret = 0;
177 172
@@ -183,86 +178,37 @@ static inline u64 buf_get_int64(struct cbuf *buf)
183 return ret; 178 return ret;
184} 179}
185 180
186static inline int 181static void buf_get_str(struct cbuf *buf, struct v9fs_str *vstr)
187buf_get_string(struct cbuf *buf, char *data, unsigned int datalen)
188{
189 u16 len = 0;
190
191 len = buf_get_int16(buf);
192 if (!buf_check_overflow(buf) && buf_check_size(buf, len) && len+1>datalen) {
193 memcpy(data, buf->p, len);
194 data[len] = 0;
195 buf->p += len;
196 len++;
197 }
198
199 return len;
200}
201
202static inline char *buf_get_stringb(struct cbuf *buf, struct cbuf *sbuf)
203{
204 char *ret;
205 u16 len;
206
207 ret = NULL;
208 len = buf_get_int16(buf);
209
210 if (!buf_check_overflow(buf) && buf_check_size(buf, len) &&
211 buf_check_size(sbuf, len+1)) {
212
213 memcpy(sbuf->p, buf->p, len);
214 sbuf->p[len] = 0;
215 ret = sbuf->p;
216 buf->p += len;
217 sbuf->p += len + 1;
218 }
219
220 return ret;
221}
222
223static inline int buf_get_data(struct cbuf *buf, void *data, int datalen)
224{ 182{
225 int ret = 0; 183 vstr->len = buf_get_int16(buf);
226 184 if (!buf_check_overflow(buf) && buf_check_size(buf, vstr->len)) {
227 if (buf_check_size(buf, datalen)) { 185 vstr->str = buf->p;
228 memcpy(data, buf->p, datalen); 186 buf->p += vstr->len;
229 buf->p += datalen; 187 } else {
230 ret = datalen; 188 vstr->len = 0;
189 vstr->str = NULL;
231 } 190 }
232
233 return ret;
234} 191}
235 192
236static inline void *buf_get_datab(struct cbuf *buf, struct cbuf *dbuf, 193static void buf_get_qid(struct cbuf *bufp, struct v9fs_qid *qid)
237 int datalen)
238{ 194{
239 char *ret = NULL; 195 qid->type = buf_get_int8(bufp);
240 int n = 0; 196 qid->version = buf_get_int32(bufp);
241 197 qid->path = buf_get_int64(bufp);
242 if (buf_check_size(dbuf, datalen)) {
243 n = buf_get_data(buf, dbuf->p, datalen);
244 if (n > 0) {
245 ret = dbuf->p;
246 dbuf->p += n;
247 }
248 }
249
250 return ret;
251} 198}
252 199
253/** 200/**
254 * v9fs_size_stat - calculate the size of a variable length stat struct 201 * v9fs_size_wstat - calculate the size of a variable length stat struct
255 * @v9ses: session information
256 * @stat: metadata (stat) structure 202 * @stat: metadata (stat) structure
203 * @extended: non-zero if 9P2000.u
257 * 204 *
258 */ 205 */
259 206
260static int v9fs_size_stat(struct v9fs_session_info *v9ses, 207static int v9fs_size_wstat(struct v9fs_wstat *wstat, int extended)
261 struct v9fs_stat *stat)
262{ 208{
263 int size = 0; 209 int size = 0;
264 210
265 if (stat == NULL) { 211 if (wstat == NULL) {
266 eprintk(KERN_ERR, "v9fs_size_stat: got a NULL stat pointer\n"); 212 eprintk(KERN_ERR, "v9fs_size_stat: got a NULL stat pointer\n");
267 return 0; 213 return 0;
268 } 214 }
@@ -279,82 +225,38 @@ static int v9fs_size_stat(struct v9fs_session_info *v9ses,
279 8 + /* length[8] */ 225 8 + /* length[8] */
280 8; /* minimum sum of string lengths */ 226 8; /* minimum sum of string lengths */
281 227
282 if (stat->name) 228 if (wstat->name)
283 size += strlen(stat->name); 229 size += strlen(wstat->name);
284 if (stat->uid) 230 if (wstat->uid)
285 size += strlen(stat->uid); 231 size += strlen(wstat->uid);
286 if (stat->gid) 232 if (wstat->gid)
287 size += strlen(stat->gid); 233 size += strlen(wstat->gid);
288 if (stat->muid) 234 if (wstat->muid)
289 size += strlen(stat->muid); 235 size += strlen(wstat->muid);
290 236
291 if (v9ses->extended) { 237 if (extended) {
292 size += 4 + /* n_uid[4] */ 238 size += 4 + /* n_uid[4] */
293 4 + /* n_gid[4] */ 239 4 + /* n_gid[4] */
294 4 + /* n_muid[4] */ 240 4 + /* n_muid[4] */
295 2; /* string length of extension[4] */ 241 2; /* string length of extension[4] */
296 if (stat->extension) 242 if (wstat->extension)
297 size += strlen(stat->extension); 243 size += strlen(wstat->extension);
298 } 244 }
299 245
300 return size; 246 return size;
301} 247}
302 248
303/** 249/**
304 * serialize_stat - safely format a stat structure for transmission 250 * buf_get_stat - safely decode a recieved metadata (stat) structure
305 * @v9ses: session info
306 * @stat: metadata (stat) structure
307 * @bufp: buffer to serialize structure into
308 *
309 */
310
311static int
312serialize_stat(struct v9fs_session_info *v9ses, struct v9fs_stat *stat,
313 struct cbuf *bufp)
314{
315 buf_put_int16(bufp, stat->size);
316 buf_put_int16(bufp, stat->type);
317 buf_put_int32(bufp, stat->dev);
318 buf_put_int8(bufp, stat->qid.type);
319 buf_put_int32(bufp, stat->qid.version);
320 buf_put_int64(bufp, stat->qid.path);
321 buf_put_int32(bufp, stat->mode);
322 buf_put_int32(bufp, stat->atime);
323 buf_put_int32(bufp, stat->mtime);
324 buf_put_int64(bufp, stat->length);
325
326 buf_put_string(bufp, stat->name);
327 buf_put_string(bufp, stat->uid);
328 buf_put_string(bufp, stat->gid);
329 buf_put_string(bufp, stat->muid);
330
331 if (v9ses->extended) {
332 buf_put_string(bufp, stat->extension);
333 buf_put_int32(bufp, stat->n_uid);
334 buf_put_int32(bufp, stat->n_gid);
335 buf_put_int32(bufp, stat->n_muid);
336 }
337
338 if (buf_check_overflow(bufp))
339 return 0;
340
341 return stat->size;
342}
343
344/**
345 * deserialize_stat - safely decode a recieved metadata (stat) structure
346 * @v9ses: session info
347 * @bufp: buffer to deserialize 251 * @bufp: buffer to deserialize
348 * @stat: metadata (stat) structure 252 * @stat: metadata (stat) structure
349 * @dbufp: buffer to deserialize variable strings into 253 * @extended: non-zero if 9P2000.u
350 * 254 *
351 */ 255 */
352 256
353static inline int 257static void
354deserialize_stat(struct v9fs_session_info *v9ses, struct cbuf *bufp, 258buf_get_stat(struct cbuf *bufp, struct v9fs_stat *stat, int extended)
355 struct v9fs_stat *stat, struct cbuf *dbufp)
356{ 259{
357
358 stat->size = buf_get_int16(bufp); 260 stat->size = buf_get_int16(bufp);
359 stat->type = buf_get_int16(bufp); 261 stat->type = buf_get_int16(bufp);
360 stat->dev = buf_get_int32(bufp); 262 stat->dev = buf_get_int32(bufp);
@@ -365,282 +267,82 @@ deserialize_stat(struct v9fs_session_info *v9ses, struct cbuf *bufp,
365 stat->atime = buf_get_int32(bufp); 267 stat->atime = buf_get_int32(bufp);
366 stat->mtime = buf_get_int32(bufp); 268 stat->mtime = buf_get_int32(bufp);
367 stat->length = buf_get_int64(bufp); 269 stat->length = buf_get_int64(bufp);
368 stat->name = buf_get_stringb(bufp, dbufp); 270 buf_get_str(bufp, &stat->name);
369 stat->uid = buf_get_stringb(bufp, dbufp); 271 buf_get_str(bufp, &stat->uid);
370 stat->gid = buf_get_stringb(bufp, dbufp); 272 buf_get_str(bufp, &stat->gid);
371 stat->muid = buf_get_stringb(bufp, dbufp); 273 buf_get_str(bufp, &stat->muid);
372 274
373 if (v9ses->extended) { 275 if (extended) {
374 stat->extension = buf_get_stringb(bufp, dbufp); 276 buf_get_str(bufp, &stat->extension);
375 stat->n_uid = buf_get_int32(bufp); 277 stat->n_uid = buf_get_int32(bufp);
376 stat->n_gid = buf_get_int32(bufp); 278 stat->n_gid = buf_get_int32(bufp);
377 stat->n_muid = buf_get_int32(bufp); 279 stat->n_muid = buf_get_int32(bufp);
378 } 280 }
379
380 if (buf_check_overflow(bufp) || buf_check_overflow(dbufp))
381 return 0;
382
383 return stat->size + 2;
384}
385
386/**
387 * deserialize_statb - wrapper for decoding a received metadata structure
388 * @v9ses: session info
389 * @bufp: buffer to deserialize
390 * @dbufp: buffer to deserialize variable strings into
391 *
392 */
393
394static inline struct v9fs_stat *deserialize_statb(struct v9fs_session_info
395 *v9ses, struct cbuf *bufp,
396 struct cbuf *dbufp)
397{
398 struct v9fs_stat *ret = buf_alloc(dbufp, sizeof(struct v9fs_stat));
399
400 if (ret) {
401 int n = deserialize_stat(v9ses, bufp, ret, dbufp);
402 if (n <= 0)
403 return NULL;
404 }
405
406 return ret;
407} 281}
408 282
409/** 283/**
410 * v9fs_deserialize_stat - decode a received metadata structure 284 * v9fs_deserialize_stat - decode a received metadata structure
411 * @v9ses: session info
412 * @buf: buffer to deserialize 285 * @buf: buffer to deserialize
413 * @buflen: length of received buffer 286 * @buflen: length of received buffer
414 * @stat: metadata structure to decode into 287 * @stat: metadata structure to decode into
415 * @statlen: length of destination metadata structure 288 * @extended: non-zero if 9P2000.u
416 * 289 *
290 * Note: stat will point to the buf region.
417 */ 291 */
418 292
419int 293int
420v9fs_deserialize_stat(struct v9fs_session_info *v9ses, void *buf, 294v9fs_deserialize_stat(void *buf, u32 buflen, struct v9fs_stat *stat,
421 u32 buflen, struct v9fs_stat *stat, u32 statlen) 295 int extended)
422{ 296{
423 struct cbuf buffer; 297 struct cbuf buffer;
424 struct cbuf *bufp = &buffer; 298 struct cbuf *bufp = &buffer;
425 struct cbuf dbuffer; 299 unsigned char *p;
426 struct cbuf *dbufp = &dbuffer;
427 300
428 buf_init(bufp, buf, buflen); 301 buf_init(bufp, buf, buflen);
429 buf_init(dbufp, (char *)stat + sizeof(struct v9fs_stat), 302 p = bufp->p;
430 statlen - sizeof(struct v9fs_stat)); 303 buf_get_stat(bufp, stat, extended);
431
432 return deserialize_stat(v9ses, bufp, stat, dbufp);
433}
434
435static inline int
436v9fs_size_fcall(struct v9fs_session_info *v9ses, struct v9fs_fcall *fcall)
437{
438 int size = 4 + 1 + 2; /* size[4] msg[1] tag[2] */
439 int i = 0;
440
441 switch (fcall->id) {
442 default:
443 eprintk(KERN_ERR, "bad msg type %d\n", fcall->id);
444 return 0;
445 case TVERSION: /* msize[4] version[s] */
446 size += 4 + 2 + strlen(fcall->params.tversion.version);
447 break;
448 case TAUTH: /* afid[4] uname[s] aname[s] */
449 size += 4 + 2 + strlen(fcall->params.tauth.uname) +
450 2 + strlen(fcall->params.tauth.aname);
451 break;
452 case TFLUSH: /* oldtag[2] */
453 size += 2;
454 break;
455 case TATTACH: /* fid[4] afid[4] uname[s] aname[s] */
456 size += 4 + 4 + 2 + strlen(fcall->params.tattach.uname) +
457 2 + strlen(fcall->params.tattach.aname);
458 break;
459 case TWALK: /* fid[4] newfid[4] nwname[2] nwname*(wname[s]) */
460 size += 4 + 4 + 2;
461 /* now compute total for the array of names */
462 for (i = 0; i < fcall->params.twalk.nwname; i++)
463 size += 2 + strlen(fcall->params.twalk.wnames[i]);
464 break;
465 case TOPEN: /* fid[4] mode[1] */
466 size += 4 + 1;
467 break;
468 case TCREATE: /* fid[4] name[s] perm[4] mode[1] */
469 size += 4 + 2 + strlen(fcall->params.tcreate.name) + 4 + 1;
470 break;
471 case TREAD: /* fid[4] offset[8] count[4] */
472 size += 4 + 8 + 4;
473 break;
474 case TWRITE: /* fid[4] offset[8] count[4] data[count] */
475 size += 4 + 8 + 4 + fcall->params.twrite.count;
476 break;
477 case TCLUNK: /* fid[4] */
478 size += 4;
479 break;
480 case TREMOVE: /* fid[4] */
481 size += 4;
482 break;
483 case TSTAT: /* fid[4] */
484 size += 4;
485 break;
486 case TWSTAT: /* fid[4] stat[n] */
487 fcall->params.twstat.stat->size =
488 v9fs_size_stat(v9ses, fcall->params.twstat.stat);
489 size += 4 + 2 + 2 + fcall->params.twstat.stat->size;
490 }
491 return size;
492}
493
494/*
495 * v9fs_serialize_fcall - marshall fcall struct into a packet
496 * @v9ses: session information
497 * @fcall: structure to convert
498 * @data: buffer to serialize fcall into
499 * @datalen: length of buffer to serialize fcall into
500 *
501 */
502
503int
504v9fs_serialize_fcall(struct v9fs_session_info *v9ses, struct v9fs_fcall *fcall,
505 void *data, u32 datalen)
506{
507 int i = 0;
508 struct v9fs_stat *stat = NULL;
509 struct cbuf buffer;
510 struct cbuf *bufp = &buffer;
511
512 buf_init(bufp, data, datalen);
513
514 if (!fcall) {
515 eprintk(KERN_ERR, "no fcall\n");
516 return -EINVAL;
517 }
518
519 fcall->size = v9fs_size_fcall(v9ses, fcall);
520
521 buf_put_int32(bufp, fcall->size);
522 buf_put_int8(bufp, fcall->id);
523 buf_put_int16(bufp, fcall->tag);
524
525 dprintk(DEBUG_CONV, "size %d id %d tag %d\n", fcall->size, fcall->id,
526 fcall->tag);
527
528 /* now encode it */
529 switch (fcall->id) {
530 default:
531 eprintk(KERN_ERR, "bad msg type: %d\n", fcall->id);
532 return -EPROTO;
533 case TVERSION:
534 buf_put_int32(bufp, fcall->params.tversion.msize);
535 buf_put_string(bufp, fcall->params.tversion.version);
536 break;
537 case TAUTH:
538 buf_put_int32(bufp, fcall->params.tauth.afid);
539 buf_put_string(bufp, fcall->params.tauth.uname);
540 buf_put_string(bufp, fcall->params.tauth.aname);
541 break;
542 case TFLUSH:
543 buf_put_int16(bufp, fcall->params.tflush.oldtag);
544 break;
545 case TATTACH:
546 buf_put_int32(bufp, fcall->params.tattach.fid);
547 buf_put_int32(bufp, fcall->params.tattach.afid);
548 buf_put_string(bufp, fcall->params.tattach.uname);
549 buf_put_string(bufp, fcall->params.tattach.aname);
550 break;
551 case TWALK:
552 buf_put_int32(bufp, fcall->params.twalk.fid);
553 buf_put_int32(bufp, fcall->params.twalk.newfid);
554 buf_put_int16(bufp, fcall->params.twalk.nwname);
555 for (i = 0; i < fcall->params.twalk.nwname; i++)
556 buf_put_string(bufp, fcall->params.twalk.wnames[i]);
557 break;
558 case TOPEN:
559 buf_put_int32(bufp, fcall->params.topen.fid);
560 buf_put_int8(bufp, fcall->params.topen.mode);
561 break;
562 case TCREATE:
563 buf_put_int32(bufp, fcall->params.tcreate.fid);
564 buf_put_string(bufp, fcall->params.tcreate.name);
565 buf_put_int32(bufp, fcall->params.tcreate.perm);
566 buf_put_int8(bufp, fcall->params.tcreate.mode);
567 break;
568 case TREAD:
569 buf_put_int32(bufp, fcall->params.tread.fid);
570 buf_put_int64(bufp, fcall->params.tread.offset);
571 buf_put_int32(bufp, fcall->params.tread.count);
572 break;
573 case TWRITE:
574 buf_put_int32(bufp, fcall->params.twrite.fid);
575 buf_put_int64(bufp, fcall->params.twrite.offset);
576 buf_put_int32(bufp, fcall->params.twrite.count);
577 buf_put_data(bufp, fcall->params.twrite.data,
578 fcall->params.twrite.count);
579 break;
580 case TCLUNK:
581 buf_put_int32(bufp, fcall->params.tclunk.fid);
582 break;
583 case TREMOVE:
584 buf_put_int32(bufp, fcall->params.tremove.fid);
585 break;
586 case TSTAT:
587 buf_put_int32(bufp, fcall->params.tstat.fid);
588 break;
589 case TWSTAT:
590 buf_put_int32(bufp, fcall->params.twstat.fid);
591 stat = fcall->params.twstat.stat;
592
593 buf_put_int16(bufp, stat->size + 2);
594 serialize_stat(v9ses, stat, bufp);
595 break;
596 }
597 304
598 if (buf_check_overflow(bufp)) 305 if (buf_check_overflow(bufp))
599 return -EIO; 306 return 0;
600 307 else
601 return fcall->size; 308 return bufp->p - p;
602} 309}
603 310
604/** 311/**
605 * deserialize_fcall - unmarshal a response 312 * deserialize_fcall - unmarshal a response
606 * @v9ses: session information
607 * @msgsize: size of rcall message
608 * @buf: recieved buffer 313 * @buf: recieved buffer
609 * @buflen: length of received buffer 314 * @buflen: length of received buffer
610 * @rcall: fcall structure to populate 315 * @rcall: fcall structure to populate
611 * @rcalllen: length of fcall structure to populate 316 * @rcalllen: length of fcall structure to populate
317 * @extended: non-zero if 9P2000.u
612 * 318 *
613 */ 319 */
614 320
615int 321int
616v9fs_deserialize_fcall(struct v9fs_session_info *v9ses, u32 msgsize, 322v9fs_deserialize_fcall(void *buf, u32 buflen, struct v9fs_fcall *rcall,
617 void *buf, u32 buflen, struct v9fs_fcall *rcall, 323 int extended)
618 int rcalllen)
619{ 324{
620 325
621 struct cbuf buffer; 326 struct cbuf buffer;
622 struct cbuf *bufp = &buffer; 327 struct cbuf *bufp = &buffer;
623 struct cbuf dbuffer;
624 struct cbuf *dbufp = &dbuffer;
625 int i = 0; 328 int i = 0;
626 329
627 buf_init(bufp, buf, buflen); 330 buf_init(bufp, buf, buflen);
628 buf_init(dbufp, (char *)rcall + sizeof(struct v9fs_fcall),
629 rcalllen - sizeof(struct v9fs_fcall));
630 331
631 rcall->size = msgsize; 332 rcall->size = buf_get_int32(bufp);
632 rcall->id = buf_get_int8(bufp); 333 rcall->id = buf_get_int8(bufp);
633 rcall->tag = buf_get_int16(bufp); 334 rcall->tag = buf_get_int16(bufp);
634 335
635 dprintk(DEBUG_CONV, "size %d id %d tag %d\n", rcall->size, rcall->id, 336 dprintk(DEBUG_CONV, "size %d id %d tag %d\n", rcall->size, rcall->id,
636 rcall->tag); 337 rcall->tag);
338
637 switch (rcall->id) { 339 switch (rcall->id) {
638 default: 340 default:
639 eprintk(KERN_ERR, "unknown message type: %d\n", rcall->id); 341 eprintk(KERN_ERR, "unknown message type: %d\n", rcall->id);
640 return -EPROTO; 342 return -EPROTO;
641 case RVERSION: 343 case RVERSION:
642 rcall->params.rversion.msize = buf_get_int32(bufp); 344 rcall->params.rversion.msize = buf_get_int32(bufp);
643 rcall->params.rversion.version = buf_get_stringb(bufp, dbufp); 345 buf_get_str(bufp, &rcall->params.rversion.version);
644 break; 346 break;
645 case RFLUSH: 347 case RFLUSH:
646 break; 348 break;
@@ -651,34 +353,27 @@ v9fs_deserialize_fcall(struct v9fs_session_info *v9ses, u32 msgsize,
651 break; 353 break;
652 case RWALK: 354 case RWALK:
653 rcall->params.rwalk.nwqid = buf_get_int16(bufp); 355 rcall->params.rwalk.nwqid = buf_get_int16(bufp);
654 rcall->params.rwalk.wqids = buf_alloc(dbufp, 356 if (rcall->params.rwalk.nwqid > V9FS_MAXWELEM) {
655 rcall->params.rwalk.nwqid * sizeof(struct v9fs_qid)); 357 eprintk(KERN_ERR, "Rwalk with more than %d qids: %d\n",
656 if (rcall->params.rwalk.wqids) 358 V9FS_MAXWELEM, rcall->params.rwalk.nwqid);
657 for (i = 0; i < rcall->params.rwalk.nwqid; i++) { 359 return -EPROTO;
658 rcall->params.rwalk.wqids[i].type = 360 }
659 buf_get_int8(bufp); 361
660 rcall->params.rwalk.wqids[i].version = 362 for (i = 0; i < rcall->params.rwalk.nwqid; i++)
661 buf_get_int16(bufp); 363 buf_get_qid(bufp, &rcall->params.rwalk.wqids[i]);
662 rcall->params.rwalk.wqids[i].path =
663 buf_get_int64(bufp);
664 }
665 break; 364 break;
666 case ROPEN: 365 case ROPEN:
667 rcall->params.ropen.qid.type = buf_get_int8(bufp); 366 buf_get_qid(bufp, &rcall->params.ropen.qid);
668 rcall->params.ropen.qid.version = buf_get_int32(bufp);
669 rcall->params.ropen.qid.path = buf_get_int64(bufp);
670 rcall->params.ropen.iounit = buf_get_int32(bufp); 367 rcall->params.ropen.iounit = buf_get_int32(bufp);
671 break; 368 break;
672 case RCREATE: 369 case RCREATE:
673 rcall->params.rcreate.qid.type = buf_get_int8(bufp); 370 buf_get_qid(bufp, &rcall->params.rcreate.qid);
674 rcall->params.rcreate.qid.version = buf_get_int32(bufp);
675 rcall->params.rcreate.qid.path = buf_get_int64(bufp);
676 rcall->params.rcreate.iounit = buf_get_int32(bufp); 371 rcall->params.rcreate.iounit = buf_get_int32(bufp);
677 break; 372 break;
678 case RREAD: 373 case RREAD:
679 rcall->params.rread.count = buf_get_int32(bufp); 374 rcall->params.rread.count = buf_get_int32(bufp);
680 rcall->params.rread.data = buf_get_datab(bufp, dbufp, 375 rcall->params.rread.data = bufp->p;
681 rcall->params.rread.count); 376 buf_check_size(bufp, rcall->params.rread.count);
682 break; 377 break;
683 case RWRITE: 378 case RWRITE:
684 rcall->params.rwrite.count = buf_get_int32(bufp); 379 rcall->params.rwrite.count = buf_get_int32(bufp);
@@ -689,20 +384,443 @@ v9fs_deserialize_fcall(struct v9fs_session_info *v9ses, u32 msgsize,
689 break; 384 break;
690 case RSTAT: 385 case RSTAT:
691 buf_get_int16(bufp); 386 buf_get_int16(bufp);
692 rcall->params.rstat.stat = 387 buf_get_stat(bufp, &rcall->params.rstat.stat, extended);
693 deserialize_statb(v9ses, bufp, dbufp);
694 break; 388 break;
695 case RWSTAT: 389 case RWSTAT:
696 break; 390 break;
697 case RERROR: 391 case RERROR:
698 rcall->params.rerror.error = buf_get_stringb(bufp, dbufp); 392 buf_get_str(bufp, &rcall->params.rerror.error);
699 if (v9ses->extended) 393 if (extended)
700 rcall->params.rerror.errno = buf_get_int16(bufp); 394 rcall->params.rerror.errno = buf_get_int16(bufp);
701 break; 395 break;
702 } 396 }
703 397
704 if (buf_check_overflow(bufp) || buf_check_overflow(dbufp)) 398 if (buf_check_overflow(bufp)) {
399 dprintk(DEBUG_ERROR, "buffer overflow\n");
705 return -EIO; 400 return -EIO;
401 }
402
403 return bufp->p - bufp->sp;
404}
405
406static inline void v9fs_put_int8(struct cbuf *bufp, u8 val, u8 * p)
407{
408 *p = val;
409 buf_put_int8(bufp, val);
410}
411
412static inline void v9fs_put_int16(struct cbuf *bufp, u16 val, u16 * p)
413{
414 *p = val;
415 buf_put_int16(bufp, val);
416}
417
418static inline void v9fs_put_int32(struct cbuf *bufp, u32 val, u32 * p)
419{
420 *p = val;
421 buf_put_int32(bufp, val);
422}
423
424static inline void v9fs_put_int64(struct cbuf *bufp, u64 val, u64 * p)
425{
426 *p = val;
427 buf_put_int64(bufp, val);
428}
429
430static void
431v9fs_put_str(struct cbuf *bufp, char *data, struct v9fs_str *str)
432{
433 if (data) {
434 str->len = strlen(data);
435 str->str = bufp->p;
436 } else {
437 str->len = 0;
438 str->str = NULL;
439 }
440
441 buf_put_stringn(bufp, data, str->len);
442}
443
444static int
445v9fs_put_user_data(struct cbuf *bufp, const char __user * data, int count,
446 unsigned char **pdata)
447{
448 *pdata = buf_alloc(bufp, count);
449 return copy_from_user(*pdata, data, count);
450}
451
452static void
453v9fs_put_wstat(struct cbuf *bufp, struct v9fs_wstat *wstat,
454 struct v9fs_stat *stat, int statsz, int extended)
455{
456 v9fs_put_int16(bufp, statsz, &stat->size);
457 v9fs_put_int16(bufp, wstat->type, &stat->type);
458 v9fs_put_int32(bufp, wstat->dev, &stat->dev);
459 v9fs_put_int8(bufp, wstat->qid.type, &stat->qid.type);
460 v9fs_put_int32(bufp, wstat->qid.version, &stat->qid.version);
461 v9fs_put_int64(bufp, wstat->qid.path, &stat->qid.path);
462 v9fs_put_int32(bufp, wstat->mode, &stat->mode);
463 v9fs_put_int32(bufp, wstat->atime, &stat->atime);
464 v9fs_put_int32(bufp, wstat->mtime, &stat->mtime);
465 v9fs_put_int64(bufp, wstat->length, &stat->length);
466
467 v9fs_put_str(bufp, wstat->name, &stat->name);
468 v9fs_put_str(bufp, wstat->uid, &stat->uid);
469 v9fs_put_str(bufp, wstat->gid, &stat->gid);
470 v9fs_put_str(bufp, wstat->muid, &stat->muid);
471
472 if (extended) {
473 v9fs_put_str(bufp, wstat->extension, &stat->extension);
474 v9fs_put_int32(bufp, wstat->n_uid, &stat->n_uid);
475 v9fs_put_int32(bufp, wstat->n_gid, &stat->n_gid);
476 v9fs_put_int32(bufp, wstat->n_muid, &stat->n_muid);
477 }
478}
479
480static struct v9fs_fcall *
481v9fs_create_common(struct cbuf *bufp, u32 size, u8 id)
482{
483 struct v9fs_fcall *fc;
484
485 size += 4 + 1 + 2; /* size[4] id[1] tag[2] */
486 fc = kmalloc(sizeof(struct v9fs_fcall) + size, GFP_KERNEL);
487 if (!fc)
488 return ERR_PTR(-ENOMEM);
489
490 fc->sdata = (char *)fc + sizeof(*fc);
491
492 buf_init(bufp, (char *)fc->sdata, size);
493 v9fs_put_int32(bufp, size, &fc->size);
494 v9fs_put_int8(bufp, id, &fc->id);
495 v9fs_put_int16(bufp, V9FS_NOTAG, &fc->tag);
496
497 return fc;
498}
499
500void v9fs_set_tag(struct v9fs_fcall *fc, u16 tag)
501{
502 fc->tag = tag;
503 *(__le16 *) (fc->sdata + 5) = cpu_to_le16(tag);
504}
505
506struct v9fs_fcall *v9fs_create_tversion(u32 msize, char *version)
507{
508 int size;
509 struct v9fs_fcall *fc;
510 struct cbuf buffer;
511 struct cbuf *bufp = &buffer;
512
513 size = 4 + 2 + strlen(version); /* msize[4] version[s] */
514 fc = v9fs_create_common(bufp, size, TVERSION);
515 if (IS_ERR(fc))
516 goto error;
517
518 v9fs_put_int32(bufp, msize, &fc->params.tversion.msize);
519 v9fs_put_str(bufp, version, &fc->params.tversion.version);
520
521 if (buf_check_overflow(bufp)) {
522 kfree(fc);
523 fc = ERR_PTR(-ENOMEM);
524 }
525 error:
526 return fc;
527}
528
529struct v9fs_fcall *v9fs_create_tauth(u32 afid, char *uname, char *aname)
530{
531 int size;
532 struct v9fs_fcall *fc;
533 struct cbuf buffer;
534 struct cbuf *bufp = &buffer;
535
536 size = 4 + 2 + strlen(uname) + 2 + strlen(aname); /* afid[4] uname[s] aname[s] */
537 fc = v9fs_create_common(bufp, size, TAUTH);
538 if (IS_ERR(fc))
539 goto error;
540
541 v9fs_put_int32(bufp, afid, &fc->params.tauth.afid);
542 v9fs_put_str(bufp, uname, &fc->params.tauth.uname);
543 v9fs_put_str(bufp, aname, &fc->params.tauth.aname);
544
545 if (buf_check_overflow(bufp)) {
546 kfree(fc);
547 fc = ERR_PTR(-ENOMEM);
548 }
549 error:
550 return fc;
551}
552
553struct v9fs_fcall *
554v9fs_create_tattach(u32 fid, u32 afid, char *uname, char *aname)
555{
556 int size;
557 struct v9fs_fcall *fc;
558 struct cbuf buffer;
559 struct cbuf *bufp = &buffer;
560
561 size = 4 + 4 + 2 + strlen(uname) + 2 + strlen(aname); /* fid[4] afid[4] uname[s] aname[s] */
562 fc = v9fs_create_common(bufp, size, TATTACH);
563 if (IS_ERR(fc))
564 goto error;
565
566 v9fs_put_int32(bufp, fid, &fc->params.tattach.fid);
567 v9fs_put_int32(bufp, afid, &fc->params.tattach.afid);
568 v9fs_put_str(bufp, uname, &fc->params.tattach.uname);
569 v9fs_put_str(bufp, aname, &fc->params.tattach.aname);
570
571 error:
572 return fc;
573}
574
575struct v9fs_fcall *v9fs_create_tflush(u16 oldtag)
576{
577 int size;
578 struct v9fs_fcall *fc;
579 struct cbuf buffer;
580 struct cbuf *bufp = &buffer;
581
582 size = 2; /* oldtag[2] */
583 fc = v9fs_create_common(bufp, size, TFLUSH);
584 if (IS_ERR(fc))
585 goto error;
586
587 v9fs_put_int16(bufp, oldtag, &fc->params.tflush.oldtag);
588
589 if (buf_check_overflow(bufp)) {
590 kfree(fc);
591 fc = ERR_PTR(-ENOMEM);
592 }
593 error:
594 return fc;
595}
596
597struct v9fs_fcall *v9fs_create_twalk(u32 fid, u32 newfid, u16 nwname,
598 char **wnames)
599{
600 int i, size;
601 struct v9fs_fcall *fc;
602 struct cbuf buffer;
603 struct cbuf *bufp = &buffer;
604
605 if (nwname > V9FS_MAXWELEM) {
606 dprintk(DEBUG_ERROR, "nwname > %d\n", V9FS_MAXWELEM);
607 return NULL;
608 }
609
610 size = 4 + 4 + 2; /* fid[4] newfid[4] nwname[2] ... */
611 for (i = 0; i < nwname; i++) {
612 size += 2 + strlen(wnames[i]); /* wname[s] */
613 }
614
615 fc = v9fs_create_common(bufp, size, TWALK);
616 if (IS_ERR(fc))
617 goto error;
618
619 v9fs_put_int32(bufp, fid, &fc->params.twalk.fid);
620 v9fs_put_int32(bufp, newfid, &fc->params.twalk.newfid);
621 v9fs_put_int16(bufp, nwname, &fc->params.twalk.nwname);
622 for (i = 0; i < nwname; i++) {
623 v9fs_put_str(bufp, wnames[i], &fc->params.twalk.wnames[i]);
624 }
625
626 if (buf_check_overflow(bufp)) {
627 kfree(fc);
628 fc = ERR_PTR(-ENOMEM);
629 }
630 error:
631 return fc;
632}
706 633
707 return rcall->size; 634struct v9fs_fcall *v9fs_create_topen(u32 fid, u8 mode)
635{
636 int size;
637 struct v9fs_fcall *fc;
638 struct cbuf buffer;
639 struct cbuf *bufp = &buffer;
640
641 size = 4 + 1; /* fid[4] mode[1] */
642 fc = v9fs_create_common(bufp, size, TOPEN);
643 if (IS_ERR(fc))
644 goto error;
645
646 v9fs_put_int32(bufp, fid, &fc->params.topen.fid);
647 v9fs_put_int8(bufp, mode, &fc->params.topen.mode);
648
649 if (buf_check_overflow(bufp)) {
650 kfree(fc);
651 fc = ERR_PTR(-ENOMEM);
652 }
653 error:
654 return fc;
655}
656
657struct v9fs_fcall *v9fs_create_tcreate(u32 fid, char *name, u32 perm, u8 mode)
658{
659 int size;
660 struct v9fs_fcall *fc;
661 struct cbuf buffer;
662 struct cbuf *bufp = &buffer;
663
664 size = 4 + 2 + strlen(name) + 4 + 1; /* fid[4] name[s] perm[4] mode[1] */
665 fc = v9fs_create_common(bufp, size, TCREATE);
666 if (IS_ERR(fc))
667 goto error;
668
669 v9fs_put_int32(bufp, fid, &fc->params.tcreate.fid);
670 v9fs_put_str(bufp, name, &fc->params.tcreate.name);
671 v9fs_put_int32(bufp, perm, &fc->params.tcreate.perm);
672 v9fs_put_int8(bufp, mode, &fc->params.tcreate.mode);
673
674 if (buf_check_overflow(bufp)) {
675 kfree(fc);
676 fc = ERR_PTR(-ENOMEM);
677 }
678 error:
679 return fc;
680}
681
682struct v9fs_fcall *v9fs_create_tread(u32 fid, u64 offset, u32 count)
683{
684 int size;
685 struct v9fs_fcall *fc;
686 struct cbuf buffer;
687 struct cbuf *bufp = &buffer;
688
689 size = 4 + 8 + 4; /* fid[4] offset[8] count[4] */
690 fc = v9fs_create_common(bufp, size, TREAD);
691 if (IS_ERR(fc))
692 goto error;
693
694 v9fs_put_int32(bufp, fid, &fc->params.tread.fid);
695 v9fs_put_int64(bufp, offset, &fc->params.tread.offset);
696 v9fs_put_int32(bufp, count, &fc->params.tread.count);
697
698 if (buf_check_overflow(bufp)) {
699 kfree(fc);
700 fc = ERR_PTR(-ENOMEM);
701 }
702 error:
703 return fc;
704}
705
706struct v9fs_fcall *v9fs_create_twrite(u32 fid, u64 offset, u32 count,
707 const char __user * data)
708{
709 int size, err;
710 struct v9fs_fcall *fc;
711 struct cbuf buffer;
712 struct cbuf *bufp = &buffer;
713
714 size = 4 + 8 + 4 + count; /* fid[4] offset[8] count[4] data[count] */
715 fc = v9fs_create_common(bufp, size, TWRITE);
716 if (IS_ERR(fc))
717 goto error;
718
719 v9fs_put_int32(bufp, fid, &fc->params.twrite.fid);
720 v9fs_put_int64(bufp, offset, &fc->params.twrite.offset);
721 v9fs_put_int32(bufp, count, &fc->params.twrite.count);
722 err = v9fs_put_user_data(bufp, data, count, &fc->params.twrite.data);
723 if (err) {
724 kfree(fc);
725 fc = ERR_PTR(err);
726 }
727
728 if (buf_check_overflow(bufp)) {
729 kfree(fc);
730 fc = ERR_PTR(-ENOMEM);
731 }
732 error:
733 return fc;
734}
735
736struct v9fs_fcall *v9fs_create_tclunk(u32 fid)
737{
738 int size;
739 struct v9fs_fcall *fc;
740 struct cbuf buffer;
741 struct cbuf *bufp = &buffer;
742
743 size = 4; /* fid[4] */
744 fc = v9fs_create_common(bufp, size, TCLUNK);
745 if (IS_ERR(fc))
746 goto error;
747
748 v9fs_put_int32(bufp, fid, &fc->params.tclunk.fid);
749
750 if (buf_check_overflow(bufp)) {
751 kfree(fc);
752 fc = ERR_PTR(-ENOMEM);
753 }
754 error:
755 return fc;
756}
757
758struct v9fs_fcall *v9fs_create_tremove(u32 fid)
759{
760 int size;
761 struct v9fs_fcall *fc;
762 struct cbuf buffer;
763 struct cbuf *bufp = &buffer;
764
765 size = 4; /* fid[4] */
766 fc = v9fs_create_common(bufp, size, TREMOVE);
767 if (IS_ERR(fc))
768 goto error;
769
770 v9fs_put_int32(bufp, fid, &fc->params.tremove.fid);
771
772 if (buf_check_overflow(bufp)) {
773 kfree(fc);
774 fc = ERR_PTR(-ENOMEM);
775 }
776 error:
777 return fc;
778}
779
780struct v9fs_fcall *v9fs_create_tstat(u32 fid)
781{
782 int size;
783 struct v9fs_fcall *fc;
784 struct cbuf buffer;
785 struct cbuf *bufp = &buffer;
786
787 size = 4; /* fid[4] */
788 fc = v9fs_create_common(bufp, size, TSTAT);
789 if (IS_ERR(fc))
790 goto error;
791
792 v9fs_put_int32(bufp, fid, &fc->params.tstat.fid);
793
794 if (buf_check_overflow(bufp)) {
795 kfree(fc);
796 fc = ERR_PTR(-ENOMEM);
797 }
798 error:
799 return fc;
800}
801
802struct v9fs_fcall *v9fs_create_twstat(u32 fid, struct v9fs_wstat *wstat,
803 int extended)
804{
805 int size, statsz;
806 struct v9fs_fcall *fc;
807 struct cbuf buffer;
808 struct cbuf *bufp = &buffer;
809
810 statsz = v9fs_size_wstat(wstat, extended);
811 size = 4 + 2 + 2 + statsz; /* fid[4] stat[n] */
812 fc = v9fs_create_common(bufp, size, TWSTAT);
813 if (IS_ERR(fc))
814 goto error;
815
816 v9fs_put_int32(bufp, fid, &fc->params.twstat.fid);
817 buf_put_int16(bufp, statsz + 2);
818 v9fs_put_wstat(bufp, wstat, &fc->params.twstat.stat, statsz, extended);
819
820 if (buf_check_overflow(bufp)) {
821 kfree(fc);
822 fc = ERR_PTR(-ENOMEM);
823 }
824 error:
825 return fc;
708} 826}
diff --git a/fs/9p/conv.h b/fs/9p/conv.h
index ee849613c61..26a736e4a2e 100644
--- a/fs/9p/conv.h
+++ b/fs/9p/conv.h
@@ -1,8 +1,9 @@
1/* 1/*
2 * linux/fs/9p/conv.h 2 * linux/fs/9p/conv.h
3 * 3 *
4 * 9P protocol conversion definitions 4 * 9P protocol conversion definitions.
5 * 5 *
6 * Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net>
6 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> 7 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
7 * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> 8 * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
8 * 9 *
@@ -24,13 +25,27 @@
24 * 25 *
25 */ 26 */
26 27
27int v9fs_deserialize_stat(struct v9fs_session_info *, void *buf, 28int v9fs_deserialize_stat(void *buf, u32 buflen, struct v9fs_stat *stat,
28 u32 buflen, struct v9fs_stat *stat, u32 statlen); 29 int extended);
29int v9fs_serialize_fcall(struct v9fs_session_info *, struct v9fs_fcall *tcall, 30int v9fs_deserialize_fcall(void *buf, u32 buflen, struct v9fs_fcall *rcall,
30 void *buf, u32 buflen); 31 int extended);
31int v9fs_deserialize_fcall(struct v9fs_session_info *, u32 msglen,
32 void *buf, u32 buflen, struct v9fs_fcall *rcall,
33 int rcalllen);
34 32
35/* this one is actually in error.c right now */ 33void v9fs_set_tag(struct v9fs_fcall *fc, u16 tag);
36int v9fs_errstr2errno(char *errstr); 34
35struct v9fs_fcall *v9fs_create_tversion(u32 msize, char *version);
36struct v9fs_fcall *v9fs_create_tauth(u32 afid, char *uname, char *aname);
37struct v9fs_fcall *v9fs_create_tattach(u32 fid, u32 afid, char *uname,
38 char *aname);
39struct v9fs_fcall *v9fs_create_tflush(u16 oldtag);
40struct v9fs_fcall *v9fs_create_twalk(u32 fid, u32 newfid, u16 nwname,
41 char **wnames);
42struct v9fs_fcall *v9fs_create_topen(u32 fid, u8 mode);
43struct v9fs_fcall *v9fs_create_tcreate(u32 fid, char *name, u32 perm, u8 mode);
44struct v9fs_fcall *v9fs_create_tread(u32 fid, u64 offset, u32 count);
45struct v9fs_fcall *v9fs_create_twrite(u32 fid, u64 offset, u32 count,
46 const char __user *data);
47struct v9fs_fcall *v9fs_create_tclunk(u32 fid);
48struct v9fs_fcall *v9fs_create_tremove(u32 fid);
49struct v9fs_fcall *v9fs_create_tstat(u32 fid);
50struct v9fs_fcall *v9fs_create_twstat(u32 fid, struct v9fs_wstat *wstat,
51 int extended);
diff --git a/fs/9p/debug.h b/fs/9p/debug.h
index 4445f06919d..fe551032788 100644
--- a/fs/9p/debug.h
+++ b/fs/9p/debug.h
@@ -51,16 +51,23 @@ do { \
51#if DEBUG_DUMP_PKT 51#if DEBUG_DUMP_PKT
52static inline void dump_data(const unsigned char *data, unsigned int datalen) 52static inline void dump_data(const unsigned char *data, unsigned int datalen)
53{ 53{
54 int i, j; 54 int i, n;
55 int len = datalen; 55 char buf[5*8];
56 56
57 printk(KERN_DEBUG "data "); 57 n = 0;
58 for (i = 0; i < len; i += 4) { 58 i = 0;
59 for (j = 0; (j < 4) && (i + j < len); j++) 59 while (i < datalen) {
60 printk(KERN_DEBUG "%02x", data[i + j]); 60 n += snprintf(buf+n, sizeof(buf)-n, "%02x", data[i++]);
61 printk(KERN_DEBUG " "); 61 if (i%4 == 0)
62 n += snprintf(buf+n, sizeof(buf)-n, " ");
63
64 if (i%16 == 0) {
65 dprintk(DEBUG_ERROR, "%s\n", buf);
66 n = 0;
67 }
62 } 68 }
63 printk(KERN_DEBUG "\n"); 69
70 dprintk(DEBUG_ERROR, "%s\n", buf);
64} 71}
65#else /* DEBUG_DUMP_PKT */ 72#else /* DEBUG_DUMP_PKT */
66static inline void dump_data(const unsigned char *data, unsigned int datalen) 73static inline void dump_data(const unsigned char *data, unsigned int datalen)
diff --git a/fs/9p/error.c b/fs/9p/error.c
index 834cb179e38..e4b6f8f38b6 100644
--- a/fs/9p/error.c
+++ b/fs/9p/error.c
@@ -33,7 +33,6 @@
33 33
34#include <linux/list.h> 34#include <linux/list.h>
35#include <linux/jhash.h> 35#include <linux/jhash.h>
36#include <linux/string.h>
37 36
38#include "debug.h" 37#include "debug.h"
39#include "error.h" 38#include "error.h"
@@ -55,7 +54,8 @@ int v9fs_error_init(void)
55 54
56 /* load initial error map into hash table */ 55 /* load initial error map into hash table */
57 for (c = errmap; c->name != NULL; c++) { 56 for (c = errmap; c->name != NULL; c++) {
58 bucket = jhash(c->name, strlen(c->name), 0) % ERRHASHSZ; 57 c->namelen = strlen(c->name);
58 bucket = jhash(c->name, c->namelen, 0) % ERRHASHSZ;
59 INIT_HLIST_NODE(&c->list); 59 INIT_HLIST_NODE(&c->list);
60 hlist_add_head(&c->list, &hash_errmap[bucket]); 60 hlist_add_head(&c->list, &hash_errmap[bucket]);
61 } 61 }
@@ -69,15 +69,15 @@ int v9fs_error_init(void)
69 * 69 *
70 */ 70 */
71 71
72int v9fs_errstr2errno(char *errstr) 72int v9fs_errstr2errno(char *errstr, int len)
73{ 73{
74 int errno = 0; 74 int errno = 0;
75 struct hlist_node *p = NULL; 75 struct hlist_node *p = NULL;
76 struct errormap *c = NULL; 76 struct errormap *c = NULL;
77 int bucket = jhash(errstr, strlen(errstr), 0) % ERRHASHSZ; 77 int bucket = jhash(errstr, len, 0) % ERRHASHSZ;
78 78
79 hlist_for_each_entry(c, p, &hash_errmap[bucket], list) { 79 hlist_for_each_entry(c, p, &hash_errmap[bucket], list) {
80 if (!strcmp(c->name, errstr)) { 80 if (c->namelen==len && !memcmp(c->name, errstr, len)) {
81 errno = c->val; 81 errno = c->val;
82 break; 82 break;
83 } 83 }
diff --git a/fs/9p/error.h b/fs/9p/error.h
index 78f89acf7c9..a9794e85fe5 100644
--- a/fs/9p/error.h
+++ b/fs/9p/error.h
@@ -36,6 +36,7 @@ struct errormap {
36 char *name; 36 char *name;
37 int val; 37 int val;
38 38
39 int namelen;
39 struct hlist_node list; 40 struct hlist_node list;
40}; 41};
41 42
@@ -175,4 +176,3 @@ static struct errormap errmap[] = {
175}; 176};
176 177
177extern int v9fs_error_init(void); 178extern int v9fs_error_init(void);
178extern int v9fs_errstr2errno(char *errstr);
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index d95f8626d17..eda449778fa 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -31,9 +31,6 @@
31#include "v9fs.h" 31#include "v9fs.h"
32#include "9p.h" 32#include "9p.h"
33#include "v9fs_vfs.h" 33#include "v9fs_vfs.h"
34#include "transport.h"
35#include "mux.h"
36#include "conv.h"
37#include "fid.h" 34#include "fid.h"
38 35
39/** 36/**
@@ -164,7 +161,7 @@ static struct v9fs_fid *v9fs_fid_walk_up(struct dentry *dentry)
164 return v9fs_fid_create(dentry, v9ses, fidnum, 0); 161 return v9fs_fid_create(dentry, v9ses, fidnum, 0);
165 162
166clunk_fid: 163clunk_fid:
167 v9fs_t_clunk(v9ses, fidnum, NULL); 164 v9fs_t_clunk(v9ses, fidnum);
168 return ERR_PTR(err); 165 return ERR_PTR(err);
169} 166}
170 167
diff --git a/fs/9p/mux.c b/fs/9p/mux.c
index 8835b576f74..945cb368d45 100644
--- a/fs/9p/mux.c
+++ b/fs/9p/mux.c
@@ -4,7 +4,7 @@
4 * Protocol Multiplexer 4 * Protocol Multiplexer
5 * 5 *
6 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> 6 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
7 * Copyright (C) 2004 by Latchesar Ionkov <lucho@ionkov.net> 7 * Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net>
8 * 8 *
9 * This program is free software; you can redistribute it and/or modify 9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by 10 * it under the terms of the GNU General Public License as published by
@@ -28,448 +28,943 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/errno.h> 29#include <linux/errno.h>
30#include <linux/fs.h> 30#include <linux/fs.h>
31#include <linux/poll.h>
31#include <linux/kthread.h> 32#include <linux/kthread.h>
32#include <linux/idr.h> 33#include <linux/idr.h>
33 34
34#include "debug.h" 35#include "debug.h"
35#include "v9fs.h" 36#include "v9fs.h"
36#include "9p.h" 37#include "9p.h"
37#include "transport.h"
38#include "conv.h" 38#include "conv.h"
39#include "transport.h"
39#include "mux.h" 40#include "mux.h"
40 41
42#define ERREQFLUSH 1
43#define SCHED_TIMEOUT 10
44#define MAXPOLLWADDR 2
45
46enum {
47 Rworksched = 1, /* read work scheduled or running */
48 Rpending = 2, /* can read */
49 Wworksched = 4, /* write work scheduled or running */
50 Wpending = 8, /* can write */
51};
52
53struct v9fs_mux_poll_task;
54
55struct v9fs_req {
56 int tag;
57 struct v9fs_fcall *tcall;
58 struct v9fs_fcall *rcall;
59 int err;
60 v9fs_mux_req_callback cb;
61 void *cba;
62 struct list_head req_list;
63};
64
65struct v9fs_mux_data {
66 spinlock_t lock;
67 struct list_head mux_list;
68 struct v9fs_mux_poll_task *poll_task;
69 int msize;
70 unsigned char *extended;
71 struct v9fs_transport *trans;
72 struct v9fs_idpool tidpool;
73 int err;
74 wait_queue_head_t equeue;
75 struct list_head req_list;
76 struct list_head unsent_req_list;
77 struct v9fs_fcall *rcall;
78 int rpos;
79 char *rbuf;
80 int wpos;
81 int wsize;
82 char *wbuf;
83 wait_queue_t poll_wait[MAXPOLLWADDR];
84 wait_queue_head_t *poll_waddr[MAXPOLLWADDR];
85 poll_table pt;
86 struct work_struct rq;
87 struct work_struct wq;
88 unsigned long wsched;
89};
90
91struct v9fs_mux_poll_task {
92 struct task_struct *task;
93 struct list_head mux_list;
94 int muxnum;
95};
96
97struct v9fs_mux_rpc {
98 struct v9fs_mux_data *m;
99 struct v9fs_req *req;
100 int err;
101 struct v9fs_fcall *rcall;
102 wait_queue_head_t wqueue;
103};
104
105static int v9fs_poll_proc(void *);
106static void v9fs_read_work(void *);
107static void v9fs_write_work(void *);
108static void v9fs_pollwait(struct file *filp, wait_queue_head_t * wait_address,
109 poll_table * p);
110static u16 v9fs_mux_get_tag(struct v9fs_mux_data *);
111static void v9fs_mux_put_tag(struct v9fs_mux_data *, u16);
112
113static DECLARE_MUTEX(v9fs_mux_task_lock);
114static struct workqueue_struct *v9fs_mux_wq;
115
116static int v9fs_mux_num;
117static int v9fs_mux_poll_task_num;
118static struct v9fs_mux_poll_task v9fs_mux_poll_tasks[100];
119
120int v9fs_mux_global_init(void)
121{
122 int i;
123
124 for (i = 0; i < ARRAY_SIZE(v9fs_mux_poll_tasks); i++)
125 v9fs_mux_poll_tasks[i].task = NULL;
126
127 v9fs_mux_wq = create_workqueue("v9fs");
128 if (!v9fs_mux_wq)
129 return -ENOMEM;
130
131 return 0;
132}
133
134void v9fs_mux_global_exit(void)
135{
136 destroy_workqueue(v9fs_mux_wq);
137}
138
41/** 139/**
42 * dprintcond - print condition of session info 140 * v9fs_mux_calc_poll_procs - calculates the number of polling procs
43 * @v9ses: session info structure 141 * based on the number of mounted v9fs filesystems.
44 * @req: RPC request structure
45 * 142 *
143 * The current implementation returns sqrt of the number of mounts.
46 */ 144 */
145inline int v9fs_mux_calc_poll_procs(int muxnum)
146{
147 int n;
148
149 if (v9fs_mux_poll_task_num)
150 n = muxnum / v9fs_mux_poll_task_num +
151 (muxnum % v9fs_mux_poll_task_num ? 1 : 0);
152 else
153 n = 1;
154
155 if (n > ARRAY_SIZE(v9fs_mux_poll_tasks))
156 n = ARRAY_SIZE(v9fs_mux_poll_tasks);
157
158 return n;
159}
47 160
48static inline int 161static int v9fs_mux_poll_start(struct v9fs_mux_data *m)
49dprintcond(struct v9fs_session_info *v9ses, struct v9fs_rpcreq *req)
50{ 162{
51 dprintk(DEBUG_MUX, "condition: %d, %p\n", v9ses->transport->status, 163 int i, n;
52 req->rcall); 164 struct v9fs_mux_poll_task *vpt, *vptlast;
165 struct task_struct *pproc;
166
167 dprintk(DEBUG_MUX, "mux %p muxnum %d procnum %d\n", m, v9fs_mux_num,
168 v9fs_mux_poll_task_num);
169 up(&v9fs_mux_task_lock);
170
171 n = v9fs_mux_calc_poll_procs(v9fs_mux_num + 1);
172 if (n > v9fs_mux_poll_task_num) {
173 for (i = 0; i < ARRAY_SIZE(v9fs_mux_poll_tasks); i++) {
174 if (v9fs_mux_poll_tasks[i].task == NULL) {
175 vpt = &v9fs_mux_poll_tasks[i];
176 dprintk(DEBUG_MUX, "create proc %p\n", vpt);
177 pproc = kthread_create(v9fs_poll_proc, vpt,
178 "v9fs-poll");
179
180 if (!IS_ERR(pproc)) {
181 vpt->task = pproc;
182 INIT_LIST_HEAD(&vpt->mux_list);
183 vpt->muxnum = 0;
184 v9fs_mux_poll_task_num++;
185 wake_up_process(vpt->task);
186 }
187 break;
188 }
189 }
190
191 if (i >= ARRAY_SIZE(v9fs_mux_poll_tasks))
192 dprintk(DEBUG_ERROR, "warning: no free poll slots\n");
193 }
194
195 n = (v9fs_mux_num + 1) / v9fs_mux_poll_task_num +
196 ((v9fs_mux_num + 1) % v9fs_mux_poll_task_num ? 1 : 0);
197
198 vptlast = NULL;
199 for (i = 0; i < ARRAY_SIZE(v9fs_mux_poll_tasks); i++) {
200 vpt = &v9fs_mux_poll_tasks[i];
201 if (vpt->task != NULL) {
202 vptlast = vpt;
203 if (vpt->muxnum < n) {
204 dprintk(DEBUG_MUX, "put in proc %d\n", i);
205 list_add(&m->mux_list, &vpt->mux_list);
206 vpt->muxnum++;
207 m->poll_task = vpt;
208 memset(&m->poll_waddr, 0, sizeof(m->poll_waddr));
209 init_poll_funcptr(&m->pt, v9fs_pollwait);
210 break;
211 }
212 }
213 }
214
215 if (i >= ARRAY_SIZE(v9fs_mux_poll_tasks)) {
216 if (vptlast == NULL)
217 return -ENOMEM;
218
219 dprintk(DEBUG_MUX, "put in proc %d\n", i);
220 list_add(&m->mux_list, &vptlast->mux_list);
221 vptlast->muxnum++;
222 m->poll_task = vptlast;
223 memset(&m->poll_waddr, 0, sizeof(m->poll_waddr));
224 init_poll_funcptr(&m->pt, v9fs_pollwait);
225 }
226
227 v9fs_mux_num++;
228 down(&v9fs_mux_task_lock);
229
53 return 0; 230 return 0;
54} 231}
55 232
233static void v9fs_mux_poll_stop(struct v9fs_mux_data *m)
234{
235 int i;
236 struct v9fs_mux_poll_task *vpt;
237
238 up(&v9fs_mux_task_lock);
239 vpt = m->poll_task;
240 list_del(&m->mux_list);
241 for(i = 0; i < ARRAY_SIZE(m->poll_waddr); i++) {
242 if (m->poll_waddr[i] != NULL) {
243 remove_wait_queue(m->poll_waddr[i], &m->poll_wait[i]);
244 m->poll_waddr[i] = NULL;
245 }
246 }
247 vpt->muxnum--;
248 if (!vpt->muxnum) {
249 dprintk(DEBUG_MUX, "destroy proc %p\n", vpt);
250 send_sig(SIGKILL, vpt->task, 1);
251 vpt->task = NULL;
252 v9fs_mux_poll_task_num--;
253 }
254 v9fs_mux_num--;
255 down(&v9fs_mux_task_lock);
256}
257
56/** 258/**
57 * xread - force read of a certain number of bytes 259 * v9fs_mux_init - allocate and initialize the per-session mux data
58 * @v9ses: session info structure 260 * Creates the polling task if this is the first session.
59 * @ptr: pointer to buffer
60 * @sz: number of bytes to read
61 * 261 *
62 * Chuck Cranor CS-533 project1 262 * @trans - transport structure
263 * @msize - maximum message size
264 * @extended - pointer to the extended flag
63 */ 265 */
64 266struct v9fs_mux_data *v9fs_mux_init(struct v9fs_transport *trans, int msize,
65static int xread(struct v9fs_session_info *v9ses, void *ptr, unsigned long sz) 267 unsigned char *extended)
66{ 268{
67 int rd = 0; 269 int i, n;
68 int ret = 0; 270 struct v9fs_mux_data *m, *mtmp;
69 while (rd < sz) { 271
70 ret = v9ses->transport->read(v9ses->transport, ptr, sz - rd); 272 dprintk(DEBUG_MUX, "transport %p msize %d\n", trans, msize);
71 if (ret <= 0) { 273 m = kmalloc(sizeof(struct v9fs_mux_data), GFP_KERNEL);
72 dprintk(DEBUG_ERROR, "xread errno %d\n", ret); 274 if (!m)
73 return ret; 275 return ERR_PTR(-ENOMEM);
276
277 spin_lock_init(&m->lock);
278 INIT_LIST_HEAD(&m->mux_list);
279 m->msize = msize;
280 m->extended = extended;
281 m->trans = trans;
282 idr_init(&m->tidpool.pool);
283 init_MUTEX(&m->tidpool.lock);
284 m->err = 0;
285 init_waitqueue_head(&m->equeue);
286 INIT_LIST_HEAD(&m->req_list);
287 INIT_LIST_HEAD(&m->unsent_req_list);
288 m->rcall = NULL;
289 m->rpos = 0;
290 m->rbuf = NULL;
291 m->wpos = m->wsize = 0;
292 m->wbuf = NULL;
293 INIT_WORK(&m->rq, v9fs_read_work, m);
294 INIT_WORK(&m->wq, v9fs_write_work, m);
295 m->wsched = 0;
296 memset(&m->poll_waddr, 0, sizeof(m->poll_waddr));
297 m->poll_task = NULL;
298 n = v9fs_mux_poll_start(m);
299 if (n)
300 return ERR_PTR(n);
301
302 n = trans->poll(trans, &m->pt);
303 if (n & POLLIN) {
304 dprintk(DEBUG_MUX, "mux %p can read\n", m);
305 set_bit(Rpending, &m->wsched);
306 }
307
308 if (n & POLLOUT) {
309 dprintk(DEBUG_MUX, "mux %p can write\n", m);
310 set_bit(Wpending, &m->wsched);
311 }
312
313 for(i = 0; i < ARRAY_SIZE(m->poll_waddr); i++) {
314 if (IS_ERR(m->poll_waddr[i])) {
315 v9fs_mux_poll_stop(m);
316 mtmp = (void *)m->poll_waddr; /* the error code */
317 kfree(m);
318 m = mtmp;
319 break;
74 } 320 }
75 rd += ret;
76 ptr += ret;
77 } 321 }
78 return (rd); 322
323 return m;
79} 324}
80 325
81/** 326/**
82 * read_message - read a full 9P2000 fcall packet 327 * v9fs_mux_destroy - cancels all pending requests and frees mux resources
83 * @v9ses: session info structure
84 * @rcall: fcall structure to read into
85 * @rcalllen: size of fcall buffer
86 *
87 */ 328 */
329void v9fs_mux_destroy(struct v9fs_mux_data *m)
330{
331 dprintk(DEBUG_MUX, "mux %p prev %p next %p\n", m,
332 m->mux_list.prev, m->mux_list.next);
333 v9fs_mux_cancel(m, -ECONNRESET);
334
335 if (!list_empty(&m->req_list)) {
336 /* wait until all processes waiting on this session exit */
337 dprintk(DEBUG_MUX, "mux %p waiting for empty request queue\n",
338 m);
339 wait_event_timeout(m->equeue, (list_empty(&m->req_list)), 5000);
340 dprintk(DEBUG_MUX, "mux %p request queue empty: %d\n", m,
341 list_empty(&m->req_list));
342 }
343
344 v9fs_mux_poll_stop(m);
345 m->trans = NULL;
346
347 kfree(m);
348}
88 349
89static int 350/**
90read_message(struct v9fs_session_info *v9ses, 351 * v9fs_pollwait - called by files poll operation to add v9fs-poll task
91 struct v9fs_fcall *rcall, int rcalllen) 352 * to files wait queue
353 */
354static void
355v9fs_pollwait(struct file *filp, wait_queue_head_t * wait_address,
356 poll_table * p)
92{ 357{
93 unsigned char buf[4]; 358 int i;
94 void *data; 359 struct v9fs_mux_data *m;
95 int size = 0; 360
96 int res = 0; 361 m = container_of(p, struct v9fs_mux_data, pt);
97 362 for(i = 0; i < ARRAY_SIZE(m->poll_waddr); i++)
98 res = xread(v9ses, buf, sizeof(buf)); 363 if (m->poll_waddr[i] == NULL)
99 if (res < 0) { 364 break;
100 dprintk(DEBUG_ERROR, 365
101 "Reading of count field failed returned: %d\n", res); 366 if (i >= ARRAY_SIZE(m->poll_waddr)) {
102 return res; 367 dprintk(DEBUG_ERROR, "not enough wait_address slots\n");
368 return;
103 } 369 }
104 370
105 if (res < 4) { 371 m->poll_waddr[i] = wait_address;
106 dprintk(DEBUG_ERROR, 372
107 "Reading of count field failed returned: %d\n", res); 373 if (!wait_address) {
108 return -EIO; 374 dprintk(DEBUG_ERROR, "no wait_address\n");
375 m->poll_waddr[i] = ERR_PTR(-EIO);
376 return;
109 } 377 }
110 378
111 size = buf[0] | (buf[1] << 8) | (buf[2] << 16) | (buf[3] << 24); 379 init_waitqueue_entry(&m->poll_wait[i], m->poll_task->task);
112 dprintk(DEBUG_MUX, "got a packet count: %d\n", size); 380 add_wait_queue(wait_address, &m->poll_wait[i]);
381}
382
383/**
384 * v9fs_poll_mux - polls a mux and schedules read or write works if necessary
385 */
386static inline void v9fs_poll_mux(struct v9fs_mux_data *m)
387{
388 int n;
113 389
114 /* adjust for the four bytes of size */ 390 if (m->err < 0)
115 size -= 4; 391 return;
116 392
117 if (size > v9ses->maxdata) { 393 n = m->trans->poll(m->trans, NULL);
118 dprintk(DEBUG_ERROR, "packet too big: %d\n", size); 394 if (n < 0 || n & (POLLERR | POLLHUP | POLLNVAL)) {
119 return -E2BIG; 395 dprintk(DEBUG_MUX, "error mux %p err %d\n", m, n);
396 if (n >= 0)
397 n = -ECONNRESET;
398 v9fs_mux_cancel(m, n);
120 } 399 }
121 400
122 data = kmalloc(size, GFP_KERNEL); 401 if (n & POLLIN) {
123 if (!data) { 402 set_bit(Rpending, &m->wsched);
124 eprintk(KERN_WARNING, "out of memory\n"); 403 dprintk(DEBUG_MUX, "mux %p can read\n", m);
125 return -ENOMEM; 404 if (!test_and_set_bit(Rworksched, &m->wsched)) {
405 dprintk(DEBUG_MUX, "schedule read work mux %p\n", m);
406 queue_work(v9fs_mux_wq, &m->rq);
407 }
126 } 408 }
127 409
128 res = xread(v9ses, data, size); 410 if (n & POLLOUT) {
129 if (res < size) { 411 set_bit(Wpending, &m->wsched);
130 dprintk(DEBUG_ERROR, "Reading of fcall failed returned: %d\n", 412 dprintk(DEBUG_MUX, "mux %p can write\n", m);
131 res); 413 if ((m->wsize || !list_empty(&m->unsent_req_list))
132 kfree(data); 414 && !test_and_set_bit(Wworksched, &m->wsched)) {
133 return res; 415 dprintk(DEBUG_MUX, "schedule write work mux %p\n", m);
416 queue_work(v9fs_mux_wq, &m->wq);
417 }
134 } 418 }
419}
420
421/**
422 * v9fs_poll_proc - polls all v9fs transports for new events and queues
423 * the appropriate work to the work queue
424 */
425static int v9fs_poll_proc(void *a)
426{
427 struct v9fs_mux_data *m, *mtmp;
428 struct v9fs_mux_poll_task *vpt;
135 429
136 /* we now have an in-memory string that is the reply. 430 vpt = a;
137 * deserialize it. There is very little to go wrong at this point 431 dprintk(DEBUG_MUX, "start %p %p\n", current, vpt);
138 * save for v9fs_alloc errors. 432 allow_signal(SIGKILL);
139 */ 433 while (!kthread_should_stop()) {
140 res = v9fs_deserialize_fcall(v9ses, size, data, v9ses->maxdata, 434 set_current_state(TASK_INTERRUPTIBLE);
141 rcall, rcalllen); 435 if (signal_pending(current))
436 break;
142 437
143 kfree(data); 438 list_for_each_entry_safe(m, mtmp, &vpt->mux_list, mux_list) {
439 v9fs_poll_mux(m);
440 }
144 441
145 if (res < 0) 442 dprintk(DEBUG_MUX, "sleeping...\n");
146 return res; 443 schedule_timeout(SCHED_TIMEOUT * HZ);
444 }
147 445
446 __set_current_state(TASK_RUNNING);
447 dprintk(DEBUG_MUX, "finish\n");
148 return 0; 448 return 0;
149} 449}
150 450
151/** 451/**
152 * v9fs_recv - receive an RPC response for a particular tag 452 * v9fs_write_work - called when a transport can send some data
153 * @v9ses: session info structure
154 * @req: RPC request structure
155 *
156 */ 453 */
157 454static void v9fs_write_work(void *a)
158static int v9fs_recv(struct v9fs_session_info *v9ses, struct v9fs_rpcreq *req)
159{ 455{
160 int ret = 0; 456 int n, err;
457 struct v9fs_mux_data *m;
458 struct v9fs_req *req;
161 459
162 dprintk(DEBUG_MUX, "waiting for response: %d\n", req->tcall->tag); 460 m = a;
163 ret = wait_event_interruptible(v9ses->read_wait,
164 ((v9ses->transport->status != Connected) ||
165 (req->rcall != 0) || (req->err < 0) ||
166 dprintcond(v9ses, req)));
167 461
168 dprintk(DEBUG_MUX, "got it: rcall %p\n", req->rcall); 462 if (m->err < 0) {
463 clear_bit(Wworksched, &m->wsched);
464 return;
465 }
169 466
170 spin_lock(&v9ses->muxlock); 467 if (!m->wsize) {
171 list_del(&req->next); 468 if (list_empty(&m->unsent_req_list)) {
172 spin_unlock(&v9ses->muxlock); 469 clear_bit(Wworksched, &m->wsched);
470 return;
471 }
173 472
174 if (req->err < 0) 473 spin_lock(&m->lock);
175 return req->err; 474 req =
475 list_entry(m->unsent_req_list.next, struct v9fs_req,
476 req_list);
477 list_move_tail(&req->req_list, &m->req_list);
478 m->wbuf = req->tcall->sdata;
479 m->wsize = req->tcall->size;
480 m->wpos = 0;
481 dump_data(m->wbuf, m->wsize);
482 spin_unlock(&m->lock);
483 }
176 484
177 if (v9ses->transport->status == Disconnected) 485 dprintk(DEBUG_MUX, "mux %p pos %d size %d\n", m, m->wpos, m->wsize);
178 return -ECONNRESET; 486 clear_bit(Wpending, &m->wsched);
487 err = m->trans->write(m->trans, m->wbuf + m->wpos, m->wsize - m->wpos);
488 dprintk(DEBUG_MUX, "mux %p sent %d bytes\n", m, err);
489 if (err == -EAGAIN) {
490 clear_bit(Wworksched, &m->wsched);
491 return;
492 }
179 493
180 return ret; 494 if (err <= 0)
181} 495 goto error;
182 496
183/** 497 m->wpos += err;
184 * v9fs_send - send a 9P request 498 if (m->wpos == m->wsize)
185 * @v9ses: session info structure 499 m->wpos = m->wsize = 0;
186 * @req: RPC request to send 500
187 * 501 if (m->wsize == 0 && !list_empty(&m->unsent_req_list)) {
188 */ 502 if (test_and_clear_bit(Wpending, &m->wsched))
503 n = POLLOUT;
504 else
505 n = m->trans->poll(m->trans, NULL);
506
507 if (n & POLLOUT) {
508 dprintk(DEBUG_MUX, "schedule write work mux %p\n", m);
509 queue_work(v9fs_mux_wq, &m->wq);
510 } else
511 clear_bit(Wworksched, &m->wsched);
512 } else
513 clear_bit(Wworksched, &m->wsched);
514
515 return;
189 516
190static int v9fs_send(struct v9fs_session_info *v9ses, struct v9fs_rpcreq *req) 517 error:
518 v9fs_mux_cancel(m, err);
519 clear_bit(Wworksched, &m->wsched);
520}
521
522static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req)
191{ 523{
192 int ret = -1; 524 int ecode, tag;
193 void *data = NULL; 525 struct v9fs_str *ename;
194 struct v9fs_fcall *tcall = req->tcall;
195 526
196 data = kmalloc(v9ses->maxdata + V9FS_IOHDRSZ, GFP_KERNEL); 527 tag = req->tag;
197 if (!data) 528 if (req->rcall->id == RERROR && !req->err) {
198 return -ENOMEM; 529 ecode = req->rcall->params.rerror.errno;
530 ename = &req->rcall->params.rerror.error;
199 531
200 tcall->size = 0; /* enforce size recalculation */ 532 dprintk(DEBUG_MUX, "Rerror %.*s\n", ename->len, ename->str);
201 ret =
202 v9fs_serialize_fcall(v9ses, tcall, data,
203 v9ses->maxdata + V9FS_IOHDRSZ);
204 if (ret < 0)
205 goto free_data;
206 533
207 spin_lock(&v9ses->muxlock); 534 if (*m->extended)
208 list_add(&req->next, &v9ses->mux_fcalls); 535 req->err = -ecode;
209 spin_unlock(&v9ses->muxlock);
210 536
211 dprintk(DEBUG_MUX, "sending message: tag %d size %d\n", tcall->tag, 537 if (!req->err) {
212 tcall->size); 538 req->err = v9fs_errstr2errno(ename->str, ename->len);
213 ret = v9ses->transport->write(v9ses->transport, data, tcall->size);
214 539
215 if (ret != tcall->size) { 540 if (!req->err) { /* string match failed */
216 spin_lock(&v9ses->muxlock); 541 PRINT_FCALL_ERROR("unknown error", req->rcall);
217 list_del(&req->next); 542 }
218 kfree(req->rcall); 543
544 if (!req->err)
545 req->err = -ESERVERFAULT;
546 }
547 } else if (req->tcall && req->rcall->id != req->tcall->id + 1) {
548 dprintk(DEBUG_ERROR, "fcall mismatch: expected %d, got %d\n",
549 req->tcall->id + 1, req->rcall->id);
550 if (!req->err)
551 req->err = -EIO;
552 }
219 553
220 spin_unlock(&v9ses->muxlock); 554 if (req->cb && req->err != ERREQFLUSH) {
221 if (ret >= 0) 555 dprintk(DEBUG_MUX, "calling callback tcall %p rcall %p\n",
222 ret = -EREMOTEIO; 556 req->tcall, req->rcall);
557
558 (*req->cb) (req->cba, req->tcall, req->rcall, req->err);
559 req->cb = NULL;
223 } else 560 } else
224 ret = 0; 561 kfree(req->rcall);
225 562
226 free_data: 563 v9fs_mux_put_tag(m, tag);
227 kfree(data); 564
228 return ret; 565 wake_up(&m->equeue);
566 kfree(req);
229} 567}
230 568
231/** 569/**
232 * v9fs_mux_rpc - send a request, receive a response 570 * v9fs_read_work - called when there is some data to be read from a transport
233 * @v9ses: session info structure
234 * @tcall: fcall to send
235 * @rcall: buffer to place response into
236 *
237 */ 571 */
238 572static void v9fs_read_work(void *a)
239long
240v9fs_mux_rpc(struct v9fs_session_info *v9ses, struct v9fs_fcall *tcall,
241 struct v9fs_fcall **rcall)
242{ 573{
243 int tid = -1; 574 int n, err;
244 struct v9fs_fcall *fcall = NULL; 575 struct v9fs_mux_data *m;
245 struct v9fs_rpcreq req; 576 struct v9fs_req *req, *rptr, *rreq;
246 int ret = -1; 577 struct v9fs_fcall *rcall;
247 578 char *rbuf;
248 if (!v9ses) 579
249 return -EINVAL; 580 m = a;
250 581
251 if (!v9ses->transport || v9ses->transport->status != Connected) 582 if (m->err < 0)
252 return -EIO; 583 return;
584
585 rcall = NULL;
586 dprintk(DEBUG_MUX, "start mux %p pos %d\n", m, m->rpos);
587
588 if (!m->rcall) {
589 m->rcall =
590 kmalloc(sizeof(struct v9fs_fcall) + m->msize, GFP_KERNEL);
591 if (!m->rcall) {
592 err = -ENOMEM;
593 goto error;
594 }
253 595
254 if (rcall) 596 m->rbuf = (char *)m->rcall + sizeof(struct v9fs_fcall);
255 *rcall = NULL; 597 m->rpos = 0;
598 }
256 599
257 if (tcall->id != TVERSION) { 600 clear_bit(Rpending, &m->wsched);
258 tid = v9fs_get_idpool(&v9ses->tidpool); 601 err = m->trans->read(m->trans, m->rbuf + m->rpos, m->msize - m->rpos);
259 if (tid < 0) 602 dprintk(DEBUG_MUX, "mux %p got %d bytes\n", m, err);
260 return -ENOMEM; 603 if (err == -EAGAIN) {
604 clear_bit(Rworksched, &m->wsched);
605 return;
261 } 606 }
262 607
263 tcall->tag = tid; 608 if (err <= 0)
609 goto error;
264 610
265 req.tcall = tcall; 611 m->rpos += err;
266 req.err = 0; 612 while (m->rpos > 4) {
267 req.rcall = NULL; 613 n = le32_to_cpu(*(__le32 *) m->rbuf);
614 if (n >= m->msize) {
615 dprintk(DEBUG_ERROR,
616 "requested packet size too big: %d\n", n);
617 err = -EIO;
618 goto error;
619 }
268 620
269 ret = v9fs_send(v9ses, &req); 621 if (m->rpos < n)
622 break;
270 623
271 if (ret < 0) { 624 dump_data(m->rbuf, n);
272 if (tcall->id != TVERSION) 625 err =
273 v9fs_put_idpool(tid, &v9ses->tidpool); 626 v9fs_deserialize_fcall(m->rbuf, n, m->rcall, *m->extended);
274 dprintk(DEBUG_MUX, "error %d\n", ret); 627 if (err < 0) {
275 return ret; 628 goto error;
276 } 629 }
630
631 rcall = m->rcall;
632 rbuf = m->rbuf;
633 if (m->rpos > n) {
634 m->rcall = kmalloc(sizeof(struct v9fs_fcall) + m->msize,
635 GFP_KERNEL);
636 if (!m->rcall) {
637 err = -ENOMEM;
638 goto error;
639 }
277 640
278 ret = v9fs_recv(v9ses, &req); 641 m->rbuf = (char *)m->rcall + sizeof(struct v9fs_fcall);
279 642 memmove(m->rbuf, rbuf + n, m->rpos - n);
280 fcall = req.rcall; 643 m->rpos -= n;
281 644 } else {
282 dprintk(DEBUG_MUX, "received: tag=%x, ret=%d\n", tcall->tag, ret); 645 m->rcall = NULL;
283 if (ret == -ERESTARTSYS) { 646 m->rbuf = NULL;
284 if (v9ses->transport->status != Disconnected 647 m->rpos = 0;
285 && tcall->id != TFLUSH) {
286 unsigned long flags;
287
288 dprintk(DEBUG_MUX, "flushing the tag: %d\n",
289 tcall->tag);
290 clear_thread_flag(TIF_SIGPENDING);
291 v9fs_t_flush(v9ses, tcall->tag);
292 spin_lock_irqsave(&current->sighand->siglock, flags);
293 recalc_sigpending();
294 spin_unlock_irqrestore(&current->sighand->siglock,
295 flags);
296 dprintk(DEBUG_MUX, "flushing done\n");
297 } 648 }
298 649
299 goto release_req; 650 dprintk(DEBUG_MUX, "mux %p fcall id %d tag %d\n", m, rcall->id,
300 } else if (ret < 0) 651 rcall->tag);
301 goto release_req; 652
302 653 req = NULL;
303 if (!fcall) 654 spin_lock(&m->lock);
304 ret = -EIO; 655 list_for_each_entry_safe(rreq, rptr, &m->req_list, req_list) {
305 else { 656 if (rreq->tag == rcall->tag) {
306 if (fcall->id == RERROR) { 657 req = rreq;
307 ret = v9fs_errstr2errno(fcall->params.rerror.error); 658 req->rcall = rcall;
308 if (ret == 0) { /* string match failed */ 659 list_del(&req->req_list);
309 if (fcall->params.rerror.errno) 660 spin_unlock(&m->lock);
310 ret = -(fcall->params.rerror.errno); 661 process_request(m, req);
311 else 662 break;
312 ret = -ESERVERFAULT;
313 } 663 }
314 } else if (fcall->id != tcall->id + 1) { 664
315 dprintk(DEBUG_ERROR, 665 }
316 "fcall mismatch: expected %d, got %d\n", 666
317 tcall->id + 1, fcall->id); 667 if (!req) {
318 ret = -EIO; 668 spin_unlock(&m->lock);
669 if (err >= 0 && rcall->id != RFLUSH)
670 dprintk(DEBUG_ERROR,
671 "unexpected response mux %p id %d tag %d\n",
672 m, rcall->id, rcall->tag);
673 kfree(rcall);
319 } 674 }
320 } 675 }
321 676
322 release_req: 677 if (!list_empty(&m->req_list)) {
323 if (tcall->id != TVERSION) 678 if (test_and_clear_bit(Rpending, &m->wsched))
324 v9fs_put_idpool(tid, &v9ses->tidpool); 679 n = POLLIN;
325 if (rcall) 680 else
326 *rcall = fcall; 681 n = m->trans->poll(m->trans, NULL);
327 else 682
328 kfree(fcall); 683 if (n & POLLIN) {
684 dprintk(DEBUG_MUX, "schedule read work mux %p\n", m);
685 queue_work(v9fs_mux_wq, &m->rq);
686 } else
687 clear_bit(Rworksched, &m->wsched);
688 } else
689 clear_bit(Rworksched, &m->wsched);
690
691 return;
329 692
330 return ret; 693 error:
694 v9fs_mux_cancel(m, err);
695 clear_bit(Rworksched, &m->wsched);
331} 696}
332 697
333/** 698/**
334 * v9fs_mux_cancel_requests - cancels all pending requests 699 * v9fs_send_request - send 9P request
700 * The function can sleep until the request is scheduled for sending.
701 * The function can be interrupted. Return from the function is not
702 * a guarantee that the request is sent succesfully. Can return errors
703 * that can be retrieved by PTR_ERR macros.
335 * 704 *
336 * @v9ses: session info structure 705 * @m: mux data
337 * @err: error code to return to the requests 706 * @tc: request to be sent
707 * @cb: callback function to call when response is received
708 * @cba: parameter to pass to the callback function
338 */ 709 */
339void v9fs_mux_cancel_requests(struct v9fs_session_info *v9ses, int err) 710static struct v9fs_req *v9fs_send_request(struct v9fs_mux_data *m,
711 struct v9fs_fcall *tc,
712 v9fs_mux_req_callback cb, void *cba)
340{ 713{
341 struct v9fs_rpcreq *rptr; 714 int n;
342 struct v9fs_rpcreq *rreq; 715 struct v9fs_req *req;
343 716
344 dprintk(DEBUG_MUX, " %d\n", err); 717 dprintk(DEBUG_MUX, "mux %p task %p tcall %p id %d\n", m, current,
345 spin_lock(&v9ses->muxlock); 718 tc, tc->id);
346 list_for_each_entry_safe(rreq, rptr, &v9ses->mux_fcalls, next) { 719 if (m->err < 0)
347 rreq->err = err; 720 return ERR_PTR(m->err);
348 }
349 spin_unlock(&v9ses->muxlock);
350 wake_up_all(&v9ses->read_wait);
351}
352 721
353/** 722 req = kmalloc(sizeof(struct v9fs_req), GFP_KERNEL);
354 * v9fs_recvproc - kproc to handle demultiplexing responses 723 if (!req)
355 * @data: session info structure 724 return ERR_PTR(-ENOMEM);
356 *
357 */
358 725
359static int v9fs_recvproc(void *data) 726 if (tc->id == TVERSION)
360{ 727 n = V9FS_NOTAG;
361 struct v9fs_session_info *v9ses = (struct v9fs_session_info *)data; 728 else
362 struct v9fs_fcall *rcall = NULL; 729 n = v9fs_mux_get_tag(m);
363 struct v9fs_rpcreq *rptr;
364 struct v9fs_rpcreq *req;
365 struct v9fs_rpcreq *rreq;
366 int err = 0;
367 730
368 allow_signal(SIGKILL); 731 if (n < 0)
369 set_current_state(TASK_INTERRUPTIBLE); 732 return ERR_PTR(-ENOMEM);
370 complete(&v9ses->proccmpl);
371 while (!kthread_should_stop() && err >= 0) {
372 req = rptr = rreq = NULL;
373
374 rcall = kmalloc(v9ses->maxdata + V9FS_IOHDRSZ, GFP_KERNEL);
375 if (!rcall) {
376 eprintk(KERN_ERR, "no memory for buffers\n");
377 break;
378 }
379 733
380 err = read_message(v9ses, rcall, v9ses->maxdata + V9FS_IOHDRSZ); 734 v9fs_set_tag(tc, n);
381 spin_lock(&v9ses->muxlock);
382 if (err < 0) {
383 list_for_each_entry_safe(rreq, rptr, &v9ses->mux_fcalls, next) {
384 rreq->err = err;
385 }
386 if(err != -ERESTARTSYS)
387 eprintk(KERN_ERR,
388 "Transport error while reading message %d\n", err);
389 } else {
390 list_for_each_entry_safe(rreq, rptr, &v9ses->mux_fcalls, next) {
391 if (rreq->tcall->tag == rcall->tag) {
392 req = rreq;
393 req->rcall = rcall;
394 break;
395 }
396 }
397 }
398 735
399 if (req && (req->tcall->id == TFLUSH)) { 736 req->tag = n;
400 struct v9fs_rpcreq *treq = NULL; 737 req->tcall = tc;
401 list_for_each_entry_safe(treq, rptr, &v9ses->mux_fcalls, next) { 738 req->rcall = NULL;
402 if (treq->tcall->tag == 739 req->err = 0;
403 req->tcall->params.tflush.oldtag) { 740 req->cb = cb;
404 list_del(&rptr->next); 741 req->cba = cba;
405 kfree(treq->rcall); 742
406 break; 743 spin_lock(&m->lock);
407 } 744 list_add_tail(&req->req_list, &m->unsent_req_list);
745 spin_unlock(&m->lock);
746
747 if (test_and_clear_bit(Wpending, &m->wsched))
748 n = POLLOUT;
749 else
750 n = m->trans->poll(m->trans, NULL);
751
752 if (n & POLLOUT && !test_and_set_bit(Wworksched, &m->wsched))
753 queue_work(v9fs_mux_wq, &m->wq);
754
755 return req;
756}
757
758static inline void
759v9fs_mux_flush_cb(void *a, struct v9fs_fcall *tc, struct v9fs_fcall *rc,
760 int err)
761{
762 v9fs_mux_req_callback cb;
763 int tag;
764 struct v9fs_mux_data *m;
765 struct v9fs_req *req, *rptr;
766
767 m = a;
768 dprintk(DEBUG_MUX, "mux %p tc %p rc %p err %d oldtag %d\n", m, tc,
769 rc, err, tc->params.tflush.oldtag);
770
771 spin_lock(&m->lock);
772 cb = NULL;
773 tag = tc->params.tflush.oldtag;
774 list_for_each_entry_safe(req, rptr, &m->req_list, req_list) {
775 if (req->tag == tag) {
776 list_del(&req->req_list);
777 if (req->cb) {
778 cb = req->cb;
779 req->cb = NULL;
780 spin_unlock(&m->lock);
781 (*cb) (req->cba, req->tcall, req->rcall,
782 req->err);
408 } 783 }
784 kfree(req);
785 wake_up(&m->equeue);
786 break;
409 } 787 }
788 }
410 789
411 spin_unlock(&v9ses->muxlock); 790 if (!cb)
791 spin_unlock(&m->lock);
412 792
413 if (!req) { 793 v9fs_mux_put_tag(m, tag);
414 if (err >= 0) 794 kfree(tc);
415 dprintk(DEBUG_ERROR, 795 kfree(rc);
416 "unexpected response: id %d tag %d\n", 796}
417 rcall->id, rcall->tag);
418 797
419 kfree(rcall); 798static void
420 } 799v9fs_mux_flush_request(struct v9fs_mux_data *m, struct v9fs_req *req)
800{
801 struct v9fs_fcall *fc;
421 802
422 wake_up_all(&v9ses->read_wait); 803 dprintk(DEBUG_MUX, "mux %p req %p tag %d\n", m, req, req->tag);
423 set_current_state(TASK_INTERRUPTIBLE); 804
805 fc = v9fs_create_tflush(req->tag);
806 v9fs_send_request(m, fc, v9fs_mux_flush_cb, m);
807}
808
809static void
810v9fs_mux_rpc_cb(void *a, struct v9fs_fcall *tc, struct v9fs_fcall *rc, int err)
811{
812 struct v9fs_mux_rpc *r;
813
814 if (err == ERREQFLUSH) {
815 dprintk(DEBUG_MUX, "err req flush\n");
816 return;
424 } 817 }
425 818
426 v9ses->transport->close(v9ses->transport); 819 r = a;
820 dprintk(DEBUG_MUX, "mux %p req %p tc %p rc %p err %d\n", r->m, r->req,
821 tc, rc, err);
822 r->rcall = rc;
823 r->err = err;
824 wake_up(&r->wqueue);
825}
427 826
428 /* Inform all pending processes about the failure */ 827/**
429 wake_up_all(&v9ses->read_wait); 828 * v9fs_mux_rpc - sends 9P request and waits until a response is available.
829 * The function can be interrupted.
830 * @m: mux data
831 * @tc: request to be sent
832 * @rc: pointer where a pointer to the response is stored
833 */
834int
835v9fs_mux_rpc(struct v9fs_mux_data *m, struct v9fs_fcall *tc,
836 struct v9fs_fcall **rc)
837{
838 int err;
839 unsigned long flags;
840 struct v9fs_req *req;
841 struct v9fs_mux_rpc r;
842
843 r.err = 0;
844 r.rcall = NULL;
845 r.m = m;
846 init_waitqueue_head(&r.wqueue);
847
848 if (rc)
849 *rc = NULL;
850
851 req = v9fs_send_request(m, tc, v9fs_mux_rpc_cb, &r);
852 if (IS_ERR(req)) {
853 err = PTR_ERR(req);
854 dprintk(DEBUG_MUX, "error %d\n", err);
855 return PTR_ERR(req);
856 }
430 857
431 if (signal_pending(current)) 858 r.req = req;
432 complete(&v9ses->proccmpl); 859 dprintk(DEBUG_MUX, "mux %p tc %p tag %d rpc %p req %p\n", m, tc,
860 req->tag, &r, req);
861 err = wait_event_interruptible(r.wqueue, r.rcall != NULL || r.err < 0);
862 if (r.err < 0)
863 err = r.err;
864
865 if (err == -ERESTARTSYS && m->trans->status == Connected && m->err == 0) {
866 spin_lock(&m->lock);
867 req->tcall = NULL;
868 req->err = ERREQFLUSH;
869 spin_unlock(&m->lock);
870
871 clear_thread_flag(TIF_SIGPENDING);
872 v9fs_mux_flush_request(m, req);
873 spin_lock_irqsave(&current->sighand->siglock, flags);
874 recalc_sigpending();
875 spin_unlock_irqrestore(&current->sighand->siglock, flags);
876 }
433 877
434 dprintk(DEBUG_MUX, "recvproc: end\n"); 878 if (!err) {
435 v9ses->recvproc = NULL; 879 if (r.rcall)
880 dprintk(DEBUG_MUX, "got response id %d tag %d\n",
881 r.rcall->id, r.rcall->tag);
882
883 if (rc)
884 *rc = r.rcall;
885 else
886 kfree(r.rcall);
887 } else {
888 kfree(r.rcall);
889 dprintk(DEBUG_MUX, "got error %d\n", err);
890 if (err > 0)
891 err = -EIO;
892 }
436 893
437 return err >= 0; 894 return err;
438} 895}
439 896
440/** 897/**
441 * v9fs_mux_init - initialize multiplexer (spawn kproc) 898 * v9fs_mux_rpcnb - sends 9P request without waiting for response.
442 * @v9ses: session info structure 899 * @m: mux data
443 * @dev_name: mount device information (to create unique kproc) 900 * @tc: request to be sent
444 * 901 * @cb: callback function to be called when response arrives
902 * @cba: value to pass to the callback function
445 */ 903 */
904int v9fs_mux_rpcnb(struct v9fs_mux_data *m, struct v9fs_fcall *tc,
905 v9fs_mux_req_callback cb, void *a)
906{
907 int err;
908 struct v9fs_req *req;
909
910 req = v9fs_send_request(m, tc, cb, a);
911 if (IS_ERR(req)) {
912 err = PTR_ERR(req);
913 dprintk(DEBUG_MUX, "error %d\n", err);
914 return PTR_ERR(req);
915 }
916
917 dprintk(DEBUG_MUX, "mux %p tc %p tag %d\n", m, tc, req->tag);
918 return 0;
919}
446 920
447int v9fs_mux_init(struct v9fs_session_info *v9ses, const char *dev_name) 921/**
922 * v9fs_mux_cancel - cancel all pending requests with error
923 * @m: mux data
924 * @err: error code
925 */
926void v9fs_mux_cancel(struct v9fs_mux_data *m, int err)
448{ 927{
449 char procname[60]; 928 struct v9fs_req *req, *rtmp;
450 929 LIST_HEAD(cancel_list);
451 strncpy(procname, dev_name, sizeof(procname)); 930
452 procname[sizeof(procname) - 1] = 0; 931 dprintk(DEBUG_MUX, "mux %p err %d\n", m, err);
453 932 m->err = err;
454 init_waitqueue_head(&v9ses->read_wait); 933 spin_lock(&m->lock);
455 init_completion(&v9ses->fcread); 934 list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) {
456 init_completion(&v9ses->proccmpl); 935 list_move(&req->req_list, &cancel_list);
457 spin_lock_init(&v9ses->muxlock);
458 INIT_LIST_HEAD(&v9ses->mux_fcalls);
459 v9ses->recvproc = NULL;
460 v9ses->curfcall = NULL;
461
462 v9ses->recvproc = kthread_create(v9fs_recvproc, v9ses,
463 "v9fs_recvproc %s", procname);
464
465 if (IS_ERR(v9ses->recvproc)) {
466 eprintk(KERN_ERR, "cannot create receiving thread\n");
467 v9fs_session_close(v9ses);
468 return -ECONNABORTED;
469 } 936 }
937 spin_unlock(&m->lock);
470 938
471 wake_up_process(v9ses->recvproc); 939 list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) {
472 wait_for_completion(&v9ses->proccmpl); 940 list_del(&req->req_list);
941 if (!req->err)
942 req->err = err;
473 943
474 return 0; 944 if (req->cb)
945 (*req->cb) (req->cba, req->tcall, req->rcall, req->err);
946 else
947 kfree(req->rcall);
948
949 kfree(req);
950 }
951
952 wake_up(&m->equeue);
953}
954
955static u16 v9fs_mux_get_tag(struct v9fs_mux_data *m)
956{
957 int tag;
958
959 tag = v9fs_get_idpool(&m->tidpool);
960 if (tag < 0)
961 return V9FS_NOTAG;
962 else
963 return (u16) tag;
964}
965
966static void v9fs_mux_put_tag(struct v9fs_mux_data *m, u16 tag)
967{
968 if (tag != V9FS_NOTAG && v9fs_check_idpool(tag, &m->tidpool))
969 v9fs_put_idpool(tag, &m->tidpool);
475} 970}
diff --git a/fs/9p/mux.h b/fs/9p/mux.h
index 4994cb10bad..9473b84f24b 100644
--- a/fs/9p/mux.h
+++ b/fs/9p/mux.h
@@ -3,6 +3,7 @@
3 * 3 *
4 * Multiplexer Definitions 4 * Multiplexer Definitions
5 * 5 *
6 * Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net>
6 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> 7 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
7 * 8 *
8 * This program is free software; you can redistribute it and/or modify 9 * This program is free software; you can redistribute it and/or modify
@@ -23,19 +24,35 @@
23 * 24 *
24 */ 25 */
25 26
26/* structure to manage each RPC transaction */ 27struct v9fs_mux_data;
27 28
28struct v9fs_rpcreq { 29/**
29 struct v9fs_fcall *tcall; 30 * v9fs_mux_req_callback - callback function that is called when the
30 struct v9fs_fcall *rcall; 31 * response of a request is received. The callback is called from
31 int err; /* error code if response failed */ 32 * a workqueue and shouldn't block.
33 *
34 * @a - the pointer that was specified when the request was send to be
35 * passed to the callback
36 * @tc - request call
37 * @rc - response call
38 * @err - error code (non-zero if error occured)
39 */
40typedef void (*v9fs_mux_req_callback)(void *a, struct v9fs_fcall *tc,
41 struct v9fs_fcall *rc, int err);
42
43int v9fs_mux_global_init(void);
44void v9fs_mux_global_exit(void);
32 45
33 /* XXX - could we put scatter/gather buffers here? */ 46struct v9fs_mux_data *v9fs_mux_init(struct v9fs_transport *trans, int msize,
47 unsigned char *extended);
48void v9fs_mux_destroy(struct v9fs_mux_data *);
34 49
35 struct list_head next; 50int v9fs_mux_send(struct v9fs_mux_data *m, struct v9fs_fcall *tc);
36}; 51struct v9fs_fcall *v9fs_mux_recv(struct v9fs_mux_data *m);
52int v9fs_mux_rpc(struct v9fs_mux_data *m, struct v9fs_fcall *tc, struct v9fs_fcall **rc);
53int v9fs_mux_rpcnb(struct v9fs_mux_data *m, struct v9fs_fcall *tc,
54 v9fs_mux_req_callback cb, void *a);
37 55
38int v9fs_mux_init(struct v9fs_session_info *v9ses, const char *dev_name); 56void v9fs_mux_flush(struct v9fs_mux_data *m, int sendflush);
39long v9fs_mux_rpc(struct v9fs_session_info *v9ses, 57void v9fs_mux_cancel(struct v9fs_mux_data *m, int err);
40 struct v9fs_fcall *tcall, struct v9fs_fcall **rcall); 58int v9fs_errstr2errno(char *errstr, int len);
41void v9fs_mux_cancel_requests(struct v9fs_session_info *v9ses, int err);
diff --git a/fs/9p/trans_fd.c b/fs/9p/trans_fd.c
index 63b58ce98ff..1a28ef97a3d 100644
--- a/fs/9p/trans_fd.c
+++ b/fs/9p/trans_fd.c
@@ -3,6 +3,7 @@
3 * 3 *
4 * File Descriptor Transport Layer 4 * File Descriptor Transport Layer
5 * 5 *
6 * Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net>
6 * Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com> 7 * Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com>
7 * 8 *
8 * This program is free software; you can redistribute it and/or modify 9 * This program is free software; you can redistribute it and/or modify
@@ -106,9 +107,6 @@ v9fs_fd_init(struct v9fs_session_info *v9ses, const char *addr, char *data)
106 return -ENOPROTOOPT; 107 return -ENOPROTOOPT;
107 } 108 }
108 109
109 sema_init(&trans->writelock, 1);
110 sema_init(&trans->readlock, 1);
111
112 ts = kmalloc(sizeof(struct v9fs_trans_fd), GFP_KERNEL); 110 ts = kmalloc(sizeof(struct v9fs_trans_fd), GFP_KERNEL);
113 111
114 if (!ts) 112 if (!ts)
@@ -148,12 +146,12 @@ static void v9fs_fd_close(struct v9fs_transport *trans)
148 if (!trans) 146 if (!trans)
149 return; 147 return;
150 148
151 trans->status = Disconnected; 149 ts = xchg(&trans->priv, NULL);
152 ts = trans->priv;
153 150
154 if (!ts) 151 if (!ts)
155 return; 152 return;
156 153
154 trans->status = Disconnected;
157 if (ts->in_file) 155 if (ts->in_file)
158 fput(ts->in_file); 156 fput(ts->in_file);
159 157
@@ -163,10 +161,55 @@ static void v9fs_fd_close(struct v9fs_transport *trans)
163 kfree(ts); 161 kfree(ts);
164} 162}
165 163
164static unsigned int
165v9fs_fd_poll(struct v9fs_transport *trans, struct poll_table_struct *pt)
166{
167 int ret, n;
168 struct v9fs_trans_fd *ts;
169 mm_segment_t oldfs;
170
171 if (!trans)
172 return -EIO;
173
174 ts = trans->priv;
175 if (trans->status != Connected || !ts)
176 return -EIO;
177
178 oldfs = get_fs();
179 set_fs(get_ds());
180
181 if (!ts->in_file->f_op || !ts->in_file->f_op->poll) {
182 ret = -EIO;
183 goto end;
184 }
185
186 ret = ts->in_file->f_op->poll(ts->in_file, pt);
187
188 if (ts->out_file != ts->in_file) {
189 if (!ts->out_file->f_op || !ts->out_file->f_op->poll) {
190 ret = -EIO;
191 goto end;
192 }
193
194 n = ts->out_file->f_op->poll(ts->out_file, pt);
195
196 ret &= ~POLLOUT;
197 n &= ~POLLIN;
198
199 ret |= n;
200 }
201
202end:
203 set_fs(oldfs);
204 return ret;
205}
206
207
166struct v9fs_transport v9fs_trans_fd = { 208struct v9fs_transport v9fs_trans_fd = {
167 .init = v9fs_fd_init, 209 .init = v9fs_fd_init,
168 .write = v9fs_fd_send, 210 .write = v9fs_fd_send,
169 .read = v9fs_fd_recv, 211 .read = v9fs_fd_recv,
170 .close = v9fs_fd_close, 212 .close = v9fs_fd_close,
213 .poll = v9fs_fd_poll,
171}; 214};
172 215
diff --git a/fs/9p/trans_sock.c b/fs/9p/trans_sock.c
index a93c2bf94c3..44e830697ac 100644
--- a/fs/9p/trans_sock.c
+++ b/fs/9p/trans_sock.c
@@ -3,6 +3,7 @@
3 * 3 *
4 * Socket Transport Layer 4 * Socket Transport Layer
5 * 5 *
6 * Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net>
6 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> 7 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
7 * Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com> 8 * Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com>
8 * Copyright (C) 1995, 1996 by Olaf Kirch <okir@monad.swb.de> 9 * Copyright (C) 1995, 1996 by Olaf Kirch <okir@monad.swb.de>
@@ -26,6 +27,7 @@
26 */ 27 */
27 28
28#include <linux/config.h> 29#include <linux/config.h>
30#include <linux/in.h>
29#include <linux/module.h> 31#include <linux/module.h>
30#include <linux/net.h> 32#include <linux/net.h>
31#include <linux/ipv6.h> 33#include <linux/ipv6.h>
@@ -35,6 +37,7 @@
35#include <asm/uaccess.h> 37#include <asm/uaccess.h>
36#include <linux/inet.h> 38#include <linux/inet.h>
37#include <linux/idr.h> 39#include <linux/idr.h>
40#include <linux/file.h>
38 41
39#include "debug.h" 42#include "debug.h"
40#include "v9fs.h" 43#include "v9fs.h"
@@ -44,6 +47,7 @@
44 47
45struct v9fs_trans_sock { 48struct v9fs_trans_sock {
46 struct socket *s; 49 struct socket *s;
50 struct file *filp;
47}; 51};
48 52
49/** 53/**
@@ -56,41 +60,26 @@ struct v9fs_trans_sock {
56 60
57static int v9fs_sock_recv(struct v9fs_transport *trans, void *v, int len) 61static int v9fs_sock_recv(struct v9fs_transport *trans, void *v, int len)
58{ 62{
59 struct msghdr msg; 63 int ret;
60 struct kvec iov; 64 struct v9fs_trans_sock *ts;
61 int result;
62 mm_segment_t oldfs;
63 struct v9fs_trans_sock *ts = trans ? trans->priv : NULL;
64 65
65 if (trans->status == Disconnected) 66 if (!trans || trans->status == Disconnected) {
67 dprintk(DEBUG_ERROR, "disconnected ...\n");
66 return -EREMOTEIO; 68 return -EREMOTEIO;
69 }
67 70
68 result = -EINVAL; 71 ts = trans->priv;
69
70 oldfs = get_fs();
71 set_fs(get_ds());
72
73 iov.iov_base = v;
74 iov.iov_len = len;
75 msg.msg_name = NULL;
76 msg.msg_namelen = 0;
77 msg.msg_iovlen = 1;
78 msg.msg_control = NULL;
79 msg.msg_controllen = 0;
80 msg.msg_namelen = 0;
81 msg.msg_flags = MSG_NOSIGNAL;
82 72
83 result = kernel_recvmsg(ts->s, &msg, &iov, 1, len, 0); 73 if (!(ts->filp->f_flags & O_NONBLOCK))
74 dprintk(DEBUG_ERROR, "blocking read ...\n");
84 75
85 dprintk(DEBUG_TRANS, "socket state %d\n", ts->s->state); 76 ret = kernel_read(ts->filp, ts->filp->f_pos, v, len);
86 set_fs(oldfs); 77 if (ret <= 0) {
87 78 if (ret != -ERESTARTSYS && ret != -EAGAIN)
88 if (result <= 0) {
89 if (result != -ERESTARTSYS)
90 trans->status = Disconnected; 79 trans->status = Disconnected;
91 } 80 }
92 81
93 return result; 82 return ret;
94} 83}
95 84
96/** 85/**
@@ -103,40 +92,72 @@ static int v9fs_sock_recv(struct v9fs_transport *trans, void *v, int len)
103 92
104static int v9fs_sock_send(struct v9fs_transport *trans, void *v, int len) 93static int v9fs_sock_send(struct v9fs_transport *trans, void *v, int len)
105{ 94{
106 struct kvec iov; 95 int ret;
107 struct msghdr msg;
108 int result = -1;
109 mm_segment_t oldfs; 96 mm_segment_t oldfs;
110 struct v9fs_trans_sock *ts = trans ? trans->priv : NULL; 97 struct v9fs_trans_sock *ts;
111 98
112 dprintk(DEBUG_TRANS, "Sending packet size %d (%x)\n", len, len); 99 if (!trans || trans->status == Disconnected) {
113 dump_data(v, len); 100 dprintk(DEBUG_ERROR, "disconnected ...\n");
101 return -EREMOTEIO;
102 }
103
104 ts = trans->priv;
105 if (!ts) {
106 dprintk(DEBUG_ERROR, "no transport ...\n");
107 return -EREMOTEIO;
108 }
114 109
115 down(&trans->writelock); 110 if (!(ts->filp->f_flags & O_NONBLOCK))
111 dprintk(DEBUG_ERROR, "blocking write ...\n");
116 112
117 oldfs = get_fs(); 113 oldfs = get_fs();
118 set_fs(get_ds()); 114 set_fs(get_ds());
119 iov.iov_base = v; 115 ret = vfs_write(ts->filp, (void __user *)v, len, &ts->filp->f_pos);
120 iov.iov_len = len;
121 msg.msg_name = NULL;
122 msg.msg_namelen = 0;
123 msg.msg_iovlen = 1;
124 msg.msg_control = NULL;
125 msg.msg_controllen = 0;
126 msg.msg_namelen = 0;
127 msg.msg_flags = MSG_NOSIGNAL;
128 result = kernel_sendmsg(ts->s, &msg, &iov, 1, len);
129 set_fs(oldfs); 116 set_fs(oldfs);
130 117
131 if (result < 0) { 118 if (ret < 0) {
132 if (result != -ERESTARTSYS) 119 if (ret != -ERESTARTSYS)
133 trans->status = Disconnected; 120 trans->status = Disconnected;
134 } 121 }
135 122
136 up(&trans->writelock); 123 return ret;
137 return result; 124}
125
126static unsigned int v9fs_sock_poll(struct v9fs_transport *trans,
127 struct poll_table_struct *pt) {
128
129 int ret;
130 struct v9fs_trans_sock *ts;
131 mm_segment_t oldfs;
132
133 if (!trans) {
134 dprintk(DEBUG_ERROR, "no transport\n");
135 return -EIO;
136 }
137
138 ts = trans->priv;
139 if (trans->status != Connected || !ts) {
140 dprintk(DEBUG_ERROR, "transport disconnected: %d\n", trans->status);
141 return -EIO;
142 }
143
144 oldfs = get_fs();
145 set_fs(get_ds());
146
147 if (!ts->filp->f_op || !ts->filp->f_op->poll) {
148 dprintk(DEBUG_ERROR, "no poll operation\n");
149 ret = -EIO;
150 goto end;
151 }
152
153 ret = ts->filp->f_op->poll(ts->filp, pt);
154
155end:
156 set_fs(oldfs);
157 return ret;
138} 158}
139 159
160
140/** 161/**
141 * v9fs_tcp_init - initialize TCP socket 162 * v9fs_tcp_init - initialize TCP socket
142 * @v9ses: session information 163 * @v9ses: session information
@@ -153,9 +174,9 @@ v9fs_tcp_init(struct v9fs_session_info *v9ses, const char *addr, char *data)
153 int rc = 0; 174 int rc = 0;
154 struct v9fs_trans_sock *ts = NULL; 175 struct v9fs_trans_sock *ts = NULL;
155 struct v9fs_transport *trans = v9ses->transport; 176 struct v9fs_transport *trans = v9ses->transport;
177 int fd;
156 178
157 sema_init(&trans->writelock, 1); 179 trans->status = Disconnected;
158 sema_init(&trans->readlock, 1);
159 180
160 ts = kmalloc(sizeof(struct v9fs_trans_sock), GFP_KERNEL); 181 ts = kmalloc(sizeof(struct v9fs_trans_sock), GFP_KERNEL);
161 182
@@ -164,6 +185,7 @@ v9fs_tcp_init(struct v9fs_session_info *v9ses, const char *addr, char *data)
164 185
165 trans->priv = ts; 186 trans->priv = ts;
166 ts->s = NULL; 187 ts->s = NULL;
188 ts->filp = NULL;
167 189
168 if (!addr) 190 if (!addr)
169 return -EINVAL; 191 return -EINVAL;
@@ -184,7 +206,18 @@ v9fs_tcp_init(struct v9fs_session_info *v9ses, const char *addr, char *data)
184 return rc; 206 return rc;
185 } 207 }
186 csocket->sk->sk_allocation = GFP_NOIO; 208 csocket->sk->sk_allocation = GFP_NOIO;
209
210 fd = sock_map_fd(csocket);
211 if (fd < 0) {
212 sock_release(csocket);
213 kfree(ts);
214 trans->priv = NULL;
215 return fd;
216 }
217
187 ts->s = csocket; 218 ts->s = csocket;
219 ts->filp = fget(fd);
220 ts->filp->f_flags |= O_NONBLOCK;
188 trans->status = Connected; 221 trans->status = Connected;
189 222
190 return 0; 223 return 0;
@@ -202,7 +235,7 @@ static int
202v9fs_unix_init(struct v9fs_session_info *v9ses, const char *dev_name, 235v9fs_unix_init(struct v9fs_session_info *v9ses, const char *dev_name,
203 char *data) 236 char *data)
204{ 237{
205 int rc; 238 int rc, fd;
206 struct socket *csocket; 239 struct socket *csocket;
207 struct sockaddr_un sun_server; 240 struct sockaddr_un sun_server;
208 struct v9fs_transport *trans; 241 struct v9fs_transport *trans;
@@ -212,6 +245,8 @@ v9fs_unix_init(struct v9fs_session_info *v9ses, const char *dev_name,
212 csocket = NULL; 245 csocket = NULL;
213 trans = v9ses->transport; 246 trans = v9ses->transport;
214 247
248 trans->status = Disconnected;
249
215 if (strlen(dev_name) > UNIX_PATH_MAX) { 250 if (strlen(dev_name) > UNIX_PATH_MAX) {
216 eprintk(KERN_ERR, "v9fs_trans_unix: address too long: %s\n", 251 eprintk(KERN_ERR, "v9fs_trans_unix: address too long: %s\n",
217 dev_name); 252 dev_name);
@@ -224,9 +259,7 @@ v9fs_unix_init(struct v9fs_session_info *v9ses, const char *dev_name,
224 259
225 trans->priv = ts; 260 trans->priv = ts;
226 ts->s = NULL; 261 ts->s = NULL;
227 262 ts->filp = NULL;
228 sema_init(&trans->writelock, 1);
229 sema_init(&trans->readlock, 1);
230 263
231 sun_server.sun_family = PF_UNIX; 264 sun_server.sun_family = PF_UNIX;
232 strcpy(sun_server.sun_path, dev_name); 265 strcpy(sun_server.sun_path, dev_name);
@@ -240,7 +273,18 @@ v9fs_unix_init(struct v9fs_session_info *v9ses, const char *dev_name,
240 return rc; 273 return rc;
241 } 274 }
242 csocket->sk->sk_allocation = GFP_NOIO; 275 csocket->sk->sk_allocation = GFP_NOIO;
276
277 fd = sock_map_fd(csocket);
278 if (fd < 0) {
279 sock_release(csocket);
280 kfree(ts);
281 trans->priv = NULL;
282 return fd;
283 }
284
243 ts->s = csocket; 285 ts->s = csocket;
286 ts->filp = fget(fd);
287 ts->filp->f_flags |= O_NONBLOCK;
244 trans->status = Connected; 288 trans->status = Connected;
245 289
246 return 0; 290 return 0;
@@ -261,12 +305,11 @@ static void v9fs_sock_close(struct v9fs_transport *trans)
261 305
262 ts = trans->priv; 306 ts = trans->priv;
263 307
264 if ((ts) && (ts->s)) { 308 if ((ts) && (ts->filp)) {
265 dprintk(DEBUG_TRANS, "closing the socket %p\n", ts->s); 309 fput(ts->filp);
266 sock_release(ts->s); 310 ts->filp = NULL;
267 ts->s = NULL; 311 ts->s = NULL;
268 trans->status = Disconnected; 312 trans->status = Disconnected;
269 dprintk(DEBUG_TRANS, "socket closed\n");
270 } 313 }
271 314
272 kfree(ts); 315 kfree(ts);
@@ -279,6 +322,7 @@ struct v9fs_transport v9fs_trans_tcp = {
279 .write = v9fs_sock_send, 322 .write = v9fs_sock_send,
280 .read = v9fs_sock_recv, 323 .read = v9fs_sock_recv,
281 .close = v9fs_sock_close, 324 .close = v9fs_sock_close,
325 .poll = v9fs_sock_poll,
282}; 326};
283 327
284struct v9fs_transport v9fs_trans_unix = { 328struct v9fs_transport v9fs_trans_unix = {
@@ -286,4 +330,5 @@ struct v9fs_transport v9fs_trans_unix = {
286 .write = v9fs_sock_send, 330 .write = v9fs_sock_send,
287 .read = v9fs_sock_recv, 331 .read = v9fs_sock_recv,
288 .close = v9fs_sock_close, 332 .close = v9fs_sock_close,
333 .poll = v9fs_sock_poll,
289}; 334};
diff --git a/fs/9p/transport.h b/fs/9p/transport.h
index 9e9cd418efd..91fcdb94b36 100644
--- a/fs/9p/transport.h
+++ b/fs/9p/transport.h
@@ -3,6 +3,7 @@
3 * 3 *
4 * Transport Definition 4 * Transport Definition
5 * 5 *
6 * Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net>
6 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> 7 * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
7 * 8 *
8 * This program is free software; you can redistribute it and/or modify 9 * This program is free software; you can redistribute it and/or modify
@@ -31,14 +32,13 @@ enum v9fs_transport_status {
31 32
32struct v9fs_transport { 33struct v9fs_transport {
33 enum v9fs_transport_status status; 34 enum v9fs_transport_status status;
34 struct semaphore writelock;
35 struct semaphore readlock;
36 void *priv; 35 void *priv;
37 36
38 int (*init) (struct v9fs_session_info *, const char *, char *); 37 int (*init) (struct v9fs_session_info *, const char *, char *);
39 int (*write) (struct v9fs_transport *, void *, int); 38 int (*write) (struct v9fs_transport *, void *, int);
40 int (*read) (struct v9fs_transport *, void *, int); 39 int (*read) (struct v9fs_transport *, void *, int);
41 void (*close) (struct v9fs_transport *); 40 void (*close) (struct v9fs_transport *);
41 unsigned int (*poll)(struct v9fs_transport *, struct poll_table_struct *);
42}; 42};
43 43
44extern struct v9fs_transport v9fs_trans_tcp; 44extern struct v9fs_transport v9fs_trans_tcp;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 418c3743fde..5250c428fc1 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -37,7 +37,6 @@
37#include "v9fs_vfs.h" 37#include "v9fs_vfs.h"
38#include "transport.h" 38#include "transport.h"
39#include "mux.h" 39#include "mux.h"
40#include "conv.h"
41 40
42/* TODO: sysfs or debugfs interface */ 41/* TODO: sysfs or debugfs interface */
43int v9fs_debug_level = 0; /* feature-rific global debug level */ 42int v9fs_debug_level = 0; /* feature-rific global debug level */
@@ -213,7 +212,8 @@ retry:
213 return -1; 212 return -1;
214 } 213 }
215 214
216 error = idr_get_new(&p->pool, NULL, &i); 215 /* no need to store exactly p, we just need something non-null */
216 error = idr_get_new(&p->pool, p, &i);
217 up(&p->lock); 217 up(&p->lock);
218 218
219 if (error == -EAGAIN) 219 if (error == -EAGAIN)
@@ -243,6 +243,16 @@ void v9fs_put_idpool(int id, struct v9fs_idpool *p)
243} 243}
244 244
245/** 245/**
246 * v9fs_check_idpool - check if the specified id is available
247 * @id - id to check
248 * @p - pool
249 */
250int v9fs_check_idpool(int id, struct v9fs_idpool *p)
251{
252 return idr_find(&p->pool, id) != NULL;
253}
254
255/**
246 * v9fs_session_init - initialize session 256 * v9fs_session_init - initialize session
247 * @v9ses: session information structure 257 * @v9ses: session information structure
248 * @dev_name: device being mounted 258 * @dev_name: device being mounted
@@ -259,6 +269,7 @@ v9fs_session_init(struct v9fs_session_info *v9ses,
259 int n = 0; 269 int n = 0;
260 int newfid = -1; 270 int newfid = -1;
261 int retval = -EINVAL; 271 int retval = -EINVAL;
272 struct v9fs_str *version;
262 273
263 v9ses->name = __getname(); 274 v9ses->name = __getname();
264 if (!v9ses->name) 275 if (!v9ses->name)
@@ -281,9 +292,6 @@ v9fs_session_init(struct v9fs_session_info *v9ses,
281 /* id pools that are session-dependent: FIDs and TIDs */ 292 /* id pools that are session-dependent: FIDs and TIDs */
282 idr_init(&v9ses->fidpool.pool); 293 idr_init(&v9ses->fidpool.pool);
283 init_MUTEX(&v9ses->fidpool.lock); 294 init_MUTEX(&v9ses->fidpool.lock);
284 idr_init(&v9ses->tidpool.pool);
285 init_MUTEX(&v9ses->tidpool.lock);
286
287 295
288 switch (v9ses->proto) { 296 switch (v9ses->proto) {
289 case PROTO_TCP: 297 case PROTO_TCP:
@@ -320,7 +328,12 @@ v9fs_session_init(struct v9fs_session_info *v9ses,
320 v9ses->shutdown = 0; 328 v9ses->shutdown = 0;
321 v9ses->session_hung = 0; 329 v9ses->session_hung = 0;
322 330
323 if ((retval = v9fs_mux_init(v9ses, dev_name)) < 0) { 331 v9ses->mux = v9fs_mux_init(v9ses->transport, v9ses->maxdata + V9FS_IOHDRSZ,
332 &v9ses->extended);
333
334 if (IS_ERR(v9ses->mux)) {
335 retval = PTR_ERR(v9ses->mux);
336 v9ses->mux = NULL;
324 dprintk(DEBUG_ERROR, "problem initializing mux\n"); 337 dprintk(DEBUG_ERROR, "problem initializing mux\n");
325 goto SessCleanUp; 338 goto SessCleanUp;
326 } 339 }
@@ -339,13 +352,16 @@ v9fs_session_init(struct v9fs_session_info *v9ses,
339 goto FreeFcall; 352 goto FreeFcall;
340 } 353 }
341 354
342 /* Really should check for 9P1 and report error */ 355 version = &fcall->params.rversion.version;
343 if (!strcmp(fcall->params.rversion.version, "9P2000.u")) { 356 if (version->len==8 && !memcmp(version->str, "9P2000.u", 8)) {
344 dprintk(DEBUG_9P, "9P2000 UNIX extensions enabled\n"); 357 dprintk(DEBUG_9P, "9P2000 UNIX extensions enabled\n");
345 v9ses->extended = 1; 358 v9ses->extended = 1;
346 } else { 359 } else if (version->len==6 && !memcmp(version->str, "9P2000", 6)) {
347 dprintk(DEBUG_9P, "9P2000 legacy mode enabled\n"); 360 dprintk(DEBUG_9P, "9P2000 legacy mode enabled\n");
348 v9ses->extended = 0; 361 v9ses->extended = 0;
362 } else {
363 retval = -EREMOTEIO;
364 goto FreeFcall;
349 } 365 }
350 366
351 n = fcall->params.rversion.msize; 367 n = fcall->params.rversion.msize;
@@ -381,7 +397,7 @@ v9fs_session_init(struct v9fs_session_info *v9ses,
381 } 397 }
382 398
383 if (v9ses->afid != ~0) { 399 if (v9ses->afid != ~0) {
384 if (v9fs_t_clunk(v9ses, v9ses->afid, NULL)) 400 if (v9fs_t_clunk(v9ses, v9ses->afid))
385 dprintk(DEBUG_ERROR, "clunk failed\n"); 401 dprintk(DEBUG_ERROR, "clunk failed\n");
386 } 402 }
387 403
@@ -403,13 +419,16 @@ v9fs_session_init(struct v9fs_session_info *v9ses,
403 419
404void v9fs_session_close(struct v9fs_session_info *v9ses) 420void v9fs_session_close(struct v9fs_session_info *v9ses)
405{ 421{
406 if (v9ses->recvproc) { 422 if (v9ses->mux) {
407 send_sig(SIGKILL, v9ses->recvproc, 1); 423 v9fs_mux_destroy(v9ses->mux);
408 wait_for_completion(&v9ses->proccmpl); 424 v9ses->mux = NULL;
409 } 425 }
410 426
411 if (v9ses->transport) 427 if (v9ses->transport) {
412 v9ses->transport->close(v9ses->transport); 428 v9ses->transport->close(v9ses->transport);
429 kfree(v9ses->transport);
430 v9ses->transport = NULL;
431 }
413 432
414 __putname(v9ses->name); 433 __putname(v9ses->name);
415 __putname(v9ses->remotename); 434 __putname(v9ses->remotename);
@@ -420,8 +439,9 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
420 * and cancel all pending requests. 439 * and cancel all pending requests.
421 */ 440 */
422void v9fs_session_cancel(struct v9fs_session_info *v9ses) { 441void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
442 dprintk(DEBUG_ERROR, "cancel session %p\n", v9ses);
423 v9ses->transport->status = Disconnected; 443 v9ses->transport->status = Disconnected;
424 v9fs_mux_cancel_requests(v9ses, -EIO); 444 v9fs_mux_cancel(v9ses->mux, -EIO);
425} 445}
426 446
427extern int v9fs_error_init(void); 447extern int v9fs_error_init(void);
@@ -433,11 +453,17 @@ extern int v9fs_error_init(void);
433 453
434static int __init init_v9fs(void) 454static int __init init_v9fs(void)
435{ 455{
456 int ret;
457
436 v9fs_error_init(); 458 v9fs_error_init();
437 459
438 printk(KERN_INFO "Installing v9fs 9P2000 file system support\n"); 460 printk(KERN_INFO "Installing v9fs 9P2000 file system support\n");
439 461
440 return register_filesystem(&v9fs_fs_type); 462 ret = v9fs_mux_global_init();
463 if (!ret)
464 ret = register_filesystem(&v9fs_fs_type);
465
466 return ret;
441} 467}
442 468
443/** 469/**
@@ -447,6 +473,7 @@ static int __init init_v9fs(void)
447 473
448static void __exit exit_v9fs(void) 474static void __exit exit_v9fs(void)
449{ 475{
476 v9fs_mux_global_exit();
450 unregister_filesystem(&v9fs_fs_type); 477 unregister_filesystem(&v9fs_fs_type);
451} 478}
452 479
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 45dcef42bdd..f337da7a0ee 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -57,24 +57,14 @@ struct v9fs_session_info {
57 57
58 /* book keeping */ 58 /* book keeping */
59 struct v9fs_idpool fidpool; /* The FID pool for file descriptors */ 59 struct v9fs_idpool fidpool; /* The FID pool for file descriptors */
60 struct v9fs_idpool tidpool; /* The TID pool for transactions ids */
61 60
62 /* transport information */
63 struct v9fs_transport *transport; 61 struct v9fs_transport *transport;
62 struct v9fs_mux_data *mux;
64 63
65 int inprogress; /* session in progress => true */ 64 int inprogress; /* session in progress => true */
66 int shutdown; /* session shutting down. no more attaches. */ 65 int shutdown; /* session shutting down. no more attaches. */
67 unsigned char session_hung; 66 unsigned char session_hung;
68 67 struct dentry *debugfs_dir;
69 /* mux private data */
70 struct v9fs_fcall *curfcall;
71 wait_queue_head_t read_wait;
72 struct completion fcread;
73 struct completion proccmpl;
74 struct task_struct *recvproc;
75
76 spinlock_t muxlock;
77 struct list_head mux_fcalls;
78}; 68};
79 69
80/* possible values of ->proto */ 70/* possible values of ->proto */
@@ -84,11 +74,14 @@ enum {
84 PROTO_FD, 74 PROTO_FD,
85}; 75};
86 76
77extern struct dentry *v9fs_debugfs_root;
78
87int v9fs_session_init(struct v9fs_session_info *, const char *, char *); 79int v9fs_session_init(struct v9fs_session_info *, const char *, char *);
88struct v9fs_session_info *v9fs_inode2v9ses(struct inode *); 80struct v9fs_session_info *v9fs_inode2v9ses(struct inode *);
89void v9fs_session_close(struct v9fs_session_info *v9ses); 81void v9fs_session_close(struct v9fs_session_info *v9ses);
90int v9fs_get_idpool(struct v9fs_idpool *p); 82int v9fs_get_idpool(struct v9fs_idpool *p);
91void v9fs_put_idpool(int id, struct v9fs_idpool *p); 83void v9fs_put_idpool(int id, struct v9fs_idpool *p);
84int v9fs_check_idpool(int id, struct v9fs_idpool *p);
92void v9fs_session_cancel(struct v9fs_session_info *v9ses); 85void v9fs_session_cancel(struct v9fs_session_info *v9ses);
93 86
94#define V9FS_MAGIC 0x01021997 87#define V9FS_MAGIC 0x01021997
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 2f2cea7ee3e..69cf2905dc9 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -39,15 +39,15 @@
39 */ 39 */
40 40
41extern struct file_system_type v9fs_fs_type; 41extern struct file_system_type v9fs_fs_type;
42extern struct address_space_operations v9fs_addr_operations;
42extern struct file_operations v9fs_file_operations; 43extern struct file_operations v9fs_file_operations;
43extern struct file_operations v9fs_dir_operations; 44extern struct file_operations v9fs_dir_operations;
44extern struct dentry_operations v9fs_dentry_operations; 45extern struct dentry_operations v9fs_dentry_operations;
45 46
46struct inode *v9fs_get_inode(struct super_block *sb, int mode); 47struct inode *v9fs_get_inode(struct super_block *sb, int mode);
47ino_t v9fs_qid2ino(struct v9fs_qid *qid); 48ino_t v9fs_qid2ino(struct v9fs_qid *qid);
48void v9fs_mistat2inode(struct v9fs_stat *, struct inode *, 49void v9fs_stat2inode(struct v9fs_stat *, struct inode *, struct super_block *);
49 struct super_block *);
50int v9fs_dir_release(struct inode *inode, struct file *filp); 50int v9fs_dir_release(struct inode *inode, struct file *filp);
51int v9fs_file_open(struct inode *inode, struct file *file); 51int v9fs_file_open(struct inode *inode, struct file *file);
52void v9fs_inode2mistat(struct inode *inode, struct v9fs_stat *mistat); 52void v9fs_inode2stat(struct inode *inode, struct v9fs_stat *stat);
53void v9fs_dentry_release(struct dentry *); 53void v9fs_dentry_release(struct dentry *);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
new file mode 100644
index 00000000000..8100fb5171b
--- /dev/null
+++ b/fs/9p/vfs_addr.c
@@ -0,0 +1,109 @@
1/*
2 * linux/fs/9p/vfs_addr.c
3 *
4 * This file contians vfs address (mmap) ops for 9P2000.
5 *
6 * Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com>
7 * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to:
21 * Free Software Foundation
22 * 51 Franklin Street, Fifth Floor
23 * Boston, MA 02111-1301 USA
24 *
25 */
26
27#include <linux/module.h>
28#include <linux/errno.h>
29#include <linux/fs.h>
30#include <linux/file.h>
31#include <linux/stat.h>
32#include <linux/string.h>
33#include <linux/smp_lock.h>
34#include <linux/inet.h>
35#include <linux/version.h>
36#include <linux/pagemap.h>
37#include <linux/idr.h>
38
39#include "debug.h"
40#include "v9fs.h"
41#include "9p.h"
42#include "v9fs_vfs.h"
43#include "fid.h"
44
45/**
46 * v9fs_vfs_readpage - read an entire page in from 9P
47 *
48 * @file: file being read
49 * @page: structure to page
50 *
51 */
52
53static int v9fs_vfs_readpage(struct file *filp, struct page *page)
54{
55 char *buffer = NULL;
56 int retval = -EIO;
57 loff_t offset = page_offset(page);
58 int count = PAGE_CACHE_SIZE;
59 struct inode *inode = filp->f_dentry->d_inode;
60 struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
61 int rsize = v9ses->maxdata - V9FS_IOHDRSZ;
62 struct v9fs_fid *v9f = filp->private_data;
63 struct v9fs_fcall *fcall = NULL;
64 int fid = v9f->fid;
65 int total = 0;
66 int result = 0;
67
68 buffer = kmap(page);
69 do {
70 if (count < rsize)
71 rsize = count;
72
73 result = v9fs_t_read(v9ses, fid, offset, rsize, &fcall);
74
75 if (result < 0) {
76 printk(KERN_ERR "v9fs_t_read returned %d\n",
77 result);
78
79 kfree(fcall);
80 goto UnmapAndUnlock;
81 } else
82 offset += result;
83
84 memcpy(buffer, fcall->params.rread.data, result);
85
86 count -= result;
87 buffer += result;
88 total += result;
89
90 kfree(fcall);
91
92 if (result < rsize)
93 break;
94 } while (count);
95
96 memset(buffer, 0, count);
97 flush_dcache_page(page);
98 SetPageUptodate(page);
99 retval = 0;
100
101UnmapAndUnlock:
102 kunmap(page);
103 unlock_page(page);
104 return retval;
105}
106
107struct address_space_operations v9fs_addr_operations = {
108 .readpage = v9fs_vfs_readpage,
109};
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index a6aa947de0f..2dd806dac9f 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -40,7 +40,6 @@
40#include "v9fs.h" 40#include "v9fs.h"
41#include "9p.h" 41#include "9p.h"
42#include "v9fs_vfs.h" 42#include "v9fs_vfs.h"
43#include "conv.h"
44#include "fid.h" 43#include "fid.h"
45 44
46/** 45/**
@@ -95,24 +94,22 @@ static int v9fs_dentry_validate(struct dentry *dentry, struct nameidata *nd)
95 94
96void v9fs_dentry_release(struct dentry *dentry) 95void v9fs_dentry_release(struct dentry *dentry)
97{ 96{
97 int err;
98
98 dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry); 99 dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
99 100
100 if (dentry->d_fsdata != NULL) { 101 if (dentry->d_fsdata != NULL) {
101 struct list_head *fid_list = dentry->d_fsdata; 102 struct list_head *fid_list = dentry->d_fsdata;
102 struct v9fs_fid *temp = NULL; 103 struct v9fs_fid *temp = NULL;
103 struct v9fs_fid *current_fid = NULL; 104 struct v9fs_fid *current_fid = NULL;
104 struct v9fs_fcall *fcall = NULL;
105 105
106 list_for_each_entry_safe(current_fid, temp, fid_list, list) { 106 list_for_each_entry_safe(current_fid, temp, fid_list, list) {
107 if (v9fs_t_clunk 107 err = v9fs_t_clunk(current_fid->v9ses, current_fid->fid);
108 (current_fid->v9ses, current_fid->fid, &fcall))
109 dprintk(DEBUG_ERROR, "clunk failed: %s\n",
110 FCALL_ERROR(fcall));
111 108
112 v9fs_put_idpool(current_fid->fid, 109 if (err < 0)
113 &current_fid->v9ses->fidpool); 110 dprintk(DEBUG_ERROR, "clunk failed: %d name %s\n",
111 err, dentry->d_iname);
114 112
115 kfree(fcall);
116 v9fs_fid_destroy(current_fid); 113 v9fs_fid_destroy(current_fid);
117 } 114 }
118 115
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 57a43b8feef..ae6d032b9b5 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -37,8 +37,8 @@
37#include "debug.h" 37#include "debug.h"
38#include "v9fs.h" 38#include "v9fs.h"
39#include "9p.h" 39#include "9p.h"
40#include "v9fs_vfs.h"
41#include "conv.h" 40#include "conv.h"
41#include "v9fs_vfs.h"
42#include "fid.h" 42#include "fid.h"
43 43
44/** 44/**
@@ -74,20 +74,16 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
74 struct inode *inode = filp->f_dentry->d_inode; 74 struct inode *inode = filp->f_dentry->d_inode;
75 struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); 75 struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
76 struct v9fs_fid *file = filp->private_data; 76 struct v9fs_fid *file = filp->private_data;
77 unsigned int i, n; 77 unsigned int i, n, s;
78 int fid = -1; 78 int fid = -1;
79 int ret = 0; 79 int ret = 0;
80 struct v9fs_stat *mi = NULL; 80 struct v9fs_stat stat;
81 int over = 0; 81 int over = 0;
82 82
83 dprintk(DEBUG_VFS, "name %s\n", filp->f_dentry->d_name.name); 83 dprintk(DEBUG_VFS, "name %s\n", filp->f_dentry->d_name.name);
84 84
85 fid = file->fid; 85 fid = file->fid;
86 86
87 mi = kmalloc(v9ses->maxdata, GFP_KERNEL);
88 if (!mi)
89 return -ENOMEM;
90
91 if (file->rdir_fcall && (filp->f_pos != file->rdir_pos)) { 87 if (file->rdir_fcall && (filp->f_pos != file->rdir_pos)) {
92 kfree(file->rdir_fcall); 88 kfree(file->rdir_fcall);
93 file->rdir_fcall = NULL; 89 file->rdir_fcall = NULL;
@@ -97,20 +93,20 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
97 n = file->rdir_fcall->params.rread.count; 93 n = file->rdir_fcall->params.rread.count;
98 i = file->rdir_fpos; 94 i = file->rdir_fpos;
99 while (i < n) { 95 while (i < n) {
100 int s = v9fs_deserialize_stat(v9ses, 96 s = v9fs_deserialize_stat(
101 file->rdir_fcall->params.rread.data + i, 97 file->rdir_fcall->params.rread.data + i,
102 n - i, mi, v9ses->maxdata); 98 n - i, &stat, v9ses->extended);
103 99
104 if (s == 0) { 100 if (s == 0) {
105 dprintk(DEBUG_ERROR, 101 dprintk(DEBUG_ERROR,
106 "error while deserializing mistat\n"); 102 "error while deserializing stat\n");
107 ret = -EIO; 103 ret = -EIO;
108 goto FreeStructs; 104 goto FreeStructs;
109 } 105 }
110 106
111 over = filldir(dirent, mi->name, strlen(mi->name), 107 over = filldir(dirent, stat.name.str, stat.name.len,
112 filp->f_pos, v9fs_qid2ino(&mi->qid), 108 filp->f_pos, v9fs_qid2ino(&stat.qid),
113 dt_type(mi)); 109 dt_type(&stat));
114 110
115 if (over) { 111 if (over) {
116 file->rdir_fpos = i; 112 file->rdir_fpos = i;
@@ -130,7 +126,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
130 126
131 while (!over) { 127 while (!over) {
132 ret = v9fs_t_read(v9ses, fid, filp->f_pos, 128 ret = v9fs_t_read(v9ses, fid, filp->f_pos,
133 v9ses->maxdata-V9FS_IOHDRSZ, &fcall); 129 v9ses->maxdata-V9FS_IOHDRSZ, &fcall);
134 if (ret < 0) { 130 if (ret < 0) {
135 dprintk(DEBUG_ERROR, "error while reading: %d: %p\n", 131 dprintk(DEBUG_ERROR, "error while reading: %d: %p\n",
136 ret, fcall); 132 ret, fcall);
@@ -141,19 +137,18 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
141 n = ret; 137 n = ret;
142 i = 0; 138 i = 0;
143 while (i < n) { 139 while (i < n) {
144 int s = v9fs_deserialize_stat(v9ses, 140 s = v9fs_deserialize_stat(fcall->params.rread.data + i,
145 fcall->params.rread.data + i, n - i, mi, 141 n - i, &stat, v9ses->extended);
146 v9ses->maxdata);
147 142
148 if (s == 0) { 143 if (s == 0) {
149 dprintk(DEBUG_ERROR, 144 dprintk(DEBUG_ERROR,
150 "error while deserializing mistat\n"); 145 "error while deserializing stat\n");
151 return -EIO; 146 return -EIO;
152 } 147 }
153 148
154 over = filldir(dirent, mi->name, strlen(mi->name), 149 over = filldir(dirent, stat.name.str, stat.name.len,
155 filp->f_pos, v9fs_qid2ino(&mi->qid), 150 filp->f_pos, v9fs_qid2ino(&stat.qid),
156 dt_type(mi)); 151 dt_type(&stat));
157 152
158 if (over) { 153 if (over) {
159 file->rdir_fcall = fcall; 154 file->rdir_fcall = fcall;
@@ -172,7 +167,6 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
172 167
173 FreeStructs: 168 FreeStructs:
174 kfree(fcall); 169 kfree(fcall);
175 kfree(mi);
176 return ret; 170 return ret;
177} 171}
178 172
@@ -193,18 +187,15 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
193 fid->fid); 187 fid->fid);
194 fidnum = fid->fid; 188 fidnum = fid->fid;
195 189
196 filemap_fdatawrite(inode->i_mapping); 190 filemap_write_and_wait(inode->i_mapping);
197 filemap_fdatawait(inode->i_mapping);
198 191
199 if (fidnum >= 0) { 192 if (fidnum >= 0) {
200 dprintk(DEBUG_VFS, "fidopen: %d v9f->fid: %d\n", fid->fidopen, 193 dprintk(DEBUG_VFS, "fidopen: %d v9f->fid: %d\n", fid->fidopen,
201 fid->fid); 194 fid->fid);
202 195
203 if (v9fs_t_clunk(v9ses, fidnum, NULL)) 196 if (v9fs_t_clunk(v9ses, fidnum))
204 dprintk(DEBUG_ERROR, "clunk failed\n"); 197 dprintk(DEBUG_ERROR, "clunk failed\n");
205 198
206 v9fs_put_idpool(fid->fid, &v9ses->fidpool);
207
208 kfree(fid->rdir_fcall); 199 kfree(fid->rdir_fcall);
209 kfree(fid); 200 kfree(fid);
210 201
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 89c849da850..c7e14d91721 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -32,6 +32,7 @@
32#include <linux/string.h> 32#include <linux/string.h>
33#include <linux/smp_lock.h> 33#include <linux/smp_lock.h>
34#include <linux/inet.h> 34#include <linux/inet.h>
35#include <linux/version.h>
35#include <linux/list.h> 36#include <linux/list.h>
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
37#include <linux/idr.h> 38#include <linux/idr.h>
@@ -117,9 +118,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
117 118
118 result = v9fs_t_open(v9ses, newfid, open_mode, &fcall); 119 result = v9fs_t_open(v9ses, newfid, open_mode, &fcall);
119 if (result < 0) { 120 if (result < 0) {
120 dprintk(DEBUG_ERROR, 121 PRINT_FCALL_ERROR("open failed", fcall);
121 "open failed, open_mode 0x%x: %s\n", open_mode,
122 FCALL_ERROR(fcall));
123 kfree(fcall); 122 kfree(fcall);
124 return result; 123 return result;
125 } 124 }
@@ -165,8 +164,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
165 return -ENOLCK; 164 return -ENOLCK;
166 165
167 if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { 166 if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
168 filemap_fdatawrite(inode->i_mapping); 167 filemap_write_and_wait(inode->i_mapping);
169 filemap_fdatawait(inode->i_mapping);
170 invalidate_inode_pages(&inode->i_data); 168 invalidate_inode_pages(&inode->i_data);
171 } 169 }
172 170
@@ -257,7 +255,6 @@ v9fs_file_write(struct file *filp, const char __user * data,
257 int result = -EIO; 255 int result = -EIO;
258 int rsize = 0; 256 int rsize = 0;
259 int total = 0; 257 int total = 0;
260 char *buf;
261 258
262 dprintk(DEBUG_VFS, "data %p count %d offset %x\n", data, (int)count, 259 dprintk(DEBUG_VFS, "data %p count %d offset %x\n", data, (int)count,
263 (int)*offset); 260 (int)*offset);
@@ -265,28 +262,14 @@ v9fs_file_write(struct file *filp, const char __user * data,
265 if (v9fid->iounit != 0 && rsize > v9fid->iounit) 262 if (v9fid->iounit != 0 && rsize > v9fid->iounit)
266 rsize = v9fid->iounit; 263 rsize = v9fid->iounit;
267 264
268 buf = kmalloc(v9ses->maxdata - V9FS_IOHDRSZ, GFP_KERNEL);
269 if (!buf)
270 return -ENOMEM;
271
272 do { 265 do {
273 if (count < rsize) 266 if (count < rsize)
274 rsize = count; 267 rsize = count;
275 268
276 result = copy_from_user(buf, data, rsize); 269 result = v9fs_t_write(v9ses, fid, *offset, rsize, data, &fcall);
277 if (result) {
278 dprintk(DEBUG_ERROR, "Problem copying from user\n");
279 kfree(buf);
280 return -EFAULT;
281 }
282
283 dump_data(buf, rsize);
284 result = v9fs_t_write(v9ses, fid, *offset, rsize, buf, &fcall);
285 if (result < 0) { 270 if (result < 0) {
286 eprintk(KERN_ERR, "error while writing: %s(%d)\n", 271 PRINT_FCALL_ERROR("error while writing", fcall);
287 FCALL_ERROR(fcall), result);
288 kfree(fcall); 272 kfree(fcall);
289 kfree(buf);
290 return result; 273 return result;
291 } else 274 } else
292 *offset += result; 275 *offset += result;
@@ -306,7 +289,9 @@ v9fs_file_write(struct file *filp, const char __user * data,
306 total += result; 289 total += result;
307 } while (count); 290 } while (count);
308 291
309 kfree(buf); 292 if(inode->i_mapping->nrpages)
293 invalidate_inode_pages2(inode->i_mapping);
294
310 return total; 295 return total;
311} 296}
312 297
@@ -317,4 +302,5 @@ struct file_operations v9fs_file_operations = {
317 .open = v9fs_file_open, 302 .open = v9fs_file_open,
318 .release = v9fs_dir_release, 303 .release = v9fs_dir_release,
319 .lock = v9fs_file_lock, 304 .lock = v9fs_file_lock,
305 .mmap = generic_file_mmap,
320}; 306};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index be7288184fa..91f552454c7 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -40,7 +40,6 @@
40#include "v9fs.h" 40#include "v9fs.h"
41#include "9p.h" 41#include "9p.h"
42#include "v9fs_vfs.h" 42#include "v9fs_vfs.h"
43#include "conv.h"
44#include "fid.h" 43#include "fid.h"
45 44
46static struct inode_operations v9fs_dir_inode_operations; 45static struct inode_operations v9fs_dir_inode_operations;
@@ -127,100 +126,32 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
127} 126}
128 127
129/** 128/**
130 * v9fs_blank_mistat - helper function to setup a 9P stat structure 129 * v9fs_blank_wstat - helper function to setup a 9P stat structure
131 * @v9ses: 9P session info (for determining extended mode) 130 * @v9ses: 9P session info (for determining extended mode)
132 * @mistat: structure to initialize 131 * @wstat: structure to initialize
133 * 132 *
134 */ 133 */
135 134
136static void 135static void
137v9fs_blank_mistat(struct v9fs_session_info *v9ses, struct v9fs_stat *mistat) 136v9fs_blank_wstat(struct v9fs_wstat *wstat)
138{ 137{
139 mistat->type = ~0; 138 wstat->type = ~0;
140 mistat->dev = ~0; 139 wstat->dev = ~0;
141 mistat->qid.type = ~0; 140 wstat->qid.type = ~0;
142 mistat->qid.version = ~0; 141 wstat->qid.version = ~0;
143 *((long long *)&mistat->qid.path) = ~0; 142 *((long long *)&wstat->qid.path) = ~0;
144 mistat->mode = ~0; 143 wstat->mode = ~0;
145 mistat->atime = ~0; 144 wstat->atime = ~0;
146 mistat->mtime = ~0; 145 wstat->mtime = ~0;
147 mistat->length = ~0; 146 wstat->length = ~0;
148 mistat->name = mistat->data; 147 wstat->name = NULL;
149 mistat->uid = mistat->data; 148 wstat->uid = NULL;
150 mistat->gid = mistat->data; 149 wstat->gid = NULL;
151 mistat->muid = mistat->data; 150 wstat->muid = NULL;
152 if (v9ses->extended) { 151 wstat->n_uid = ~0;
153 mistat->n_uid = ~0; 152 wstat->n_gid = ~0;
154 mistat->n_gid = ~0; 153 wstat->n_muid = ~0;
155 mistat->n_muid = ~0; 154 wstat->extension = NULL;
156 mistat->extension = mistat->data;
157 }
158 *mistat->data = 0;
159}
160
161/**
162 * v9fs_mistat2unix - convert mistat to unix stat
163 * @mistat: Plan 9 metadata (mistat) structure
164 * @buf: unix metadata (stat) structure to populate
165 * @sb: superblock
166 *
167 */
168
169static void
170v9fs_mistat2unix(struct v9fs_stat *mistat, struct stat *buf,
171 struct super_block *sb)
172{
173 struct v9fs_session_info *v9ses = sb ? sb->s_fs_info : NULL;
174
175 buf->st_nlink = 1;
176
177 buf->st_atime = mistat->atime;
178 buf->st_mtime = mistat->mtime;
179 buf->st_ctime = mistat->mtime;
180
181 buf->st_uid = (unsigned short)-1;
182 buf->st_gid = (unsigned short)-1;
183
184 if (v9ses && v9ses->extended) {
185 /* TODO: string to uid mapping via user-space daemon */
186 if (mistat->n_uid != -1)
187 sscanf(mistat->uid, "%x", (unsigned int *)&buf->st_uid);
188
189 if (mistat->n_gid != -1)
190 sscanf(mistat->gid, "%x", (unsigned int *)&buf->st_gid);
191 }
192
193 if (buf->st_uid == (unsigned short)-1)
194 buf->st_uid = v9ses->uid;
195 if (buf->st_gid == (unsigned short)-1)
196 buf->st_gid = v9ses->gid;
197
198 buf->st_mode = p9mode2unixmode(v9ses, mistat->mode);
199 if ((S_ISBLK(buf->st_mode)) || (S_ISCHR(buf->st_mode))) {
200 char type = 0;
201 int major = -1;
202 int minor = -1;
203 sscanf(mistat->extension, "%c %u %u", &type, &major, &minor);
204 switch (type) {
205 case 'c':
206 buf->st_mode &= ~S_IFBLK;
207 buf->st_mode |= S_IFCHR;
208 break;
209 case 'b':
210 break;
211 default:
212 dprintk(DEBUG_ERROR, "Unknown special type %c (%s)\n",
213 type, mistat->extension);
214 };
215 buf->st_rdev = MKDEV(major, minor);
216 } else
217 buf->st_rdev = 0;
218
219 buf->st_size = mistat->length;
220
221 buf->st_blksize = sb->s_blocksize;
222 buf->st_blocks =
223 (buf->st_size + buf->st_blksize - 1) >> sb->s_blocksize_bits;
224} 155}
225 156
226/** 157/**
@@ -246,6 +177,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
246 inode->i_blocks = 0; 177 inode->i_blocks = 0;
247 inode->i_rdev = 0; 178 inode->i_rdev = 0;
248 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 179 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
180 inode->i_mapping->a_ops = &v9fs_addr_operations;
249 181
250 switch (mode & S_IFMT) { 182 switch (mode & S_IFMT) {
251 case S_IFIFO: 183 case S_IFIFO:
@@ -312,12 +244,12 @@ v9fs_create(struct inode *dir,
312 struct inode *file_inode = NULL; 244 struct inode *file_inode = NULL;
313 struct v9fs_fcall *fcall = NULL; 245 struct v9fs_fcall *fcall = NULL;
314 struct v9fs_qid qid; 246 struct v9fs_qid qid;
315 struct stat newstat;
316 int dirfidnum = -1; 247 int dirfidnum = -1;
317 long newfid = -1; 248 long newfid = -1;
318 int result = 0; 249 int result = 0;
319 unsigned int iounit = 0; 250 unsigned int iounit = 0;
320 int wfidno = -1; 251 int wfidno = -1;
252 int err;
321 253
322 perm = unixmode2p9mode(v9ses, perm); 254 perm = unixmode2p9mode(v9ses, perm);
323 255
@@ -349,57 +281,64 @@ v9fs_create(struct inode *dir,
349 281
350 result = v9fs_t_walk(v9ses, dirfidnum, newfid, NULL, &fcall); 282 result = v9fs_t_walk(v9ses, dirfidnum, newfid, NULL, &fcall);
351 if (result < 0) { 283 if (result < 0) {
352 dprintk(DEBUG_ERROR, "clone error: %s\n", FCALL_ERROR(fcall)); 284 PRINT_FCALL_ERROR("clone error", fcall);
353 v9fs_put_idpool(newfid, &v9ses->fidpool); 285 v9fs_put_idpool(newfid, &v9ses->fidpool);
354 newfid = -1; 286 newfid = -1;
355 goto CleanUpFid; 287 goto CleanUpFid;
356 } 288 }
357 289
358 kfree(fcall); 290 kfree(fcall);
291 fcall = NULL;
359 292
360 result = v9fs_t_create(v9ses, newfid, (char *)file_dentry->d_name.name, 293 result = v9fs_t_create(v9ses, newfid, (char *)file_dentry->d_name.name,
361 perm, open_mode, &fcall); 294 perm, open_mode, &fcall);
362 if (result < 0) { 295 if (result < 0) {
363 dprintk(DEBUG_ERROR, "create fails: %s(%d)\n", 296 PRINT_FCALL_ERROR("create fails", fcall);
364 FCALL_ERROR(fcall), result);
365
366 goto CleanUpFid; 297 goto CleanUpFid;
367 } 298 }
368 299
369 iounit = fcall->params.rcreate.iounit; 300 iounit = fcall->params.rcreate.iounit;
370 qid = fcall->params.rcreate.qid; 301 qid = fcall->params.rcreate.qid;
371 kfree(fcall); 302 kfree(fcall);
303 fcall = NULL;
372 304
373 fid = v9fs_fid_create(file_dentry, v9ses, newfid, 1); 305 if (!(perm&V9FS_DMDIR)) {
374 dprintk(DEBUG_VFS, "fid %p %d\n", fid, fid->fidcreate); 306 fid = v9fs_fid_create(file_dentry, v9ses, newfid, 1);
375 if (!fid) { 307 dprintk(DEBUG_VFS, "fid %p %d\n", fid, fid->fidcreate);
376 result = -ENOMEM; 308 if (!fid) {
377 goto CleanUpFid; 309 result = -ENOMEM;
378 } 310 goto CleanUpFid;
311 }
379 312
380 fid->qid = qid; 313 fid->qid = qid;
381 fid->iounit = iounit; 314 fid->iounit = iounit;
315 } else {
316 err = v9fs_t_clunk(v9ses, newfid);
317 newfid = -1;
318 if (err < 0)
319 dprintk(DEBUG_ERROR, "clunk for mkdir failed: %d\n", err);
320 }
382 321
383 /* walk to the newly created file and put the fid in the dentry */ 322 /* walk to the newly created file and put the fid in the dentry */
384 wfidno = v9fs_get_idpool(&v9ses->fidpool); 323 wfidno = v9fs_get_idpool(&v9ses->fidpool);
385 if (newfid < 0) { 324 if (wfidno < 0) {
386 eprintk(KERN_WARNING, "no free fids available\n"); 325 eprintk(KERN_WARNING, "no free fids available\n");
387 return -ENOSPC; 326 return -ENOSPC;
388 } 327 }
389 328
390 result = v9fs_t_walk(v9ses, dirfidnum, wfidno, 329 result = v9fs_t_walk(v9ses, dirfidnum, wfidno,
391 (char *) file_dentry->d_name.name, NULL); 330 (char *) file_dentry->d_name.name, &fcall);
392 if (result < 0) { 331 if (result < 0) {
393 dprintk(DEBUG_ERROR, "clone error: %s\n", FCALL_ERROR(fcall)); 332 PRINT_FCALL_ERROR("clone error", fcall);
394 v9fs_put_idpool(wfidno, &v9ses->fidpool); 333 v9fs_put_idpool(wfidno, &v9ses->fidpool);
395 wfidno = -1; 334 wfidno = -1;
396 goto CleanUpFid; 335 goto CleanUpFid;
397 } 336 }
337 kfree(fcall);
338 fcall = NULL;
398 339
399 if (!v9fs_fid_create(file_dentry, v9ses, wfidno, 0)) { 340 if (!v9fs_fid_create(file_dentry, v9ses, wfidno, 0)) {
400 if (!v9fs_t_clunk(v9ses, newfid, &fcall)) { 341 v9fs_put_idpool(wfidno, &v9ses->fidpool);
401 v9fs_put_idpool(wfidno, &v9ses->fidpool);
402 }
403 342
404 goto CleanUpFid; 343 goto CleanUpFid;
405 } 344 }
@@ -409,60 +348,43 @@ v9fs_create(struct inode *dir,
409 (perm & V9FS_DMDEVICE)) 348 (perm & V9FS_DMDEVICE))
410 return 0; 349 return 0;
411 350
412 result = v9fs_t_stat(v9ses, newfid, &fcall); 351 result = v9fs_t_stat(v9ses, wfidno, &fcall);
413 if (result < 0) { 352 if (result < 0) {
414 dprintk(DEBUG_ERROR, "stat error: %s(%d)\n", FCALL_ERROR(fcall), 353 PRINT_FCALL_ERROR("stat error", fcall);
415 result);
416 goto CleanUpFid; 354 goto CleanUpFid;
417 } 355 }
418 356
419 v9fs_mistat2unix(fcall->params.rstat.stat, &newstat, sb);
420 357
421 file_inode = v9fs_get_inode(sb, newstat.st_mode); 358 file_inode = v9fs_get_inode(sb,
359 p9mode2unixmode(v9ses, fcall->params.rstat.stat.mode));
360
422 if ((!file_inode) || IS_ERR(file_inode)) { 361 if ((!file_inode) || IS_ERR(file_inode)) {
423 dprintk(DEBUG_ERROR, "create inode failed\n"); 362 dprintk(DEBUG_ERROR, "create inode failed\n");
424 result = -EBADF; 363 result = -EBADF;
425 goto CleanUpFid; 364 goto CleanUpFid;
426 } 365 }
427 366
428 v9fs_mistat2inode(fcall->params.rstat.stat, file_inode, sb); 367 v9fs_stat2inode(&fcall->params.rstat.stat, file_inode, sb);
429 kfree(fcall); 368 kfree(fcall);
369 fcall = NULL;
370 file_dentry->d_op = &v9fs_dentry_operations;
430 d_instantiate(file_dentry, file_inode); 371 d_instantiate(file_dentry, file_inode);
431 372
432 if (perm & V9FS_DMDIR) {
433 if (!v9fs_t_clunk(v9ses, newfid, &fcall))
434 v9fs_put_idpool(newfid, &v9ses->fidpool);
435 else
436 dprintk(DEBUG_ERROR, "clunk for mkdir failed: %s\n",
437 FCALL_ERROR(fcall));
438 kfree(fcall);
439 fid->fidopen = 0;
440 fid->fidcreate = 0;
441 d_drop(file_dentry);
442 }
443
444 return 0; 373 return 0;
445 374
446 CleanUpFid: 375 CleanUpFid:
447 kfree(fcall); 376 kfree(fcall);
377 fcall = NULL;
448 378
449 if (newfid >= 0) { 379 if (newfid >= 0) {
450 if (!v9fs_t_clunk(v9ses, newfid, &fcall)) 380 err = v9fs_t_clunk(v9ses, newfid);
451 v9fs_put_idpool(newfid, &v9ses->fidpool); 381 if (err < 0)
452 else 382 dprintk(DEBUG_ERROR, "clunk failed: %d\n", err);
453 dprintk(DEBUG_ERROR, "clunk failed: %s\n",
454 FCALL_ERROR(fcall));
455
456 kfree(fcall);
457 } 383 }
458 if (wfidno >= 0) { 384 if (wfidno >= 0) {
459 if (!v9fs_t_clunk(v9ses, wfidno, &fcall)) 385 err = v9fs_t_clunk(v9ses, wfidno);
460 v9fs_put_idpool(wfidno, &v9ses->fidpool); 386 if (err < 0)
461 else 387 dprintk(DEBUG_ERROR, "clunk failed: %d\n", err);
462 dprintk(DEBUG_ERROR, "clunk failed: %s\n",
463 FCALL_ERROR(fcall));
464
465 kfree(fcall);
466 } 388 }
467 return result; 389 return result;
468} 390}
@@ -507,10 +429,9 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
507 } 429 }
508 430
509 result = v9fs_t_remove(v9ses, fid, &fcall); 431 result = v9fs_t_remove(v9ses, fid, &fcall);
510 if (result < 0) 432 if (result < 0) {
511 dprintk(DEBUG_ERROR, "remove of file fails: %s(%d)\n", 433 PRINT_FCALL_ERROR("remove fails", fcall);
512 FCALL_ERROR(fcall), result); 434 } else {
513 else {
514 v9fs_put_idpool(fid, &v9ses->fidpool); 435 v9fs_put_idpool(fid, &v9ses->fidpool);
515 v9fs_fid_destroy(v9fid); 436 v9fs_fid_destroy(v9fid);
516 } 437 }
@@ -565,7 +486,6 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
565 struct v9fs_fid *fid; 486 struct v9fs_fid *fid;
566 struct inode *inode; 487 struct inode *inode;
567 struct v9fs_fcall *fcall = NULL; 488 struct v9fs_fcall *fcall = NULL;
568 struct stat newstat;
569 int dirfidnum = -1; 489 int dirfidnum = -1;
570 int newfid = -1; 490 int newfid = -1;
571 int result = 0; 491 int result = 0;
@@ -618,8 +538,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
618 goto FreeFcall; 538 goto FreeFcall;
619 } 539 }
620 540
621 v9fs_mistat2unix(fcall->params.rstat.stat, &newstat, sb); 541 inode = v9fs_get_inode(sb, p9mode2unixmode(v9ses,
622 inode = v9fs_get_inode(sb, newstat.st_mode); 542 fcall->params.rstat.stat.mode));
623 543
624 if (IS_ERR(inode) && (PTR_ERR(inode) == -ENOSPC)) { 544 if (IS_ERR(inode) && (PTR_ERR(inode) == -ENOSPC)) {
625 eprintk(KERN_WARNING, "inode alloc failes, returns %ld\n", 545 eprintk(KERN_WARNING, "inode alloc failes, returns %ld\n",
@@ -629,7 +549,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
629 goto FreeFcall; 549 goto FreeFcall;
630 } 550 }
631 551
632 inode->i_ino = v9fs_qid2ino(&fcall->params.rstat.stat->qid); 552 inode->i_ino = v9fs_qid2ino(&fcall->params.rstat.stat.qid);
633 553
634 fid = v9fs_fid_create(dentry, v9ses, newfid, 0); 554 fid = v9fs_fid_create(dentry, v9ses, newfid, 0);
635 if (fid == NULL) { 555 if (fid == NULL) {
@@ -638,10 +558,10 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
638 goto FreeFcall; 558 goto FreeFcall;
639 } 559 }
640 560
641 fid->qid = fcall->params.rstat.stat->qid; 561 fid->qid = fcall->params.rstat.stat.qid;
642 562
643 dentry->d_op = &v9fs_dentry_operations; 563 dentry->d_op = &v9fs_dentry_operations;
644 v9fs_mistat2inode(fcall->params.rstat.stat, inode, inode->i_sb); 564 v9fs_stat2inode(&fcall->params.rstat.stat, inode, inode->i_sb);
645 565
646 d_add(dentry, inode); 566 d_add(dentry, inode);
647 kfree(fcall); 567 kfree(fcall);
@@ -697,7 +617,7 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
697 v9fs_fid_lookup(old_dentry->d_parent); 617 v9fs_fid_lookup(old_dentry->d_parent);
698 struct v9fs_fid *newdirfid = 618 struct v9fs_fid *newdirfid =
699 v9fs_fid_lookup(new_dentry->d_parent); 619 v9fs_fid_lookup(new_dentry->d_parent);
700 struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL); 620 struct v9fs_wstat wstat;
701 struct v9fs_fcall *fcall = NULL; 621 struct v9fs_fcall *fcall = NULL;
702 int fid = -1; 622 int fid = -1;
703 int olddirfidnum = -1; 623 int olddirfidnum = -1;
@@ -706,9 +626,6 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
706 626
707 dprintk(DEBUG_VFS, "\n"); 627 dprintk(DEBUG_VFS, "\n");
708 628
709 if (!mistat)
710 return -ENOMEM;
711
712 if ((!oldfid) || (!olddirfid) || (!newdirfid)) { 629 if ((!oldfid) || (!olddirfid) || (!newdirfid)) {
713 dprintk(DEBUG_ERROR, "problem with arguments\n"); 630 dprintk(DEBUG_ERROR, "problem with arguments\n");
714 return -EBADF; 631 return -EBADF;
@@ -732,33 +649,22 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
732 goto FreeFcallnBail; 649 goto FreeFcallnBail;
733 } 650 }
734 651
735 v9fs_blank_mistat(v9ses, mistat); 652 v9fs_blank_wstat(&wstat);
736 653 wstat.muid = v9ses->name;
737 strcpy(mistat->data + 1, v9ses->name); 654 wstat.name = (char *) new_dentry->d_name.name;
738 mistat->name = mistat->data + 1 + strlen(v9ses->name);
739
740 if (new_dentry->d_name.len >
741 (v9ses->maxdata - strlen(v9ses->name) - sizeof(struct v9fs_stat))) {
742 dprintk(DEBUG_ERROR, "new name too long\n");
743 goto FreeFcallnBail;
744 }
745 655
746 strcpy(mistat->name, new_dentry->d_name.name); 656 retval = v9fs_t_wstat(v9ses, fid, &wstat, &fcall);
747 retval = v9fs_t_wstat(v9ses, fid, mistat, &fcall);
748 657
749 FreeFcallnBail: 658 FreeFcallnBail:
750 kfree(mistat);
751
752 if (retval < 0) 659 if (retval < 0)
753 dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n", 660 PRINT_FCALL_ERROR("wstat error", fcall);
754 FCALL_ERROR(fcall));
755 661
756 kfree(fcall); 662 kfree(fcall);
757 return retval; 663 return retval;
758} 664}
759 665
760/** 666/**
761 * v9fs_vfs_getattr - retreive file metadata 667 * v9fs_vfs_getattr - retrieve file metadata
762 * @mnt - mount information 668 * @mnt - mount information
763 * @dentry - file to get attributes on 669 * @dentry - file to get attributes on
764 * @stat - metadata structure to populate 670 * @stat - metadata structure to populate
@@ -786,7 +692,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
786 if (err < 0) 692 if (err < 0)
787 dprintk(DEBUG_ERROR, "stat error\n"); 693 dprintk(DEBUG_ERROR, "stat error\n");
788 else { 694 else {
789 v9fs_mistat2inode(fcall->params.rstat.stat, dentry->d_inode, 695 v9fs_stat2inode(&fcall->params.rstat.stat, dentry->d_inode,
790 dentry->d_inode->i_sb); 696 dentry->d_inode->i_sb);
791 generic_fillattr(dentry->d_inode, stat); 697 generic_fillattr(dentry->d_inode, stat);
792 } 698 }
@@ -807,57 +713,44 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
807 struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode); 713 struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode);
808 struct v9fs_fid *fid = v9fs_fid_lookup(dentry); 714 struct v9fs_fid *fid = v9fs_fid_lookup(dentry);
809 struct v9fs_fcall *fcall = NULL; 715 struct v9fs_fcall *fcall = NULL;
810 struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL); 716 struct v9fs_wstat wstat;
811 int res = -EPERM; 717 int res = -EPERM;
812 718
813 dprintk(DEBUG_VFS, "\n"); 719 dprintk(DEBUG_VFS, "\n");
814 720
815 if (!mistat)
816 return -ENOMEM;
817
818 if (!fid) { 721 if (!fid) {
819 dprintk(DEBUG_ERROR, 722 dprintk(DEBUG_ERROR,
820 "Couldn't find fid associated with dentry\n"); 723 "Couldn't find fid associated with dentry\n");
821 return -EBADF; 724 return -EBADF;
822 } 725 }
823 726
824 v9fs_blank_mistat(v9ses, mistat); 727 v9fs_blank_wstat(&wstat);
825 if (iattr->ia_valid & ATTR_MODE) 728 if (iattr->ia_valid & ATTR_MODE)
826 mistat->mode = unixmode2p9mode(v9ses, iattr->ia_mode); 729 wstat.mode = unixmode2p9mode(v9ses, iattr->ia_mode);
827 730
828 if (iattr->ia_valid & ATTR_MTIME) 731 if (iattr->ia_valid & ATTR_MTIME)
829 mistat->mtime = iattr->ia_mtime.tv_sec; 732 wstat.mtime = iattr->ia_mtime.tv_sec;
830 733
831 if (iattr->ia_valid & ATTR_ATIME) 734 if (iattr->ia_valid & ATTR_ATIME)
832 mistat->atime = iattr->ia_atime.tv_sec; 735 wstat.atime = iattr->ia_atime.tv_sec;
833 736
834 if (iattr->ia_valid & ATTR_SIZE) 737 if (iattr->ia_valid & ATTR_SIZE)
835 mistat->length = iattr->ia_size; 738 wstat.length = iattr->ia_size;
836 739
837 if (v9ses->extended) { 740 if (v9ses->extended) {
838 char *ptr = mistat->data+1; 741 if (iattr->ia_valid & ATTR_UID)
839 742 wstat.n_uid = iattr->ia_uid;
840 if (iattr->ia_valid & ATTR_UID) {
841 mistat->uid = ptr;
842 ptr += 1+sprintf(ptr, "%08x", iattr->ia_uid);
843 mistat->n_uid = iattr->ia_uid;
844 }
845 743
846 if (iattr->ia_valid & ATTR_GID) { 744 if (iattr->ia_valid & ATTR_GID)
847 mistat->gid = ptr; 745 wstat.n_gid = iattr->ia_gid;
848 ptr += 1+sprintf(ptr, "%08x", iattr->ia_gid);
849 mistat->n_gid = iattr->ia_gid;
850 }
851 } 746 }
852 747
853 res = v9fs_t_wstat(v9ses, fid->fid, mistat, &fcall); 748 res = v9fs_t_wstat(v9ses, fid->fid, &wstat, &fcall);
854 749
855 if (res < 0) 750 if (res < 0)
856 dprintk(DEBUG_ERROR, "wstat error: %s\n", FCALL_ERROR(fcall)); 751 PRINT_FCALL_ERROR("wstat error", fcall);
857 752
858 kfree(mistat);
859 kfree(fcall); 753 kfree(fcall);
860
861 if (res >= 0) 754 if (res >= 0)
862 res = inode_setattr(dentry->d_inode, iattr); 755 res = inode_setattr(dentry->d_inode, iattr);
863 756
@@ -865,51 +758,47 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
865} 758}
866 759
867/** 760/**
868 * v9fs_mistat2inode - populate an inode structure with mistat info 761 * v9fs_stat2inode - populate an inode structure with mistat info
869 * @mistat: Plan 9 metadata (mistat) structure 762 * @stat: Plan 9 metadata (mistat) structure
870 * @inode: inode to populate 763 * @inode: inode to populate
871 * @sb: superblock of filesystem 764 * @sb: superblock of filesystem
872 * 765 *
873 */ 766 */
874 767
875void 768void
876v9fs_mistat2inode(struct v9fs_stat *mistat, struct inode *inode, 769v9fs_stat2inode(struct v9fs_stat *stat, struct inode *inode,
877 struct super_block *sb) 770 struct super_block *sb)
878{ 771{
772 int n;
773 char ext[32];
879 struct v9fs_session_info *v9ses = sb->s_fs_info; 774 struct v9fs_session_info *v9ses = sb->s_fs_info;
880 775
881 inode->i_nlink = 1; 776 inode->i_nlink = 1;
882 777
883 inode->i_atime.tv_sec = mistat->atime; 778 inode->i_atime.tv_sec = stat->atime;
884 inode->i_mtime.tv_sec = mistat->mtime; 779 inode->i_mtime.tv_sec = stat->mtime;
885 inode->i_ctime.tv_sec = mistat->mtime; 780 inode->i_ctime.tv_sec = stat->mtime;
886 781
887 inode->i_uid = -1; 782 inode->i_uid = v9ses->uid;
888 inode->i_gid = -1; 783 inode->i_gid = v9ses->gid;
889 784
890 if (v9ses->extended) { 785 if (v9ses->extended) {
891 /* TODO: string to uid mapping via user-space daemon */ 786 inode->i_uid = stat->n_uid;
892 inode->i_uid = mistat->n_uid; 787 inode->i_gid = stat->n_gid;
893 inode->i_gid = mistat->n_gid;
894
895 if (mistat->n_uid == -1)
896 sscanf(mistat->uid, "%x", &inode->i_uid);
897
898 if (mistat->n_gid == -1)
899 sscanf(mistat->gid, "%x", &inode->i_gid);
900 } 788 }
901 789
902 if (inode->i_uid == -1) 790 inode->i_mode = p9mode2unixmode(v9ses, stat->mode);
903 inode->i_uid = v9ses->uid;
904 if (inode->i_gid == -1)
905 inode->i_gid = v9ses->gid;
906
907 inode->i_mode = p9mode2unixmode(v9ses, mistat->mode);
908 if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) { 791 if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) {
909 char type = 0; 792 char type = 0;
910 int major = -1; 793 int major = -1;
911 int minor = -1; 794 int minor = -1;
912 sscanf(mistat->extension, "%c %u %u", &type, &major, &minor); 795
796 n = stat->extension.len;
797 if (n > sizeof(ext)-1)
798 n = sizeof(ext)-1;
799 memmove(ext, stat->extension.str, n);
800 ext[n] = 0;
801 sscanf(ext, "%c %u %u", &type, &major, &minor);
913 switch (type) { 802 switch (type) {
914 case 'c': 803 case 'c':
915 inode->i_mode &= ~S_IFBLK; 804 inode->i_mode &= ~S_IFBLK;
@@ -918,14 +807,14 @@ v9fs_mistat2inode(struct v9fs_stat *mistat, struct inode *inode,
918 case 'b': 807 case 'b':
919 break; 808 break;
920 default: 809 default:
921 dprintk(DEBUG_ERROR, "Unknown special type %c (%s)\n", 810 dprintk(DEBUG_ERROR, "Unknown special type %c (%.*s)\n",
922 type, mistat->extension); 811 type, stat->extension.len, stat->extension.str);
923 }; 812 };
924 inode->i_rdev = MKDEV(major, minor); 813 inode->i_rdev = MKDEV(major, minor);
925 } else 814 } else
926 inode->i_rdev = 0; 815 inode->i_rdev = 0;
927 816
928 inode->i_size = mistat->length; 817 inode->i_size = stat->length;
929 818
930 inode->i_blksize = sb->s_blocksize; 819 inode->i_blksize = sb->s_blocksize;
931 inode->i_blocks = 820 inode->i_blocks =
@@ -953,71 +842,6 @@ ino_t v9fs_qid2ino(struct v9fs_qid *qid)
953} 842}
954 843
955/** 844/**
956 * v9fs_vfs_symlink - helper function to create symlinks
957 * @dir: directory inode containing symlink
958 * @dentry: dentry for symlink
959 * @symname: symlink data
960 *
961 * See 9P2000.u RFC for more information
962 *
963 */
964
965static int
966v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
967{
968 int retval = -EPERM;
969 struct v9fs_fid *newfid;
970 struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
971 struct v9fs_fcall *fcall = NULL;
972 struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
973
974 dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
975 symname);
976
977 if (!mistat)
978 return -ENOMEM;
979
980 if (!v9ses->extended) {
981 dprintk(DEBUG_ERROR, "not extended\n");
982 goto FreeFcall;
983 }
984
985 /* issue a create */
986 retval = v9fs_create(dir, dentry, S_IFLNK, 0);
987 if (retval != 0)
988 goto FreeFcall;
989
990 newfid = v9fs_fid_lookup(dentry);
991
992 /* issue a twstat */
993 v9fs_blank_mistat(v9ses, mistat);
994 strcpy(mistat->data + 1, symname);
995 mistat->extension = mistat->data + 1;
996 retval = v9fs_t_wstat(v9ses, newfid->fid, mistat, &fcall);
997 if (retval < 0) {
998 dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n",
999 FCALL_ERROR(fcall));
1000 goto FreeFcall;
1001 }
1002
1003 kfree(fcall);
1004
1005 if (v9fs_t_clunk(v9ses, newfid->fid, &fcall)) {
1006 dprintk(DEBUG_ERROR, "clunk for symlink failed: %s\n",
1007 FCALL_ERROR(fcall));
1008 goto FreeFcall;
1009 }
1010
1011 d_drop(dentry); /* FID - will this also clunk? */
1012
1013 FreeFcall:
1014 kfree(mistat);
1015 kfree(fcall);
1016
1017 return retval;
1018}
1019
1020/**
1021 * v9fs_readlink - read a symlink's location (internal version) 845 * v9fs_readlink - read a symlink's location (internal version)
1022 * @dentry: dentry for symlink 846 * @dentry: dentry for symlink
1023 * @buffer: buffer to load symlink location into 847 * @buffer: buffer to load symlink location into
@@ -1056,16 +880,17 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
1056 if (!fcall) 880 if (!fcall)
1057 return -EIO; 881 return -EIO;
1058 882
1059 if (!(fcall->params.rstat.stat->mode & V9FS_DMSYMLINK)) { 883 if (!(fcall->params.rstat.stat.mode & V9FS_DMSYMLINK)) {
1060 retval = -EINVAL; 884 retval = -EINVAL;
1061 goto FreeFcall; 885 goto FreeFcall;
1062 } 886 }
1063 887
1064 /* copy extension buffer into buffer */ 888 /* copy extension buffer into buffer */
1065 if (strlen(fcall->params.rstat.stat->extension) < buflen) 889 if (fcall->params.rstat.stat.extension.len < buflen)
1066 buflen = strlen(fcall->params.rstat.stat->extension); 890 buflen = fcall->params.rstat.stat.extension.len;
1067 891
1068 memcpy(buffer, fcall->params.rstat.stat->extension, buflen + 1); 892 memcpy(buffer, fcall->params.rstat.stat.extension.str, buflen - 1);
893 buffer[buflen-1] = 0;
1069 894
1070 retval = buflen; 895 retval = buflen;
1071 896
@@ -1155,6 +980,77 @@ static void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void
1155 __putname(s); 980 __putname(s);
1156} 981}
1157 982
983static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
984 int mode, const char *extension)
985{
986 int err, retval;
987 struct v9fs_session_info *v9ses;
988 struct v9fs_fcall *fcall;
989 struct v9fs_fid *fid;
990 struct v9fs_wstat wstat;
991
992 v9ses = v9fs_inode2v9ses(dir);
993 retval = -EPERM;
994 fcall = NULL;
995
996 if (!v9ses->extended) {
997 dprintk(DEBUG_ERROR, "not extended\n");
998 goto free_mem;
999 }
1000
1001 /* issue a create */
1002 retval = v9fs_create(dir, dentry, mode, 0);
1003 if (retval != 0)
1004 goto free_mem;
1005
1006 fid = v9fs_fid_get_created(dentry);
1007 if (!fid) {
1008 dprintk(DEBUG_ERROR, "couldn't resolve fid from dentry\n");
1009 goto free_mem;
1010 }
1011
1012 /* issue a Twstat */
1013 v9fs_blank_wstat(&wstat);
1014 wstat.muid = v9ses->name;
1015 wstat.extension = (char *) extension;
1016 retval = v9fs_t_wstat(v9ses, fid->fid, &wstat, &fcall);
1017 if (retval < 0) {
1018 PRINT_FCALL_ERROR("wstat error", fcall);
1019 goto free_mem;
1020 }
1021
1022 err = v9fs_t_clunk(v9ses, fid->fid);
1023 if (err < 0) {
1024 dprintk(DEBUG_ERROR, "clunk failed: %d\n", err);
1025 goto free_mem;
1026 }
1027
1028 d_drop(dentry); /* FID - will this also clunk? */
1029
1030free_mem:
1031 kfree(fcall);
1032 return retval;
1033}
1034
1035/**
1036 * v9fs_vfs_symlink - helper function to create symlinks
1037 * @dir: directory inode containing symlink
1038 * @dentry: dentry for symlink
1039 * @symname: symlink data
1040 *
1041 * See 9P2000.u RFC for more information
1042 *
1043 */
1044
1045static int
1046v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
1047{
1048 dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
1049 symname);
1050
1051 return v9fs_vfs_mkspecial(dir, dentry, S_IFLNK, symname);
1052}
1053
1158/** 1054/**
1159 * v9fs_vfs_link - create a hardlink 1055 * v9fs_vfs_link - create a hardlink
1160 * @old_dentry: dentry for file to link to 1056 * @old_dentry: dentry for file to link to
@@ -1171,64 +1067,24 @@ static int
1171v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, 1067v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
1172 struct dentry *dentry) 1068 struct dentry *dentry)
1173{ 1069{
1174 int retval = -EPERM; 1070 int retval;
1175 struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); 1071 struct v9fs_fid *oldfid;
1176 struct v9fs_fcall *fcall = NULL; 1072 char *name;
1177 struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
1178 struct v9fs_fid *oldfid = v9fs_fid_lookup(old_dentry);
1179 struct v9fs_fid *newfid = NULL;
1180 char *symname = __getname();
1181 1073
1182 dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name, 1074 dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
1183 old_dentry->d_name.name); 1075 old_dentry->d_name.name);
1184 1076
1185 if (!v9ses->extended) { 1077 oldfid = v9fs_fid_lookup(old_dentry);
1186 dprintk(DEBUG_ERROR, "not extended\n"); 1078 if (!oldfid) {
1187 goto FreeMem; 1079 dprintk(DEBUG_ERROR, "can't find oldfid\n");
1188 } 1080 return -EPERM;
1189
1190 /* get fid of old_dentry */
1191 sprintf(symname, "hardlink(%d)\n", oldfid->fid);
1192
1193 /* issue a create */
1194 retval = v9fs_create(dir, dentry, V9FS_DMLINK, 0);
1195 if (retval != 0)
1196 goto FreeMem;
1197
1198 newfid = v9fs_fid_lookup(dentry);
1199 if (!newfid) {
1200 dprintk(DEBUG_ERROR, "couldn't resolve fid from dentry\n");
1201 goto FreeMem;
1202 }
1203
1204 /* issue a twstat */
1205 v9fs_blank_mistat(v9ses, mistat);
1206 strcpy(mistat->data + 1, symname);
1207 mistat->extension = mistat->data + 1;
1208 retval = v9fs_t_wstat(v9ses, newfid->fid, mistat, &fcall);
1209 if (retval < 0) {
1210 dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n",
1211 FCALL_ERROR(fcall));
1212 goto FreeMem;
1213 }
1214
1215 kfree(fcall);
1216
1217 if (v9fs_t_clunk(v9ses, newfid->fid, &fcall)) {
1218 dprintk(DEBUG_ERROR, "clunk for symlink failed: %s\n",
1219 FCALL_ERROR(fcall));
1220 goto FreeMem;
1221 } 1081 }
1222 1082
1223 d_drop(dentry); /* FID - will this also clunk? */ 1083 name = __getname();
1224 1084 sprintf(name, "hardlink(%d)\n", oldfid->fid);
1225 kfree(fcall); 1085 retval = v9fs_vfs_mkspecial(dir, dentry, V9FS_DMLINK, name);
1226 fcall = NULL; 1086 __putname(name);
1227 1087
1228 FreeMem:
1229 kfree(mistat);
1230 kfree(fcall);
1231 __putname(symname);
1232 return retval; 1088 return retval;
1233} 1089}
1234 1090
@@ -1244,82 +1100,30 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
1244static int 1100static int
1245v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) 1101v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
1246{ 1102{
1247 int retval = -EPERM; 1103 int retval;
1248 struct v9fs_fid *newfid; 1104 char *name;
1249 struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
1250 struct v9fs_fcall *fcall = NULL;
1251 struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
1252 char *symname = __getname();
1253 1105
1254 dprintk(DEBUG_VFS, " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, 1106 dprintk(DEBUG_VFS, " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
1255 dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev)); 1107 dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
1256 1108
1257 if (!mistat) 1109 if (!new_valid_dev(rdev))
1258 return -ENOMEM; 1110 return -EINVAL;
1259
1260 if (!new_valid_dev(rdev)) {
1261 retval = -EINVAL;
1262 goto FreeMem;
1263 }
1264
1265 if (!v9ses->extended) {
1266 dprintk(DEBUG_ERROR, "not extended\n");
1267 goto FreeMem;
1268 }
1269
1270 /* issue a create */
1271 retval = v9fs_create(dir, dentry, mode, 0);
1272
1273 if (retval != 0)
1274 goto FreeMem;
1275
1276 newfid = v9fs_fid_lookup(dentry);
1277 if (!newfid) {
1278 dprintk(DEBUG_ERROR, "coudn't resove fid from dentry\n");
1279 retval = -EINVAL;
1280 goto FreeMem;
1281 }
1282 1111
1112 name = __getname();
1283 /* build extension */ 1113 /* build extension */
1284 if (S_ISBLK(mode)) 1114 if (S_ISBLK(mode))
1285 sprintf(symname, "b %u %u", MAJOR(rdev), MINOR(rdev)); 1115 sprintf(name, "b %u %u", MAJOR(rdev), MINOR(rdev));
1286 else if (S_ISCHR(mode)) 1116 else if (S_ISCHR(mode))
1287 sprintf(symname, "c %u %u", MAJOR(rdev), MINOR(rdev)); 1117 sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev));
1288 else if (S_ISFIFO(mode)) 1118 else if (S_ISFIFO(mode))
1289 ; /* DO NOTHING */ 1119 *name = 0;
1290 else { 1120 else {
1291 retval = -EINVAL; 1121 __putname(name);
1292 goto FreeMem; 1122 return -EINVAL;
1293 }
1294
1295 if (!S_ISFIFO(mode)) {
1296 /* issue a twstat */
1297 v9fs_blank_mistat(v9ses, mistat);
1298 strcpy(mistat->data + 1, symname);
1299 mistat->extension = mistat->data + 1;
1300 retval = v9fs_t_wstat(v9ses, newfid->fid, mistat, &fcall);
1301 if (retval < 0) {
1302 dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n",
1303 FCALL_ERROR(fcall));
1304 goto FreeMem;
1305 }
1306 } 1123 }
1307 1124
1308 /* need to update dcache so we show up */ 1125 retval = v9fs_vfs_mkspecial(dir, dentry, mode, name);
1309 kfree(fcall); 1126 __putname(name);
1310
1311 if (v9fs_t_clunk(v9ses, newfid->fid, &fcall)) {
1312 dprintk(DEBUG_ERROR, "clunk for symlink failed: %s\n",
1313 FCALL_ERROR(fcall));
1314 goto FreeMem;
1315 }
1316
1317 d_drop(dentry); /* FID - will this also clunk? */
1318
1319 FreeMem:
1320 kfree(mistat);
1321 kfree(fcall);
1322 __putname(symname);
1323 1127
1324 return retval; 1128 return retval;
1325} 1129}
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 82c5b008407..2c4fa75be02 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -44,7 +44,6 @@
44#include "v9fs.h" 44#include "v9fs.h"
45#include "9p.h" 45#include "9p.h"
46#include "v9fs_vfs.h" 46#include "v9fs_vfs.h"
47#include "conv.h"
48#include "fid.h" 47#include "fid.h"
49 48
50static void v9fs_clear_inode(struct inode *); 49static void v9fs_clear_inode(struct inode *);
@@ -92,7 +91,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
92 sb->s_op = &v9fs_super_ops; 91 sb->s_op = &v9fs_super_ops;
93 92
94 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | 93 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
95 MS_NODIRATIME | MS_NOATIME; 94 MS_NOATIME;
96} 95}
97 96
98/** 97/**
@@ -123,12 +122,13 @@ static struct super_block *v9fs_get_sb(struct file_system_type
123 122
124 dprintk(DEBUG_VFS, " \n"); 123 dprintk(DEBUG_VFS, " \n");
125 124
126 v9ses = kcalloc(1, sizeof(struct v9fs_session_info), GFP_KERNEL); 125 v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
127 if (!v9ses) 126 if (!v9ses)
128 return ERR_PTR(-ENOMEM); 127 return ERR_PTR(-ENOMEM);
129 128
130 if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) { 129 if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) {
131 dprintk(DEBUG_ERROR, "problem initiating session\n"); 130 dprintk(DEBUG_ERROR, "problem initiating session\n");
131 kfree(v9ses);
132 return ERR_PTR(newfid); 132 return ERR_PTR(newfid);
133 } 133 }
134 134
@@ -157,7 +157,7 @@ static struct super_block *v9fs_get_sb(struct file_system_type
157 stat_result = v9fs_t_stat(v9ses, newfid, &fcall); 157 stat_result = v9fs_t_stat(v9ses, newfid, &fcall);
158 if (stat_result < 0) { 158 if (stat_result < 0) {
159 dprintk(DEBUG_ERROR, "stat error\n"); 159 dprintk(DEBUG_ERROR, "stat error\n");
160 v9fs_t_clunk(v9ses, newfid, NULL); 160 v9fs_t_clunk(v9ses, newfid);
161 v9fs_put_idpool(newfid, &v9ses->fidpool); 161 v9fs_put_idpool(newfid, &v9ses->fidpool);
162 } else { 162 } else {
163 /* Setup the Root Inode */ 163 /* Setup the Root Inode */
@@ -167,10 +167,10 @@ static struct super_block *v9fs_get_sb(struct file_system_type
167 goto put_back_sb; 167 goto put_back_sb;
168 } 168 }
169 169
170 root_fid->qid = fcall->params.rstat.stat->qid; 170 root_fid->qid = fcall->params.rstat.stat.qid;
171 root->d_inode->i_ino = 171 root->d_inode->i_ino =
172 v9fs_qid2ino(&fcall->params.rstat.stat->qid); 172 v9fs_qid2ino(&fcall->params.rstat.stat.qid);
173 v9fs_mistat2inode(fcall->params.rstat.stat, root->d_inode, sb); 173 v9fs_stat2inode(&fcall->params.rstat.stat, root->d_inode, sb);
174 } 174 }
175 175
176 kfree(fcall); 176 kfree(fcall);
diff --git a/fs/Kconfig b/fs/Kconfig
index d5255e627b5..ef78e3a42d3 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -70,6 +70,7 @@ config FS_XIP
70 70
71config EXT3_FS 71config EXT3_FS
72 tristate "Ext3 journalling file system support" 72 tristate "Ext3 journalling file system support"
73 select JBD
73 help 74 help
74 This is the journaling version of the Second extended file system 75 This is the journaling version of the Second extended file system
75 (often called ext3), the de facto standard Linux file system 76 (often called ext3), the de facto standard Linux file system
@@ -138,23 +139,20 @@ config EXT3_FS_SECURITY
138 extended attributes for file security labels, say N. 139 extended attributes for file security labels, say N.
139 140
140config JBD 141config JBD
141# CONFIG_JBD could be its own option (even modular), but until there are
142# other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS
143# dep_tristate ' Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS
144 tristate 142 tristate
145 default EXT3_FS
146 help 143 help
147 This is a generic journaling layer for block devices. It is 144 This is a generic journaling layer for block devices. It is
148 currently used by the ext3 file system, but it could also be used to 145 currently used by the ext3 and OCFS2 file systems, but it could
149 add journal support to other file systems or block devices such as 146 also be used to add journal support to other file systems or block
150 RAID or LVM. 147 devices such as RAID or LVM.
151 148
152 If you are using the ext3 file system, you need to say Y here. If 149 If you are using the ext3 or OCFS2 file systems, you need to
153 you are not using ext3 then you will probably want to say N. 150 say Y here. If you are not using ext3 OCFS2 then you will probably
151 want to say N.
154 152
155 To compile this device as a module, choose M here: the module will be 153 To compile this device as a module, choose M here: the module will be
156 called jbd. If you are compiling ext3 into the kernel, you cannot 154 called jbd. If you are compiling ext3 or OCFS2 into the kernel,
157 compile this code as a module. 155 you cannot compile this code as a module.
158 156
159config JBD_DEBUG 157config JBD_DEBUG
160 bool "JBD (ext3) debugging support" 158 bool "JBD (ext3) debugging support"
@@ -326,6 +324,38 @@ config FS_POSIX_ACL
326 324
327source "fs/xfs/Kconfig" 325source "fs/xfs/Kconfig"
328 326
327config OCFS2_FS
328 tristate "OCFS2 file system support (EXPERIMENTAL)"
329 depends on NET && EXPERIMENTAL
330 select CONFIGFS_FS
331 select JBD
332 select CRC32
333 select INET
334 help
335 OCFS2 is a general purpose extent based shared disk cluster file
336 system with many similarities to ext3. It supports 64 bit inode
337 numbers, and has automatically extending metadata groups which may
338 also make it attractive for non-clustered use.
339
340 You'll want to install the ocfs2-tools package in order to at least
341 get "mount.ocfs2".
342
343 Project web page: http://oss.oracle.com/projects/ocfs2
344 Tools web page: http://oss.oracle.com/projects/ocfs2-tools
345 OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
346
347 Note: Features which OCFS2 does not support yet:
348 - extended attributes
349 - shared writeable mmap
350 - loopback is supported, but data written will not
351 be cluster coherent.
352 - quotas
353 - cluster aware flock
354 - Directory change notification (F_NOTIFY)
355 - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
356 - POSIX ACLs
357 - readpages / writepages (not user visible)
358
329config MINIX_FS 359config MINIX_FS
330 tristate "Minix fs support" 360 tristate "Minix fs support"
331 help 361 help
@@ -768,7 +798,7 @@ config PROC_KCORE
768 798
769config PROC_VMCORE 799config PROC_VMCORE
770 bool "/proc/vmcore support (EXPERIMENTAL)" 800 bool "/proc/vmcore support (EXPERIMENTAL)"
771 depends on PROC_FS && EMBEDDED && EXPERIMENTAL && CRASH_DUMP 801 depends on PROC_FS && EXPERIMENTAL && CRASH_DUMP
772 help 802 help
773 Exports the dump image of crashed kernel in ELF format. 803 Exports the dump image of crashed kernel in ELF format.
774 804
@@ -841,6 +871,20 @@ config RELAYFS_FS
841 871
842 If unsure, say N. 872 If unsure, say N.
843 873
874config CONFIGFS_FS
875 tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)"
876 depends on EXPERIMENTAL
877 help
878 configfs is a ram-based filesystem that provides the converse
879 of sysfs's functionality. Where sysfs is a filesystem-based
880 view of kernel objects, configfs is a filesystem-based manager
881 of kernel objects, or config_items.
882
883 Both sysfs and configfs can and should exist together on the
884 same system. One is not a replacement for the other.
885
886 If unsure, say N.
887
844endmenu 888endmenu
845 889
846menu "Miscellaneous filesystems" 890menu "Miscellaneous filesystems"
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 175b2e8177c..f3d3d81eb7e 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -1,6 +1,6 @@
1config BINFMT_ELF 1config BINFMT_ELF
2 bool "Kernel support for ELF binaries" 2 bool "Kernel support for ELF binaries"
3 depends on MMU 3 depends on MMU && (BROKEN || !FRV)
4 default y 4 default y
5 ---help--- 5 ---help---
6 ELF (Executable and Linkable Format) is a format for libraries and 6 ELF (Executable and Linkable Format) is a format for libraries and
diff --git a/fs/Makefile b/fs/Makefile
index 4c265575907..1db711319c8 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -10,11 +10,11 @@ obj-y := open.o read_write.o file_table.o buffer.o bio.o super.o \
10 ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \ 10 ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
11 attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ 11 attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
12 seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \ 12 seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
13 ioprio.o pnode.o 13 ioprio.o pnode.o drop_caches.o
14 14
15obj-$(CONFIG_INOTIFY) += inotify.o 15obj-$(CONFIG_INOTIFY) += inotify.o
16obj-$(CONFIG_EPOLL) += eventpoll.o 16obj-$(CONFIG_EPOLL) += eventpoll.o
17obj-$(CONFIG_COMPAT) += compat.o 17obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
18 18
19nfsd-$(CONFIG_NFSD) := nfsctl.o 19nfsd-$(CONFIG_NFSD) := nfsctl.o
20obj-y += $(nfsd-y) $(nfsd-m) 20obj-y += $(nfsd-y) $(nfsd-m)
@@ -101,3 +101,5 @@ obj-$(CONFIG_BEFS_FS) += befs/
101obj-$(CONFIG_HOSTFS) += hostfs/ 101obj-$(CONFIG_HOSTFS) += hostfs/
102obj-$(CONFIG_HPPFS) += hppfs/ 102obj-$(CONFIG_HPPFS) += hppfs/
103obj-$(CONFIG_DEBUG_FS) += debugfs/ 103obj-$(CONFIG_DEBUG_FS) += debugfs/
104obj-$(CONFIG_CONFIGFS_FS) += configfs/
105obj-$(CONFIG_OCFS2_FS) += ocfs2/
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 9ebe881c678..44d439cb69f 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -244,10 +244,10 @@ affs_put_inode(struct inode *inode)
244 pr_debug("AFFS: put_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink); 244 pr_debug("AFFS: put_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
245 affs_free_prealloc(inode); 245 affs_free_prealloc(inode);
246 if (atomic_read(&inode->i_count) == 1) { 246 if (atomic_read(&inode->i_count) == 1) {
247 down(&inode->i_sem); 247 mutex_lock(&inode->i_mutex);
248 if (inode->i_size != AFFS_I(inode)->mmu_private) 248 if (inode->i_size != AFFS_I(inode)->mmu_private)
249 affs_truncate(inode); 249 affs_truncate(inode);
250 up(&inode->i_sem); 250 mutex_unlock(&inode->i_mutex);
251 } 251 }
252} 252}
253 253
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 2fd62f89ae0..9cb206e9d4b 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -19,6 +19,7 @@
19#include "server.h" 19#include "server.h"
20#include "vnode.h" 20#include "vnode.h"
21#include "internal.h" 21#include "internal.h"
22#include "cmservice.h"
22 23
23/*****************************************************************************/ 24/*****************************************************************************/
24/* 25/*
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 0a57fd7c726..9eef6bf156a 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -118,7 +118,7 @@ static int kafscmd(void *arg)
118 _SRXAFSCM_xxxx_t func; 118 _SRXAFSCM_xxxx_t func;
119 int die; 119 int die;
120 120
121 printk("kAFS: Started kafscmd %d\n", current->pid); 121 printk(KERN_INFO "kAFS: Started kafscmd %d\n", current->pid);
122 122
123 daemonize("kafscmd"); 123 daemonize("kafscmd");
124 124
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 6682d6d7f29..5c61c24dab2 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -137,7 +137,7 @@ static inline void afs_dir_check_page(struct inode *dir, struct page *page)
137#endif 137#endif
138 138
139 /* determine how many magic numbers there should be in this page */ 139 /* determine how many magic numbers there should be in this page */
140 latter = dir->i_size - (page->index << PAGE_CACHE_SHIFT); 140 latter = dir->i_size - page_offset(page);
141 if (latter >= PAGE_SIZE) 141 if (latter >= PAGE_SIZE)
142 qty = PAGE_SIZE; 142 qty = PAGE_SIZE;
143 else 143 else
diff --git a/fs/afs/volume.h b/fs/afs/volume.h
index 1e691889c4c..bfdcf19ba3f 100644
--- a/fs/afs/volume.h
+++ b/fs/afs/volume.h
@@ -18,8 +18,6 @@
18#include "kafsasyncd.h" 18#include "kafsasyncd.h"
19#include "cache.h" 19#include "cache.h"
20 20
21#define __packed __attribute__((packed))
22
23typedef enum { 21typedef enum {
24 AFS_VLUPD_SLEEP, /* sleeping waiting for update timer to fire */ 22 AFS_VLUPD_SLEEP, /* sleeping waiting for update timer to fire */
25 AFS_VLUPD_PENDING, /* on pending queue */ 23 AFS_VLUPD_PENDING, /* on pending queue */
@@ -115,7 +113,7 @@ struct afs_volume
115 struct cachefs_cookie *cache; /* caching cookie */ 113 struct cachefs_cookie *cache; /* caching cookie */
116#endif 114#endif
117 afs_volid_t vid; /* volume ID */ 115 afs_volid_t vid; /* volume ID */
118 afs_voltype_t __packed type; /* type of volume */ 116 afs_voltype_t type; /* type of volume */
119 char type_force; /* force volume type (suppress R/O -> R/W) */ 117 char type_force; /* force volume type (suppress R/O -> R/W) */
120 unsigned short nservers; /* number of server slots filled */ 118 unsigned short nservers; /* number of server slots filled */
121 unsigned short rjservers; /* number of servers discarded due to -ENOMEDIUM */ 119 unsigned short rjservers; /* number of servers discarded due to -ENOMEDIUM */
diff --git a/fs/aio.c b/fs/aio.c
index 20bb919eb19..aec2b1916d1 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -29,7 +29,6 @@
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/workqueue.h> 30#include <linux/workqueue.h>
31#include <linux/security.h> 31#include <linux/security.h>
32#include <linux/rcuref.h>
33 32
34#include <asm/kmap_types.h> 33#include <asm/kmap_types.h>
35#include <asm/uaccess.h> 34#include <asm/uaccess.h>
@@ -457,6 +456,8 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx)
457 456
458static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) 457static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
459{ 458{
459 assert_spin_locked(&ctx->ctx_lock);
460
460 if (req->ki_dtor) 461 if (req->ki_dtor)
461 req->ki_dtor(req); 462 req->ki_dtor(req);
462 kmem_cache_free(kiocb_cachep, req); 463 kmem_cache_free(kiocb_cachep, req);
@@ -498,6 +499,8 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
498 dprintk(KERN_DEBUG "aio_put(%p): f_count=%d\n", 499 dprintk(KERN_DEBUG "aio_put(%p): f_count=%d\n",
499 req, atomic_read(&req->ki_filp->f_count)); 500 req, atomic_read(&req->ki_filp->f_count));
500 501
502 assert_spin_locked(&ctx->ctx_lock);
503
501 req->ki_users --; 504 req->ki_users --;
502 if (unlikely(req->ki_users < 0)) 505 if (unlikely(req->ki_users < 0))
503 BUG(); 506 BUG();
@@ -510,7 +513,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
510 /* Must be done under the lock to serialise against cancellation. 513 /* Must be done under the lock to serialise against cancellation.
511 * Call this aio_fput as it duplicates fput via the fput_work. 514 * Call this aio_fput as it duplicates fput via the fput_work.
512 */ 515 */
513 if (unlikely(rcuref_dec_and_test(&req->ki_filp->f_count))) { 516 if (unlikely(atomic_dec_and_test(&req->ki_filp->f_count))) {
514 get_ioctx(ctx); 517 get_ioctx(ctx);
515 spin_lock(&fput_lock); 518 spin_lock(&fput_lock);
516 list_add(&req->ki_list, &fput_head); 519 list_add(&req->ki_list, &fput_head);
@@ -619,14 +622,13 @@ static void unuse_mm(struct mm_struct *mm)
619 * the kiocb (to tell the caller to activate the work 622 * the kiocb (to tell the caller to activate the work
620 * queue to process it), or 0, if it found that it was 623 * queue to process it), or 0, if it found that it was
621 * already queued. 624 * already queued.
622 *
623 * Should be called with the spin lock iocb->ki_ctx->ctx_lock
624 * held
625 */ 625 */
626static inline int __queue_kicked_iocb(struct kiocb *iocb) 626static inline int __queue_kicked_iocb(struct kiocb *iocb)
627{ 627{
628 struct kioctx *ctx = iocb->ki_ctx; 628 struct kioctx *ctx = iocb->ki_ctx;
629 629
630 assert_spin_locked(&ctx->ctx_lock);
631
630 if (list_empty(&iocb->ki_run_list)) { 632 if (list_empty(&iocb->ki_run_list)) {
631 list_add_tail(&iocb->ki_run_list, 633 list_add_tail(&iocb->ki_run_list,
632 &ctx->run_list); 634 &ctx->run_list);
@@ -771,13 +773,15 @@ out:
771 * Process all pending retries queued on the ioctx 773 * Process all pending retries queued on the ioctx
772 * run list. 774 * run list.
773 * Assumes it is operating within the aio issuer's mm 775 * Assumes it is operating within the aio issuer's mm
774 * context. Expects to be called with ctx->ctx_lock held 776 * context.
775 */ 777 */
776static int __aio_run_iocbs(struct kioctx *ctx) 778static int __aio_run_iocbs(struct kioctx *ctx)
777{ 779{
778 struct kiocb *iocb; 780 struct kiocb *iocb;
779 LIST_HEAD(run_list); 781 LIST_HEAD(run_list);
780 782
783 assert_spin_locked(&ctx->ctx_lock);
784
781 list_splice_init(&ctx->run_list, &run_list); 785 list_splice_init(&ctx->run_list, &run_list);
782 while (!list_empty(&run_list)) { 786 while (!list_empty(&run_list)) {
783 iocb = list_entry(run_list.next, struct kiocb, 787 iocb = list_entry(run_list.next, struct kiocb,
@@ -937,28 +941,19 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
937 unsigned long tail; 941 unsigned long tail;
938 int ret; 942 int ret;
939 943
940 /* Special case handling for sync iocbs: events go directly 944 /*
941 * into the iocb for fast handling. Note that this will not 945 * Special case handling for sync iocbs:
942 * work if we allow sync kiocbs to be cancelled. in which 946 * - events go directly into the iocb for fast handling
943 * case the usage count checks will have to move under ctx_lock 947 * - the sync task with the iocb in its stack holds the single iocb
944 * for all cases. 948 * ref, no other paths have a way to get another ref
949 * - the sync task helpfully left a reference to itself in the iocb
945 */ 950 */
946 if (is_sync_kiocb(iocb)) { 951 if (is_sync_kiocb(iocb)) {
947 int ret; 952 BUG_ON(iocb->ki_users != 1);
948
949 iocb->ki_user_data = res; 953 iocb->ki_user_data = res;
950 if (iocb->ki_users == 1) { 954 iocb->ki_users = 0;
951 iocb->ki_users = 0;
952 ret = 1;
953 } else {
954 spin_lock_irq(&ctx->ctx_lock);
955 iocb->ki_users--;
956 ret = (0 == iocb->ki_users);
957 spin_unlock_irq(&ctx->ctx_lock);
958 }
959 /* sync iocbs put the task here for us */
960 wake_up_process(iocb->ki_obj.tsk); 955 wake_up_process(iocb->ki_obj.tsk);
961 return ret; 956 return 1;
962 } 957 }
963 958
964 info = &ctx->ring_info; 959 info = &ctx->ring_info;
@@ -1613,12 +1608,14 @@ asmlinkage long sys_io_submit(aio_context_t ctx_id, long nr,
1613 1608
1614/* lookup_kiocb 1609/* lookup_kiocb
1615 * Finds a given iocb for cancellation. 1610 * Finds a given iocb for cancellation.
1616 * MUST be called with ctx->ctx_lock held.
1617 */ 1611 */
1618static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, 1612static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
1619 u32 key) 1613 u32 key)
1620{ 1614{
1621 struct list_head *pos; 1615 struct list_head *pos;
1616
1617 assert_spin_locked(&ctx->ctx_lock);
1618
1622 /* TODO: use a hash or array, this sucks. */ 1619 /* TODO: use a hash or array, this sucks. */
1623 list_for_each(pos, &ctx->active_reqs) { 1620 list_for_each(pos, &ctx->active_reqs) {
1624 struct kiocb *kiocb = list_kiocb(pos); 1621 struct kiocb *kiocb = list_kiocb(pos);
diff --git a/fs/attr.c b/fs/attr.c
index 67bcd9b14ea..97de9467087 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -10,11 +10,11 @@
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/smp_lock.h> 12#include <linux/smp_lock.h>
13#include <linux/capability.h>
13#include <linux/fsnotify.h> 14#include <linux/fsnotify.h>
14#include <linux/fcntl.h> 15#include <linux/fcntl.h>
15#include <linux/quotaops.h> 16#include <linux/quotaops.h>
16#include <linux/security.h> 17#include <linux/security.h>
17#include <linux/time.h>
18 18
19/* Taken over from the old code... */ 19/* Taken over from the old code... */
20 20
@@ -67,20 +67,12 @@ EXPORT_SYMBOL(inode_change_ok);
67int inode_setattr(struct inode * inode, struct iattr * attr) 67int inode_setattr(struct inode * inode, struct iattr * attr)
68{ 68{
69 unsigned int ia_valid = attr->ia_valid; 69 unsigned int ia_valid = attr->ia_valid;
70 int error = 0; 70
71 71 if (ia_valid & ATTR_SIZE &&
72 if (ia_valid & ATTR_SIZE) { 72 attr->ia_size != i_size_read(inode)) {
73 if (attr->ia_size != i_size_read(inode)) { 73 int error = vmtruncate(inode, attr->ia_size);
74 error = vmtruncate(inode, attr->ia_size); 74 if (error)
75 if (error || (ia_valid == ATTR_SIZE)) 75 return error;
76 goto out;
77 } else {
78 /*
79 * We skipped the truncate but must still update
80 * timestamps
81 */
82 ia_valid |= ATTR_MTIME|ATTR_CTIME;
83 }
84 } 76 }
85 77
86 if (ia_valid & ATTR_UID) 78 if (ia_valid & ATTR_UID)
@@ -104,8 +96,8 @@ int inode_setattr(struct inode * inode, struct iattr * attr)
104 inode->i_mode = mode; 96 inode->i_mode = mode;
105 } 97 }
106 mark_inode_dirty(inode); 98 mark_inode_dirty(inode);
107out: 99
108 return error; 100 return 0;
109} 101}
110EXPORT_SYMBOL(inode_setattr); 102EXPORT_SYMBOL(inode_setattr);
111 103
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index a1ab1c0ed21..870e2cf3301 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -10,6 +10,7 @@
10 * 10 *
11 * ------------------------------------------------------------------------- */ 11 * ------------------------------------------------------------------------- */
12 12
13#include <linux/capability.h>
13#include <linux/errno.h> 14#include <linux/errno.h>
14#include <linux/stat.h> 15#include <linux/stat.h>
15#include <linux/param.h> 16#include <linux/param.h>
@@ -229,9 +230,9 @@ static struct dentry *autofs_root_lookup(struct inode *dir, struct dentry *dentr
229 dentry->d_flags |= DCACHE_AUTOFS_PENDING; 230 dentry->d_flags |= DCACHE_AUTOFS_PENDING;
230 d_add(dentry, NULL); 231 d_add(dentry, NULL);
231 232
232 up(&dir->i_sem); 233 mutex_unlock(&dir->i_mutex);
233 autofs_revalidate(dentry, nd); 234 autofs_revalidate(dentry, nd);
234 down(&dir->i_sem); 235 mutex_lock(&dir->i_mutex);
235 236
236 /* 237 /*
237 * If we are still pending, check if we had to handle 238 * If we are still pending, check if we had to handle
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index fca83e28edc..385bed09b0d 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -209,7 +209,7 @@ static inline int simple_empty_nolock(struct dentry *dentry)
209 struct dentry *child; 209 struct dentry *child;
210 int ret = 0; 210 int ret = 0;
211 211
212 list_for_each_entry(child, &dentry->d_subdirs, d_child) 212 list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child)
213 if (simple_positive(child)) 213 if (simple_positive(child))
214 goto out; 214 goto out;
215 ret = 1; 215 ret = 1;
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index feb6ac427d0..dc39589df16 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -105,7 +105,7 @@ repeat:
105 next = this_parent->d_subdirs.next; 105 next = this_parent->d_subdirs.next;
106resume: 106resume:
107 while (next != &this_parent->d_subdirs) { 107 while (next != &this_parent->d_subdirs) {
108 struct dentry *dentry = list_entry(next, struct dentry, d_child); 108 struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
109 109
110 /* Negative dentry - give up */ 110 /* Negative dentry - give up */
111 if (!simple_positive(dentry)) { 111 if (!simple_positive(dentry)) {
@@ -138,7 +138,7 @@ resume:
138 } 138 }
139 139
140 if (this_parent != top) { 140 if (this_parent != top) {
141 next = this_parent->d_child.next; 141 next = this_parent->d_u.d_child.next;
142 this_parent = this_parent->d_parent; 142 this_parent = this_parent->d_parent;
143 goto resume; 143 goto resume;
144 } 144 }
@@ -163,7 +163,7 @@ repeat:
163 next = this_parent->d_subdirs.next; 163 next = this_parent->d_subdirs.next;
164resume: 164resume:
165 while (next != &this_parent->d_subdirs) { 165 while (next != &this_parent->d_subdirs) {
166 struct dentry *dentry = list_entry(next, struct dentry, d_child); 166 struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
167 167
168 /* Negative dentry - give up */ 168 /* Negative dentry - give up */
169 if (!simple_positive(dentry)) { 169 if (!simple_positive(dentry)) {
@@ -199,7 +199,7 @@ cont:
199 } 199 }
200 200
201 if (this_parent != parent) { 201 if (this_parent != parent) {
202 next = this_parent->d_child.next; 202 next = this_parent->d_u.d_child.next;
203 this_parent = this_parent->d_parent; 203 this_parent = this_parent->d_parent;
204 goto resume; 204 goto resume;
205 } 205 }
@@ -238,7 +238,7 @@ static struct dentry *autofs4_expire(struct super_block *sb,
238 /* On exit from the loop expire is set to a dgot dentry 238 /* On exit from the loop expire is set to a dgot dentry
239 * to expire or it's NULL */ 239 * to expire or it's NULL */
240 while ( next != &root->d_subdirs ) { 240 while ( next != &root->d_subdirs ) {
241 struct dentry *dentry = list_entry(next, struct dentry, d_child); 241 struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
242 242
243 /* Negative dentry - give up */ 243 /* Negative dentry - give up */
244 if ( !simple_positive(dentry) ) { 244 if ( !simple_positive(dentry) ) {
@@ -302,7 +302,7 @@ next:
302 expired, (int)expired->d_name.len, expired->d_name.name); 302 expired, (int)expired->d_name.len, expired->d_name.name);
303 spin_lock(&dcache_lock); 303 spin_lock(&dcache_lock);
304 list_del(&expired->d_parent->d_subdirs); 304 list_del(&expired->d_parent->d_subdirs);
305 list_add(&expired->d_parent->d_subdirs, &expired->d_child); 305 list_add(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
306 spin_unlock(&dcache_lock); 306 spin_unlock(&dcache_lock);
307 return expired; 307 return expired;
308 } 308 }
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 818b37be515..2d3082854a2 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -91,7 +91,7 @@ repeat:
91 next = this_parent->d_subdirs.next; 91 next = this_parent->d_subdirs.next;
92resume: 92resume:
93 while (next != &this_parent->d_subdirs) { 93 while (next != &this_parent->d_subdirs) {
94 struct dentry *dentry = list_entry(next, struct dentry, d_child); 94 struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
95 95
96 /* Negative dentry - don`t care */ 96 /* Negative dentry - don`t care */
97 if (!simple_positive(dentry)) { 97 if (!simple_positive(dentry)) {
@@ -117,7 +117,7 @@ resume:
117 if (this_parent != sbi->root) { 117 if (this_parent != sbi->root) {
118 struct dentry *dentry = this_parent; 118 struct dentry *dentry = this_parent;
119 119
120 next = this_parent->d_child.next; 120 next = this_parent->d_u.d_child.next;
121 this_parent = this_parent->d_parent; 121 this_parent = this_parent->d_parent;
122 spin_unlock(&dcache_lock); 122 spin_unlock(&dcache_lock);
123 DPRINTK("parent dentry %p %.*s", 123 DPRINTK("parent dentry %p %.*s",
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 2a771ec6695..62d8d4acb8b 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -12,6 +12,7 @@
12 * 12 *
13 * ------------------------------------------------------------------------- */ 13 * ------------------------------------------------------------------------- */
14 14
15#include <linux/capability.h>
15#include <linux/errno.h> 16#include <linux/errno.h>
16#include <linux/stat.h> 17#include <linux/stat.h>
17#include <linux/param.h> 18#include <linux/param.h>
@@ -86,7 +87,7 @@ static int autofs4_root_readdir(struct file *file, void *dirent,
86 87
87/* Update usage from here to top of tree, so that scan of 88/* Update usage from here to top of tree, so that scan of
88 top-level directories will give a useful result */ 89 top-level directories will give a useful result */
89static void autofs4_update_usage(struct dentry *dentry) 90static void autofs4_update_usage(struct vfsmount *mnt, struct dentry *dentry)
90{ 91{
91 struct dentry *top = dentry->d_sb->s_root; 92 struct dentry *top = dentry->d_sb->s_root;
92 93
@@ -95,7 +96,7 @@ static void autofs4_update_usage(struct dentry *dentry)
95 struct autofs_info *ino = autofs4_dentry_ino(dentry); 96 struct autofs_info *ino = autofs4_dentry_ino(dentry);
96 97
97 if (ino) { 98 if (ino) {
98 update_atime(dentry->d_inode); 99 touch_atime(mnt, dentry);
99 ino->last_used = jiffies; 100 ino->last_used = jiffies;
100 } 101 }
101 } 102 }
@@ -143,7 +144,8 @@ static int autofs4_dcache_readdir(struct file * filp, void * dirent, filldir_t f
143 } 144 }
144 145
145 while(1) { 146 while(1) {
146 struct dentry *de = list_entry(list, struct dentry, d_child); 147 struct dentry *de = list_entry(list,
148 struct dentry, d_u.d_child);
147 149
148 if (!d_unhashed(de) && de->d_inode) { 150 if (!d_unhashed(de) && de->d_inode) {
149 spin_unlock(&dcache_lock); 151 spin_unlock(&dcache_lock);
@@ -193,6 +195,8 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
193 if (!empty) 195 if (!empty)
194 d_invalidate(dentry); 196 d_invalidate(dentry);
195 197
198 nd.dentry = dentry;
199 nd.mnt = mnt;
196 nd.flags = LOOKUP_DIRECTORY; 200 nd.flags = LOOKUP_DIRECTORY;
197 status = (dentry->d_op->d_revalidate)(dentry, &nd); 201 status = (dentry->d_op->d_revalidate)(dentry, &nd);
198 202
@@ -288,10 +292,10 @@ out:
288 return autofs4_dcache_readdir(file, dirent, filldir); 292 return autofs4_dcache_readdir(file, dirent, filldir);
289} 293}
290 294
291static int try_to_fill_dentry(struct dentry *dentry, 295static int try_to_fill_dentry(struct vfsmount *mnt, struct dentry *dentry, int flags)
292 struct super_block *sb,
293 struct autofs_sb_info *sbi, int flags)
294{ 296{
297 struct super_block *sb = mnt->mnt_sb;
298 struct autofs_sb_info *sbi = autofs4_sbi(sb);
295 struct autofs_info *de_info = autofs4_dentry_ino(dentry); 299 struct autofs_info *de_info = autofs4_dentry_ino(dentry);
296 int status = 0; 300 int status = 0;
297 301
@@ -366,7 +370,7 @@ static int try_to_fill_dentry(struct dentry *dentry,
366 /* We don't update the usages for the autofs daemon itself, this 370 /* We don't update the usages for the autofs daemon itself, this
367 is necessary for recursive autofs mounts */ 371 is necessary for recursive autofs mounts */
368 if (!autofs4_oz_mode(sbi)) 372 if (!autofs4_oz_mode(sbi))
369 autofs4_update_usage(dentry); 373 autofs4_update_usage(mnt, dentry);
370 374
371 spin_lock(&dentry->d_lock); 375 spin_lock(&dentry->d_lock);
372 dentry->d_flags &= ~DCACHE_AUTOFS_PENDING; 376 dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
@@ -391,7 +395,7 @@ static int autofs4_revalidate(struct dentry * dentry, struct nameidata *nd)
391 /* Pending dentry */ 395 /* Pending dentry */
392 if (autofs4_ispending(dentry)) { 396 if (autofs4_ispending(dentry)) {
393 if (!oz_mode) 397 if (!oz_mode)
394 status = try_to_fill_dentry(dentry, dir->i_sb, sbi, flags); 398 status = try_to_fill_dentry(nd->mnt, dentry, flags);
395 return status; 399 return status;
396 } 400 }
397 401
@@ -408,14 +412,14 @@ static int autofs4_revalidate(struct dentry * dentry, struct nameidata *nd)
408 dentry, dentry->d_name.len, dentry->d_name.name); 412 dentry, dentry->d_name.len, dentry->d_name.name);
409 spin_unlock(&dcache_lock); 413 spin_unlock(&dcache_lock);
410 if (!oz_mode) 414 if (!oz_mode)
411 status = try_to_fill_dentry(dentry, dir->i_sb, sbi, flags); 415 status = try_to_fill_dentry(nd->mnt, dentry, flags);
412 return status; 416 return status;
413 } 417 }
414 spin_unlock(&dcache_lock); 418 spin_unlock(&dcache_lock);
415 419
416 /* Update the usage list */ 420 /* Update the usage list */
417 if (!oz_mode) 421 if (!oz_mode)
418 autofs4_update_usage(dentry); 422 autofs4_update_usage(nd->mnt, dentry);
419 423
420 return 1; 424 return 1;
421} 425}
@@ -488,9 +492,9 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
488 d_add(dentry, NULL); 492 d_add(dentry, NULL);
489 493
490 if (dentry->d_op && dentry->d_op->d_revalidate) { 494 if (dentry->d_op && dentry->d_op->d_revalidate) {
491 up(&dir->i_sem); 495 mutex_unlock(&dir->i_mutex);
492 (dentry->d_op->d_revalidate)(dentry, nd); 496 (dentry->d_op->d_revalidate)(dentry, nd);
493 down(&dir->i_sem); 497 mutex_lock(&dir->i_mutex);
494 } 498 }
495 499
496 /* 500 /*
diff --git a/fs/befs/attribute.c b/fs/befs/attribute.c
deleted file mode 100644
index e329d727053..00000000000
--- a/fs/befs/attribute.c
+++ /dev/null
@@ -1,117 +0,0 @@
1/*
2 * linux/fs/befs/attribute.c
3 *
4 * Copyright (C) 2002 Will Dyson <will_dyson@pobox.com>
5 *
6 * Many thanks to Dominic Giampaolo, author of "Practical File System
7 * Design with the Be File System", for such a helpful book.
8 *
9 */
10
11#include <linux/fs.h>
12#include <linux/kernel.h>
13#include <linux/string.h>
14
15#include "befs.h"
16#include "endian.h"
17
18#define SD_DATA(sd)\
19 (void*)((char*)sd + sizeof(*sd) + (sd->name_size - sizeof(sd->name)))
20
21#define SD_NEXT(sd)\
22 (befs_small_data*)((char*)sd + sizeof(*sd) + (sd->name_size - \
23 sizeof(sd->name) + sd->data_size))
24
25int
26list_small_data(struct super_block *sb, befs_inode * inode, filldir_t filldir);
27
28befs_small_data *
29find_small_data(struct super_block *sb, befs_inode * inode,
30 const char *name);
31int
32read_small_data(struct super_block *sb, befs_inode * inode,
33 befs_small_data * sdata, void *buf, size_t bufsize);
34
35/**
36 *
37 *
38 *
39 *
40 *
41 */
42befs_small_data *
43find_small_data(struct super_block *sb, befs_inode * inode, const char *name)
44{
45 befs_small_data *sdata = inode->small_data;
46
47 while (sdata->type != 0) {
48 if (strcmp(name, sdata->name) != 0) {
49 return sdata;
50 }
51 sdata = SD_NEXT(sdata);
52 }
53 return NULL;
54}
55
56/**
57 *
58 *
59 *
60 *
61 *
62 */
63int
64read_small_data(struct super_block *sb, befs_inode * inode,
65 const char *name, void *buf, size_t bufsize)
66{
67 befs_small_data *sdata;
68
69 sdata = find_small_data(sb, inode, name);
70 if (sdata == NULL)
71 return BEFS_ERR;
72 else if (sdata->data_size > bufsize)
73 return BEFS_ERR;
74
75 memcpy(buf, SD_DATA(sdata), sdata->data_size);
76
77 return BEFS_OK;
78}
79
80/**
81 *
82 *
83 *
84 *
85 *
86 */
87int
88list_small_data(struct super_block *sb, befs_inode * inode)
89{
90
91}
92
93/**
94 *
95 *
96 *
97 *
98 *
99 */
100int
101list_attr(struct super_block *sb, befs_inode * inode)
102{
103
104}
105
106/**
107 *
108 *
109 *
110 *
111 *
112 */
113int
114read_attr(struct super_block *sb, befs_inode * inode)
115{
116
117}
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 72011826f0c..f312103434d 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -33,8 +33,6 @@ static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
33static int load_aout_library(struct file*); 33static int load_aout_library(struct file*);
34static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file); 34static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file);
35 35
36extern void dump_thread(struct pt_regs *, struct user *);
37
38static struct linux_binfmt aout_format = { 36static struct linux_binfmt aout_format = {
39 .module = THIS_MODULE, 37 .module = THIS_MODULE,
40 .load_binary = load_aout_binary, 38 .load_binary = load_aout_binary,
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index f36f2210204..1b117a44129 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -58,7 +58,7 @@ extern int dump_fpu (struct pt_regs *, elf_fpregset_t *);
58 * If we don't support core dumping, then supply a NULL so we 58 * If we don't support core dumping, then supply a NULL so we
59 * don't even try. 59 * don't even try.
60 */ 60 */
61#ifdef USE_ELF_CORE_DUMP 61#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
62static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file); 62static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file);
63#else 63#else
64#define elf_core_dump NULL 64#define elf_core_dump NULL
@@ -288,11 +288,17 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
288 struct elf_phdr *eppnt, int prot, int type) 288 struct elf_phdr *eppnt, int prot, int type)
289{ 289{
290 unsigned long map_addr; 290 unsigned long map_addr;
291 unsigned long pageoffset = ELF_PAGEOFFSET(eppnt->p_vaddr);
291 292
292 down_write(&current->mm->mmap_sem); 293 down_write(&current->mm->mmap_sem);
293 map_addr = do_mmap(filep, ELF_PAGESTART(addr), 294 /* mmap() will return -EINVAL if given a zero size, but a
294 eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr), prot, type, 295 * segment with zero filesize is perfectly valid */
295 eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr)); 296 if (eppnt->p_filesz + pageoffset)
297 map_addr = do_mmap(filep, ELF_PAGESTART(addr),
298 eppnt->p_filesz + pageoffset, prot, type,
299 eppnt->p_offset - pageoffset);
300 else
301 map_addr = ELF_PAGESTART(addr);
296 up_write(&current->mm->mmap_sem); 302 up_write(&current->mm->mmap_sem);
297 return(map_addr); 303 return(map_addr);
298} 304}
@@ -616,7 +622,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
616 goto out_free_file; 622 goto out_free_file;
617 623
618 retval = -ENOMEM; 624 retval = -ENOMEM;
619 elf_interpreter = (char *) kmalloc(elf_ppnt->p_filesz, 625 elf_interpreter = kmalloc(elf_ppnt->p_filesz,
620 GFP_KERNEL); 626 GFP_KERNEL);
621 if (!elf_interpreter) 627 if (!elf_interpreter)
622 goto out_free_file; 628 goto out_free_file;
@@ -1107,7 +1113,7 @@ out:
1107 * Note that some platforms still use traditional core dumps and not 1113 * Note that some platforms still use traditional core dumps and not
1108 * the ELF core dump. Each platform can select it as appropriate. 1114 * the ELF core dump. Each platform can select it as appropriate.
1109 */ 1115 */
1110#ifdef USE_ELF_CORE_DUMP 1116#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
1111 1117
1112/* 1118/*
1113 * ELF core dumper 1119 * ELF core dumper
@@ -1212,7 +1218,7 @@ static int writenote(struct memelfnote *men, struct file *file)
1212 if (!dump_seek(file, (off))) \ 1218 if (!dump_seek(file, (off))) \
1213 goto end_coredump; 1219 goto end_coredump;
1214 1220
1215static inline void fill_elf_header(struct elfhdr *elf, int segs) 1221static void fill_elf_header(struct elfhdr *elf, int segs)
1216{ 1222{
1217 memcpy(elf->e_ident, ELFMAG, SELFMAG); 1223 memcpy(elf->e_ident, ELFMAG, SELFMAG);
1218 elf->e_ident[EI_CLASS] = ELF_CLASS; 1224 elf->e_ident[EI_CLASS] = ELF_CLASS;
@@ -1237,7 +1243,7 @@ static inline void fill_elf_header(struct elfhdr *elf, int segs)
1237 return; 1243 return;
1238} 1244}
1239 1245
1240static inline void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, off_t offset) 1246static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, off_t offset)
1241{ 1247{
1242 phdr->p_type = PT_NOTE; 1248 phdr->p_type = PT_NOTE;
1243 phdr->p_offset = offset; 1249 phdr->p_offset = offset;
@@ -1628,17 +1634,17 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
1628 ELF_CORE_WRITE_EXTRA_DATA; 1634 ELF_CORE_WRITE_EXTRA_DATA;
1629#endif 1635#endif
1630 1636
1631 if ((off_t) file->f_pos != offset) { 1637 if ((off_t)file->f_pos != offset) {
1632 /* Sanity check */ 1638 /* Sanity check */
1633 printk("elf_core_dump: file->f_pos (%ld) != offset (%ld)\n", 1639 printk(KERN_WARNING "elf_core_dump: file->f_pos (%ld) != offset (%ld)\n",
1634 (off_t) file->f_pos, offset); 1640 (off_t)file->f_pos, offset);
1635 } 1641 }
1636 1642
1637end_coredump: 1643end_coredump:
1638 set_fs(fs); 1644 set_fs(fs);
1639 1645
1640cleanup: 1646cleanup:
1641 while(!list_empty(&thread_list)) { 1647 while (!list_empty(&thread_list)) {
1642 struct list_head *tmp = thread_list.next; 1648 struct list_head *tmp = thread_list.next;
1643 list_del(tmp); 1649 list_del(tmp);
1644 kfree(list_entry(tmp, struct elf_thread_status, list)); 1650 kfree(list_entry(tmp, struct elf_thread_status, list));
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index e0344f69c79..5b3076e8ee9 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -187,7 +187,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs
187 goto error; 187 goto error;
188 188
189 /* read the name of the interpreter into memory */ 189 /* read the name of the interpreter into memory */
190 interpreter_name = (char *) kmalloc(phdr->p_filesz, GFP_KERNEL); 190 interpreter_name = kmalloc(phdr->p_filesz, GFP_KERNEL);
191 if (!interpreter_name) 191 if (!interpreter_name)
192 goto error; 192 goto error;
193 193
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 9d6625829b9..108d56bbd0d 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -77,8 +77,6 @@ static int load_flat_shared_library(int id, struct lib_info *p);
77static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs); 77static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs);
78static int flat_core_dump(long signr, struct pt_regs * regs, struct file *file); 78static int flat_core_dump(long signr, struct pt_regs * regs, struct file *file);
79 79
80extern void dump_thread(struct pt_regs *, struct user *);
81
82static struct linux_binfmt flat_format = { 80static struct linux_binfmt flat_format = {
83 .module = THIS_MODULE, 81 .module = THIS_MODULE,
84 .load_binary = load_flat_binary, 82 .load_binary = load_flat_binary,
@@ -444,19 +442,22 @@ static int load_flat_file(struct linux_binprm * bprm,
444 flags = ntohl(hdr->flags); 442 flags = ntohl(hdr->flags);
445 rev = ntohl(hdr->rev); 443 rev = ntohl(hdr->rev);
446 444
447 if (flags & FLAT_FLAG_KTRACE) 445 if (strncmp(hdr->magic, "bFLT", 4)) {
448 printk("BINFMT_FLAT: Loading file: %s\n", bprm->filename);
449
450 if (strncmp(hdr->magic, "bFLT", 4) ||
451 (rev != FLAT_VERSION && rev != OLD_FLAT_VERSION)) {
452 /* 446 /*
453 * because a lot of people do not manage to produce good 447 * because a lot of people do not manage to produce good
454 * flat binaries, we leave this printk to help them realise 448 * flat binaries, we leave this printk to help them realise
455 * the problem. We only print the error if its not a script file 449 * the problem. We only print the error if its not a script file
456 */ 450 */
457 if (strncmp(hdr->magic, "#!", 2)) 451 if (strncmp(hdr->magic, "#!", 2))
458 printk("BINFMT_FLAT: bad magic/rev (0x%x, need 0x%x)\n", 452 printk("BINFMT_FLAT: bad header magic\n");
459 rev, (int) FLAT_VERSION); 453 return -ENOEXEC;
454 }
455
456 if (flags & FLAT_FLAG_KTRACE)
457 printk("BINFMT_FLAT: Loading file: %s\n", bprm->filename);
458
459 if (rev != FLAT_VERSION && rev != OLD_FLAT_VERSION) {
460 printk("BINFMT_FLAT: bad flat file version 0x%x (supported 0x%x and 0x%x)\n", rev, FLAT_VERSION, OLD_FLAT_VERSION);
460 return -ENOEXEC; 461 return -ENOEXEC;
461 } 462 }
462 463
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 2568eb41cb3..6a7b730c206 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -264,7 +264,7 @@ static int unquote(char *from)
264 return p - from; 264 return p - from;
265} 265}
266 266
267static inline char * check_special_flags (char * sfs, Node * e) 267static char * check_special_flags (char * sfs, Node * e)
268{ 268{
269 char * p = sfs; 269 char * p = sfs;
270 int cont = 1; 270 int cont = 1;
@@ -588,11 +588,11 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
588 case 2: set_bit(Enabled, &e->flags); 588 case 2: set_bit(Enabled, &e->flags);
589 break; 589 break;
590 case 3: root = dget(file->f_vfsmnt->mnt_sb->s_root); 590 case 3: root = dget(file->f_vfsmnt->mnt_sb->s_root);
591 down(&root->d_inode->i_sem); 591 mutex_lock(&root->d_inode->i_mutex);
592 592
593 kill_node(e); 593 kill_node(e);
594 594
595 up(&root->d_inode->i_sem); 595 mutex_unlock(&root->d_inode->i_mutex);
596 dput(root); 596 dput(root);
597 break; 597 break;
598 default: return res; 598 default: return res;
@@ -622,7 +622,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
622 return PTR_ERR(e); 622 return PTR_ERR(e);
623 623
624 root = dget(sb->s_root); 624 root = dget(sb->s_root);
625 down(&root->d_inode->i_sem); 625 mutex_lock(&root->d_inode->i_mutex);
626 dentry = lookup_one_len(e->name, root, strlen(e->name)); 626 dentry = lookup_one_len(e->name, root, strlen(e->name));
627 err = PTR_ERR(dentry); 627 err = PTR_ERR(dentry);
628 if (IS_ERR(dentry)) 628 if (IS_ERR(dentry))
@@ -658,7 +658,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
658out2: 658out2:
659 dput(dentry); 659 dput(dentry);
660out: 660out:
661 up(&root->d_inode->i_sem); 661 mutex_unlock(&root->d_inode->i_mutex);
662 dput(root); 662 dput(root);
663 663
664 if (err) { 664 if (err) {
@@ -703,12 +703,12 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer,
703 case 1: enabled = 0; break; 703 case 1: enabled = 0; break;
704 case 2: enabled = 1; break; 704 case 2: enabled = 1; break;
705 case 3: root = dget(file->f_vfsmnt->mnt_sb->s_root); 705 case 3: root = dget(file->f_vfsmnt->mnt_sb->s_root);
706 down(&root->d_inode->i_sem); 706 mutex_lock(&root->d_inode->i_mutex);
707 707
708 while (!list_empty(&entries)) 708 while (!list_empty(&entries))
709 kill_node(list_entry(entries.next, Node, list)); 709 kill_node(list_entry(entries.next, Node, list));
710 710
711 up(&root->d_inode->i_sem); 711 mutex_unlock(&root->d_inode->i_mutex);
712 dput(root); 712 dput(root);
713 default: return res; 713 default: return res;
714 } 714 }
diff --git a/fs/bio.c b/fs/bio.c
index 460554b07ff..bbc442b8c86 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -123,9 +123,10 @@ static void bio_fs_destructor(struct bio *bio)
123 bio_free(bio, fs_bio_set); 123 bio_free(bio, fs_bio_set);
124} 124}
125 125
126inline void bio_init(struct bio *bio) 126void bio_init(struct bio *bio)
127{ 127{
128 bio->bi_next = NULL; 128 bio->bi_next = NULL;
129 bio->bi_bdev = NULL;
129 bio->bi_flags = 1 << BIO_UPTODATE; 130 bio->bi_flags = 1 << BIO_UPTODATE;
130 bio->bi_rw = 0; 131 bio->bi_rw = 0;
131 bio->bi_vcnt = 0; 132 bio->bi_vcnt = 0;
@@ -252,7 +253,7 @@ inline int bio_hw_segments(request_queue_t *q, struct bio *bio)
252 * the actual data it points to. Reference count of returned 253 * the actual data it points to. Reference count of returned
253 * bio will be one. 254 * bio will be one.
254 */ 255 */
255inline void __bio_clone(struct bio *bio, struct bio *bio_src) 256void __bio_clone(struct bio *bio, struct bio *bio_src)
256{ 257{
257 request_queue_t *q = bdev_get_queue(bio_src->bi_bdev); 258 request_queue_t *q = bdev_get_queue(bio_src->bi_bdev);
258 259
@@ -313,7 +314,8 @@ int bio_get_nr_vecs(struct block_device *bdev)
313} 314}
314 315
315static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page 316static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page
316 *page, unsigned int len, unsigned int offset) 317 *page, unsigned int len, unsigned int offset,
318 unsigned short max_sectors)
317{ 319{
318 int retried_segments = 0; 320 int retried_segments = 0;
319 struct bio_vec *bvec; 321 struct bio_vec *bvec;
@@ -324,10 +326,31 @@ static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page
324 if (unlikely(bio_flagged(bio, BIO_CLONED))) 326 if (unlikely(bio_flagged(bio, BIO_CLONED)))
325 return 0; 327 return 0;
326 328
327 if (bio->bi_vcnt >= bio->bi_max_vecs) 329 if (((bio->bi_size + len) >> 9) > max_sectors)
328 return 0; 330 return 0;
329 331
330 if (((bio->bi_size + len) >> 9) > q->max_sectors) 332 /*
333 * For filesystems with a blocksize smaller than the pagesize
334 * we will often be called with the same page as last time and
335 * a consecutive offset. Optimize this special case.
336 */
337 if (bio->bi_vcnt > 0) {
338 struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
339
340 if (page == prev->bv_page &&
341 offset == prev->bv_offset + prev->bv_len) {
342 prev->bv_len += len;
343 if (q->merge_bvec_fn &&
344 q->merge_bvec_fn(q, bio, prev) < len) {
345 prev->bv_len -= len;
346 return 0;
347 }
348
349 goto done;
350 }
351 }
352
353 if (bio->bi_vcnt >= bio->bi_max_vecs)
331 return 0; 354 return 0;
332 355
333 /* 356 /*
@@ -381,11 +404,31 @@ static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page
381 bio->bi_vcnt++; 404 bio->bi_vcnt++;
382 bio->bi_phys_segments++; 405 bio->bi_phys_segments++;
383 bio->bi_hw_segments++; 406 bio->bi_hw_segments++;
407 done:
384 bio->bi_size += len; 408 bio->bi_size += len;
385 return len; 409 return len;
386} 410}
387 411
388/** 412/**
413 * bio_add_pc_page - attempt to add page to bio
414 * @bio: destination bio
415 * @page: page to add
416 * @len: vec entry length
417 * @offset: vec entry offset
418 *
419 * Attempt to add a page to the bio_vec maplist. This can fail for a
420 * number of reasons, such as the bio being full or target block
421 * device limitations. The target block device must allow bio's
422 * smaller than PAGE_SIZE, so it is always possible to add a single
423 * page to an empty bio. This should only be used by REQ_PC bios.
424 */
425int bio_add_pc_page(request_queue_t *q, struct bio *bio, struct page *page,
426 unsigned int len, unsigned int offset)
427{
428 return __bio_add_page(q, bio, page, len, offset, q->max_hw_sectors);
429}
430
431/**
389 * bio_add_page - attempt to add page to bio 432 * bio_add_page - attempt to add page to bio
390 * @bio: destination bio 433 * @bio: destination bio
391 * @page: page to add 434 * @page: page to add
@@ -401,8 +444,8 @@ static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page
401int bio_add_page(struct bio *bio, struct page *page, unsigned int len, 444int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
402 unsigned int offset) 445 unsigned int offset)
403{ 446{
404 return __bio_add_page(bdev_get_queue(bio->bi_bdev), bio, page, 447 struct request_queue *q = bdev_get_queue(bio->bi_bdev);
405 len, offset); 448 return __bio_add_page(q, bio, page, len, offset, q->max_sectors);
406} 449}
407 450
408struct bio_map_data { 451struct bio_map_data {
@@ -514,7 +557,7 @@ struct bio *bio_copy_user(request_queue_t *q, unsigned long uaddr,
514 break; 557 break;
515 } 558 }
516 559
517 if (__bio_add_page(q, bio, page, bytes, 0) < bytes) { 560 if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) {
518 ret = -EINVAL; 561 ret = -EINVAL;
519 break; 562 break;
520 } 563 }
@@ -628,7 +671,8 @@ static struct bio *__bio_map_user_iov(request_queue_t *q,
628 /* 671 /*
629 * sorry... 672 * sorry...
630 */ 673 */
631 if (__bio_add_page(q, bio, pages[j], bytes, offset) < bytes) 674 if (bio_add_pc_page(q, bio, pages[j], bytes, offset) <
675 bytes)
632 break; 676 break;
633 677
634 len -= bytes; 678 len -= bytes;
@@ -801,8 +845,8 @@ static struct bio *__bio_map_kern(request_queue_t *q, void *data,
801 if (bytes > len) 845 if (bytes > len)
802 bytes = len; 846 bytes = len;
803 847
804 if (__bio_add_page(q, bio, virt_to_page(data), bytes, 848 if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
805 offset) < bytes) 849 offset) < bytes)
806 break; 850 break;
807 851
808 data += bytes; 852 data += bytes;
@@ -1228,6 +1272,7 @@ EXPORT_SYMBOL(bio_clone);
1228EXPORT_SYMBOL(bio_phys_segments); 1272EXPORT_SYMBOL(bio_phys_segments);
1229EXPORT_SYMBOL(bio_hw_segments); 1273EXPORT_SYMBOL(bio_hw_segments);
1230EXPORT_SYMBOL(bio_add_page); 1274EXPORT_SYMBOL(bio_add_page);
1275EXPORT_SYMBOL(bio_add_pc_page);
1231EXPORT_SYMBOL(bio_get_nr_vecs); 1276EXPORT_SYMBOL(bio_get_nr_vecs);
1232EXPORT_SYMBOL(bio_map_user); 1277EXPORT_SYMBOL(bio_map_user);
1233EXPORT_SYMBOL(bio_unmap_user); 1278EXPORT_SYMBOL(bio_unmap_user);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index e0df94c37b7..6e50346fb1e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -202,7 +202,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
202 loff_t size; 202 loff_t size;
203 loff_t retval; 203 loff_t retval;
204 204
205 down(&bd_inode->i_sem); 205 mutex_lock(&bd_inode->i_mutex);
206 size = i_size_read(bd_inode); 206 size = i_size_read(bd_inode);
207 207
208 switch (origin) { 208 switch (origin) {
@@ -219,7 +219,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
219 } 219 }
220 retval = offset; 220 retval = offset;
221 } 221 }
222 up(&bd_inode->i_sem); 222 mutex_unlock(&bd_inode->i_mutex);
223 return retval; 223 return retval;
224} 224}
225 225
diff --git a/fs/buffer.c b/fs/buffer.c
index 5287be18633..3dc712f29d2 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -26,6 +26,7 @@
26#include <linux/percpu.h> 26#include <linux/percpu.h>
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/smp_lock.h> 28#include <linux/smp_lock.h>
29#include <linux/capability.h>
29#include <linux/blkdev.h> 30#include <linux/blkdev.h>
30#include <linux/file.h> 31#include <linux/file.h>
31#include <linux/quotaops.h> 32#include <linux/quotaops.h>
@@ -153,14 +154,8 @@ int sync_blockdev(struct block_device *bdev)
153{ 154{
154 int ret = 0; 155 int ret = 0;
155 156
156 if (bdev) { 157 if (bdev)
157 int err; 158 ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
158
159 ret = filemap_fdatawrite(bdev->bd_inode->i_mapping);
160 err = filemap_fdatawait(bdev->bd_inode->i_mapping);
161 if (!ret)
162 ret = err;
163 }
164 return ret; 159 return ret;
165} 160}
166EXPORT_SYMBOL(sync_blockdev); 161EXPORT_SYMBOL(sync_blockdev);
@@ -358,11 +353,11 @@ static long do_fsync(unsigned int fd, int datasync)
358 * We need to protect against concurrent writers, 353 * We need to protect against concurrent writers,
359 * which could cause livelocks in fsync_buffers_list 354 * which could cause livelocks in fsync_buffers_list
360 */ 355 */
361 down(&mapping->host->i_sem); 356 mutex_lock(&mapping->host->i_mutex);
362 err = file->f_op->fsync(file, file->f_dentry, datasync); 357 err = file->f_op->fsync(file, file->f_dentry, datasync);
363 if (!ret) 358 if (!ret)
364 ret = err; 359 ret = err;
365 up(&mapping->host->i_sem); 360 mutex_unlock(&mapping->host->i_mutex);
366 err = filemap_fdatawait(mapping); 361 err = filemap_fdatawait(mapping);
367 if (!ret) 362 if (!ret)
368 ret = err; 363 ret = err;
@@ -1032,7 +1027,7 @@ try_again:
1032 /* Link the buffer to its page */ 1027 /* Link the buffer to its page */
1033 set_bh_page(bh, page, offset); 1028 set_bh_page(bh, page, offset);
1034 1029
1035 bh->b_end_io = NULL; 1030 init_buffer(bh, NULL, NULL);
1036 } 1031 }
1037 return head; 1032 return head;
1038/* 1033/*
@@ -1170,7 +1165,7 @@ failed:
1170 * some of those buffers may be aliases of filesystem data. 1165 * some of those buffers may be aliases of filesystem data.
1171 * grow_dev_page() will go BUG() if this happens. 1166 * grow_dev_page() will go BUG() if this happens.
1172 */ 1167 */
1173static inline int 1168static int
1174grow_buffers(struct block_device *bdev, sector_t block, int size) 1169grow_buffers(struct block_device *bdev, sector_t block, int size)
1175{ 1170{
1176 struct page *page; 1171 struct page *page;
@@ -1396,7 +1391,7 @@ static void bh_lru_install(struct buffer_head *bh)
1396/* 1391/*
1397 * Look up the bh in this cpu's LRU. If it's there, move it to the head. 1392 * Look up the bh in this cpu's LRU. If it's there, move it to the head.
1398 */ 1393 */
1399static inline struct buffer_head * 1394static struct buffer_head *
1400lookup_bh_lru(struct block_device *bdev, sector_t block, int size) 1395lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
1401{ 1396{
1402 struct buffer_head *ret = NULL; 1397 struct buffer_head *ret = NULL;
@@ -1546,7 +1541,7 @@ EXPORT_SYMBOL(set_bh_page);
1546/* 1541/*
1547 * Called when truncating a buffer on a page completely. 1542 * Called when truncating a buffer on a page completely.
1548 */ 1543 */
1549static inline void discard_buffer(struct buffer_head * bh) 1544static void discard_buffer(struct buffer_head * bh)
1550{ 1545{
1551 lock_buffer(bh); 1546 lock_buffer(bh);
1552 clear_buffer_dirty(bh); 1547 clear_buffer_dirty(bh);
@@ -1768,7 +1763,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1768 * handle that here by just cleaning them. 1763 * handle that here by just cleaning them.
1769 */ 1764 */
1770 1765
1771 block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1766 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1772 head = page_buffers(page); 1767 head = page_buffers(page);
1773 bh = head; 1768 bh = head;
1774 1769
@@ -2160,11 +2155,12 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
2160 * truncates. Uses prepare/commit_write to allow the filesystem to 2155 * truncates. Uses prepare/commit_write to allow the filesystem to
2161 * deal with the hole. 2156 * deal with the hole.
2162 */ 2157 */
2163int generic_cont_expand(struct inode *inode, loff_t size) 2158static int __generic_cont_expand(struct inode *inode, loff_t size,
2159 pgoff_t index, unsigned int offset)
2164{ 2160{
2165 struct address_space *mapping = inode->i_mapping; 2161 struct address_space *mapping = inode->i_mapping;
2166 struct page *page; 2162 struct page *page;
2167 unsigned long index, offset, limit; 2163 unsigned long limit;
2168 int err; 2164 int err;
2169 2165
2170 err = -EFBIG; 2166 err = -EFBIG;
@@ -2176,24 +2172,24 @@ int generic_cont_expand(struct inode *inode, loff_t size)
2176 if (size > inode->i_sb->s_maxbytes) 2172 if (size > inode->i_sb->s_maxbytes)
2177 goto out; 2173 goto out;
2178 2174
2179 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
2180
2181 /* ugh. in prepare/commit_write, if from==to==start of block, we
2182 ** skip the prepare. make sure we never send an offset for the start
2183 ** of a block
2184 */
2185 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
2186 offset++;
2187 }
2188 index = size >> PAGE_CACHE_SHIFT;
2189 err = -ENOMEM; 2175 err = -ENOMEM;
2190 page = grab_cache_page(mapping, index); 2176 page = grab_cache_page(mapping, index);
2191 if (!page) 2177 if (!page)
2192 goto out; 2178 goto out;
2193 err = mapping->a_ops->prepare_write(NULL, page, offset, offset); 2179 err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
2194 if (!err) { 2180 if (err) {
2195 err = mapping->a_ops->commit_write(NULL, page, offset, offset); 2181 /*
2182 * ->prepare_write() may have instantiated a few blocks
2183 * outside i_size. Trim these off again.
2184 */
2185 unlock_page(page);
2186 page_cache_release(page);
2187 vmtruncate(inode, inode->i_size);
2188 goto out;
2196 } 2189 }
2190
2191 err = mapping->a_ops->commit_write(NULL, page, offset, offset);
2192
2197 unlock_page(page); 2193 unlock_page(page);
2198 page_cache_release(page); 2194 page_cache_release(page);
2199 if (err > 0) 2195 if (err > 0)
@@ -2202,6 +2198,36 @@ out:
2202 return err; 2198 return err;
2203} 2199}
2204 2200
2201int generic_cont_expand(struct inode *inode, loff_t size)
2202{
2203 pgoff_t index;
2204 unsigned int offset;
2205
2206 offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
2207
2208 /* ugh. in prepare/commit_write, if from==to==start of block, we
2209 ** skip the prepare. make sure we never send an offset for the start
2210 ** of a block
2211 */
2212 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
2213 /* caller must handle this extra byte. */
2214 offset++;
2215 }
2216 index = size >> PAGE_CACHE_SHIFT;
2217
2218 return __generic_cont_expand(inode, size, index, offset);
2219}
2220
2221int generic_cont_expand_simple(struct inode *inode, loff_t size)
2222{
2223 loff_t pos = size - 1;
2224 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2225 unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
2226
2227 /* prepare/commit_write can handle even if from==to==start of block. */
2228 return __generic_cont_expand(inode, size, index, offset);
2229}
2230
2205/* 2231/*
2206 * For moronic filesystems that do not allow holes in file. 2232 * For moronic filesystems that do not allow holes in file.
2207 * We may have to extend the file. 2233 * We may have to extend the file.
@@ -2313,7 +2339,7 @@ int generic_commit_write(struct file *file, struct page *page,
2313 __block_commit_write(inode,page,from,to); 2339 __block_commit_write(inode,page,from,to);
2314 /* 2340 /*
2315 * No need to use i_size_read() here, the i_size 2341 * No need to use i_size_read() here, the i_size
2316 * cannot change under us because we hold i_sem. 2342 * cannot change under us because we hold i_mutex.
2317 */ 2343 */
2318 if (pos > inode->i_size) { 2344 if (pos > inode->i_size) {
2319 i_size_write(inode, pos); 2345 i_size_write(inode, pos);
@@ -2610,7 +2636,7 @@ int block_truncate_page(struct address_space *mapping,
2610 pgoff_t index = from >> PAGE_CACHE_SHIFT; 2636 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2611 unsigned offset = from & (PAGE_CACHE_SIZE-1); 2637 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2612 unsigned blocksize; 2638 unsigned blocksize;
2613 pgoff_t iblock; 2639 sector_t iblock;
2614 unsigned length, pos; 2640 unsigned length, pos;
2615 struct inode *inode = mapping->host; 2641 struct inode *inode = mapping->host;
2616 struct page *page; 2642 struct page *page;
@@ -2626,7 +2652,7 @@ int block_truncate_page(struct address_space *mapping,
2626 return 0; 2652 return 0;
2627 2653
2628 length = blocksize - length; 2654 length = blocksize - length;
2629 iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 2655 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2630 2656
2631 page = grab_cache_page(mapping, index); 2657 page = grab_cache_page(mapping, index);
2632 err = -ENOMEM; 2658 err = -ENOMEM;
@@ -3145,6 +3171,7 @@ EXPORT_SYMBOL(fsync_bdev);
3145EXPORT_SYMBOL(generic_block_bmap); 3171EXPORT_SYMBOL(generic_block_bmap);
3146EXPORT_SYMBOL(generic_commit_write); 3172EXPORT_SYMBOL(generic_commit_write);
3147EXPORT_SYMBOL(generic_cont_expand); 3173EXPORT_SYMBOL(generic_cont_expand);
3174EXPORT_SYMBOL(generic_cont_expand_simple);
3148EXPORT_SYMBOL(init_buffer); 3175EXPORT_SYMBOL(init_buffer);
3149EXPORT_SYMBOL(invalidate_bdev); 3176EXPORT_SYMBOL(invalidate_bdev);
3150EXPORT_SYMBOL(ll_rw_block); 3177EXPORT_SYMBOL(ll_rw_block);
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 3b1b1eefdbb..21195c48163 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -35,7 +35,7 @@ static struct char_device_struct {
35 unsigned int major; 35 unsigned int major;
36 unsigned int baseminor; 36 unsigned int baseminor;
37 int minorct; 37 int minorct;
38 const char *name; 38 char name[64];
39 struct file_operations *fops; 39 struct file_operations *fops;
40 struct cdev *cdev; /* will die */ 40 struct cdev *cdev; /* will die */
41} *chrdevs[MAX_PROBE_HASH]; 41} *chrdevs[MAX_PROBE_HASH];
@@ -46,34 +46,84 @@ static inline int major_to_index(int major)
46 return major % MAX_PROBE_HASH; 46 return major % MAX_PROBE_HASH;
47} 47}
48 48
49/* get char device names in somewhat random order */ 49struct chrdev_info {
50int get_chrdev_list(char *page) 50 int index;
51{
52 struct char_device_struct *cd; 51 struct char_device_struct *cd;
53 int i, len; 52};
54 53
55 len = sprintf(page, "Character devices:\n"); 54void *get_next_chrdev(void *dev)
55{
56 struct chrdev_info *info;
56 57
58 if (dev == NULL) {
59 info = kmalloc(sizeof(*info), GFP_KERNEL);
60 if (!info)
61 goto out;
62 info->index=0;
63 info->cd = chrdevs[info->index];
64 if (info->cd)
65 goto out;
66 } else {
67 info = dev;
68 }
69
70 while (info->index < ARRAY_SIZE(chrdevs)) {
71 if (info->cd)
72 info->cd = info->cd->next;
73 if (info->cd)
74 goto out;
75 /*
76 * No devices on this chain, move to the next
77 */
78 info->index++;
79 info->cd = (info->index < ARRAY_SIZE(chrdevs)) ?
80 chrdevs[info->index] : NULL;
81 if (info->cd)
82 goto out;
83 }
84
85out:
86 return info;
87}
88
89void *acquire_chrdev_list(void)
90{
57 down(&chrdevs_lock); 91 down(&chrdevs_lock);
92 return get_next_chrdev(NULL);
93}
94
95void release_chrdev_list(void *dev)
96{
97 up(&chrdevs_lock);
98 kfree(dev);
99}
100
101
102int count_chrdev_list(void)
103{
104 struct char_device_struct *cd;
105 int i, count;
106
107 count = 0;
108
58 for (i = 0; i < ARRAY_SIZE(chrdevs) ; i++) { 109 for (i = 0; i < ARRAY_SIZE(chrdevs) ; i++) {
59 for (cd = chrdevs[i]; cd; cd = cd->next) { 110 for (cd = chrdevs[i]; cd; cd = cd->next)
60 /* 111 count++;
61 * if the current name, plus the 5 extra characters
62 * in the device line for this entry
63 * would run us off the page, we're done
64 */
65 if ((len+strlen(cd->name) + 5) >= PAGE_SIZE)
66 goto page_full;
67
68
69 len += sprintf(page+len, "%3d %s\n",
70 cd->major, cd->name);
71 }
72 } 112 }
73page_full:
74 up(&chrdevs_lock);
75 113
76 return len; 114 return count;
115}
116
117int get_chrdev_info(void *dev, int *major, char **name)
118{
119 struct chrdev_info *info = dev;
120
121 if (info->cd == NULL)
122 return 1;
123
124 *major = info->cd->major;
125 *name = info->cd->name;
126 return 0;
77} 127}
78 128
79/* 129/*
@@ -121,7 +171,7 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
121 cd->major = major; 171 cd->major = major;
122 cd->baseminor = baseminor; 172 cd->baseminor = baseminor;
123 cd->minorct = minorct; 173 cd->minorct = minorct;
124 cd->name = name; 174 strncpy(cd->name,name, 64);
125 175
126 i = major_to_index(major); 176 i = major_to_index(major);
127 177
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index eab3750cf30..d335015473a 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,8 +1,22 @@
1Version 1.40
2------------
3Use fsuid (fsgid) more consistently instead of uid (gid). Improve performance
4of readpages by eliminating one extra memcpy. Allow update of file size
5from remote server even if file is open for write as long as mount is
6directio. Recognize share mode security and send NTLM encrypted password
7on tree connect if share mode negotiated.
8
1Version 1.39 9Version 1.39
2------------ 10------------
3Defer close of a file handle slightly if pending writes depend on that file handle 11Defer close of a file handle slightly if pending writes depend on that handle
4(this reduces the EBADF bad file handle errors that can be logged under heavy 12(this reduces the EBADF bad file handle errors that can be logged under heavy
5stress on writes). Modify cifs Kconfig options to expose CONFIG_CIFS_STATS2 13stress on writes). Modify cifs Kconfig options to expose CONFIG_CIFS_STATS2
14Fix SFU style symlinks and mknod needed for servers which do not support the
15CIFS Unix Extensions. Fix setfacl/getfacl on bigendian. Timeout negative
16dentries so files that the client sees as deleted but that later get created
17on the server will be recognized. Add client side permission check on setattr.
18Timeout stuck requests better (where server has never responded or sent corrupt
19responses)
6 20
7Version 1.38 21Version 1.38
8------------ 22------------
diff --git a/fs/cifs/README b/fs/cifs/README
index bb90941826a..b0070d1b149 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -278,7 +278,9 @@ A partial list of the supported mount options follows:
278 (such as Windows), permissions can also be checked at the 278 (such as Windows), permissions can also be checked at the
279 client, and a crude form of client side permission checking 279 client, and a crude form of client side permission checking
280 can be enabled by specifying file_mode and dir_mode on 280 can be enabled by specifying file_mode and dir_mode on
281 the client 281 the client. Note that the mount.cifs helper must be
282 at version 1.10 or higher to support specifying the uid
283 (or gid) in non-numberic form.
282 gid If CIFS Unix extensions are not supported by the server 284 gid If CIFS Unix extensions are not supported by the server
283 this overrides the default gid for inodes. 285 this overrides the default gid for inodes.
284 file_mode If CIFS Unix extensions are not supported by the server 286 file_mode If CIFS Unix extensions are not supported by the server
@@ -345,7 +347,10 @@ A partial list of the supported mount options follows:
345 client system. It is typically only needed when the server 347 client system. It is typically only needed when the server
346 supports the CIFS Unix Extensions but the UIDs/GIDs on the 348 supports the CIFS Unix Extensions but the UIDs/GIDs on the
347 client and server system do not match closely enough to allow 349 client and server system do not match closely enough to allow
348 access by the user doing the mount. 350 access by the user doing the mount, but it may be useful with
351 non CIFS Unix Extension mounts for cases in which the default
352 mode is specified on the mount but is not to be enforced on the
353 client (e.g. perhaps when MultiUserMount is enabled)
349 Note that this does not affect the normal ACL check on the 354 Note that this does not affect the normal ACL check on the
350 target machine done by the server software (of the server 355 target machine done by the server software (of the server
351 ACL against the user name provided at mount time). 356 ACL against the user name provided at mount time).
@@ -368,15 +373,21 @@ A partial list of the supported mount options follows:
368 setuids If the CIFS Unix extensions are negotiated with the server 373 setuids If the CIFS Unix extensions are negotiated with the server
369 the client will attempt to set the effective uid and gid of 374 the client will attempt to set the effective uid and gid of
370 the local process on newly created files, directories, and 375 the local process on newly created files, directories, and
371 devices (create, mkdir, mknod). 376 devices (create, mkdir, mknod). If the CIFS Unix Extensions
377 are not negotiated, for newly created files and directories
378 instead of using the default uid and gid specified on the
379 the mount, cache the new file's uid and gid locally which means
380 that the uid for the file can change when the inode is
381 reloaded (or the user remounts the share).
372 nosetuids The client will not attempt to set the uid and gid on 382 nosetuids The client will not attempt to set the uid and gid on
373 on newly created files, directories, and devices (create, 383 on newly created files, directories, and devices (create,
374 mkdir, mknod) which will result in the server setting the 384 mkdir, mknod) which will result in the server setting the
375 uid and gid to the default (usually the server uid of the 385 uid and gid to the default (usually the server uid of the
376 user who mounted the share). Letting the server (rather than 386 user who mounted the share). Letting the server (rather than
377 the client) set the uid and gid is the default. This 387 the client) set the uid and gid is the default. If the CIFS
378 parameter has no effect if the CIFS Unix Extensions are not 388 Unix Extensions are not negotiated then the uid and gid for
379 negotiated. 389 new files will appear to be the uid (gid) of the mounter or the
390 uid (gid) parameter specified on the mount.
380 netbiosname When mounting to servers via port 139, specifies the RFC1001 391 netbiosname When mounting to servers via port 139, specifies the RFC1001
381 source name to use to represent the client netbios machine 392 source name to use to represent the client netbios machine
382 name when doing the RFC1001 netbios session initialize. 393 name when doing the RFC1001 netbios session initialize.
@@ -418,7 +429,24 @@ A partial list of the supported mount options follows:
418 byte range locks). 429 byte range locks).
419 remount remount the share (often used to change from ro to rw mounts 430 remount remount the share (often used to change from ro to rw mounts
420 or vice versa) 431 or vice versa)
421 432 sfu When the CIFS Unix Extensions are not negotiated, attempt to
433 create device files and fifos in a format compatible with
434 Services for Unix (SFU). In addition retrieve bits 10-12
435 of the mode via the SETFILEBITS extended attribute (as
436 SFU does). In the future the bottom 9 bits of the mode
437 mode also will be emulated using queries of the security
438 descriptor (ACL).
439sec Security mode. Allowed values are:
440 none attempt to connection as a null user (no name)
441 krb5 Use Kerberos version 5 authentication
442 krb5i Use Kerberos authentication and packet signing
443 ntlm Use NTLM password hashing (default)
444 ntlmi Use NTLM password hashing with signing (if
445 /proc/fs/cifs/PacketSigningEnabled on or if
446 server requires signing also can be the default)
447 ntlmv2 Use NTLMv2 password hashing
448 ntlmv2i Use NTLMv2 password hashing with packet signing
449
422The mount.cifs mount helper also accepts a few mount options before -o 450The mount.cifs mount helper also accepts a few mount options before -o
423including: 451including:
424 452
diff --git a/fs/cifs/TODO b/fs/cifs/TODO
index c909298d11e..fc34c74ec4b 100644
--- a/fs/cifs/TODO
+++ b/fs/cifs/TODO
@@ -1,4 +1,4 @@
1version 1.37 October 9, 2005 1Version 1.39 November 30, 2005
2 2
3A Partial List of Missing Features 3A Partial List of Missing Features
4================================== 4==================================
@@ -58,7 +58,7 @@ o) Improve performance of readpages by sending more than one read
58at a time when 8 pages or more are requested. In conjuntion 58at a time when 8 pages or more are requested. In conjuntion
59add support for async_cifs_readpages. 59add support for async_cifs_readpages.
60 60
61p) Add support for storing symlink and fifo info to Windows servers 61p) Add support for storing symlink info to Windows servers
62in the Extended Attribute format their SFU clients would recognize. 62in the Extended Attribute format their SFU clients would recognize.
63 63
64q) Finish fcntl D_NOTIFY support so kde and gnome file list windows 64q) Finish fcntl D_NOTIFY support so kde and gnome file list windows
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 22a444a3fe4..f4124a32bef 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -219,6 +219,10 @@ cifs_stats_write(struct file *file, const char __user *buffer,
219 219
220 if (c == '1' || c == 'y' || c == 'Y' || c == '0') { 220 if (c == '1' || c == 'y' || c == 'Y' || c == '0') {
221 read_lock(&GlobalSMBSeslock); 221 read_lock(&GlobalSMBSeslock);
222#ifdef CONFIG_CIFS_STATS2
223 atomic_set(&totBufAllocCount, 0);
224 atomic_set(&totSmBufAllocCount, 0);
225#endif /* CONFIG_CIFS_STATS2 */
222 list_for_each(tmp, &GlobalTreeConnectionList) { 226 list_for_each(tmp, &GlobalTreeConnectionList) {
223 tcon = list_entry(tmp, struct cifsTconInfo, 227 tcon = list_entry(tmp, struct cifsTconInfo,
224 cifsConnectionList); 228 cifsConnectionList);
@@ -276,6 +280,14 @@ cifs_stats_read(char *buf, char **beginBuffer, off_t offset,
276 smBufAllocCount.counter,cifs_min_small); 280 smBufAllocCount.counter,cifs_min_small);
277 length += item_length; 281 length += item_length;
278 buf += item_length; 282 buf += item_length;
283#ifdef CONFIG_CIFS_STATS2
284 item_length = sprintf(buf, "Total Large %d Small %d Allocations\n",
285 atomic_read(&totBufAllocCount),
286 atomic_read(&totSmBufAllocCount));
287 length += item_length;
288 buf += item_length;
289#endif /* CONFIG_CIFS_STATS2 */
290
279 item_length = 291 item_length =
280 sprintf(buf,"Operations (MIDs): %d\n", 292 sprintf(buf,"Operations (MIDs): %d\n",
281 midCount.counter); 293 midCount.counter);
@@ -389,8 +401,8 @@ static read_proc_t ntlmv2_enabled_read;
389static write_proc_t ntlmv2_enabled_write; 401static write_proc_t ntlmv2_enabled_write;
390static read_proc_t packet_signing_enabled_read; 402static read_proc_t packet_signing_enabled_read;
391static write_proc_t packet_signing_enabled_write; 403static write_proc_t packet_signing_enabled_write;
392static read_proc_t quotaEnabled_read; 404static read_proc_t experimEnabled_read;
393static write_proc_t quotaEnabled_write; 405static write_proc_t experimEnabled_write;
394static read_proc_t linuxExtensionsEnabled_read; 406static read_proc_t linuxExtensionsEnabled_read;
395static write_proc_t linuxExtensionsEnabled_write; 407static write_proc_t linuxExtensionsEnabled_write;
396 408
@@ -430,9 +442,9 @@ cifs_proc_init(void)
430 pde->write_proc = oplockEnabled_write; 442 pde->write_proc = oplockEnabled_write;
431 443
432 pde = create_proc_read_entry("Experimental", 0, proc_fs_cifs, 444 pde = create_proc_read_entry("Experimental", 0, proc_fs_cifs,
433 quotaEnabled_read, NULL); 445 experimEnabled_read, NULL);
434 if (pde) 446 if (pde)
435 pde->write_proc = quotaEnabled_write; 447 pde->write_proc = experimEnabled_write;
436 448
437 pde = create_proc_read_entry("LinuxExtensionsEnabled", 0, proc_fs_cifs, 449 pde = create_proc_read_entry("LinuxExtensionsEnabled", 0, proc_fs_cifs,
438 linuxExtensionsEnabled_read, NULL); 450 linuxExtensionsEnabled_read, NULL);
@@ -574,14 +586,13 @@ oplockEnabled_write(struct file *file, const char __user *buffer,
574} 586}
575 587
576static int 588static int
577quotaEnabled_read(char *page, char **start, off_t off, 589experimEnabled_read(char *page, char **start, off_t off,
578 int count, int *eof, void *data) 590 int count, int *eof, void *data)
579{ 591{
580 int len; 592 int len;
581 593
582 len = sprintf(page, "%d\n", experimEnabled); 594 len = sprintf(page, "%d\n", experimEnabled);
583/* could also check if quotas are enabled in kernel 595
584 as a whole first */
585 len -= off; 596 len -= off;
586 *start = page + off; 597 *start = page + off;
587 598
@@ -596,21 +607,23 @@ quotaEnabled_read(char *page, char **start, off_t off,
596 return len; 607 return len;
597} 608}
598static int 609static int
599quotaEnabled_write(struct file *file, const char __user *buffer, 610experimEnabled_write(struct file *file, const char __user *buffer,
600 unsigned long count, void *data) 611 unsigned long count, void *data)
601{ 612{
602 char c; 613 char c;
603 int rc; 614 int rc;
604 615
605 rc = get_user(c, buffer); 616 rc = get_user(c, buffer);
606 if (rc) 617 if (rc)
607 return rc; 618 return rc;
608 if (c == '0' || c == 'n' || c == 'N') 619 if (c == '0' || c == 'n' || c == 'N')
609 experimEnabled = 0; 620 experimEnabled = 0;
610 else if (c == '1' || c == 'y' || c == 'Y') 621 else if (c == '1' || c == 'y' || c == 'Y')
611 experimEnabled = 1; 622 experimEnabled = 1;
623 else if (c == '2')
624 experimEnabled = 2;
612 625
613 return count; 626 return count;
614} 627}
615 628
616static int 629static int
@@ -620,8 +633,6 @@ linuxExtensionsEnabled_read(char *page, char **start, off_t off,
620 int len; 633 int len;
621 634
622 len = sprintf(page, "%d\n", linuxExtEnabled); 635 len = sprintf(page, "%d\n", linuxExtEnabled);
623/* could also check if quotas are enabled in kernel
624 as a whole first */
625 len -= off; 636 len -= off;
626 *start = page + off; 637 *start = page + off;
627 638
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index f799f6f0e72..ad58eb0c4d6 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -24,9 +24,10 @@
24#define CIFS_MOUNT_DIRECT_IO 8 /* do not write nor read through page cache */ 24#define CIFS_MOUNT_DIRECT_IO 8 /* do not write nor read through page cache */
25#define CIFS_MOUNT_NO_XATTR 0x10 /* if set - disable xattr support */ 25#define CIFS_MOUNT_NO_XATTR 0x10 /* if set - disable xattr support */
26#define CIFS_MOUNT_MAP_SPECIAL_CHR 0x20 /* remap illegal chars in filenames */ 26#define CIFS_MOUNT_MAP_SPECIAL_CHR 0x20 /* remap illegal chars in filenames */
27#define CIFS_MOUNT_POSIX_PATHS 0x40 /* Negotiate posix pathnames if possible. */ 27#define CIFS_MOUNT_POSIX_PATHS 0x40 /* Negotiate posix pathnames if possible. */
28#define CIFS_MOUNT_UNX_EMUL 0x80 /* Network compat with SFUnix emulation */ 28#define CIFS_MOUNT_UNX_EMUL 0x80 /* Network compat with SFUnix emulation */
29#define CIFS_MOUNT_NO_BRL 0x100 /* No sending byte range locks to srv */ 29#define CIFS_MOUNT_NO_BRL 0x100 /* No sending byte range locks to srv */
30#define CIFS_MOUNT_CIFS_ACL 0x200 /* send ACL requests to non-POSIX srv */
30 31
31struct cifs_sb_info { 32struct cifs_sb_info {
32 struct cifsTconInfo *tcon; /* primary mount */ 33 struct cifsTconInfo *tcon; /* primary mount */
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 4e12053f080..d2b12825594 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/cifs_unicode.c 2 * fs/cifs/cifs_unicode.c
3 * 3 *
4 * Copyright (c) International Business Machines Corp., 2000,2002 4 * Copyright (c) International Business Machines Corp., 2000,2005
5 * Modified by Steve French (sfrench@us.ibm.com) 5 * Modified by Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * This program is free software; you can redistribute it and/or modify 7 * This program is free software; you can redistribute it and/or modify
@@ -31,7 +31,7 @@
31 * 31 *
32 */ 32 */
33int 33int
34cifs_strfromUCS_le(char *to, const wchar_t * from, /* LITTLE ENDIAN */ 34cifs_strfromUCS_le(char *to, const __le16 * from,
35 int len, const struct nls_table *codepage) 35 int len, const struct nls_table *codepage)
36{ 36{
37 int i; 37 int i;
@@ -60,25 +60,26 @@ cifs_strfromUCS_le(char *to, const wchar_t * from, /* LITTLE ENDIAN */
60 * 60 *
61 */ 61 */
62int 62int
63cifs_strtoUCS(wchar_t * to, const char *from, int len, 63cifs_strtoUCS(__le16 * to, const char *from, int len,
64 const struct nls_table *codepage) 64 const struct nls_table *codepage)
65{ 65{
66 int charlen; 66 int charlen;
67 int i; 67 int i;
68 wchar_t * wchar_to = (wchar_t *)to; /* needed to quiet sparse */
68 69
69 for (i = 0; len && *from; i++, from += charlen, len -= charlen) { 70 for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
70 71
71 /* works for 2.4.0 kernel or later */ 72 /* works for 2.4.0 kernel or later */
72 charlen = codepage->char2uni(from, len, &to[i]); 73 charlen = codepage->char2uni(from, len, &wchar_to[i]);
73 if (charlen < 1) { 74 if (charlen < 1) {
74 cERROR(1, 75 cERROR(1,
75 ("cifs_strtoUCS: char2uni returned %d", 76 ("cifs_strtoUCS: char2uni returned %d",
76 charlen)); 77 charlen));
77 /* A question mark */ 78 /* A question mark */
78 to[i] = (wchar_t)cpu_to_le16(0x003f); 79 to[i] = cpu_to_le16(0x003f);
79 charlen = 1; 80 charlen = 1;
80 } else 81 } else
81 to[i] = (wchar_t)cpu_to_le16(to[i]); 82 to[i] = cpu_to_le16(wchar_to[i]);
82 83
83 } 84 }
84 85
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index da8dde96527..39e5b970325 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -5,7 +5,7 @@
5 * Convert a unicode character to upper or lower case using 5 * Convert a unicode character to upper or lower case using
6 * compressed tables. 6 * compressed tables.
7 * 7 *
8 * Copyright (c) International Business Machines Corp., 2000,2002 8 * Copyright (c) International Business Machines Corp., 2000,2005555555555555555555555555555555555555555555555555555555
9 * 9 *
10 * This program is free software; you can redistribute it and/or modify 10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by 11 * it under the terms of the GNU General Public License as published by
@@ -59,8 +59,8 @@ extern struct UniCaseRange UniLowerRange[];
59#endif /* UNIUPR_NOLOWER */ 59#endif /* UNIUPR_NOLOWER */
60 60
61#ifdef __KERNEL__ 61#ifdef __KERNEL__
62int cifs_strfromUCS_le(char *, const wchar_t *, int, const struct nls_table *); 62int cifs_strfromUCS_le(char *, const __le16 *, int, const struct nls_table *);
63int cifs_strtoUCS(wchar_t *, const char *, int, const struct nls_table *); 63int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *);
64#endif 64#endif
65 65
66/* 66/*
diff --git a/fs/cifs/cifs_uniupr.h b/fs/cifs/cifs_uniupr.h
index decd138f14d..da2ad5b451a 100644
--- a/fs/cifs/cifs_uniupr.h
+++ b/fs/cifs/cifs_uniupr.h
@@ -242,7 +242,7 @@ static signed char UniCaseRangeLff20[27] = {
242/* 242/*
243 * Lower Case Range 243 * Lower Case Range
244 */ 244 */
245const static struct UniCaseRange CifsUniLowerRange[] = { 245static const struct UniCaseRange CifsUniLowerRange[] = {
246 0x0380, 0x03ab, UniCaseRangeL0380, 246 0x0380, 0x03ab, UniCaseRangeL0380,
247 0x0400, 0x042f, UniCaseRangeL0400, 247 0x0400, 0x042f, UniCaseRangeL0400,
248 0x0490, 0x04cb, UniCaseRangeL0490, 248 0x0490, 0x04cb, UniCaseRangeL0490,
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
new file mode 100644
index 00000000000..d0776ac2b80
--- /dev/null
+++ b/fs/cifs/cifsacl.h
@@ -0,0 +1,38 @@
1/*
2 * fs/cifs/cifsacl.h
3 *
4 * Copyright (c) International Business Machines Corp., 2005
5 * Author(s): Steve French (sfrench@us.ibm.com)
6 *
7 * This library is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published
9 * by the Free Software Foundation; either version 2.1 of the License, or
10 * (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
15 * the GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with this library; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#ifndef _CIFSACL_H
23#define _CIFSACL_H
24
25struct cifs_sid {
26 __u8 revision; /* revision level */
27 __u8 num_subauths;
28 __u8 authority[6];
29 __u32 sub_auth[4];
30 /* next sub_auth if any ... */
31} __attribute__((packed));
32
33/* everyone */
34extern const struct cifs_sid sid_everyone;
35/* group users */
36extern const struct cifs_sid sid_user;
37
38#endif /* _CIFSACL_H */
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 1959c7c4b18..a2c24858d40 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/cifsencrypt.c 2 * fs/cifs/cifsencrypt.c
3 * 3 *
4 * Copyright (C) International Business Machines Corp., 2003 4 * Copyright (C) International Business Machines Corp., 2005
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * This library is free software; you can redistribute it and/or modify 7 * This library is free software; you can redistribute it and/or modify
@@ -82,6 +82,59 @@ int cifs_sign_smb(struct smb_hdr * cifs_pdu, struct TCP_Server_Info * server,
82 return rc; 82 return rc;
83} 83}
84 84
85static int cifs_calc_signature2(const struct kvec * iov, int n_vec,
86 const char * key, char * signature)
87{
88 struct MD5Context context;
89
90 if((iov == NULL) || (signature == NULL))
91 return -EINVAL;
92
93 MD5Init(&context);
94 MD5Update(&context,key,CIFS_SESSION_KEY_SIZE+16);
95
96/* MD5Update(&context,cifs_pdu->Protocol,cifs_pdu->smb_buf_length); */ /* BB FIXME BB */
97
98 MD5Final(signature,&context);
99
100 return -EOPNOTSUPP;
101/* return 0; */
102}
103
104
105int cifs_sign_smb2(struct kvec * iov, int n_vec, struct TCP_Server_Info *server,
106 __u32 * pexpected_response_sequence_number)
107{
108 int rc = 0;
109 char smb_signature[20];
110 struct smb_hdr * cifs_pdu = iov[0].iov_base;
111
112 if((cifs_pdu == NULL) || (server == NULL))
113 return -EINVAL;
114
115 if((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0)
116 return rc;
117
118 spin_lock(&GlobalMid_Lock);
119 cifs_pdu->Signature.Sequence.SequenceNumber =
120 cpu_to_le32(server->sequence_number);
121 cifs_pdu->Signature.Sequence.Reserved = 0;
122
123 *pexpected_response_sequence_number = server->sequence_number++;
124 server->sequence_number++;
125 spin_unlock(&GlobalMid_Lock);
126
127 rc = cifs_calc_signature2(iov, n_vec, server->mac_signing_key,
128 smb_signature);
129 if(rc)
130 memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
131 else
132 memcpy(cifs_pdu->Signature.SecuritySignature, smb_signature, 8);
133
134 return rc;
135
136}
137
85int cifs_verify_signature(struct smb_hdr * cifs_pdu, const char * mac_key, 138int cifs_verify_signature(struct smb_hdr * cifs_pdu, const char * mac_key,
86 __u32 expected_sequence_number) 139 __u32 expected_sequence_number)
87{ 140{
@@ -149,7 +202,7 @@ int CalcNTLMv2_partial_mac_key(struct cifsSesInfo * ses, struct nls_table * nls_
149 char temp_hash[16]; 202 char temp_hash[16];
150 struct HMACMD5Context ctx; 203 struct HMACMD5Context ctx;
151 char * ucase_buf; 204 char * ucase_buf;
152 wchar_t * unicode_buf; 205 __le16 * unicode_buf;
153 unsigned int i,user_name_len,dom_name_len; 206 unsigned int i,user_name_len,dom_name_len;
154 207
155 if(ses == NULL) 208 if(ses == NULL)
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 682b0235ad9..79eeccd0437 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -32,6 +32,7 @@
32#include <linux/seq_file.h> 32#include <linux/seq_file.h>
33#include <linux/vfs.h> 33#include <linux/vfs.h>
34#include <linux/mempool.h> 34#include <linux/mempool.h>
35#include <linux/delay.h>
35#include "cifsfs.h" 36#include "cifsfs.h"
36#include "cifspdu.h" 37#include "cifspdu.h"
37#define DECLARE_GLOBALS_HERE 38#define DECLARE_GLOBALS_HERE
@@ -429,6 +430,11 @@ static void cifs_umount_begin(struct super_block * sblock)
429 { 430 {
430 cFYI(1,("wake up tasks now - umount begin not complete")); 431 cFYI(1,("wake up tasks now - umount begin not complete"));
431 wake_up_all(&tcon->ses->server->request_q); 432 wake_up_all(&tcon->ses->server->request_q);
433 wake_up_all(&tcon->ses->server->response_q);
434 msleep(1); /* yield */
435 /* we have to kick the requests once more */
436 wake_up_all(&tcon->ses->server->response_q);
437 msleep(1);
432 } 438 }
433/* BB FIXME - finish add checks for tidStatus BB */ 439/* BB FIXME - finish add checks for tidStatus BB */
434 440
@@ -483,56 +489,40 @@ cifs_get_sb(struct file_system_type *fs_type,
483 return sb; 489 return sb;
484} 490}
485 491
486static ssize_t 492static ssize_t cifs_file_writev(struct file *file, const struct iovec *iov,
487cifs_read_wrapper(struct file * file, char __user *read_data, size_t read_size, 493 unsigned long nr_segs, loff_t *ppos)
488 loff_t * poffset)
489{ 494{
490 if(file->f_dentry == NULL) 495 struct inode *inode = file->f_dentry->d_inode;
491 return -EIO; 496 ssize_t written;
492 else if(file->f_dentry->d_inode == NULL)
493 return -EIO;
494
495 cFYI(1,("In read_wrapper size %zd at %lld",read_size,*poffset));
496 497
497 if(CIFS_I(file->f_dentry->d_inode)->clientCanCacheRead) { 498 written = generic_file_writev(file, iov, nr_segs, ppos);
498 return generic_file_read(file,read_data,read_size,poffset); 499 if (!CIFS_I(inode)->clientCanCacheAll)
499 } else { 500 filemap_fdatawrite(inode->i_mapping);
500 /* BB do we need to lock inode from here until after invalidate? */ 501 return written;
501/* if(file->f_dentry->d_inode->i_mapping) {
502 filemap_fdatawrite(file->f_dentry->d_inode->i_mapping);
503 filemap_fdatawait(file->f_dentry->d_inode->i_mapping);
504 }*/
505/* cifs_revalidate(file->f_dentry);*/ /* BB fixme */
506
507 /* BB we should make timer configurable - perhaps
508 by simply calling cifs_revalidate here */
509 /* invalidate_remote_inode(file->f_dentry->d_inode);*/
510 return generic_file_read(file,read_data,read_size,poffset);
511 }
512} 502}
513 503
514static ssize_t 504static ssize_t cifs_file_aio_write(struct kiocb *iocb, const char __user *buf,
515cifs_write_wrapper(struct file * file, const char __user *write_data, 505 size_t count, loff_t pos)
516 size_t write_size, loff_t * poffset)
517{ 506{
507 struct inode *inode = iocb->ki_filp->f_dentry->d_inode;
518 ssize_t written; 508 ssize_t written;
519 509
520 if(file->f_dentry == NULL) 510 written = generic_file_aio_write(iocb, buf, count, pos);
521 return -EIO; 511 if (!CIFS_I(inode)->clientCanCacheAll)
522 else if(file->f_dentry->d_inode == NULL) 512 filemap_fdatawrite(inode->i_mapping);
523 return -EIO;
524
525 cFYI(1,("In write_wrapper size %zd at %lld",write_size,*poffset));
526
527 written = generic_file_write(file,write_data,write_size,poffset);
528 if(!CIFS_I(file->f_dentry->d_inode)->clientCanCacheAll) {
529 if(file->f_dentry->d_inode->i_mapping) {
530 filemap_fdatawrite(file->f_dentry->d_inode->i_mapping);
531 }
532 }
533 return written; 513 return written;
534} 514}
535 515
516static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
517{
518 /* origin == SEEK_END => we must revalidate the cached file length */
519 if (origin == 2) {
520 int retval = cifs_revalidate(file->f_dentry);
521 if (retval < 0)
522 return (loff_t)retval;
523 }
524 return remote_llseek(file, offset, origin);
525}
536 526
537static struct file_system_type cifs_fs_type = { 527static struct file_system_type cifs_fs_type = {
538 .owner = THIS_MODULE, 528 .owner = THIS_MODULE,
@@ -594,8 +584,12 @@ struct inode_operations cifs_symlink_inode_ops = {
594}; 584};
595 585
596struct file_operations cifs_file_ops = { 586struct file_operations cifs_file_ops = {
597 .read = cifs_read_wrapper, 587 .read = do_sync_read,
598 .write = cifs_write_wrapper, 588 .write = do_sync_write,
589 .readv = generic_file_readv,
590 .writev = cifs_file_writev,
591 .aio_read = generic_file_aio_read,
592 .aio_write = cifs_file_aio_write,
599 .open = cifs_open, 593 .open = cifs_open,
600 .release = cifs_close, 594 .release = cifs_close,
601 .lock = cifs_lock, 595 .lock = cifs_lock,
@@ -603,15 +597,12 @@ struct file_operations cifs_file_ops = {
603 .flush = cifs_flush, 597 .flush = cifs_flush,
604 .mmap = cifs_file_mmap, 598 .mmap = cifs_file_mmap,
605 .sendfile = generic_file_sendfile, 599 .sendfile = generic_file_sendfile,
600 .llseek = cifs_llseek,
606#ifdef CONFIG_CIFS_POSIX 601#ifdef CONFIG_CIFS_POSIX
607 .ioctl = cifs_ioctl, 602 .ioctl = cifs_ioctl,
608#endif /* CONFIG_CIFS_POSIX */ 603#endif /* CONFIG_CIFS_POSIX */
609 604
610#ifdef CONFIG_CIFS_EXPERIMENTAL 605#ifdef CONFIG_CIFS_EXPERIMENTAL
611 .readv = generic_file_readv,
612 .writev = generic_file_writev,
613 .aio_read = generic_file_aio_read,
614 .aio_write = generic_file_aio_write,
615 .dir_notify = cifs_dir_notify, 606 .dir_notify = cifs_dir_notify,
616#endif /* CONFIG_CIFS_EXPERIMENTAL */ 607#endif /* CONFIG_CIFS_EXPERIMENTAL */
617}; 608};
@@ -630,12 +621,53 @@ struct file_operations cifs_file_direct_ops = {
630#ifdef CONFIG_CIFS_POSIX 621#ifdef CONFIG_CIFS_POSIX
631 .ioctl = cifs_ioctl, 622 .ioctl = cifs_ioctl,
632#endif /* CONFIG_CIFS_POSIX */ 623#endif /* CONFIG_CIFS_POSIX */
624 .llseek = cifs_llseek,
625#ifdef CONFIG_CIFS_EXPERIMENTAL
626 .dir_notify = cifs_dir_notify,
627#endif /* CONFIG_CIFS_EXPERIMENTAL */
628};
629struct file_operations cifs_file_nobrl_ops = {
630 .read = do_sync_read,
631 .write = do_sync_write,
632 .readv = generic_file_readv,
633 .writev = cifs_file_writev,
634 .aio_read = generic_file_aio_read,
635 .aio_write = cifs_file_aio_write,
636 .open = cifs_open,
637 .release = cifs_close,
638 .fsync = cifs_fsync,
639 .flush = cifs_flush,
640 .mmap = cifs_file_mmap,
641 .sendfile = generic_file_sendfile,
642 .llseek = cifs_llseek,
643#ifdef CONFIG_CIFS_POSIX
644 .ioctl = cifs_ioctl,
645#endif /* CONFIG_CIFS_POSIX */
633 646
634#ifdef CONFIG_CIFS_EXPERIMENTAL 647#ifdef CONFIG_CIFS_EXPERIMENTAL
635 .dir_notify = cifs_dir_notify, 648 .dir_notify = cifs_dir_notify,
636#endif /* CONFIG_CIFS_EXPERIMENTAL */ 649#endif /* CONFIG_CIFS_EXPERIMENTAL */
637}; 650};
638 651
652struct file_operations cifs_file_direct_nobrl_ops = {
653 /* no mmap, no aio, no readv -
654 BB reevaluate whether they can be done with directio, no cache */
655 .read = cifs_user_read,
656 .write = cifs_user_write,
657 .open = cifs_open,
658 .release = cifs_close,
659 .fsync = cifs_fsync,
660 .flush = cifs_flush,
661 .sendfile = generic_file_sendfile, /* BB removeme BB */
662#ifdef CONFIG_CIFS_POSIX
663 .ioctl = cifs_ioctl,
664#endif /* CONFIG_CIFS_POSIX */
665 .llseek = cifs_llseek,
666#ifdef CONFIG_CIFS_EXPERIMENTAL
667 .dir_notify = cifs_dir_notify,
668#endif /* CONFIG_CIFS_EXPERIMENTAL */
669};
670
639struct file_operations cifs_dir_ops = { 671struct file_operations cifs_dir_ops = {
640 .readdir = cifs_readdir, 672 .readdir = cifs_readdir,
641 .release = cifs_closedir, 673 .release = cifs_closedir,
@@ -714,7 +746,7 @@ cifs_init_request_bufs(void)
714 kmem_cache_destroy(cifs_req_cachep); 746 kmem_cache_destroy(cifs_req_cachep);
715 return -ENOMEM; 747 return -ENOMEM;
716 } 748 }
717 /* 256 (MAX_CIFS_HDR_SIZE bytes is enough for most SMB responses and 749 /* MAX_CIFS_SMALL_BUFFER_SIZE bytes is enough for most SMB responses and
718 almost all handle based requests (but not write response, nor is it 750 almost all handle based requests (but not write response, nor is it
719 sufficient for path based requests). A smaller size would have 751 sufficient for path based requests). A smaller size would have
720 been more efficient (compacting multiple slab items on one 4k page) 752 been more efficient (compacting multiple slab items on one 4k page)
@@ -723,7 +755,8 @@ cifs_init_request_bufs(void)
723 efficient to alloc 1 per page off the slab compared to 17K (5page) 755 efficient to alloc 1 per page off the slab compared to 17K (5page)
724 alloc of large cifs buffers even when page debugging is on */ 756 alloc of large cifs buffers even when page debugging is on */
725 cifs_sm_req_cachep = kmem_cache_create("cifs_small_rq", 757 cifs_sm_req_cachep = kmem_cache_create("cifs_small_rq",
726 MAX_CIFS_HDR_SIZE, 0, SLAB_HWCACHE_ALIGN, NULL, NULL); 758 MAX_CIFS_SMALL_BUFFER_SIZE, 0, SLAB_HWCACHE_ALIGN,
759 NULL, NULL);
727 if (cifs_sm_req_cachep == NULL) { 760 if (cifs_sm_req_cachep == NULL) {
728 mempool_destroy(cifs_req_poolp); 761 mempool_destroy(cifs_req_poolp);
729 kmem_cache_destroy(cifs_req_cachep); 762 kmem_cache_destroy(cifs_req_cachep);
@@ -841,9 +874,9 @@ static int cifs_oplock_thread(void * dummyarg)
841 DeleteOplockQEntry(oplock_item); 874 DeleteOplockQEntry(oplock_item);
842 /* can not grab inode sem here since it would 875 /* can not grab inode sem here since it would
843 deadlock when oplock received on delete 876 deadlock when oplock received on delete
844 since vfs_unlink holds the i_sem across 877 since vfs_unlink holds the i_mutex across
845 the call */ 878 the call */
846 /* down(&inode->i_sem);*/ 879 /* mutex_lock(&inode->i_mutex);*/
847 if (S_ISREG(inode->i_mode)) { 880 if (S_ISREG(inode->i_mode)) {
848 rc = filemap_fdatawrite(inode->i_mapping); 881 rc = filemap_fdatawrite(inode->i_mapping);
849 if(CIFS_I(inode)->clientCanCacheRead == 0) { 882 if(CIFS_I(inode)->clientCanCacheRead == 0) {
@@ -852,7 +885,7 @@ static int cifs_oplock_thread(void * dummyarg)
852 } 885 }
853 } else 886 } else
854 rc = 0; 887 rc = 0;
855 /* up(&inode->i_sem);*/ 888 /* mutex_unlock(&inode->i_mutex);*/
856 if (rc) 889 if (rc)
857 CIFS_I(inode)->write_behind_rc = rc; 890 CIFS_I(inode)->write_behind_rc = rc;
858 cFYI(1,("Oplock flush inode %p rc %d",inode,rc)); 891 cFYI(1,("Oplock flush inode %p rc %d",inode,rc));
@@ -882,6 +915,9 @@ static int cifs_oplock_thread(void * dummyarg)
882 915
883static int cifs_dnotify_thread(void * dummyarg) 916static int cifs_dnotify_thread(void * dummyarg)
884{ 917{
918 struct list_head *tmp;
919 struct cifsSesInfo *ses;
920
885 daemonize("cifsdnotifyd"); 921 daemonize("cifsdnotifyd");
886 allow_signal(SIGTERM); 922 allow_signal(SIGTERM);
887 923
@@ -890,7 +926,19 @@ static int cifs_dnotify_thread(void * dummyarg)
890 if(try_to_freeze()) 926 if(try_to_freeze())
891 continue; 927 continue;
892 set_current_state(TASK_INTERRUPTIBLE); 928 set_current_state(TASK_INTERRUPTIBLE);
893 schedule_timeout(39*HZ); 929 schedule_timeout(15*HZ);
930 read_lock(&GlobalSMBSeslock);
931 /* check if any stuck requests that need
932 to be woken up and wakeq so the
933 thread can wake up and error out */
934 list_for_each(tmp, &GlobalSMBSessionList) {
935 ses = list_entry(tmp, struct cifsSesInfo,
936 cifsSessionList);
937 if(ses && ses->server &&
938 atomic_read(&ses->server->inFlight))
939 wake_up_all(&ses->server->response_q);
940 }
941 read_unlock(&GlobalSMBSeslock);
894 } while(!signal_pending(current)); 942 } while(!signal_pending(current));
895 complete_and_exit (&cifs_dnotify_exited, 0); 943 complete_and_exit (&cifs_dnotify_exited, 0);
896} 944}
@@ -920,6 +968,12 @@ init_cifs(void)
920 atomic_set(&tconInfoReconnectCount, 0); 968 atomic_set(&tconInfoReconnectCount, 0);
921 969
922 atomic_set(&bufAllocCount, 0); 970 atomic_set(&bufAllocCount, 0);
971 atomic_set(&smBufAllocCount, 0);
972#ifdef CONFIG_CIFS_STATS2
973 atomic_set(&totBufAllocCount, 0);
974 atomic_set(&totSmBufAllocCount, 0);
975#endif /* CONFIG_CIFS_STATS2 */
976
923 atomic_set(&midCount, 0); 977 atomic_set(&midCount, 0);
924 GlobalCurrentXid = 0; 978 GlobalCurrentXid = 0;
925 GlobalTotalActiveXid = 0; 979 GlobalTotalActiveXid = 0;
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 1223fa81dbd..821a8eb2255 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -63,6 +63,8 @@ extern struct inode_operations cifs_symlink_inode_ops;
63/* Functions related to files and directories */ 63/* Functions related to files and directories */
64extern struct file_operations cifs_file_ops; 64extern struct file_operations cifs_file_ops;
65extern struct file_operations cifs_file_direct_ops; /* if directio mount */ 65extern struct file_operations cifs_file_direct_ops; /* if directio mount */
66extern struct file_operations cifs_file_nobrl_ops;
67extern struct file_operations cifs_file_direct_nobrl_ops; /* if directio mount */
66extern int cifs_open(struct inode *inode, struct file *file); 68extern int cifs_open(struct inode *inode, struct file *file);
67extern int cifs_close(struct inode *inode, struct file *file); 69extern int cifs_close(struct inode *inode, struct file *file);
68extern int cifs_closedir(struct inode *inode, struct file *file); 70extern int cifs_closedir(struct inode *inode, struct file *file);
@@ -97,5 +99,5 @@ extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t);
97extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); 99extern ssize_t cifs_listxattr(struct dentry *, char *, size_t);
98extern int cifs_ioctl (struct inode * inode, struct file * filep, 100extern int cifs_ioctl (struct inode * inode, struct file * filep,
99 unsigned int command, unsigned long arg); 101 unsigned int command, unsigned long arg);
100#define CIFS_VERSION "1.39" 102#define CIFS_VERSION "1.40"
101#endif /* _CIFSFS_H */ 103#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 1ba08f8c5bc..7bed27601ce 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -233,6 +233,8 @@ struct cifsTconInfo {
233 atomic_t num_hardlinks; 233 atomic_t num_hardlinks;
234 atomic_t num_symlinks; 234 atomic_t num_symlinks;
235 atomic_t num_locks; 235 atomic_t num_locks;
236 atomic_t num_acl_get;
237 atomic_t num_acl_set;
236#ifdef CONFIG_CIFS_STATS2 238#ifdef CONFIG_CIFS_STATS2
237 unsigned long long time_writes; 239 unsigned long long time_writes;
238 unsigned long long time_reads; 240 unsigned long long time_reads;
@@ -285,6 +287,7 @@ struct cifs_search_info {
285 unsigned endOfSearch:1; 287 unsigned endOfSearch:1;
286 unsigned emptyDir:1; 288 unsigned emptyDir:1;
287 unsigned unicode:1; 289 unsigned unicode:1;
290 unsigned smallBuf:1; /* so we know which buf_release function to call */
288}; 291};
289 292
290struct cifsFileInfo { 293struct cifsFileInfo {
@@ -420,7 +423,12 @@ struct dir_notify_req {
420#define MID_RESPONSE_RECEIVED 4 423#define MID_RESPONSE_RECEIVED 4
421#define MID_RETRY_NEEDED 8 /* session closed while this request out */ 424#define MID_RETRY_NEEDED 8 /* session closed while this request out */
422#define MID_NO_RESP_NEEDED 0x10 425#define MID_NO_RESP_NEEDED 0x10
423#define MID_SMALL_BUFFER 0x20 /* 112 byte response buffer instead of 4K */ 426
427/* Types of response buffer returned from SendReceive2 */
428#define CIFS_NO_BUFFER 0 /* Response buffer not returned */
429#define CIFS_SMALL_BUFFER 1
430#define CIFS_LARGE_BUFFER 2
431#define CIFS_IOVEC 4 /* array of response buffers */
424 432
425/* 433/*
426 ***************************************************************** 434 *****************************************************************
@@ -505,8 +513,12 @@ GLOBAL_EXTERN atomic_t tcpSesReconnectCount;
505GLOBAL_EXTERN atomic_t tconInfoReconnectCount; 513GLOBAL_EXTERN atomic_t tconInfoReconnectCount;
506 514
507/* Various Debug counters to remove someday (BB) */ 515/* Various Debug counters to remove someday (BB) */
508GLOBAL_EXTERN atomic_t bufAllocCount; 516GLOBAL_EXTERN atomic_t bufAllocCount; /* current number allocated */
509GLOBAL_EXTERN atomic_t smBufAllocCount; 517#ifdef CONFIG_CIFS_STATS2
518GLOBAL_EXTERN atomic_t totBufAllocCount; /* total allocated over all time */
519GLOBAL_EXTERN atomic_t totSmBufAllocCount;
520#endif
521GLOBAL_EXTERN atomic_t smBufAllocCount;
510GLOBAL_EXTERN atomic_t midCount; 522GLOBAL_EXTERN atomic_t midCount;
511 523
512/* Misc globals */ 524/* Misc globals */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 48a05b9df7e..cc2471094ca 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/cifspdu.h 2 * fs/cifs/cifspdu.h
3 * 3 *
4 * Copyright (c) International Business Machines Corp., 2002 4 * Copyright (c) International Business Machines Corp., 2002,2005
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * This library is free software; you can redistribute it and/or modify 7 * This library is free software; you can redistribute it and/or modify
@@ -80,7 +80,11 @@
80#define NT_TRANSACT_GET_USER_QUOTA 0x07 80#define NT_TRANSACT_GET_USER_QUOTA 0x07
81#define NT_TRANSACT_SET_USER_QUOTA 0x08 81#define NT_TRANSACT_SET_USER_QUOTA 0x08
82 82
83#define MAX_CIFS_HDR_SIZE 256 /* is future chained NTCreateXReadX bigger? */ 83#define MAX_CIFS_SMALL_BUFFER_SIZE 448 /* big enough for most */
84/* future chained NTCreateXReadX bigger, but for time being NTCreateX biggest */
85/* among the requests (NTCreateX response is bigger with wct of 34) */
86#define MAX_CIFS_HDR_SIZE 0x58 /* 4 len + 32 hdr + (2*24 wct) + 2 bct + 2 pad */
87#define CIFS_SMALL_PATH 120 /* allows for (448-88)/3 */
84 88
85/* internal cifs vfs structures */ 89/* internal cifs vfs structures */
86/***************************************************************** 90/*****************************************************************
@@ -524,7 +528,7 @@ typedef union smb_com_session_setup_andx {
524 /* STRING PrimaryDomain */ 528 /* STRING PrimaryDomain */
525 /* STRING NativeOS */ 529 /* STRING NativeOS */
526 /* STRING NativeLanMan */ 530 /* STRING NativeLanMan */
527 } __attribute__((packed)) old_req; /* pre-NTLM (LANMAN2.1) request format */ 531 } __attribute__((packed)) old_req; /* pre-NTLM (LANMAN2.1) req format */
528 532
529 struct { /* default (NTLM) response format */ 533 struct { /* default (NTLM) response format */
530 struct smb_hdr hdr; /* wct = 3 */ 534 struct smb_hdr hdr; /* wct = 3 */
@@ -536,7 +540,7 @@ typedef union smb_com_session_setup_andx {
536 unsigned char NativeOS[1]; /* followed by */ 540 unsigned char NativeOS[1]; /* followed by */
537/* unsigned char * NativeLanMan; */ 541/* unsigned char * NativeLanMan; */
538/* unsigned char * PrimaryDomain; */ 542/* unsigned char * PrimaryDomain; */
539 } __attribute__((packed)) old_resp; /* pre-NTLM (LANMAN2.1) response format */ 543 } __attribute__((packed)) old_resp; /* pre-NTLM (LANMAN2.1) response */
540} __attribute__((packed)) SESSION_SETUP_ANDX; 544} __attribute__((packed)) SESSION_SETUP_ANDX;
541 545
542#define CIFS_NETWORK_OPSYS "CIFS VFS Client for Linux" 546#define CIFS_NETWORK_OPSYS "CIFS VFS Client for Linux"
@@ -603,7 +607,9 @@ typedef struct smb_com_logoff_andx_rsp {
603 __u16 ByteCount; 607 __u16 ByteCount;
604} __attribute__((packed)) LOGOFF_ANDX_RSP; 608} __attribute__((packed)) LOGOFF_ANDX_RSP;
605 609
606typedef union smb_com_tree_disconnect { /* as an altetnative can use flag on tree_connect PDU to effect disconnect *//* probably the simplest SMB PDU */ 610typedef union smb_com_tree_disconnect { /* as an altetnative can use flag on
611 tree_connect PDU to effect disconnect */
612 /* tdis is probably simplest SMB PDU */
607 struct { 613 struct {
608 struct smb_hdr hdr; /* wct = 0 */ 614 struct smb_hdr hdr; /* wct = 0 */
609 __u16 ByteCount; /* bcc = 0 */ 615 __u16 ByteCount; /* bcc = 0 */
@@ -1001,10 +1007,49 @@ typedef struct smb_com_setattr_rsp {
1001 1007
1002/* empty wct response to setattr */ 1008/* empty wct response to setattr */
1003 1009
1004/***************************************************/ 1010/*******************************************************/
1005/* NT Transact structure defintions follow */ 1011/* NT Transact structure defintions follow */
1006/* Currently only ioctl and notify are implemented */ 1012/* Currently only ioctl, acl (get security descriptor) */
1007/***************************************************/ 1013/* and notify are implemented */
1014/*******************************************************/
1015typedef struct smb_com_ntransact_req {
1016 struct smb_hdr hdr; /* wct >= 19 */
1017 __u8 MaxSetupCount;
1018 __u16 Reserved;
1019 __le32 TotalParameterCount;
1020 __le32 TotalDataCount;
1021 __le32 MaxParameterCount;
1022 __le32 MaxDataCount;
1023 __le32 ParameterCount;
1024 __le32 ParameterOffset;
1025 __le32 DataCount;
1026 __le32 DataOffset;
1027 __u8 SetupCount; /* four setup words follow subcommand */
1028 /* SNIA spec incorrectly included spurious pad here */
1029 __le16 SubCommand; /* 2 = IOCTL/FSCTL */
1030 /* SetupCount words follow then */
1031 __le16 ByteCount;
1032 __u8 Pad[3];
1033 __u8 Parms[0];
1034} __attribute__((packed)) NTRANSACT_REQ;
1035
1036typedef struct smb_com_ntransact_rsp {
1037 struct smb_hdr hdr; /* wct = 18 */
1038 __u8 Reserved[3];
1039 __le32 TotalParameterCount;
1040 __le32 TotalDataCount;
1041 __le32 ParameterCount;
1042 __le32 ParameterOffset;
1043 __le32 ParameterDisplacement;
1044 __le32 DataCount;
1045 __le32 DataOffset;
1046 __le32 DataDisplacement;
1047 __u8 SetupCount; /* 0 */
1048 __u16 ByteCount;
1049 /* __u8 Pad[3]; */
1050 /* parms and data follow */
1051} __attribute__((packed)) NTRANSACT_RSP;
1052
1008typedef struct smb_com_transaction_ioctl_req { 1053typedef struct smb_com_transaction_ioctl_req {
1009 struct smb_hdr hdr; /* wct = 23 */ 1054 struct smb_hdr hdr; /* wct = 23 */
1010 __u8 MaxSetupCount; 1055 __u8 MaxSetupCount;
@@ -1019,11 +1064,11 @@ typedef struct smb_com_transaction_ioctl_req {
1019 __le32 DataOffset; 1064 __le32 DataOffset;
1020 __u8 SetupCount; /* four setup words follow subcommand */ 1065 __u8 SetupCount; /* four setup words follow subcommand */
1021 /* SNIA spec incorrectly included spurious pad here */ 1066 /* SNIA spec incorrectly included spurious pad here */
1022 __le16 SubCommand;/* 2 = IOCTL/FSCTL */ 1067 __le16 SubCommand; /* 2 = IOCTL/FSCTL */
1023 __le32 FunctionCode; 1068 __le32 FunctionCode;
1024 __u16 Fid; 1069 __u16 Fid;
1025 __u8 IsFsctl; /* 1 = File System Control, 0 = device control (IOCTL)*/ 1070 __u8 IsFsctl; /* 1 = File System Control 0 = device control (IOCTL) */
1026 __u8 IsRootFlag; /* 1 = apply command to root of share (must be DFS share)*/ 1071 __u8 IsRootFlag; /* 1 = apply command to root of share (must be DFS) */
1027 __le16 ByteCount; 1072 __le16 ByteCount;
1028 __u8 Pad[3]; 1073 __u8 Pad[3];
1029 __u8 Data[1]; 1074 __u8 Data[1];
@@ -1043,9 +1088,35 @@ typedef struct smb_com_transaction_ioctl_rsp {
1043 __u8 SetupCount; /* 1 */ 1088 __u8 SetupCount; /* 1 */
1044 __le16 ReturnedDataLen; 1089 __le16 ReturnedDataLen;
1045 __u16 ByteCount; 1090 __u16 ByteCount;
1046 __u8 Pad[3];
1047} __attribute__((packed)) TRANSACT_IOCTL_RSP; 1091} __attribute__((packed)) TRANSACT_IOCTL_RSP;
1048 1092
1093#define CIFS_ACL_OWNER 1
1094#define CIFS_ACL_GROUP 2
1095#define CIFS_ACL_DACL 4
1096#define CIFS_ACL_SACL 8
1097
1098typedef struct smb_com_transaction_qsec_req {
1099 struct smb_hdr hdr; /* wct = 19 */
1100 __u8 MaxSetupCount;
1101 __u16 Reserved;
1102 __le32 TotalParameterCount;
1103 __le32 TotalDataCount;
1104 __le32 MaxParameterCount;
1105 __le32 MaxDataCount;
1106 __le32 ParameterCount;
1107 __le32 ParameterOffset;
1108 __le32 DataCount;
1109 __le32 DataOffset;
1110 __u8 SetupCount; /* no setup words follow subcommand */
1111 /* SNIA spec incorrectly included spurious pad here */
1112 __le16 SubCommand; /* 6 = QUERY_SECURITY_DESC */
1113 __le16 ByteCount; /* bcc = 3 + 8 */
1114 __u8 Pad[3];
1115 __u16 Fid;
1116 __u16 Reserved2;
1117 __le32 AclFlags;
1118} __attribute__((packed)) QUERY_SEC_DESC_REQ;
1119
1049typedef struct smb_com_transaction_change_notify_req { 1120typedef struct smb_com_transaction_change_notify_req {
1050 struct smb_hdr hdr; /* wct = 23 */ 1121 struct smb_hdr hdr; /* wct = 23 */
1051 __u8 MaxSetupCount; 1122 __u8 MaxSetupCount;
@@ -1066,10 +1137,12 @@ typedef struct smb_com_transaction_change_notify_req {
1066 __u8 WatchTree; /* 1 = Monitor subdirectories */ 1137 __u8 WatchTree; /* 1 = Monitor subdirectories */
1067 __u8 Reserved2; 1138 __u8 Reserved2;
1068 __le16 ByteCount; 1139 __le16 ByteCount;
1069/* __u8 Pad[3];*/ 1140/* __u8 Pad[3];*/
1070/* __u8 Data[1];*/ 1141/* __u8 Data[1];*/
1071} __attribute__((packed)) TRANSACT_CHANGE_NOTIFY_REQ; 1142} __attribute__((packed)) TRANSACT_CHANGE_NOTIFY_REQ;
1072 1143
1144/* BB eventually change to use generic ntransact rsp struct
1145 and validation routine */
1073typedef struct smb_com_transaction_change_notify_rsp { 1146typedef struct smb_com_transaction_change_notify_rsp {
1074 struct smb_hdr hdr; /* wct = 18 */ 1147 struct smb_hdr hdr; /* wct = 18 */
1075 __u8 Reserved[3]; 1148 __u8 Reserved[3];
@@ -2025,6 +2098,12 @@ typedef struct {
2025} __attribute__((packed)) FILE_BOTH_DIRECTORY_INFO; /* level 0x104 FF response data area */ 2098} __attribute__((packed)) FILE_BOTH_DIRECTORY_INFO; /* level 0x104 FF response data area */
2026 2099
2027 2100
2101struct win_dev {
2102 unsigned char type[8]; /* IntxCHR or IntxBLK */
2103 __le64 major;
2104 __le64 minor;
2105} __attribute__((packed));
2106
2028struct gea { 2107struct gea {
2029 unsigned char name_len; 2108 unsigned char name_len;
2030 char name[1]; 2109 char name[1];
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 1b73f4f4c5c..3c03aadaff0 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -48,8 +48,8 @@ extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
48 struct smb_hdr * /* out */ , 48 struct smb_hdr * /* out */ ,
49 int * /* bytes returned */ , const int long_op); 49 int * /* bytes returned */ , const int long_op);
50extern int SendReceive2(const unsigned int /* xid */ , struct cifsSesInfo *, 50extern int SendReceive2(const unsigned int /* xid */ , struct cifsSesInfo *,
51 struct kvec *, int /* nvec */, 51 struct kvec *, int /* nvec to send */,
52 int * /* bytes returned */ , const int long_op); 52 int * /* type of buf returned */ , const int long_op);
53extern int checkSMBhdr(struct smb_hdr *smb, __u16 mid); 53extern int checkSMBhdr(struct smb_hdr *smb, __u16 mid);
54extern int checkSMB(struct smb_hdr *smb, __u16 mid, int length); 54extern int checkSMB(struct smb_hdr *smb, __u16 mid, int length);
55extern int is_valid_oplock_break(struct smb_hdr *smb); 55extern int is_valid_oplock_break(struct smb_hdr *smb);
@@ -93,11 +93,12 @@ extern int CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
93 const struct nls_table *); 93 const struct nls_table *);
94 94
95extern int CIFSFindFirst(const int xid, struct cifsTconInfo *tcon, 95extern int CIFSFindFirst(const int xid, struct cifsTconInfo *tcon,
96 const char *searchName, const struct nls_table *nls_codepage, 96 const char *searchName, const struct nls_table *nls_codepage,
97 __u16 *searchHandle, struct cifs_search_info * psrch_inf, int map, const char dirsep); 97 __u16 *searchHandle, struct cifs_search_info * psrch_inf,
98 int map, const char dirsep);
98 99
99extern int CIFSFindNext(const int xid, struct cifsTconInfo *tcon, 100extern int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
100 __u16 searchHandle, struct cifs_search_info * psrch_inf); 101 __u16 searchHandle, struct cifs_search_info * psrch_inf);
101 102
102extern int CIFSFindClose(const int, struct cifsTconInfo *tcon, 103extern int CIFSFindClose(const int, struct cifsTconInfo *tcon,
103 const __u16 search_handle); 104 const __u16 search_handle);
@@ -230,19 +231,18 @@ extern int CIFSSMBClose(const int xid, struct cifsTconInfo *tcon,
230 const int smb_file_id); 231 const int smb_file_id);
231 232
232extern int CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, 233extern int CIFSSMBRead(const int xid, struct cifsTconInfo *tcon,
233 const int netfid, unsigned int count, 234 const int netfid, unsigned int count,
234 const __u64 lseek, unsigned int *nbytes, char **buf); 235 const __u64 lseek, unsigned int *nbytes, char **buf,
236 int * return_buf_type);
235extern int CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon, 237extern int CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
236 const int netfid, const unsigned int count, 238 const int netfid, const unsigned int count,
237 const __u64 lseek, unsigned int *nbytes, 239 const __u64 lseek, unsigned int *nbytes,
238 const char *buf, const char __user *ubuf, 240 const char *buf, const char __user *ubuf,
239 const int long_op); 241 const int long_op);
240#ifdef CONFIG_CIFS_EXPERIMENTAL
241extern int CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon, 242extern int CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
242 const int netfid, const unsigned int count, 243 const int netfid, const unsigned int count,
243 const __u64 offset, unsigned int *nbytes, 244 const __u64 offset, unsigned int *nbytes,
244 struct kvec *iov, const int nvec, const int long_op); 245 struct kvec *iov, const int nvec, const int long_op);
245#endif /* CONFIG_CIFS_EXPERIMENTAL */
246extern int CIFSGetSrvInodeNumber(const int xid, struct cifsTconInfo *tcon, 246extern int CIFSGetSrvInodeNumber(const int xid, struct cifsTconInfo *tcon,
247 const unsigned char *searchName, __u64 * inode_number, 247 const unsigned char *searchName, __u64 * inode_number,
248 const struct nls_table *nls_codepage, 248 const struct nls_table *nls_codepage,
@@ -269,6 +269,8 @@ extern void tconInfoFree(struct cifsTconInfo *);
269extern int cifs_reconnect(struct TCP_Server_Info *server); 269extern int cifs_reconnect(struct TCP_Server_Info *server);
270 270
271extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *,__u32 *); 271extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *,__u32 *);
272extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
273 __u32 *);
272extern int cifs_verify_signature(struct smb_hdr *, const char * mac_key, 274extern int cifs_verify_signature(struct smb_hdr *, const char * mac_key,
273 __u32 expected_sequence_number); 275 __u32 expected_sequence_number);
274extern int cifs_calculate_mac_key(char * key,const char * rn,const char * pass); 276extern int cifs_calculate_mac_key(char * key,const char * rn,const char * pass);
@@ -297,6 +299,9 @@ extern int CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon,
297 const char *fileName, const char * ea_name, 299 const char *fileName, const char * ea_name,
298 const void * ea_value, const __u16 ea_value_len, 300 const void * ea_value, const __u16 ea_value_len,
299 const struct nls_table *nls_codepage, int remap_special_chars); 301 const struct nls_table *nls_codepage, int remap_special_chars);
302extern int CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon,
303 __u16 fid, char *acl_inf, const int buflen,
304 const int acl_type /* ACCESS vs. DEFAULT */);
300extern int CIFSSMBGetPosixACL(const int xid, struct cifsTconInfo *tcon, 305extern int CIFSSMBGetPosixACL(const int xid, struct cifsTconInfo *tcon,
301 const unsigned char *searchName, 306 const unsigned char *searchName,
302 char *acl_inf, const int buflen,const int acl_type, 307 char *acl_inf, const int buflen,const int acl_type,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index a53c596e108..217323b0c89 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -37,6 +37,7 @@
37#include "cifsproto.h" 37#include "cifsproto.h"
38#include "cifs_unicode.h" 38#include "cifs_unicode.h"
39#include "cifs_debug.h" 39#include "cifs_debug.h"
40#include "cifsacl.h"
40 41
41#ifdef CONFIG_CIFS_POSIX 42#ifdef CONFIG_CIFS_POSIX
42static struct { 43static struct {
@@ -90,6 +91,18 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
90 check for tcp and smb session status done differently 91 check for tcp and smb session status done differently
91 for those three - in the calling routine */ 92 for those three - in the calling routine */
92 if(tcon) { 93 if(tcon) {
94 if(tcon->tidStatus == CifsExiting) {
95 /* only tree disconnect, open, and write,
96 (and ulogoff which does not have tcon)
97 are allowed as we start force umount */
98 if((smb_command != SMB_COM_WRITE_ANDX) &&
99 (smb_command != SMB_COM_OPEN_ANDX) &&
100 (smb_command != SMB_COM_TREE_DISCONNECT)) {
101 cFYI(1,("can not send cmd %d while umounting",
102 smb_command));
103 return -ENODEV;
104 }
105 }
93 if((tcon->ses) && (tcon->ses->status != CifsExiting) && 106 if((tcon->ses) && (tcon->ses->status != CifsExiting) &&
94 (tcon->ses->server)){ 107 (tcon->ses->server)){
95 struct nls_table *nls_codepage; 108 struct nls_table *nls_codepage;
@@ -187,6 +200,19 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
187 check for tcp and smb session status done differently 200 check for tcp and smb session status done differently
188 for those three - in the calling routine */ 201 for those three - in the calling routine */
189 if(tcon) { 202 if(tcon) {
203 if(tcon->tidStatus == CifsExiting) {
204 /* only tree disconnect, open, and write,
205 (and ulogoff which does not have tcon)
206 are allowed as we start force umount */
207 if((smb_command != SMB_COM_WRITE_ANDX) &&
208 (smb_command != SMB_COM_OPEN_ANDX) &&
209 (smb_command != SMB_COM_TREE_DISCONNECT)) {
210 cFYI(1,("can not send cmd %d while umounting",
211 smb_command));
212 return -ENODEV;
213 }
214 }
215
190 if((tcon->ses) && (tcon->ses->status != CifsExiting) && 216 if((tcon->ses) && (tcon->ses->status != CifsExiting) &&
191 (tcon->ses->server)){ 217 (tcon->ses->server)){
192 struct nls_table *nls_codepage; 218 struct nls_table *nls_codepage;
@@ -347,8 +373,10 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
347 rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB, 373 rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
348 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 374 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
349 if (rc == 0) { 375 if (rc == 0) {
350 server->secMode = pSMBr->SecurityMode; 376 server->secMode = pSMBr->SecurityMode;
351 server->secType = NTLM; /* BB override default for 377 if((server->secMode & SECMODE_USER) == 0)
378 cFYI(1,("share mode security"));
379 server->secType = NTLM; /* BB override default for
352 NTLMv2 or kerberos v5 */ 380 NTLMv2 or kerberos v5 */
353 /* one byte - no need to convert this or EncryptionKeyLen 381 /* one byte - no need to convert this or EncryptionKeyLen
354 from little endian */ 382 from little endian */
@@ -358,7 +386,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
358 min(le32_to_cpu(pSMBr->MaxBufferSize), 386 min(le32_to_cpu(pSMBr->MaxBufferSize),
359 (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); 387 (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
360 server->maxRw = le32_to_cpu(pSMBr->MaxRawSize); 388 server->maxRw = le32_to_cpu(pSMBr->MaxRawSize);
361 cFYI(0, ("Max buf = %d ", ses->server->maxBuf)); 389 cFYI(0, ("Max buf = %d", ses->server->maxBuf));
362 GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey); 390 GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
363 server->capabilities = le32_to_cpu(pSMBr->Capabilities); 391 server->capabilities = le32_to_cpu(pSMBr->Capabilities);
364 server->timeZone = le16_to_cpu(pSMBr->ServerTimeZone); 392 server->timeZone = le16_to_cpu(pSMBr->ServerTimeZone);
@@ -386,8 +414,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
386 (server->server_GUID, 414 (server->server_GUID,
387 pSMBr->u.extended_response. 415 pSMBr->u.extended_response.
388 GUID, 16) != 0) { 416 GUID, 16) != 0) {
389 cFYI(1, 417 cFYI(1, ("server UID changed"));
390 ("UID of server does not match previous connection to same ip address"));
391 memcpy(server-> 418 memcpy(server->
392 server_GUID, 419 server_GUID,
393 pSMBr->u. 420 pSMBr->u.
@@ -933,21 +960,19 @@ openRetry:
933 return rc; 960 return rc;
934} 961}
935 962
936/* If no buffer passed in, then caller wants to do the copy
937 as in the case of readpages so the SMB buffer must be
938 freed by the caller */
939
940int 963int
941CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, 964CIFSSMBRead(const int xid, struct cifsTconInfo *tcon,
942 const int netfid, const unsigned int count, 965 const int netfid, const unsigned int count,
943 const __u64 lseek, unsigned int *nbytes, char **buf) 966 const __u64 lseek, unsigned int *nbytes, char **buf,
967 int * pbuf_type)
944{ 968{
945 int rc = -EACCES; 969 int rc = -EACCES;
946 READ_REQ *pSMB = NULL; 970 READ_REQ *pSMB = NULL;
947 READ_RSP *pSMBr = NULL; 971 READ_RSP *pSMBr = NULL;
948 char *pReadData = NULL; 972 char *pReadData = NULL;
949 int bytes_returned;
950 int wct; 973 int wct;
974 int resp_buf_type = 0;
975 struct kvec iov[1];
951 976
952 cFYI(1,("Reading %d bytes on fid %d",count,netfid)); 977 cFYI(1,("Reading %d bytes on fid %d",count,netfid));
953 if(tcon->ses->capabilities & CAP_LARGE_FILES) 978 if(tcon->ses->capabilities & CAP_LARGE_FILES)
@@ -956,8 +981,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon,
956 wct = 10; /* old style read */ 981 wct = 10; /* old style read */
957 982
958 *nbytes = 0; 983 *nbytes = 0;
959 rc = smb_init(SMB_COM_READ_ANDX, wct, tcon, (void **) &pSMB, 984 rc = small_smb_init(SMB_COM_READ_ANDX, wct, tcon, (void **) &pSMB);
960 (void **) &pSMBr);
961 if (rc) 985 if (rc)
962 return rc; 986 return rc;
963 987
@@ -965,13 +989,13 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon,
965 if (tcon->ses->server == NULL) 989 if (tcon->ses->server == NULL)
966 return -ECONNABORTED; 990 return -ECONNABORTED;
967 991
968 pSMB->AndXCommand = 0xFF; /* none */ 992 pSMB->AndXCommand = 0xFF; /* none */
969 pSMB->Fid = netfid; 993 pSMB->Fid = netfid;
970 pSMB->OffsetLow = cpu_to_le32(lseek & 0xFFFFFFFF); 994 pSMB->OffsetLow = cpu_to_le32(lseek & 0xFFFFFFFF);
971 if(wct == 12) 995 if(wct == 12)
972 pSMB->OffsetHigh = cpu_to_le32(lseek >> 32); 996 pSMB->OffsetHigh = cpu_to_le32(lseek >> 32);
973 else if((lseek >> 32) > 0) /* can not handle this big offset for old */ 997 else if((lseek >> 32) > 0) /* can not handle this big offset for old */
974 return -EIO; 998 return -EIO;
975 999
976 pSMB->Remaining = 0; 1000 pSMB->Remaining = 0;
977 pSMB->MaxCount = cpu_to_le16(count & 0xFFFF); 1001 pSMB->MaxCount = cpu_to_le16(count & 0xFFFF);
@@ -980,14 +1004,18 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon,
980 pSMB->ByteCount = 0; /* no need to do le conversion since 0 */ 1004 pSMB->ByteCount = 0; /* no need to do le conversion since 0 */
981 else { 1005 else {
982 /* old style read */ 1006 /* old style read */
983 struct smb_com_readx_req * pSMBW = 1007 struct smb_com_readx_req * pSMBW =
984 (struct smb_com_readx_req *)pSMB; 1008 (struct smb_com_readx_req *)pSMB;
985 pSMBW->ByteCount = 0; 1009 pSMBW->ByteCount = 0;
986 } 1010 }
987 1011
988 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 1012 iov[0].iov_base = (char *)pSMB;
989 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 1013 iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
1014 rc = SendReceive2(xid, tcon->ses, iov,
1015 1 /* num iovecs */,
1016 &resp_buf_type, 0);
990 cifs_stats_inc(&tcon->num_reads); 1017 cifs_stats_inc(&tcon->num_reads);
1018 pSMBr = (READ_RSP *)iov[0].iov_base;
991 if (rc) { 1019 if (rc) {
992 cERROR(1, ("Send error in read = %d", rc)); 1020 cERROR(1, ("Send error in read = %d", rc));
993 } else { 1021 } else {
@@ -997,33 +1025,43 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon,
997 *nbytes = data_length; 1025 *nbytes = data_length;
998 1026
999 /*check that DataLength would not go beyond end of SMB */ 1027 /*check that DataLength would not go beyond end of SMB */
1000 if ((data_length > CIFSMaxBufSize) 1028 if ((data_length > CIFSMaxBufSize)
1001 || (data_length > count)) { 1029 || (data_length > count)) {
1002 cFYI(1,("bad length %d for count %d",data_length,count)); 1030 cFYI(1,("bad length %d for count %d",data_length,count));
1003 rc = -EIO; 1031 rc = -EIO;
1004 *nbytes = 0; 1032 *nbytes = 0;
1005 } else { 1033 } else {
1006 pReadData = 1034 pReadData = (char *) (&pSMBr->hdr.Protocol) +
1007 (char *) (&pSMBr->hdr.Protocol) +
1008 le16_to_cpu(pSMBr->DataOffset); 1035 le16_to_cpu(pSMBr->DataOffset);
1009/* if(rc = copy_to_user(buf, pReadData, data_length)) { 1036/* if(rc = copy_to_user(buf, pReadData, data_length)) {
1010 cERROR(1,("Faulting on read rc = %d",rc)); 1037 cERROR(1,("Faulting on read rc = %d",rc));
1011 rc = -EFAULT; 1038 rc = -EFAULT;
1012 }*/ /* can not use copy_to_user when using page cache*/ 1039 }*/ /* can not use copy_to_user when using page cache*/
1013 if(*buf) 1040 if(*buf)
1014 memcpy(*buf,pReadData,data_length); 1041 memcpy(*buf,pReadData,data_length);
1015 } 1042 }
1016 } 1043 }
1017 if(*buf)
1018 cifs_buf_release(pSMB);
1019 else
1020 *buf = (char *)pSMB;
1021 1044
1022 /* Note: On -EAGAIN error only caller can retry on handle based calls 1045 cifs_small_buf_release(pSMB);
1046 if(*buf) {
1047 if(resp_buf_type == CIFS_SMALL_BUFFER)
1048 cifs_small_buf_release(iov[0].iov_base);
1049 else if(resp_buf_type == CIFS_LARGE_BUFFER)
1050 cifs_buf_release(iov[0].iov_base);
1051 } else /* return buffer to caller to free */ /* BB FIXME how do we tell caller if it is not a large buffer */ {
1052 *buf = iov[0].iov_base;
1053 if(resp_buf_type == CIFS_SMALL_BUFFER)
1054 *pbuf_type = CIFS_SMALL_BUFFER;
1055 else if(resp_buf_type == CIFS_LARGE_BUFFER)
1056 *pbuf_type = CIFS_LARGE_BUFFER;
1057 }
1058
1059 /* Note: On -EAGAIN error only caller can retry on handle based calls
1023 since file handle passed in no longer valid */ 1060 since file handle passed in no longer valid */
1024 return rc; 1061 return rc;
1025} 1062}
1026 1063
1064
1027int 1065int
1028CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon, 1066CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
1029 const int netfid, const unsigned int count, 1067 const int netfid, const unsigned int count,
@@ -1130,7 +1168,6 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
1130 return rc; 1168 return rc;
1131} 1169}
1132 1170
1133#ifdef CONFIG_CIFS_EXPERIMENTAL
1134int 1171int
1135CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon, 1172CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
1136 const int netfid, const unsigned int count, 1173 const int netfid, const unsigned int count,
@@ -1139,10 +1176,12 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
1139{ 1176{
1140 int rc = -EACCES; 1177 int rc = -EACCES;
1141 WRITE_REQ *pSMB = NULL; 1178 WRITE_REQ *pSMB = NULL;
1142 int bytes_returned, wct; 1179 int wct;
1143 int smb_hdr_len; 1180 int smb_hdr_len;
1181 int resp_buf_type = 0;
1182
1183 cFYI(1,("write2 at %lld %d bytes", (long long)offset, count));
1144 1184
1145 cFYI(1,("write2 at %lld %d bytes",offset,count)); /* BB removeme BB */
1146 if(tcon->ses->capabilities & CAP_LARGE_FILES) 1185 if(tcon->ses->capabilities & CAP_LARGE_FILES)
1147 wct = 14; 1186 wct = 14;
1148 else 1187 else
@@ -1183,22 +1222,34 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
1183 pSMBW->ByteCount = cpu_to_le16(count + 5); 1222 pSMBW->ByteCount = cpu_to_le16(count + 5);
1184 } 1223 }
1185 iov[0].iov_base = pSMB; 1224 iov[0].iov_base = pSMB;
1186 iov[0].iov_len = smb_hdr_len + 4; 1225 if(wct == 14)
1226 iov[0].iov_len = smb_hdr_len + 4;
1227 else /* wct == 12 pad bigger by four bytes */
1228 iov[0].iov_len = smb_hdr_len + 8;
1229
1187 1230
1188 rc = SendReceive2(xid, tcon->ses, iov, n_vec + 1, &bytes_returned, 1231 rc = SendReceive2(xid, tcon->ses, iov, n_vec + 1, &resp_buf_type,
1189 long_op); 1232 long_op);
1190 cifs_stats_inc(&tcon->num_writes); 1233 cifs_stats_inc(&tcon->num_writes);
1191 if (rc) { 1234 if (rc) {
1192 cFYI(1, ("Send error Write2 = %d", rc)); 1235 cFYI(1, ("Send error Write2 = %d", rc));
1193 *nbytes = 0; 1236 *nbytes = 0;
1237 } else if(resp_buf_type == 0) {
1238 /* presumably this can not happen, but best to be safe */
1239 rc = -EIO;
1240 *nbytes = 0;
1194 } else { 1241 } else {
1195 WRITE_RSP * pSMBr = (WRITE_RSP *)pSMB; 1242 WRITE_RSP * pSMBr = (WRITE_RSP *)iov[0].iov_base;
1196 *nbytes = le16_to_cpu(pSMBr->CountHigh); 1243 *nbytes = le16_to_cpu(pSMBr->CountHigh);
1197 *nbytes = (*nbytes) << 16; 1244 *nbytes = (*nbytes) << 16;
1198 *nbytes += le16_to_cpu(pSMBr->Count); 1245 *nbytes += le16_to_cpu(pSMBr->Count);
1199 } 1246 }
1200 1247
1201 cifs_small_buf_release(pSMB); 1248 cifs_small_buf_release(pSMB);
1249 if(resp_buf_type == CIFS_SMALL_BUFFER)
1250 cifs_small_buf_release(iov[0].iov_base);
1251 else if(resp_buf_type == CIFS_LARGE_BUFFER)
1252 cifs_buf_release(iov[0].iov_base);
1202 1253
1203 /* Note: On -EAGAIN error only caller can retry on handle based calls 1254 /* Note: On -EAGAIN error only caller can retry on handle based calls
1204 since file handle passed in no longer valid */ 1255 since file handle passed in no longer valid */
@@ -1207,8 +1258,6 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
1207} 1258}
1208 1259
1209 1260
1210#endif /* CIFS_EXPERIMENTAL */
1211
1212int 1261int
1213CIFSSMBLock(const int xid, struct cifsTconInfo *tcon, 1262CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
1214 const __u16 smb_file_id, const __u64 len, 1263 const __u16 smb_file_id, const __u64 len,
@@ -1553,7 +1602,7 @@ createSymLinkRetry:
1553 1602
1554 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 1603 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
1555 name_len = 1604 name_len =
1556 cifs_strtoUCS((wchar_t *) pSMB->FileName, fromName, PATH_MAX 1605 cifs_strtoUCS((__le16 *) pSMB->FileName, fromName, PATH_MAX
1557 /* find define for this maxpathcomponent */ 1606 /* find define for this maxpathcomponent */
1558 , nls_codepage); 1607 , nls_codepage);
1559 name_len++; /* trailing null */ 1608 name_len++; /* trailing null */
@@ -1577,7 +1626,7 @@ createSymLinkRetry:
1577 data_offset = (char *) (&pSMB->hdr.Protocol) + offset; 1626 data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
1578 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 1627 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
1579 name_len_target = 1628 name_len_target =
1580 cifs_strtoUCS((wchar_t *) data_offset, toName, PATH_MAX 1629 cifs_strtoUCS((__le16 *) data_offset, toName, PATH_MAX
1581 /* find define for this maxpathcomponent */ 1630 /* find define for this maxpathcomponent */
1582 , nls_codepage); 1631 , nls_codepage);
1583 name_len_target++; /* trailing null */ 1632 name_len_target++; /* trailing null */
@@ -1803,7 +1852,7 @@ querySymLinkRetry:
1803 1852
1804 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 1853 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
1805 name_len = 1854 name_len =
1806 cifs_strtoUCS((wchar_t *) pSMB->FileName, searchName, PATH_MAX 1855 cifs_strtoUCS((__le16 *) pSMB->FileName, searchName, PATH_MAX
1807 /* find define for this maxpathcomponent */ 1856 /* find define for this maxpathcomponent */
1808 , nls_codepage); 1857 , nls_codepage);
1809 name_len++; /* trailing null */ 1858 name_len++; /* trailing null */
@@ -1860,7 +1909,7 @@ querySymLinkRetry:
1860 min_t(const int, buflen,count) / 2); 1909 min_t(const int, buflen,count) / 2);
1861 /* BB FIXME investigate remapping reserved chars here */ 1910 /* BB FIXME investigate remapping reserved chars here */
1862 cifs_strfromUCS_le(symlinkinfo, 1911 cifs_strfromUCS_le(symlinkinfo,
1863 (wchar_t *) ((char *)&pSMBr->hdr.Protocol + 1912 (__le16 *) ((char *)&pSMBr->hdr.Protocol +
1864 data_offset), 1913 data_offset),
1865 name_len, nls_codepage); 1914 name_len, nls_codepage);
1866 } else { 1915 } else {
@@ -1879,6 +1928,90 @@ querySymLinkRetry:
1879 return rc; 1928 return rc;
1880} 1929}
1881 1930
1931/* Initialize NT TRANSACT SMB into small smb request buffer.
1932 This assumes that all NT TRANSACTS that we init here have
1933 total parm and data under about 400 bytes (to fit in small cifs
1934 buffer size), which is the case so far, it easily fits. NB:
1935 Setup words themselves and ByteCount
1936 MaxSetupCount (size of returned setup area) and
1937 MaxParameterCount (returned parms size) must be set by caller */
1938static int
1939smb_init_ntransact(const __u16 sub_command, const int setup_count,
1940 const int parm_len, struct cifsTconInfo *tcon,
1941 void ** ret_buf)
1942{
1943 int rc;
1944 __u32 temp_offset;
1945 struct smb_com_ntransact_req * pSMB;
1946
1947 rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon,
1948 (void **)&pSMB);
1949 if (rc)
1950 return rc;
1951 *ret_buf = (void *)pSMB;
1952 pSMB->Reserved = 0;
1953 pSMB->TotalParameterCount = cpu_to_le32(parm_len);
1954 pSMB->TotalDataCount = 0;
1955 pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
1956 MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
1957 pSMB->ParameterCount = pSMB->TotalParameterCount;
1958 pSMB->DataCount = pSMB->TotalDataCount;
1959 temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
1960 (setup_count * 2) - 4 /* for rfc1001 length itself */;
1961 pSMB->ParameterOffset = cpu_to_le32(temp_offset);
1962 pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len);
1963 pSMB->SetupCount = setup_count; /* no need to le convert byte fields */
1964 pSMB->SubCommand = cpu_to_le16(sub_command);
1965 return 0;
1966}
1967
1968static int
1969validate_ntransact(char * buf, char ** ppparm, char ** ppdata,
1970 int * pdatalen, int * pparmlen)
1971{
1972 char * end_of_smb;
1973 __u32 data_count, data_offset, parm_count, parm_offset;
1974 struct smb_com_ntransact_rsp * pSMBr;
1975
1976 if(buf == NULL)
1977 return -EINVAL;
1978
1979 pSMBr = (struct smb_com_ntransact_rsp *)buf;
1980
1981 /* ByteCount was converted from little endian in SendReceive */
1982 end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount +
1983 (char *)&pSMBr->ByteCount;
1984
1985
1986 data_offset = le32_to_cpu(pSMBr->DataOffset);
1987 data_count = le32_to_cpu(pSMBr->DataCount);
1988 parm_offset = le32_to_cpu(pSMBr->ParameterOffset);
1989 parm_count = le32_to_cpu(pSMBr->ParameterCount);
1990
1991 *ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset;
1992 *ppdata = (char *)&pSMBr->hdr.Protocol + data_offset;
1993
1994 /* should we also check that parm and data areas do not overlap? */
1995 if(*ppparm > end_of_smb) {
1996 cFYI(1,("parms start after end of smb"));
1997 return -EINVAL;
1998 } else if(parm_count + *ppparm > end_of_smb) {
1999 cFYI(1,("parm end after end of smb"));
2000 return -EINVAL;
2001 } else if(*ppdata > end_of_smb) {
2002 cFYI(1,("data starts after end of smb"));
2003 return -EINVAL;
2004 } else if(data_count + *ppdata > end_of_smb) {
2005 cFYI(1,("data %p + count %d (%p) ends after end of smb %p start %p",
2006 *ppdata, data_count, (data_count + *ppdata), end_of_smb, pSMBr)); /* BB FIXME */
2007 return -EINVAL;
2008 } else if(parm_count + data_count > pSMBr->ByteCount) {
2009 cFYI(1,("parm count and data count larger than SMB"));
2010 return -EINVAL;
2011 }
2012 return 0;
2013}
2014
1882int 2015int
1883CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon, 2016CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
1884 const unsigned char *searchName, 2017 const unsigned char *searchName,
@@ -1901,7 +2034,8 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
1901 pSMB->TotalDataCount = 0; 2034 pSMB->TotalDataCount = 0;
1902 pSMB->MaxParameterCount = cpu_to_le32(2); 2035 pSMB->MaxParameterCount = cpu_to_le32(2);
1903 /* BB find exact data count max from sess structure BB */ 2036 /* BB find exact data count max from sess structure BB */
1904 pSMB->MaxDataCount = cpu_to_le32(4000); 2037 pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
2038 MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
1905 pSMB->MaxSetupCount = 4; 2039 pSMB->MaxSetupCount = 4;
1906 pSMB->Reserved = 0; 2040 pSMB->Reserved = 0;
1907 pSMB->ParameterOffset = 0; 2041 pSMB->ParameterOffset = 0;
@@ -1928,7 +2062,9 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
1928 rc = -EIO; /* bad smb */ 2062 rc = -EIO; /* bad smb */
1929 else { 2063 else {
1930 if(data_count && (data_count < 2048)) { 2064 if(data_count && (data_count < 2048)) {
1931 char * end_of_smb = pSMBr->ByteCount + (char *)&pSMBr->ByteCount; 2065 char * end_of_smb = 2 /* sizeof byte count */ +
2066 pSMBr->ByteCount +
2067 (char *)&pSMBr->ByteCount;
1932 2068
1933 struct reparse_data * reparse_buf = (struct reparse_data *) 2069 struct reparse_data * reparse_buf = (struct reparse_data *)
1934 ((char *)&pSMBr->hdr.Protocol + data_offset); 2070 ((char *)&pSMBr->hdr.Protocol + data_offset);
@@ -1951,7 +2087,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
1951 reparse_buf->TargetNameOffset), 2087 reparse_buf->TargetNameOffset),
1952 min(buflen/2, reparse_buf->TargetNameLen / 2)); 2088 min(buflen/2, reparse_buf->TargetNameLen / 2));
1953 cifs_strfromUCS_le(symlinkinfo, 2089 cifs_strfromUCS_le(symlinkinfo,
1954 (wchar_t *) (reparse_buf->LinkNamesBuf + 2090 (__le16 *) (reparse_buf->LinkNamesBuf +
1955 reparse_buf->TargetNameOffset), 2091 reparse_buf->TargetNameOffset),
1956 name_len, nls_codepage); 2092 name_len, nls_codepage);
1957 } else { /* ASCII names */ 2093 } else { /* ASCII names */
@@ -1983,9 +2119,9 @@ qreparse_out:
1983static void cifs_convert_ace(posix_acl_xattr_entry * ace, struct cifs_posix_ace * cifs_ace) 2119static void cifs_convert_ace(posix_acl_xattr_entry * ace, struct cifs_posix_ace * cifs_ace)
1984{ 2120{
1985 /* u8 cifs fields do not need le conversion */ 2121 /* u8 cifs fields do not need le conversion */
1986 ace->e_perm = (__u16)cifs_ace->cifs_e_perm; 2122 ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm);
1987 ace->e_tag = (__u16)cifs_ace->cifs_e_tag; 2123 ace->e_tag = cpu_to_le16(cifs_ace->cifs_e_tag);
1988 ace->e_id = (__u32)le64_to_cpu(cifs_ace->cifs_uid); 2124 ace->e_id = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid));
1989 /* cFYI(1,("perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id)); */ 2125 /* cFYI(1,("perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id)); */
1990 2126
1991 return; 2127 return;
@@ -2037,7 +2173,7 @@ static int cifs_copy_posix_acl(char * trgt,char * src, const int buflen,
2037 } else if(size > buflen) { 2173 } else if(size > buflen) {
2038 return -ERANGE; 2174 return -ERANGE;
2039 } else /* buffer big enough */ { 2175 } else /* buffer big enough */ {
2040 local_acl->a_version = POSIX_ACL_XATTR_VERSION; 2176 local_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
2041 for(i = 0;i < count ;i++) { 2177 for(i = 0;i < count ;i++) {
2042 cifs_convert_ace(&local_acl->a_entries[i],pACE); 2178 cifs_convert_ace(&local_acl->a_entries[i],pACE);
2043 pACE ++; 2179 pACE ++;
@@ -2051,14 +2187,14 @@ static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace * cifs_ace,
2051{ 2187{
2052 __u16 rc = 0; /* 0 = ACL converted ok */ 2188 __u16 rc = 0; /* 0 = ACL converted ok */
2053 2189
2054 cifs_ace->cifs_e_perm = (__u8)cpu_to_le16(local_ace->e_perm); 2190 cifs_ace->cifs_e_perm = le16_to_cpu(local_ace->e_perm);
2055 cifs_ace->cifs_e_tag = (__u8)cpu_to_le16(local_ace->e_tag); 2191 cifs_ace->cifs_e_tag = le16_to_cpu(local_ace->e_tag);
2056 /* BB is there a better way to handle the large uid? */ 2192 /* BB is there a better way to handle the large uid? */
2057 if(local_ace->e_id == -1) { 2193 if(local_ace->e_id == cpu_to_le32(-1)) {
2058 /* Probably no need to le convert -1 on any arch but can not hurt */ 2194 /* Probably no need to le convert -1 on any arch but can not hurt */
2059 cifs_ace->cifs_uid = cpu_to_le64(-1); 2195 cifs_ace->cifs_uid = cpu_to_le64(-1);
2060 } else 2196 } else
2061 cifs_ace->cifs_uid = (__u64)cpu_to_le32(local_ace->e_id); 2197 cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id));
2062 /*cFYI(1,("perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id));*/ 2198 /*cFYI(1,("perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id));*/
2063 return rc; 2199 return rc;
2064} 2200}
@@ -2078,16 +2214,17 @@ static __u16 ACL_to_cifs_posix(char * parm_data,const char * pACL,const int bufl
2078 2214
2079 count = posix_acl_xattr_count((size_t)buflen); 2215 count = posix_acl_xattr_count((size_t)buflen);
2080 cFYI(1,("setting acl with %d entries from buf of length %d and version of %d", 2216 cFYI(1,("setting acl with %d entries from buf of length %d and version of %d",
2081 count,buflen,local_acl->a_version)); 2217 count, buflen, le32_to_cpu(local_acl->a_version)));
2082 if(local_acl->a_version != 2) { 2218 if(le32_to_cpu(local_acl->a_version) != 2) {
2083 cFYI(1,("unknown POSIX ACL version %d",local_acl->a_version)); 2219 cFYI(1,("unknown POSIX ACL version %d",
2220 le32_to_cpu(local_acl->a_version)));
2084 return 0; 2221 return 0;
2085 } 2222 }
2086 cifs_acl->version = cpu_to_le16(1); 2223 cifs_acl->version = cpu_to_le16(1);
2087 if(acl_type == ACL_TYPE_ACCESS) 2224 if(acl_type == ACL_TYPE_ACCESS)
2088 cifs_acl->access_entry_count = count; 2225 cifs_acl->access_entry_count = cpu_to_le16(count);
2089 else if(acl_type == ACL_TYPE_DEFAULT) 2226 else if(acl_type == ACL_TYPE_DEFAULT)
2090 cifs_acl->default_entry_count = count; 2227 cifs_acl->default_entry_count = cpu_to_le16(count);
2091 else { 2228 else {
2092 cFYI(1,("unknown ACL type %d",acl_type)); 2229 cFYI(1,("unknown ACL type %d",acl_type));
2093 return 0; 2230 return 0;
@@ -2171,6 +2308,7 @@ queryAclRetry:
2171 2308
2172 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 2309 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
2173 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 2310 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
2311 cifs_stats_inc(&tcon->num_acl_get);
2174 if (rc) { 2312 if (rc) {
2175 cFYI(1, ("Send error in Query POSIX ACL = %d", rc)); 2313 cFYI(1, ("Send error in Query POSIX ACL = %d", rc));
2176 } else { 2314 } else {
@@ -2358,6 +2496,92 @@ GetExtAttrOut:
2358 2496
2359#endif /* CONFIG_POSIX */ 2497#endif /* CONFIG_POSIX */
2360 2498
2499
2500/* security id for everyone */
2501const struct cifs_sid sid_everyone = {1, 1, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0}};
2502/* group users */
2503const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {32, 545, 0, 0}};
2504
2505/* Convert CIFS ACL to POSIX form */
2506static int parse_sec_desc(struct cifs_sid * psec_desc, int acl_len)
2507{
2508 return 0;
2509}
2510
2511/* Get Security Descriptor (by handle) from remote server for a file or dir */
2512int
2513CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
2514 /* BB fix up return info */ char *acl_inf, const int buflen,
2515 const int acl_type /* ACCESS/DEFAULT not sure implication */)
2516{
2517 int rc = 0;
2518 int buf_type = 0;
2519 QUERY_SEC_DESC_REQ * pSMB;
2520 struct kvec iov[1];
2521
2522 cFYI(1, ("GetCifsACL"));
2523
2524 rc = smb_init_ntransact(NT_TRANSACT_QUERY_SECURITY_DESC, 0,
2525 8 /* parm len */, tcon, (void **) &pSMB);
2526 if (rc)
2527 return rc;
2528
2529 pSMB->MaxParameterCount = cpu_to_le32(4);
2530 /* BB TEST with big acls that might need to be e.g. larger than 16K */
2531 pSMB->MaxSetupCount = 0;
2532 pSMB->Fid = fid; /* file handle always le */
2533 pSMB->AclFlags = cpu_to_le32(CIFS_ACL_OWNER | CIFS_ACL_GROUP |
2534 CIFS_ACL_DACL);
2535 pSMB->ByteCount = cpu_to_le16(11); /* 3 bytes pad + 8 bytes parm */
2536 pSMB->hdr.smb_buf_length += 11;
2537 iov[0].iov_base = (char *)pSMB;
2538 iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
2539
2540 rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type, 0);
2541 cifs_stats_inc(&tcon->num_acl_get);
2542 if (rc) {
2543 cFYI(1, ("Send error in QuerySecDesc = %d", rc));
2544 } else { /* decode response */
2545 struct cifs_sid * psec_desc;
2546 __le32 * parm;
2547 int parm_len;
2548 int data_len;
2549 int acl_len;
2550 struct smb_com_ntransact_rsp * pSMBr;
2551
2552/* validate_nttransact */
2553 rc = validate_ntransact(iov[0].iov_base, (char **)&parm,
2554 (char **)&psec_desc,
2555 &parm_len, &data_len);
2556
2557 if(rc)
2558 goto qsec_out;
2559 pSMBr = (struct smb_com_ntransact_rsp *)iov[0].iov_base;
2560
2561 cERROR(1,("smb %p parm %p data %p",pSMBr,parm,psec_desc)); /* BB removeme BB */
2562
2563 if (le32_to_cpu(pSMBr->ParameterCount) != 4) {
2564 rc = -EIO; /* bad smb */
2565 goto qsec_out;
2566 }
2567
2568/* BB check that data area is minimum length and as big as acl_len */
2569
2570 acl_len = le32_to_cpu(*(__le32 *)parm);
2571 /* BB check if(acl_len > bufsize) */
2572
2573 parse_sec_desc(psec_desc, acl_len);
2574 }
2575qsec_out:
2576 if(buf_type == CIFS_SMALL_BUFFER)
2577 cifs_small_buf_release(iov[0].iov_base);
2578 else if(buf_type == CIFS_LARGE_BUFFER)
2579 cifs_buf_release(iov[0].iov_base);
2580 cifs_small_buf_release(pSMB);
2581 return rc;
2582}
2583
2584
2361/* Legacy Query Path Information call for lookup to old servers such 2585/* Legacy Query Path Information call for lookup to old servers such
2362 as Win9x/WinME */ 2586 as Win9x/WinME */
2363int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon, 2587int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
@@ -3203,7 +3427,7 @@ getDFSRetry:
3203 temp = ((char *)referrals) + le16_to_cpu(referrals->DfsPathOffset); 3427 temp = ((char *)referrals) + le16_to_cpu(referrals->DfsPathOffset);
3204 if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) { 3428 if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) {
3205 cifs_strfromUCS_le(*targetUNCs, 3429 cifs_strfromUCS_le(*targetUNCs,
3206 (wchar_t *) temp, name_len, nls_codepage); 3430 (__le16 *) temp, name_len, nls_codepage);
3207 } else { 3431 } else {
3208 strncpy(*targetUNCs,temp,name_len); 3432 strncpy(*targetUNCs,temp,name_len);
3209 } 3433 }
@@ -4256,7 +4480,7 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
4256{ 4480{
4257 int rc = 0; 4481 int rc = 0;
4258 struct smb_com_transaction_change_notify_req * pSMB = NULL; 4482 struct smb_com_transaction_change_notify_req * pSMB = NULL;
4259 struct smb_com_transaction_change_notify_rsp * pSMBr = NULL; 4483 struct smb_com_ntransaction_change_notify_rsp * pSMBr = NULL;
4260 struct dir_notify_req *dnotify_req; 4484 struct dir_notify_req *dnotify_req;
4261 int bytes_returned; 4485 int bytes_returned;
4262 4486
@@ -4271,6 +4495,10 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
4271 pSMB->MaxParameterCount = cpu_to_le32(2); 4495 pSMB->MaxParameterCount = cpu_to_le32(2);
4272 /* BB find exact data count max from sess structure BB */ 4496 /* BB find exact data count max from sess structure BB */
4273 pSMB->MaxDataCount = 0; /* same in little endian or be */ 4497 pSMB->MaxDataCount = 0; /* same in little endian or be */
4498/* BB VERIFY verify which is correct for above BB */
4499 pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
4500 MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
4501
4274 pSMB->MaxSetupCount = 4; 4502 pSMB->MaxSetupCount = 4;
4275 pSMB->Reserved = 0; 4503 pSMB->Reserved = 0;
4276 pSMB->ParameterOffset = 0; 4504 pSMB->ParameterOffset = 0;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2cb620716bc..88f60aa5205 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -76,12 +76,19 @@ struct smb_vol {
76 unsigned setuids:1; 76 unsigned setuids:1;
77 unsigned noperm:1; 77 unsigned noperm:1;
78 unsigned no_psx_acl:1; /* set if posix acl support should be disabled */ 78 unsigned no_psx_acl:1; /* set if posix acl support should be disabled */
79 unsigned cifs_acl:1;
79 unsigned no_xattr:1; /* set if xattr (EA) support should be disabled*/ 80 unsigned no_xattr:1; /* set if xattr (EA) support should be disabled*/
80 unsigned server_ino:1; /* use inode numbers from server ie UniqueId */ 81 unsigned server_ino:1; /* use inode numbers from server ie UniqueId */
81 unsigned direct_io:1; 82 unsigned direct_io:1;
82 unsigned remap:1; /* set to remap seven reserved chars in filenames */ 83 unsigned remap:1; /* set to remap seven reserved chars in filenames */
83 unsigned posix_paths:1; /* unset to not ask for posix pathnames. */ 84 unsigned posix_paths:1; /* unset to not ask for posix pathnames. */
84 unsigned sfu_emul:1; 85 unsigned sfu_emul:1;
86 unsigned krb5:1;
87 unsigned ntlm:1;
88 unsigned ntlmv2:1;
89 unsigned nullauth:1; /* attempt to authenticate with null user */
90 unsigned sign:1;
91 unsigned seal:1; /* encrypt */
85 unsigned nocase; /* request case insensitive filenames */ 92 unsigned nocase; /* request case insensitive filenames */
86 unsigned nobrl; /* disable sending byte range locks to srv */ 93 unsigned nobrl; /* disable sending byte range locks to srv */
87 unsigned int rsize; 94 unsigned int rsize;
@@ -508,7 +515,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
508 /* else length ok */ 515 /* else length ok */
509 reconnect = 0; 516 reconnect = 0;
510 517
511 if(pdu_length > MAX_CIFS_HDR_SIZE - 4) { 518 if(pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) {
512 isLargeBuf = TRUE; 519 isLargeBuf = TRUE;
513 memcpy(bigbuf, smallbuf, 4); 520 memcpy(bigbuf, smallbuf, 4);
514 smb_buffer = bigbuf; 521 smb_buffer = bigbuf;
@@ -777,7 +784,7 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
777 784
778 /* vol->retry default is 0 (i.e. "soft" limited retry not hard retry) */ 785 /* vol->retry default is 0 (i.e. "soft" limited retry not hard retry) */
779 vol->rw = TRUE; 786 vol->rw = TRUE;
780 787 vol->ntlm = TRUE;
781 /* default is always to request posix paths. */ 788 /* default is always to request posix paths. */
782 vol->posix_paths = 1; 789 vol->posix_paths = 1;
783 790
@@ -903,6 +910,39 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
903 printk(KERN_WARNING "CIFS: ip address too long\n"); 910 printk(KERN_WARNING "CIFS: ip address too long\n");
904 return 1; 911 return 1;
905 } 912 }
913 } else if (strnicmp(data, "sec", 3) == 0) {
914 if (!value || !*value) {
915 cERROR(1,("no security value specified"));
916 continue;
917 } else if (strnicmp(value, "krb5i", 5) == 0) {
918 vol->sign = 1;
919 vol->krb5 = 1;
920 } else if (strnicmp(value, "krb5p", 5) == 0) {
921 /* vol->seal = 1;
922 vol->krb5 = 1; */
923 cERROR(1,("Krb5 cifs privacy not supported"));
924 return 1;
925 } else if (strnicmp(value, "krb5", 4) == 0) {
926 vol->krb5 = 1;
927 } else if (strnicmp(value, "ntlmv2i", 7) == 0) {
928 vol->ntlmv2 = 1;
929 vol->sign = 1;
930 } else if (strnicmp(value, "ntlmv2", 6) == 0) {
931 vol->ntlmv2 = 1;
932 } else if (strnicmp(value, "ntlmi", 5) == 0) {
933 vol->ntlm = 1;
934 vol->sign = 1;
935 } else if (strnicmp(value, "ntlm", 4) == 0) {
936 /* ntlm is default so can be turned off too */
937 vol->ntlm = 1;
938 } else if (strnicmp(value, "nontlm", 6) == 0) {
939 vol->ntlm = 0;
940 } else if (strnicmp(value, "none", 4) == 0) {
941 vol->nullauth = 1;
942 } else {
943 cERROR(1,("bad security option: %s", value));
944 return 1;
945 }
906 } else if ((strnicmp(data, "unc", 3) == 0) 946 } else if ((strnicmp(data, "unc", 3) == 0)
907 || (strnicmp(data, "target", 6) == 0) 947 || (strnicmp(data, "target", 6) == 0)
908 || (strnicmp(data, "path", 4) == 0)) { 948 || (strnicmp(data, "path", 4) == 0)) {
@@ -1120,6 +1160,10 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
1120 vol->server_ino = 1; 1160 vol->server_ino = 1;
1121 } else if (strnicmp(data, "noserverino",9) == 0) { 1161 } else if (strnicmp(data, "noserverino",9) == 0) {
1122 vol->server_ino = 0; 1162 vol->server_ino = 0;
1163 } else if (strnicmp(data, "cifsacl",7) == 0) {
1164 vol->cifs_acl = 1;
1165 } else if (strnicmp(data, "nocifsacl", 9) == 0) {
1166 vol->cifs_acl = 0;
1123 } else if (strnicmp(data, "acl",3) == 0) { 1167 } else if (strnicmp(data, "acl",3) == 0) {
1124 vol->no_psx_acl = 0; 1168 vol->no_psx_acl = 0;
1125 } else if (strnicmp(data, "noacl",5) == 0) { 1169 } else if (strnicmp(data, "noacl",5) == 0) {
@@ -1546,7 +1590,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
1546 cFYI(1, ("Username: %s ", volume_info.username)); 1590 cFYI(1, ("Username: %s ", volume_info.username));
1547 1591
1548 } else { 1592 } else {
1549 cifserror("No username specified "); 1593 cifserror("No username specified");
1550 /* In userspace mount helper we can get user name from alternate 1594 /* In userspace mount helper we can get user name from alternate
1551 locations such as env variables and files on disk */ 1595 locations such as env variables and files on disk */
1552 kfree(volume_info.UNC); 1596 kfree(volume_info.UNC);
@@ -1587,7 +1631,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
1587 return -EINVAL; 1631 return -EINVAL;
1588 } else /* which servers DFS root would we conect to */ { 1632 } else /* which servers DFS root would we conect to */ {
1589 cERROR(1, 1633 cERROR(1,
1590 ("CIFS mount error: No UNC path (e.g. -o unc=//192.168.1.100/public) specified ")); 1634 ("CIFS mount error: No UNC path (e.g. -o unc=//192.168.1.100/public) specified"));
1591 kfree(volume_info.UNC); 1635 kfree(volume_info.UNC);
1592 kfree(volume_info.password); 1636 kfree(volume_info.password);
1593 FreeXid(xid); 1637 FreeXid(xid);
@@ -1626,7 +1670,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
1626 1670
1627 1671
1628 if (srvTcp) { 1672 if (srvTcp) {
1629 cFYI(1, ("Existing tcp session with server found ")); 1673 cFYI(1, ("Existing tcp session with server found"));
1630 } else { /* create socket */ 1674 } else { /* create socket */
1631 if(volume_info.port) 1675 if(volume_info.port)
1632 sin_server.sin_port = htons(volume_info.port); 1676 sin_server.sin_port = htons(volume_info.port);
@@ -1689,11 +1733,11 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
1689 1733
1690 if (existingCifsSes) { 1734 if (existingCifsSes) {
1691 pSesInfo = existingCifsSes; 1735 pSesInfo = existingCifsSes;
1692 cFYI(1, ("Existing smb sess found ")); 1736 cFYI(1, ("Existing smb sess found"));
1693 kfree(volume_info.password); 1737 kfree(volume_info.password);
1694 /* volume_info.UNC freed at end of function */ 1738 /* volume_info.UNC freed at end of function */
1695 } else if (!rc) { 1739 } else if (!rc) {
1696 cFYI(1, ("Existing smb sess not found ")); 1740 cFYI(1, ("Existing smb sess not found"));
1697 pSesInfo = sesInfoAlloc(); 1741 pSesInfo = sesInfoAlloc();
1698 if (pSesInfo == NULL) 1742 if (pSesInfo == NULL)
1699 rc = -ENOMEM; 1743 rc = -ENOMEM;
@@ -1751,7 +1795,8 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
1751 cifs_sb->mnt_gid = volume_info.linux_gid; 1795 cifs_sb->mnt_gid = volume_info.linux_gid;
1752 cifs_sb->mnt_file_mode = volume_info.file_mode; 1796 cifs_sb->mnt_file_mode = volume_info.file_mode;
1753 cifs_sb->mnt_dir_mode = volume_info.dir_mode; 1797 cifs_sb->mnt_dir_mode = volume_info.dir_mode;
1754 cFYI(1,("file mode: 0x%x dir mode: 0x%x",cifs_sb->mnt_file_mode,cifs_sb->mnt_dir_mode)); 1798 cFYI(1,("file mode: 0x%x dir mode: 0x%x",
1799 cifs_sb->mnt_file_mode,cifs_sb->mnt_dir_mode));
1755 1800
1756 if(volume_info.noperm) 1801 if(volume_info.noperm)
1757 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM; 1802 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
@@ -1767,6 +1812,8 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
1767 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL; 1812 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL;
1768 if(volume_info.nobrl) 1813 if(volume_info.nobrl)
1769 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL; 1814 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL;
1815 if(volume_info.cifs_acl)
1816 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
1770 1817
1771 if(volume_info.direct_io) { 1818 if(volume_info.direct_io) {
1772 cFYI(1,("mounting share using direct i/o")); 1819 cFYI(1,("mounting share using direct i/o"));
@@ -1777,7 +1824,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
1777 find_unc(sin_server.sin_addr.s_addr, volume_info.UNC, 1824 find_unc(sin_server.sin_addr.s_addr, volume_info.UNC,
1778 volume_info.username); 1825 volume_info.username);
1779 if (tcon) { 1826 if (tcon) {
1780 cFYI(1, ("Found match on UNC path ")); 1827 cFYI(1, ("Found match on UNC path"));
1781 /* we can have only one retry value for a connection 1828 /* we can have only one retry value for a connection
1782 to a share so for resources mounted more than once 1829 to a share so for resources mounted more than once
1783 to the same server share the last value passed in 1830 to the same server share the last value passed in
@@ -1926,7 +1973,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
1926 __u32 capabilities; 1973 __u32 capabilities;
1927 __u16 count; 1974 __u16 count;
1928 1975
1929 cFYI(1, ("In sesssetup ")); 1976 cFYI(1, ("In sesssetup"));
1930 if(ses == NULL) 1977 if(ses == NULL)
1931 return -EINVAL; 1978 return -EINVAL;
1932 user = ses->userName; 1979 user = ses->userName;
@@ -1986,32 +2033,32 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
1986 bytes_returned = 0; /* skill null user */ 2033 bytes_returned = 0; /* skill null user */
1987 else 2034 else
1988 bytes_returned = 2035 bytes_returned =
1989 cifs_strtoUCS((wchar_t *) bcc_ptr, user, 100, 2036 cifs_strtoUCS((__le16 *) bcc_ptr, user, 100,
1990 nls_codepage); 2037 nls_codepage);
1991 /* convert number of 16 bit words to bytes */ 2038 /* convert number of 16 bit words to bytes */
1992 bcc_ptr += 2 * bytes_returned; 2039 bcc_ptr += 2 * bytes_returned;
1993 bcc_ptr += 2; /* trailing null */ 2040 bcc_ptr += 2; /* trailing null */
1994 if (domain == NULL) 2041 if (domain == NULL)
1995 bytes_returned = 2042 bytes_returned =
1996 cifs_strtoUCS((wchar_t *) bcc_ptr, 2043 cifs_strtoUCS((__le16 *) bcc_ptr,
1997 "CIFS_LINUX_DOM", 32, nls_codepage); 2044 "CIFS_LINUX_DOM", 32, nls_codepage);
1998 else 2045 else
1999 bytes_returned = 2046 bytes_returned =
2000 cifs_strtoUCS((wchar_t *) bcc_ptr, domain, 64, 2047 cifs_strtoUCS((__le16 *) bcc_ptr, domain, 64,
2001 nls_codepage); 2048 nls_codepage);
2002 bcc_ptr += 2 * bytes_returned; 2049 bcc_ptr += 2 * bytes_returned;
2003 bcc_ptr += 2; 2050 bcc_ptr += 2;
2004 bytes_returned = 2051 bytes_returned =
2005 cifs_strtoUCS((wchar_t *) bcc_ptr, "Linux version ", 2052 cifs_strtoUCS((__le16 *) bcc_ptr, "Linux version ",
2006 32, nls_codepage); 2053 32, nls_codepage);
2007 bcc_ptr += 2 * bytes_returned; 2054 bcc_ptr += 2 * bytes_returned;
2008 bytes_returned = 2055 bytes_returned =
2009 cifs_strtoUCS((wchar_t *) bcc_ptr, system_utsname.release, 2056 cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release,
2010 32, nls_codepage); 2057 32, nls_codepage);
2011 bcc_ptr += 2 * bytes_returned; 2058 bcc_ptr += 2 * bytes_returned;
2012 bcc_ptr += 2; 2059 bcc_ptr += 2;
2013 bytes_returned = 2060 bytes_returned =
2014 cifs_strtoUCS((wchar_t *) bcc_ptr, CIFS_NETWORK_OPSYS, 2061 cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
2015 64, nls_codepage); 2062 64, nls_codepage);
2016 bcc_ptr += 2 * bytes_returned; 2063 bcc_ptr += 2 * bytes_returned;
2017 bcc_ptr += 2; 2064 bcc_ptr += 2;
@@ -2081,7 +2128,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
2081 if(ses->serverOS == NULL) 2128 if(ses->serverOS == NULL)
2082 goto sesssetup_nomem; 2129 goto sesssetup_nomem;
2083 cifs_strfromUCS_le(ses->serverOS, 2130 cifs_strfromUCS_le(ses->serverOS,
2084 (wchar_t *)bcc_ptr, len,nls_codepage); 2131 (__le16 *)bcc_ptr, len,nls_codepage);
2085 bcc_ptr += 2 * (len + 1); 2132 bcc_ptr += 2 * (len + 1);
2086 remaining_words -= len + 1; 2133 remaining_words -= len + 1;
2087 ses->serverOS[2 * len] = 0; 2134 ses->serverOS[2 * len] = 0;
@@ -2093,7 +2140,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
2093 if(ses->serverNOS == NULL) 2140 if(ses->serverNOS == NULL)
2094 goto sesssetup_nomem; 2141 goto sesssetup_nomem;
2095 cifs_strfromUCS_le(ses->serverNOS, 2142 cifs_strfromUCS_le(ses->serverNOS,
2096 (wchar_t *)bcc_ptr,len,nls_codepage); 2143 (__le16 *)bcc_ptr,len,nls_codepage);
2097 bcc_ptr += 2 * (len + 1); 2144 bcc_ptr += 2 * (len + 1);
2098 ses->serverNOS[2 * len] = 0; 2145 ses->serverNOS[2 * len] = 0;
2099 ses->serverNOS[1 + (2 * len)] = 0; 2146 ses->serverNOS[1 + (2 * len)] = 0;
@@ -2111,7 +2158,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
2111 if(ses->serverDomain == NULL) 2158 if(ses->serverDomain == NULL)
2112 goto sesssetup_nomem; 2159 goto sesssetup_nomem;
2113 cifs_strfromUCS_le(ses->serverDomain, 2160 cifs_strfromUCS_le(ses->serverDomain,
2114 (wchar_t *)bcc_ptr,len,nls_codepage); 2161 (__le16 *)bcc_ptr,len,nls_codepage);
2115 bcc_ptr += 2 * (len + 1); 2162 bcc_ptr += 2 * (len + 1);
2116 ses->serverDomain[2*len] = 0; 2163 ses->serverDomain[2*len] = 0;
2117 ses->serverDomain[1+(2*len)] = 0; 2164 ses->serverDomain[1+(2*len)] = 0;
@@ -2255,30 +2302,30 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
2255 bcc_ptr++; 2302 bcc_ptr++;
2256 } 2303 }
2257 bytes_returned = 2304 bytes_returned =
2258 cifs_strtoUCS((wchar_t *) bcc_ptr, user, 100, nls_codepage); 2305 cifs_strtoUCS((__le16 *) bcc_ptr, user, 100, nls_codepage);
2259 bcc_ptr += 2 * bytes_returned; /* convert num of 16 bit words to bytes */ 2306 bcc_ptr += 2 * bytes_returned; /* convert num of 16 bit words to bytes */
2260 bcc_ptr += 2; /* trailing null */ 2307 bcc_ptr += 2; /* trailing null */
2261 if (domain == NULL) 2308 if (domain == NULL)
2262 bytes_returned = 2309 bytes_returned =
2263 cifs_strtoUCS((wchar_t *) bcc_ptr, 2310 cifs_strtoUCS((__le16 *) bcc_ptr,
2264 "CIFS_LINUX_DOM", 32, nls_codepage); 2311 "CIFS_LINUX_DOM", 32, nls_codepage);
2265 else 2312 else
2266 bytes_returned = 2313 bytes_returned =
2267 cifs_strtoUCS((wchar_t *) bcc_ptr, domain, 64, 2314 cifs_strtoUCS((__le16 *) bcc_ptr, domain, 64,
2268 nls_codepage); 2315 nls_codepage);
2269 bcc_ptr += 2 * bytes_returned; 2316 bcc_ptr += 2 * bytes_returned;
2270 bcc_ptr += 2; 2317 bcc_ptr += 2;
2271 bytes_returned = 2318 bytes_returned =
2272 cifs_strtoUCS((wchar_t *) bcc_ptr, "Linux version ", 2319 cifs_strtoUCS((__le16 *) bcc_ptr, "Linux version ",
2273 32, nls_codepage); 2320 32, nls_codepage);
2274 bcc_ptr += 2 * bytes_returned; 2321 bcc_ptr += 2 * bytes_returned;
2275 bytes_returned = 2322 bytes_returned =
2276 cifs_strtoUCS((wchar_t *) bcc_ptr, system_utsname.release, 32, 2323 cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release, 32,
2277 nls_codepage); 2324 nls_codepage);
2278 bcc_ptr += 2 * bytes_returned; 2325 bcc_ptr += 2 * bytes_returned;
2279 bcc_ptr += 2; 2326 bcc_ptr += 2;
2280 bytes_returned = 2327 bytes_returned =
2281 cifs_strtoUCS((wchar_t *) bcc_ptr, CIFS_NETWORK_OPSYS, 2328 cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
2282 64, nls_codepage); 2329 64, nls_codepage);
2283 bcc_ptr += 2 * bytes_returned; 2330 bcc_ptr += 2 * bytes_returned;
2284 bcc_ptr += 2; 2331 bcc_ptr += 2;
@@ -2357,7 +2404,7 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
2357 ses->serverOS = 2404 ses->serverOS =
2358 kzalloc(2 * (len + 1), GFP_KERNEL); 2405 kzalloc(2 * (len + 1), GFP_KERNEL);
2359 cifs_strfromUCS_le(ses->serverOS, 2406 cifs_strfromUCS_le(ses->serverOS,
2360 (wchar_t *) 2407 (__le16 *)
2361 bcc_ptr, len, 2408 bcc_ptr, len,
2362 nls_codepage); 2409 nls_codepage);
2363 bcc_ptr += 2 * (len + 1); 2410 bcc_ptr += 2 * (len + 1);
@@ -2372,7 +2419,7 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
2372 kzalloc(2 * (len + 1), 2419 kzalloc(2 * (len + 1),
2373 GFP_KERNEL); 2420 GFP_KERNEL);
2374 cifs_strfromUCS_le(ses->serverNOS, 2421 cifs_strfromUCS_le(ses->serverNOS,
2375 (wchar_t *)bcc_ptr, 2422 (__le16 *)bcc_ptr,
2376 len, 2423 len,
2377 nls_codepage); 2424 nls_codepage);
2378 bcc_ptr += 2 * (len + 1); 2425 bcc_ptr += 2 * (len + 1);
@@ -2384,9 +2431,8 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
2384 /* last string is not always null terminated (for e.g. for Windows XP & 2000) */ 2431 /* last string is not always null terminated (for e.g. for Windows XP & 2000) */
2385 ses->serverDomain = kzalloc(2*(len+1),GFP_KERNEL); 2432 ses->serverDomain = kzalloc(2*(len+1),GFP_KERNEL);
2386 cifs_strfromUCS_le(ses->serverDomain, 2433 cifs_strfromUCS_le(ses->serverDomain,
2387 (wchar_t *)bcc_ptr, 2434 (__le16 *)bcc_ptr,
2388 len, 2435 len, nls_codepage);
2389 nls_codepage);
2390 bcc_ptr += 2*(len+1); 2436 bcc_ptr += 2*(len+1);
2391 ses->serverDomain[2*len] = 0; 2437 ses->serverDomain[2*len] = 0;
2392 ses->serverDomain[1+(2*len)] = 0; 2438 ses->serverDomain[1+(2*len)] = 0;
@@ -2560,16 +2606,16 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
2560 } 2606 }
2561 2607
2562 bytes_returned = 2608 bytes_returned =
2563 cifs_strtoUCS((wchar_t *) bcc_ptr, "Linux version ", 2609 cifs_strtoUCS((__le16 *) bcc_ptr, "Linux version ",
2564 32, nls_codepage); 2610 32, nls_codepage);
2565 bcc_ptr += 2 * bytes_returned; 2611 bcc_ptr += 2 * bytes_returned;
2566 bytes_returned = 2612 bytes_returned =
2567 cifs_strtoUCS((wchar_t *) bcc_ptr, system_utsname.release, 32, 2613 cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release, 32,
2568 nls_codepage); 2614 nls_codepage);
2569 bcc_ptr += 2 * bytes_returned; 2615 bcc_ptr += 2 * bytes_returned;
2570 bcc_ptr += 2; /* null terminate Linux version */ 2616 bcc_ptr += 2; /* null terminate Linux version */
2571 bytes_returned = 2617 bytes_returned =
2572 cifs_strtoUCS((wchar_t *) bcc_ptr, CIFS_NETWORK_OPSYS, 2618 cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
2573 64, nls_codepage); 2619 64, nls_codepage);
2574 bcc_ptr += 2 * bytes_returned; 2620 bcc_ptr += 2 * bytes_returned;
2575 *(bcc_ptr + 1) = 0; 2621 *(bcc_ptr + 1) = 0;
@@ -2673,7 +2719,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
2673 ses->serverOS = 2719 ses->serverOS =
2674 kzalloc(2 * (len + 1), GFP_KERNEL); 2720 kzalloc(2 * (len + 1), GFP_KERNEL);
2675 cifs_strfromUCS_le(ses->serverOS, 2721 cifs_strfromUCS_le(ses->serverOS,
2676 (wchar_t *) 2722 (__le16 *)
2677 bcc_ptr, len, 2723 bcc_ptr, len,
2678 nls_codepage); 2724 nls_codepage);
2679 bcc_ptr += 2 * (len + 1); 2725 bcc_ptr += 2 * (len + 1);
@@ -2690,7 +2736,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
2690 GFP_KERNEL); 2736 GFP_KERNEL);
2691 cifs_strfromUCS_le(ses-> 2737 cifs_strfromUCS_le(ses->
2692 serverNOS, 2738 serverNOS,
2693 (wchar_t *) 2739 (__le16 *)
2694 bcc_ptr, 2740 bcc_ptr,
2695 len, 2741 len,
2696 nls_codepage); 2742 nls_codepage);
@@ -2708,23 +2754,15 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
2708 1), 2754 1),
2709 GFP_KERNEL); 2755 GFP_KERNEL);
2710 cifs_strfromUCS_le 2756 cifs_strfromUCS_le
2711 (ses-> 2757 (ses->serverDomain,
2712 serverDomain, 2758 (__le16 *)bcc_ptr,
2713 (wchar_t *) 2759 len, nls_codepage);
2714 bcc_ptr, len,
2715 nls_codepage);
2716 bcc_ptr += 2760 bcc_ptr +=
2717 2 * (len + 1); 2761 2 * (len + 1);
2718 ses-> 2762 ses->serverDomain[2*len]
2719 serverDomain[2
2720 * len]
2721 = 0; 2763 = 0;
2722 ses-> 2764 ses->serverDomain
2723 serverDomain[1 2765 [1 + (2 * len)]
2724 +
2725 (2
2726 *
2727 len)]
2728 = 0; 2766 = 0;
2729 } /* else no more room so create dummy domain string */ 2767 } /* else no more room so create dummy domain string */
2730 else 2768 else
@@ -2903,7 +2941,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
2903 SecurityBlob->DomainName.MaximumLength = 0; 2941 SecurityBlob->DomainName.MaximumLength = 0;
2904 } else { 2942 } else {
2905 __u16 len = 2943 __u16 len =
2906 cifs_strtoUCS((wchar_t *) bcc_ptr, domain, 64, 2944 cifs_strtoUCS((__le16 *) bcc_ptr, domain, 64,
2907 nls_codepage); 2945 nls_codepage);
2908 len *= 2; 2946 len *= 2;
2909 SecurityBlob->DomainName.MaximumLength = 2947 SecurityBlob->DomainName.MaximumLength =
@@ -2921,7 +2959,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
2921 SecurityBlob->UserName.MaximumLength = 0; 2959 SecurityBlob->UserName.MaximumLength = 0;
2922 } else { 2960 } else {
2923 __u16 len = 2961 __u16 len =
2924 cifs_strtoUCS((wchar_t *) bcc_ptr, user, 64, 2962 cifs_strtoUCS((__le16 *) bcc_ptr, user, 64,
2925 nls_codepage); 2963 nls_codepage);
2926 len *= 2; 2964 len *= 2;
2927 SecurityBlob->UserName.MaximumLength = 2965 SecurityBlob->UserName.MaximumLength =
@@ -2934,7 +2972,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
2934 cpu_to_le16(len); 2972 cpu_to_le16(len);
2935 } 2973 }
2936 2974
2937 /* SecurityBlob->WorkstationName.Length = cifs_strtoUCS((wchar_t *) bcc_ptr, "AMACHINE",64, nls_codepage); 2975 /* SecurityBlob->WorkstationName.Length = cifs_strtoUCS((__le16 *) bcc_ptr, "AMACHINE",64, nls_codepage);
2938 SecurityBlob->WorkstationName.Length *= 2; 2976 SecurityBlob->WorkstationName.Length *= 2;
2939 SecurityBlob->WorkstationName.MaximumLength = cpu_to_le16(SecurityBlob->WorkstationName.Length); 2977 SecurityBlob->WorkstationName.MaximumLength = cpu_to_le16(SecurityBlob->WorkstationName.Length);
2940 SecurityBlob->WorkstationName.Buffer = cpu_to_le32(SecurityBlobLength); 2978 SecurityBlob->WorkstationName.Buffer = cpu_to_le32(SecurityBlobLength);
@@ -2947,16 +2985,16 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
2947 bcc_ptr++; 2985 bcc_ptr++;
2948 } 2986 }
2949 bytes_returned = 2987 bytes_returned =
2950 cifs_strtoUCS((wchar_t *) bcc_ptr, "Linux version ", 2988 cifs_strtoUCS((__le16 *) bcc_ptr, "Linux version ",
2951 32, nls_codepage); 2989 32, nls_codepage);
2952 bcc_ptr += 2 * bytes_returned; 2990 bcc_ptr += 2 * bytes_returned;
2953 bytes_returned = 2991 bytes_returned =
2954 cifs_strtoUCS((wchar_t *) bcc_ptr, system_utsname.release, 32, 2992 cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release, 32,
2955 nls_codepage); 2993 nls_codepage);
2956 bcc_ptr += 2 * bytes_returned; 2994 bcc_ptr += 2 * bytes_returned;
2957 bcc_ptr += 2; /* null term version string */ 2995 bcc_ptr += 2; /* null term version string */
2958 bytes_returned = 2996 bytes_returned =
2959 cifs_strtoUCS((wchar_t *) bcc_ptr, CIFS_NETWORK_OPSYS, 2997 cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
2960 64, nls_codepage); 2998 64, nls_codepage);
2961 bcc_ptr += 2 * bytes_returned; 2999 bcc_ptr += 2 * bytes_returned;
2962 *(bcc_ptr + 1) = 0; 3000 *(bcc_ptr + 1) = 0;
@@ -3069,7 +3107,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
3069 ses->serverOS = 3107 ses->serverOS =
3070 kzalloc(2 * (len + 1), GFP_KERNEL); 3108 kzalloc(2 * (len + 1), GFP_KERNEL);
3071 cifs_strfromUCS_le(ses->serverOS, 3109 cifs_strfromUCS_le(ses->serverOS,
3072 (wchar_t *) 3110 (__le16 *)
3073 bcc_ptr, len, 3111 bcc_ptr, len,
3074 nls_codepage); 3112 nls_codepage);
3075 bcc_ptr += 2 * (len + 1); 3113 bcc_ptr += 2 * (len + 1);
@@ -3086,7 +3124,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
3086 GFP_KERNEL); 3124 GFP_KERNEL);
3087 cifs_strfromUCS_le(ses-> 3125 cifs_strfromUCS_le(ses->
3088 serverNOS, 3126 serverNOS,
3089 (wchar_t *) 3127 (__le16 *)
3090 bcc_ptr, 3128 bcc_ptr,
3091 len, 3129 len,
3092 nls_codepage); 3130 nls_codepage);
@@ -3105,7 +3143,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
3105 cifs_strfromUCS_le 3143 cifs_strfromUCS_le
3106 (ses-> 3144 (ses->
3107 serverDomain, 3145 serverDomain,
3108 (wchar_t *) 3146 (__le16 *)
3109 bcc_ptr, len, 3147 bcc_ptr, len,
3110 nls_codepage); 3148 nls_codepage);
3111 bcc_ptr += 3149 bcc_ptr +=
@@ -3211,9 +3249,26 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
3211 3249
3212 pSMB->AndXCommand = 0xFF; 3250 pSMB->AndXCommand = 0xFF;
3213 pSMB->Flags = cpu_to_le16(TCON_EXTENDED_SECINFO); 3251 pSMB->Flags = cpu_to_le16(TCON_EXTENDED_SECINFO);
3214 pSMB->PasswordLength = cpu_to_le16(1); /* minimum */
3215 bcc_ptr = &pSMB->Password[0]; 3252 bcc_ptr = &pSMB->Password[0];
3216 bcc_ptr++; /* skip password */ 3253 if((ses->server->secMode) & SECMODE_USER) {
3254 pSMB->PasswordLength = cpu_to_le16(1); /* minimum */
3255 bcc_ptr++; /* skip password */
3256 } else {
3257 pSMB->PasswordLength = cpu_to_le16(CIFS_SESSION_KEY_SIZE);
3258 /* BB FIXME add code to fail this if NTLMv2 or Kerberos
3259 specified as required (when that support is added to
3260 the vfs in the future) as only NTLM or the much
3261 weaker LANMAN (which we do not send) is accepted
3262 by Samba (not sure whether other servers allow
3263 NTLMv2 password here) */
3264 SMBNTencrypt(ses->password,
3265 ses->server->cryptKey,
3266 bcc_ptr);
3267
3268 bcc_ptr += CIFS_SESSION_KEY_SIZE;
3269 *bcc_ptr = 0;
3270 bcc_ptr++; /* align */
3271 }
3217 3272
3218 if(ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) 3273 if(ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
3219 smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; 3274 smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
@@ -3227,11 +3282,10 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
3227 if (ses->capabilities & CAP_UNICODE) { 3282 if (ses->capabilities & CAP_UNICODE) {
3228 smb_buffer->Flags2 |= SMBFLG2_UNICODE; 3283 smb_buffer->Flags2 |= SMBFLG2_UNICODE;
3229 length = 3284 length =
3230 cifs_strtoUCS((wchar_t *) bcc_ptr, tree, 100, nls_codepage); 3285 cifs_strtoUCS((__le16 *) bcc_ptr, tree, 100, nls_codepage);
3231 bcc_ptr += 2 * length; /* convert num of 16 bit words to bytes */ 3286 bcc_ptr += 2 * length; /* convert num of 16 bit words to bytes */
3232 bcc_ptr += 2; /* skip trailing null */ 3287 bcc_ptr += 2; /* skip trailing null */
3233 } else { /* ASCII */ 3288 } else { /* ASCII */
3234
3235 strcpy(bcc_ptr, tree); 3289 strcpy(bcc_ptr, tree);
3236 bcc_ptr += strlen(tree) + 1; 3290 bcc_ptr += strlen(tree) + 1;
3237 } 3291 }
@@ -3263,7 +3317,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
3263 tcon->nativeFileSystem = 3317 tcon->nativeFileSystem =
3264 kzalloc(length + 2, GFP_KERNEL); 3318 kzalloc(length + 2, GFP_KERNEL);
3265 cifs_strfromUCS_le(tcon->nativeFileSystem, 3319 cifs_strfromUCS_le(tcon->nativeFileSystem,
3266 (wchar_t *) bcc_ptr, 3320 (__le16 *) bcc_ptr,
3267 length, nls_codepage); 3321 length, nls_codepage);
3268 bcc_ptr += 2 * length; 3322 bcc_ptr += 2 * length;
3269 bcc_ptr[0] = 0; /* null terminate the string */ 3323 bcc_ptr[0] = 0; /* null terminate the string */
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 8dfe717a332..fed55e3c53d 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * vfs operations that deal with dentries 4 * vfs operations that deal with dentries
5 * 5 *
6 * Copyright (C) International Business Machines Corp., 2002,2003 6 * Copyright (C) International Business Machines Corp., 2002,2005
7 * Author(s): Steve French (sfrench@us.ibm.com) 7 * Author(s): Steve French (sfrench@us.ibm.com)
8 * 8 *
9 * This library is free software; you can redistribute it and/or modify 9 * This library is free software; you can redistribute it and/or modify
@@ -200,8 +200,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
200 (oplock & CIFS_CREATE_ACTION)) 200 (oplock & CIFS_CREATE_ACTION))
201 if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { 201 if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
202 CIFSSMBUnixSetPerms(xid, pTcon, full_path, mode, 202 CIFSSMBUnixSetPerms(xid, pTcon, full_path, mode,
203 (__u64)current->euid, 203 (__u64)current->fsuid,
204 (__u64)current->egid, 204 (__u64)current->fsgid,
205 0 /* dev */, 205 0 /* dev */,
206 cifs_sb->local_nls, 206 cifs_sb->local_nls,
207 cifs_sb->mnt_cifs_flags & 207 cifs_sb->mnt_cifs_flags &
@@ -228,8 +228,15 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
228 else { 228 else {
229 rc = cifs_get_inode_info(&newinode, full_path, 229 rc = cifs_get_inode_info(&newinode, full_path,
230 buf, inode->i_sb,xid); 230 buf, inode->i_sb,xid);
231 if(newinode) 231 if(newinode) {
232 newinode->i_mode = mode; 232 newinode->i_mode = mode;
233 if((oplock & CIFS_CREATE_ACTION) &&
234 (cifs_sb->mnt_cifs_flags &
235 CIFS_MOUNT_SET_UID)) {
236 newinode->i_uid = current->fsuid;
237 newinode->i_gid = current->fsgid;
238 }
239 }
233 } 240 }
234 241
235 if (rc != 0) { 242 if (rc != 0) {
@@ -292,7 +299,8 @@ cifs_create_out:
292 return rc; 299 return rc;
293} 300}
294 301
295int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, dev_t device_number) 302int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
303 dev_t device_number)
296{ 304{
297 int rc = -EPERM; 305 int rc = -EPERM;
298 int xid; 306 int xid;
@@ -317,7 +325,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, dev_t dev
317 else if (pTcon->ses->capabilities & CAP_UNIX) { 325 else if (pTcon->ses->capabilities & CAP_UNIX) {
318 if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { 326 if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
319 rc = CIFSSMBUnixSetPerms(xid, pTcon, full_path, 327 rc = CIFSSMBUnixSetPerms(xid, pTcon, full_path,
320 mode,(__u64)current->euid,(__u64)current->egid, 328 mode,(__u64)current->fsuid,(__u64)current->fsgid,
321 device_number, cifs_sb->local_nls, 329 device_number, cifs_sb->local_nls,
322 cifs_sb->mnt_cifs_flags & 330 cifs_sb->mnt_cifs_flags &
323 CIFS_MOUNT_MAP_SPECIAL_CHR); 331 CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -368,7 +376,34 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, dev_t dev
368 376
369 if(!rc) { 377 if(!rc) {
370 /* BB Do not bother to decode buf since no 378 /* BB Do not bother to decode buf since no
371 local inode yet to put timestamps in */ 379 local inode yet to put timestamps in,
380 but we can reuse it safely */
381 int bytes_written;
382 struct win_dev *pdev;
383 pdev = (struct win_dev *)buf;
384 if(S_ISCHR(mode)) {
385 memcpy(pdev->type, "IntxCHR", 8);
386 pdev->major =
387 cpu_to_le64(MAJOR(device_number));
388 pdev->minor =
389 cpu_to_le64(MINOR(device_number));
390 rc = CIFSSMBWrite(xid, pTcon,
391 fileHandle,
392 sizeof(struct win_dev),
393 0, &bytes_written, (char *)pdev,
394 NULL, 0);
395 } else if(S_ISBLK(mode)) {
396 memcpy(pdev->type, "IntxBLK", 8);
397 pdev->major =
398 cpu_to_le64(MAJOR(device_number));
399 pdev->minor =
400 cpu_to_le64(MINOR(device_number));
401 rc = CIFSSMBWrite(xid, pTcon,
402 fileHandle,
403 sizeof(struct win_dev),
404 0, &bytes_written, (char *)pdev,
405 NULL, 0);
406 } /* else if(S_ISFIFO */
372 CIFSSMBClose(xid, pTcon, fileHandle); 407 CIFSSMBClose(xid, pTcon, fileHandle);
373 d_drop(direntry); 408 d_drop(direntry);
374 } 409 }
@@ -437,12 +472,20 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, struct name
437 direntry->d_op = &cifs_dentry_ops; 472 direntry->d_op = &cifs_dentry_ops;
438 d_add(direntry, newInode); 473 d_add(direntry, newInode);
439 474
440 /* since paths are not looked up by component - the parent directories are presumed to be good here */ 475 /* since paths are not looked up by component - the parent
476 directories are presumed to be good here */
441 renew_parental_timestamps(direntry); 477 renew_parental_timestamps(direntry);
442 478
443 } else if (rc == -ENOENT) { 479 } else if (rc == -ENOENT) {
444 rc = 0; 480 rc = 0;
481 direntry->d_time = jiffies;
482 if (pTcon->nocase)
483 direntry->d_op = &cifs_ci_dentry_ops;
484 else
485 direntry->d_op = &cifs_dentry_ops;
445 d_add(direntry, NULL); 486 d_add(direntry, NULL);
487 /* if it was once a directory (but how can we tell?) we could do
488 shrink_dcache_parent(direntry); */
446 } else { 489 } else {
447 cERROR(1,("Error 0x%x on cifs_get_inode_info in lookup of %s", 490 cERROR(1,("Error 0x%x on cifs_get_inode_info in lookup of %s",
448 rc,full_path)); 491 rc,full_path));
@@ -461,21 +504,20 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
461{ 504{
462 int isValid = 1; 505 int isValid = 1;
463 506
464/* lock_kernel(); *//* surely we do not want to lock the kernel for a whole network round trip which could take seconds */
465
466 if (direntry->d_inode) { 507 if (direntry->d_inode) {
467 if (cifs_revalidate(direntry)) { 508 if (cifs_revalidate(direntry)) {
468 /* unlock_kernel(); */
469 return 0; 509 return 0;
470 } 510 }
471 } else { 511 } else {
472 cFYI(1, 512 cFYI(1, ("neg dentry 0x%p name = %s",
473 ("In cifs_d_revalidate with no inode but name = %s and dentry 0x%p", 513 direntry, direntry->d_name.name));
474 direntry->d_name.name, direntry)); 514 if(time_after(jiffies, direntry->d_time + HZ) ||
515 !lookupCacheEnabled) {
516 d_drop(direntry);
517 isValid = 0;
518 }
475 } 519 }
476 520
477/* unlock_kernel(); */
478
479 return isValid; 521 return isValid;
480} 522}
481 523
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index da4f5e10b3c..77c990f0cb9 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -127,8 +127,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
127 if (file->f_dentry->d_inode->i_mapping) { 127 if (file->f_dentry->d_inode->i_mapping) {
128 /* BB no need to lock inode until after invalidate 128 /* BB no need to lock inode until after invalidate
129 since namei code should already have it locked? */ 129 since namei code should already have it locked? */
130 filemap_fdatawrite(file->f_dentry->d_inode->i_mapping); 130 filemap_write_and_wait(file->f_dentry->d_inode->i_mapping);
131 filemap_fdatawait(file->f_dentry->d_inode->i_mapping);
132 } 131 }
133 cFYI(1, ("invalidating remote inode since open detected it " 132 cFYI(1, ("invalidating remote inode since open detected it "
134 "changed")); 133 "changed"));
@@ -419,8 +418,7 @@ static int cifs_reopen_file(struct inode *inode, struct file *file,
419 pCifsInode = CIFS_I(inode); 418 pCifsInode = CIFS_I(inode);
420 if (pCifsInode) { 419 if (pCifsInode) {
421 if (can_flush) { 420 if (can_flush) {
422 filemap_fdatawrite(inode->i_mapping); 421 filemap_write_and_wait(inode->i_mapping);
423 filemap_fdatawait(inode->i_mapping);
424 /* temporarily disable caching while we 422 /* temporarily disable caching while we
425 go to server to get inode info */ 423 go to server to get inode info */
426 pCifsInode->clientCanCacheAll = FALSE; 424 pCifsInode->clientCanCacheAll = FALSE;
@@ -489,8 +487,10 @@ int cifs_close(struct inode *inode, struct file *file)
489 the struct would be in each open file, 487 the struct would be in each open file,
490 but this should give enough time to 488 but this should give enough time to
491 clear the socket */ 489 clear the socket */
490 write_unlock(&file->f_owner.lock);
492 cERROR(1,("close with pending writes")); 491 cERROR(1,("close with pending writes"));
493 msleep(timeout); 492 msleep(timeout);
493 write_lock(&file->f_owner.lock);
494 timeout *= 4; 494 timeout *= 4;
495 } 495 }
496 write_unlock(&file->f_owner.lock); 496 write_unlock(&file->f_owner.lock);
@@ -553,13 +553,13 @@ int cifs_closedir(struct inode *inode, struct file *file)
553 } 553 }
554 ptmp = pCFileStruct->srch_inf.ntwrk_buf_start; 554 ptmp = pCFileStruct->srch_inf.ntwrk_buf_start;
555 if (ptmp) { 555 if (ptmp) {
556 /* BB removeme BB */ cFYI(1, ("freeing smb buf in srch struct in closedir")); 556 cFYI(1, ("closedir free smb buf in srch struct"));
557 pCFileStruct->srch_inf.ntwrk_buf_start = NULL; 557 pCFileStruct->srch_inf.ntwrk_buf_start = NULL;
558 cifs_buf_release(ptmp); 558 cifs_buf_release(ptmp);
559 } 559 }
560 ptmp = pCFileStruct->search_resume_name; 560 ptmp = pCFileStruct->search_resume_name;
561 if (ptmp) { 561 if (ptmp) {
562 /* BB removeme BB */ cFYI(1, ("freeing resume name in closedir")); 562 cFYI(1, ("closedir free resume name"));
563 pCFileStruct->search_resume_name = NULL; 563 pCFileStruct->search_resume_name = NULL;
564 kfree(ptmp); 564 kfree(ptmp);
565 } 565 }
@@ -868,10 +868,9 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
868 if (rc != 0) 868 if (rc != 0)
869 break; 869 break;
870 } 870 }
871#ifdef CONFIG_CIFS_EXPERIMENTAL
872 /* BB FIXME We can not sign across two buffers yet */ 871 /* BB FIXME We can not sign across two buffers yet */
873 if((experimEnabled) && ((pTcon->ses->server->secMode & 872 if((pTcon->ses->server->secMode &
874 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) == 0)) { 873 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) == 0) {
875 struct kvec iov[2]; 874 struct kvec iov[2];
876 unsigned int len; 875 unsigned int len;
877 876
@@ -887,7 +886,6 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
887 iov, 1, long_op); 886 iov, 1, long_op);
888 } else 887 } else
889 /* BB FIXME fixup indentation of line below */ 888 /* BB FIXME fixup indentation of line below */
890#endif
891 rc = CIFSSMBWrite(xid, pTcon, 889 rc = CIFSSMBWrite(xid, pTcon,
892 open_file->netfid, 890 open_file->netfid,
893 min_t(const int, cifs_sb->wsize, 891 min_t(const int, cifs_sb->wsize,
@@ -1024,7 +1022,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1024 return rc; 1022 return rc;
1025} 1023}
1026 1024
1027#ifdef CONFIG_CIFS_EXPERIMENTAL
1028static int cifs_writepages(struct address_space *mapping, 1025static int cifs_writepages(struct address_space *mapping,
1029 struct writeback_control *wbc) 1026 struct writeback_control *wbc)
1030{ 1027{
@@ -1227,7 +1224,6 @@ retry:
1227 1224
1228 return rc; 1225 return rc;
1229} 1226}
1230#endif
1231 1227
1232static int cifs_writepage(struct page* page, struct writeback_control *wbc) 1228static int cifs_writepage(struct page* page, struct writeback_control *wbc)
1233{ 1229{
@@ -1426,6 +1422,7 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
1426 rc = -EAGAIN; 1422 rc = -EAGAIN;
1427 smb_read_data = NULL; 1423 smb_read_data = NULL;
1428 while (rc == -EAGAIN) { 1424 while (rc == -EAGAIN) {
1425 int buf_type = CIFS_NO_BUFFER;
1429 if ((open_file->invalidHandle) && 1426 if ((open_file->invalidHandle) &&
1430 (!open_file->closePend)) { 1427 (!open_file->closePend)) {
1431 rc = cifs_reopen_file(file->f_dentry->d_inode, 1428 rc = cifs_reopen_file(file->f_dentry->d_inode,
@@ -1434,20 +1431,22 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
1434 break; 1431 break;
1435 } 1432 }
1436 rc = CIFSSMBRead(xid, pTcon, 1433 rc = CIFSSMBRead(xid, pTcon,
1437 open_file->netfid, 1434 open_file->netfid,
1438 current_read_size, *poffset, 1435 current_read_size, *poffset,
1439 &bytes_read, &smb_read_data); 1436 &bytes_read, &smb_read_data,
1437 &buf_type);
1440 pSMBr = (struct smb_com_read_rsp *)smb_read_data; 1438 pSMBr = (struct smb_com_read_rsp *)smb_read_data;
1441 if (copy_to_user(current_offset, 1439 if (copy_to_user(current_offset,
1442 smb_read_data + 4 /* RFC1001 hdr */ 1440 smb_read_data + 4 /* RFC1001 hdr */
1443 + le16_to_cpu(pSMBr->DataOffset), 1441 + le16_to_cpu(pSMBr->DataOffset),
1444 bytes_read)) { 1442 bytes_read)) {
1445 rc = -EFAULT; 1443 rc = -EFAULT;
1446 FreeXid(xid); 1444 }
1447 return rc;
1448 }
1449 if (smb_read_data) { 1445 if (smb_read_data) {
1450 cifs_buf_release(smb_read_data); 1446 if(buf_type == CIFS_SMALL_BUFFER)
1447 cifs_small_buf_release(smb_read_data);
1448 else if(buf_type == CIFS_LARGE_BUFFER)
1449 cifs_buf_release(smb_read_data);
1451 smb_read_data = NULL; 1450 smb_read_data = NULL;
1452 } 1451 }
1453 } 1452 }
@@ -1480,6 +1479,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1480 int xid; 1479 int xid;
1481 char *current_offset; 1480 char *current_offset;
1482 struct cifsFileInfo *open_file; 1481 struct cifsFileInfo *open_file;
1482 int buf_type = CIFS_NO_BUFFER;
1483 1483
1484 xid = GetXid(); 1484 xid = GetXid();
1485 cifs_sb = CIFS_SB(file->f_dentry->d_sb); 1485 cifs_sb = CIFS_SB(file->f_dentry->d_sb);
@@ -1516,9 +1516,10 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1516 break; 1516 break;
1517 } 1517 }
1518 rc = CIFSSMBRead(xid, pTcon, 1518 rc = CIFSSMBRead(xid, pTcon,
1519 open_file->netfid, 1519 open_file->netfid,
1520 current_read_size, *poffset, 1520 current_read_size, *poffset,
1521 &bytes_read, &current_offset); 1521 &bytes_read, &current_offset,
1522 &buf_type);
1522 } 1523 }
1523 if (rc || (bytes_read == 0)) { 1524 if (rc || (bytes_read == 0)) {
1524 if (total_read) { 1525 if (total_read) {
@@ -1616,6 +1617,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
1616 struct smb_com_read_rsp *pSMBr; 1617 struct smb_com_read_rsp *pSMBr;
1617 struct pagevec lru_pvec; 1618 struct pagevec lru_pvec;
1618 struct cifsFileInfo *open_file; 1619 struct cifsFileInfo *open_file;
1620 int buf_type = CIFS_NO_BUFFER;
1619 1621
1620 xid = GetXid(); 1622 xid = GetXid();
1621 if (file->private_data == NULL) { 1623 if (file->private_data == NULL) {
@@ -1672,14 +1674,17 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
1672 } 1674 }
1673 1675
1674 rc = CIFSSMBRead(xid, pTcon, 1676 rc = CIFSSMBRead(xid, pTcon,
1675 open_file->netfid, 1677 open_file->netfid,
1676 read_size, offset, 1678 read_size, offset,
1677 &bytes_read, &smb_read_data); 1679 &bytes_read, &smb_read_data,
1678 1680 &buf_type);
1679 /* BB more RC checks ? */ 1681 /* BB more RC checks ? */
1680 if (rc== -EAGAIN) { 1682 if (rc== -EAGAIN) {
1681 if (smb_read_data) { 1683 if (smb_read_data) {
1682 cifs_buf_release(smb_read_data); 1684 if(buf_type == CIFS_SMALL_BUFFER)
1685 cifs_small_buf_release(smb_read_data);
1686 else if(buf_type == CIFS_LARGE_BUFFER)
1687 cifs_buf_release(smb_read_data);
1683 smb_read_data = NULL; 1688 smb_read_data = NULL;
1684 } 1689 }
1685 } 1690 }
@@ -1736,7 +1741,10 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
1736 break; 1741 break;
1737 } 1742 }
1738 if (smb_read_data) { 1743 if (smb_read_data) {
1739 cifs_buf_release(smb_read_data); 1744 if(buf_type == CIFS_SMALL_BUFFER)
1745 cifs_small_buf_release(smb_read_data);
1746 else if(buf_type == CIFS_LARGE_BUFFER)
1747 cifs_buf_release(smb_read_data);
1740 smb_read_data = NULL; 1748 smb_read_data = NULL;
1741 } 1749 }
1742 bytes_read = 0; 1750 bytes_read = 0;
@@ -1746,7 +1754,10 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
1746 1754
1747/* need to free smb_read_data buf before exit */ 1755/* need to free smb_read_data buf before exit */
1748 if (smb_read_data) { 1756 if (smb_read_data) {
1749 cifs_buf_release(smb_read_data); 1757 if(buf_type == CIFS_SMALL_BUFFER)
1758 cifs_small_buf_release(smb_read_data);
1759 else if(buf_type == CIFS_LARGE_BUFFER)
1760 cifs_buf_release(smb_read_data);
1750 smb_read_data = NULL; 1761 smb_read_data = NULL;
1751 } 1762 }
1752 1763
@@ -1825,10 +1836,20 @@ int is_size_safe_to_change(struct cifsInodeInfo *cifsInode)
1825 open_file = find_writable_file(cifsInode); 1836 open_file = find_writable_file(cifsInode);
1826 1837
1827 if(open_file) { 1838 if(open_file) {
1839 struct cifs_sb_info *cifs_sb;
1840
1828 /* there is not actually a write pending so let 1841 /* there is not actually a write pending so let
1829 this handle go free and allow it to 1842 this handle go free and allow it to
1830 be closable if needed */ 1843 be closable if needed */
1831 atomic_dec(&open_file->wrtPending); 1844 atomic_dec(&open_file->wrtPending);
1845
1846 cifs_sb = CIFS_SB(cifsInode->vfs_inode.i_sb);
1847 if ( cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO ) {
1848 /* since no page cache to corrupt on directio
1849 we can change size safely */
1850 return 1;
1851 }
1852
1832 return 0; 1853 return 0;
1833 } else 1854 } else
1834 return 1; 1855 return 1;
@@ -1873,9 +1894,7 @@ struct address_space_operations cifs_addr_ops = {
1873 .readpage = cifs_readpage, 1894 .readpage = cifs_readpage,
1874 .readpages = cifs_readpages, 1895 .readpages = cifs_readpages,
1875 .writepage = cifs_writepage, 1896 .writepage = cifs_writepage,
1876#ifdef CONFIG_CIFS_EXPERIMENTAL
1877 .writepages = cifs_writepages, 1897 .writepages = cifs_writepages,
1878#endif
1879 .prepare_write = cifs_prepare_write, 1898 .prepare_write = cifs_prepare_write,
1880 .commit_write = cifs_commit_write, 1899 .commit_write = cifs_commit_write,
1881 .set_page_dirty = __set_page_dirty_nobuffers, 1900 .set_page_dirty = __set_page_dirty_nobuffers,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 923d071163b..59359911f48 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -41,7 +41,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
41 char *tmp_path; 41 char *tmp_path;
42 42
43 pTcon = cifs_sb->tcon; 43 pTcon = cifs_sb->tcon;
44 cFYI(1, (" Getting info on %s ", search_path)); 44 cFYI(1, ("Getting info on %s ", search_path));
45 /* could have done a find first instead but this returns more info */ 45 /* could have done a find first instead but this returns more info */
46 rc = CIFSSMBUnixQPathInfo(xid, pTcon, search_path, &findData, 46 rc = CIFSSMBUnixQPathInfo(xid, pTcon, search_path, &findData,
47 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 47 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
@@ -97,9 +97,9 @@ int cifs_get_inode_info_unix(struct inode **pinode,
97 inode = *pinode; 97 inode = *pinode;
98 cifsInfo = CIFS_I(inode); 98 cifsInfo = CIFS_I(inode);
99 99
100 cFYI(1, (" Old time %ld ", cifsInfo->time)); 100 cFYI(1, ("Old time %ld ", cifsInfo->time));
101 cifsInfo->time = jiffies; 101 cifsInfo->time = jiffies;
102 cFYI(1, (" New time %ld ", cifsInfo->time)); 102 cFYI(1, ("New time %ld ", cifsInfo->time));
103 /* this is ok to set on every inode revalidate */ 103 /* this is ok to set on every inode revalidate */
104 atomic_set(&cifsInfo->inUse,1); 104 atomic_set(&cifsInfo->inUse,1);
105 105
@@ -111,6 +111,9 @@ int cifs_get_inode_info_unix(struct inode **pinode,
111 inode->i_ctime = 111 inode->i_ctime =
112 cifs_NTtimeToUnix(le64_to_cpu(findData.LastStatusChange)); 112 cifs_NTtimeToUnix(le64_to_cpu(findData.LastStatusChange));
113 inode->i_mode = le64_to_cpu(findData.Permissions); 113 inode->i_mode = le64_to_cpu(findData.Permissions);
114 /* since we set the inode type below we need to mask off
115 to avoid strange results if bits set above */
116 inode->i_mode &= ~S_IFMT;
114 if (type == UNIX_FILE) { 117 if (type == UNIX_FILE) {
115 inode->i_mode |= S_IFREG; 118 inode->i_mode |= S_IFREG;
116 } else if (type == UNIX_SYMLINK) { 119 } else if (type == UNIX_SYMLINK) {
@@ -129,6 +132,10 @@ int cifs_get_inode_info_unix(struct inode **pinode,
129 inode->i_mode |= S_IFIFO; 132 inode->i_mode |= S_IFIFO;
130 } else if (type == UNIX_SOCKET) { 133 } else if (type == UNIX_SOCKET) {
131 inode->i_mode |= S_IFSOCK; 134 inode->i_mode |= S_IFSOCK;
135 } else {
136 /* safest to call it a file if we do not know */
137 inode->i_mode |= S_IFREG;
138 cFYI(1,("unknown type %d",type));
132 } 139 }
133 inode->i_uid = le64_to_cpu(findData.Uid); 140 inode->i_uid = le64_to_cpu(findData.Uid);
134 inode->i_gid = le64_to_cpu(findData.Gid); 141 inode->i_gid = le64_to_cpu(findData.Gid);
@@ -155,34 +162,39 @@ int cifs_get_inode_info_unix(struct inode **pinode,
155 } 162 }
156 163
157 if (num_of_bytes < end_of_file) 164 if (num_of_bytes < end_of_file)
158 cFYI(1, ("allocation size less than end of file ")); 165 cFYI(1, ("allocation size less than end of file"));
159 cFYI(1, 166 cFYI(1,
160 ("Size %ld and blocks %ld", 167 ("Size %ld and blocks %ld",
161 (unsigned long) inode->i_size, inode->i_blocks)); 168 (unsigned long) inode->i_size, inode->i_blocks));
162 if (S_ISREG(inode->i_mode)) { 169 if (S_ISREG(inode->i_mode)) {
163 cFYI(1, (" File inode ")); 170 cFYI(1, ("File inode"));
164 inode->i_op = &cifs_file_inode_ops; 171 inode->i_op = &cifs_file_inode_ops;
165 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) 172 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
166 inode->i_fop = &cifs_file_direct_ops; 173 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
167 else 174 inode->i_fop =
175 &cifs_file_direct_nobrl_ops;
176 else
177 inode->i_fop = &cifs_file_direct_ops;
178 } else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
179 inode->i_fop = &cifs_file_nobrl_ops;
180 else /* not direct, send byte range locks */
168 inode->i_fop = &cifs_file_ops; 181 inode->i_fop = &cifs_file_ops;
169 if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) 182
170 inode->i_fop->lock = NULL;
171 inode->i_data.a_ops = &cifs_addr_ops; 183 inode->i_data.a_ops = &cifs_addr_ops;
172 /* check if server can support readpages */ 184 /* check if server can support readpages */
173 if(pTcon->ses->server->maxBuf < 185 if(pTcon->ses->server->maxBuf <
174 4096 + MAX_CIFS_HDR_SIZE) 186 4096 + MAX_CIFS_HDR_SIZE)
175 inode->i_data.a_ops->readpages = NULL; 187 inode->i_data.a_ops->readpages = NULL;
176 } else if (S_ISDIR(inode->i_mode)) { 188 } else if (S_ISDIR(inode->i_mode)) {
177 cFYI(1, (" Directory inode")); 189 cFYI(1, ("Directory inode"));
178 inode->i_op = &cifs_dir_inode_ops; 190 inode->i_op = &cifs_dir_inode_ops;
179 inode->i_fop = &cifs_dir_ops; 191 inode->i_fop = &cifs_dir_ops;
180 } else if (S_ISLNK(inode->i_mode)) { 192 } else if (S_ISLNK(inode->i_mode)) {
181 cFYI(1, (" Symbolic Link inode ")); 193 cFYI(1, ("Symbolic Link inode"));
182 inode->i_op = &cifs_symlink_inode_ops; 194 inode->i_op = &cifs_symlink_inode_ops;
183 /* tmp_inode->i_fop = */ /* do not need to set to anything */ 195 /* tmp_inode->i_fop = */ /* do not need to set to anything */
184 } else { 196 } else {
185 cFYI(1, (" Init special inode ")); 197 cFYI(1, ("Init special inode"));
186 init_special_inode(inode, inode->i_mode, 198 init_special_inode(inode, inode->i_mode,
187 inode->i_rdev); 199 inode->i_rdev);
188 } 200 }
@@ -190,6 +202,112 @@ int cifs_get_inode_info_unix(struct inode **pinode,
190 return rc; 202 return rc;
191} 203}
192 204
205static int decode_sfu_inode(struct inode * inode, __u64 size,
206 const unsigned char *path,
207 struct cifs_sb_info *cifs_sb, int xid)
208{
209 int rc;
210 int oplock = FALSE;
211 __u16 netfid;
212 struct cifsTconInfo *pTcon = cifs_sb->tcon;
213 char buf[24];
214 unsigned int bytes_read;
215 char * pbuf;
216
217 pbuf = buf;
218
219 if(size == 0) {
220 inode->i_mode |= S_IFIFO;
221 return 0;
222 } else if (size < 8) {
223 return -EINVAL; /* EOPNOTSUPP? */
224 }
225
226 rc = CIFSSMBOpen(xid, pTcon, path, FILE_OPEN, GENERIC_READ,
227 CREATE_NOT_DIR, &netfid, &oplock, NULL,
228 cifs_sb->local_nls,
229 cifs_sb->mnt_cifs_flags &
230 CIFS_MOUNT_MAP_SPECIAL_CHR);
231 if (rc==0) {
232 int buf_type = CIFS_NO_BUFFER;
233 /* Read header */
234 rc = CIFSSMBRead(xid, pTcon,
235 netfid,
236 24 /* length */, 0 /* offset */,
237 &bytes_read, &pbuf, &buf_type);
238 if((rc == 0) && (bytes_read >= 8)) {
239 if(memcmp("IntxBLK", pbuf, 8) == 0) {
240 cFYI(1,("Block device"));
241 inode->i_mode |= S_IFBLK;
242 if(bytes_read == 24) {
243 /* we have enough to decode dev num */
244 __u64 mjr; /* major */
245 __u64 mnr; /* minor */
246 mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
247 mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
248 inode->i_rdev = MKDEV(mjr, mnr);
249 }
250 } else if(memcmp("IntxCHR", pbuf, 8) == 0) {
251 cFYI(1,("Char device"));
252 inode->i_mode |= S_IFCHR;
253 if(bytes_read == 24) {
254 /* we have enough to decode dev num */
255 __u64 mjr; /* major */
256 __u64 mnr; /* minor */
257 mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
258 mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
259 inode->i_rdev = MKDEV(mjr, mnr);
260 }
261 } else if(memcmp("IntxLNK", pbuf, 7) == 0) {
262 cFYI(1,("Symlink"));
263 inode->i_mode |= S_IFLNK;
264 } else {
265 inode->i_mode |= S_IFREG; /* file? */
266 rc = -EOPNOTSUPP;
267 }
268 } else {
269 inode->i_mode |= S_IFREG; /* then it is a file */
270 rc = -EOPNOTSUPP; /* or some unknown SFU type */
271 }
272 CIFSSMBClose(xid, pTcon, netfid);
273 }
274 return rc;
275
276}
277
278#define SFBITS_MASK (S_ISVTX | S_ISGID | S_ISUID) /* SETFILEBITS valid bits */
279
280static int get_sfu_uid_mode(struct inode * inode,
281 const unsigned char *path,
282 struct cifs_sb_info *cifs_sb, int xid)
283{
284#ifdef CONFIG_CIFS_XATTR
285 ssize_t rc;
286 char ea_value[4];
287 __u32 mode;
288
289 rc = CIFSSMBQueryEA(xid, cifs_sb->tcon, path, "SETFILEBITS",
290 ea_value, 4 /* size of buf */, cifs_sb->local_nls,
291 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
292 if(rc < 0)
293 return (int)rc;
294 else if (rc > 3) {
295 mode = le32_to_cpu(*((__le32 *)ea_value));
296 inode->i_mode &= ~SFBITS_MASK;
297 cFYI(1,("special bits 0%o org mode 0%o", mode, inode->i_mode));
298 inode->i_mode = (mode & SFBITS_MASK) | inode->i_mode;
299 cFYI(1,("special mode bits 0%o", mode));
300 return 0;
301 } else {
302 return 0;
303 }
304#else
305 return -EOPNOTSUPP;
306#endif
307
308
309}
310
193int cifs_get_inode_info(struct inode **pinode, 311int cifs_get_inode_info(struct inode **pinode,
194 const unsigned char *search_path, FILE_ALL_INFO *pfindData, 312 const unsigned char *search_path, FILE_ALL_INFO *pfindData,
195 struct super_block *sb, int xid) 313 struct super_block *sb, int xid)
@@ -202,7 +320,7 @@ int cifs_get_inode_info(struct inode **pinode,
202 char *buf = NULL; 320 char *buf = NULL;
203 321
204 pTcon = cifs_sb->tcon; 322 pTcon = cifs_sb->tcon;
205 cFYI(1,("Getting info on %s ", search_path)); 323 cFYI(1,("Getting info on %s", search_path));
206 324
207 if ((pfindData == NULL) && (*pinode != NULL)) { 325 if ((pfindData == NULL) && (*pinode != NULL)) {
208 if (CIFS_I(*pinode)->clientCanCacheRead) { 326 if (CIFS_I(*pinode)->clientCanCacheRead) {
@@ -303,9 +421,9 @@ int cifs_get_inode_info(struct inode **pinode,
303 inode = *pinode; 421 inode = *pinode;
304 cifsInfo = CIFS_I(inode); 422 cifsInfo = CIFS_I(inode);
305 cifsInfo->cifsAttrs = attr; 423 cifsInfo->cifsAttrs = attr;
306 cFYI(1, (" Old time %ld ", cifsInfo->time)); 424 cFYI(1, ("Old time %ld ", cifsInfo->time));
307 cifsInfo->time = jiffies; 425 cifsInfo->time = jiffies;
308 cFYI(1, (" New time %ld ", cifsInfo->time)); 426 cFYI(1, ("New time %ld ", cifsInfo->time));
309 427
310 /* blksize needs to be multiple of two. So safer to default to 428 /* blksize needs to be multiple of two. So safer to default to
311 blksize and blkbits set in superblock so 2**blkbits and blksize 429 blksize and blkbits set in superblock so 2**blkbits and blksize
@@ -319,13 +437,15 @@ int cifs_get_inode_info(struct inode **pinode,
319 cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime)); 437 cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
320 inode->i_ctime = 438 inode->i_ctime =
321 cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime)); 439 cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
322 cFYI(0, (" Attributes came in as 0x%x ", attr)); 440 cFYI(0, ("Attributes came in as 0x%x ", attr));
323 441
324 /* set default mode. will override for dirs below */ 442 /* set default mode. will override for dirs below */
325 if (atomic_read(&cifsInfo->inUse) == 0) 443 if (atomic_read(&cifsInfo->inUse) == 0)
326 /* new inode, can safely set these fields */ 444 /* new inode, can safely set these fields */
327 inode->i_mode = cifs_sb->mnt_file_mode; 445 inode->i_mode = cifs_sb->mnt_file_mode;
328 446 else /* since we set the inode type below we need to mask off
447 to avoid strange results if type changes and both get orred in */
448 inode->i_mode &= ~S_IFMT;
329/* if (attr & ATTR_REPARSE) */ 449/* if (attr & ATTR_REPARSE) */
330 /* We no longer handle these as symlinks because we could not 450 /* We no longer handle these as symlinks because we could not
331 follow them due to the absolute path with drive letter */ 451 follow them due to the absolute path with drive letter */
@@ -340,10 +460,16 @@ int cifs_get_inode_info(struct inode **pinode,
340 (pfindData->EndOfFile == 0)) { 460 (pfindData->EndOfFile == 0)) {
341 inode->i_mode = cifs_sb->mnt_file_mode; 461 inode->i_mode = cifs_sb->mnt_file_mode;
342 inode->i_mode |= S_IFIFO; 462 inode->i_mode |= S_IFIFO;
343/* BB Finish for SFU style symlinks and devies */ 463/* BB Finish for SFU style symlinks and devices */
344/* } else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) && 464 } else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) &&
345 (cifsInfo->cifsAttrs & ATTR_SYSTEM) && ) */ 465 (cifsInfo->cifsAttrs & ATTR_SYSTEM)) {
346 466 if (decode_sfu_inode(inode,
467 le64_to_cpu(pfindData->EndOfFile),
468 search_path,
469 cifs_sb, xid)) {
470 cFYI(1,("Unrecognized sfu inode type"));
471 }
472 cFYI(1,("sfu mode 0%o",inode->i_mode));
347 } else { 473 } else {
348 inode->i_mode |= S_IFREG; 474 inode->i_mode |= S_IFREG;
349 /* treat the dos attribute of read-only as read-only 475 /* treat the dos attribute of read-only as read-only
@@ -368,7 +494,10 @@ int cifs_get_inode_info(struct inode **pinode,
368 494
369 /* BB fill in uid and gid here? with help from winbind? 495 /* BB fill in uid and gid here? with help from winbind?
370 or retrieve from NTFS stream extended attribute */ 496 or retrieve from NTFS stream extended attribute */
371 if (atomic_read(&cifsInfo->inUse) == 0) { 497 if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
498 /* fill in uid, gid, mode from server ACL */
499 get_sfu_uid_mode(inode, search_path, cifs_sb, xid);
500 } else if (atomic_read(&cifsInfo->inUse) == 0) {
372 inode->i_uid = cifs_sb->mnt_uid; 501 inode->i_uid = cifs_sb->mnt_uid;
373 inode->i_gid = cifs_sb->mnt_gid; 502 inode->i_gid = cifs_sb->mnt_gid;
374 /* set so we do not keep refreshing these fields with 503 /* set so we do not keep refreshing these fields with
@@ -377,24 +506,29 @@ int cifs_get_inode_info(struct inode **pinode,
377 } 506 }
378 507
379 if (S_ISREG(inode->i_mode)) { 508 if (S_ISREG(inode->i_mode)) {
380 cFYI(1, (" File inode ")); 509 cFYI(1, ("File inode"));
381 inode->i_op = &cifs_file_inode_ops; 510 inode->i_op = &cifs_file_inode_ops;
382 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) 511 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
383 inode->i_fop = &cifs_file_direct_ops; 512 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
384 else 513 inode->i_fop =
514 &cifs_file_direct_nobrl_ops;
515 else
516 inode->i_fop = &cifs_file_direct_ops;
517 } else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
518 inode->i_fop = &cifs_file_nobrl_ops;
519 else /* not direct, send byte range locks */
385 inode->i_fop = &cifs_file_ops; 520 inode->i_fop = &cifs_file_ops;
386 if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) 521
387 inode->i_fop->lock = NULL;
388 inode->i_data.a_ops = &cifs_addr_ops; 522 inode->i_data.a_ops = &cifs_addr_ops;
389 if(pTcon->ses->server->maxBuf < 523 if(pTcon->ses->server->maxBuf <
390 4096 + MAX_CIFS_HDR_SIZE) 524 4096 + MAX_CIFS_HDR_SIZE)
391 inode->i_data.a_ops->readpages = NULL; 525 inode->i_data.a_ops->readpages = NULL;
392 } else if (S_ISDIR(inode->i_mode)) { 526 } else if (S_ISDIR(inode->i_mode)) {
393 cFYI(1, (" Directory inode ")); 527 cFYI(1, ("Directory inode"));
394 inode->i_op = &cifs_dir_inode_ops; 528 inode->i_op = &cifs_dir_inode_ops;
395 inode->i_fop = &cifs_dir_ops; 529 inode->i_fop = &cifs_dir_ops;
396 } else if (S_ISLNK(inode->i_mode)) { 530 } else if (S_ISLNK(inode->i_mode)) {
397 cFYI(1, (" Symbolic Link inode ")); 531 cFYI(1, ("Symbolic Link inode"));
398 inode->i_op = &cifs_symlink_inode_ops; 532 inode->i_op = &cifs_symlink_inode_ops;
399 } else { 533 } else {
400 init_special_inode(inode, inode->i_mode, 534 init_special_inode(inode, inode->i_mode,
@@ -431,7 +565,7 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
431 struct cifsInodeInfo *cifsInode; 565 struct cifsInodeInfo *cifsInode;
432 FILE_BASIC_INFO *pinfo_buf; 566 FILE_BASIC_INFO *pinfo_buf;
433 567
434 cFYI(1, (" cifs_unlink, inode = 0x%p with ", inode)); 568 cFYI(1, ("cifs_unlink, inode = 0x%p with ", inode));
435 569
436 xid = GetXid(); 570 xid = GetXid();
437 571
@@ -577,7 +711,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
577 char *full_path = NULL; 711 char *full_path = NULL;
578 struct inode *newinode = NULL; 712 struct inode *newinode = NULL;
579 713
580 cFYI(1, ("In cifs_mkdir, mode = 0x%x inode = 0x%p ", mode, inode)); 714 cFYI(1, ("In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode));
581 715
582 xid = GetXid(); 716 xid = GetXid();
583 717
@@ -617,8 +751,8 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
617 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { 751 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
618 CIFSSMBUnixSetPerms(xid, pTcon, full_path, 752 CIFSSMBUnixSetPerms(xid, pTcon, full_path,
619 mode, 753 mode,
620 (__u64)current->euid, 754 (__u64)current->fsuid,
621 (__u64)current->egid, 755 (__u64)current->fsgid,
622 0 /* dev_t */, 756 0 /* dev_t */,
623 cifs_sb->local_nls, 757 cifs_sb->local_nls,
624 cifs_sb->mnt_cifs_flags & 758 cifs_sb->mnt_cifs_flags &
@@ -635,6 +769,17 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
635 /* BB to be implemented via Windows secrty descriptors 769 /* BB to be implemented via Windows secrty descriptors
636 eg CIFSSMBWinSetPerms(xid, pTcon, full_path, mode, 770 eg CIFSSMBWinSetPerms(xid, pTcon, full_path, mode,
637 -1, -1, local_nls); */ 771 -1, -1, local_nls); */
772 if(direntry->d_inode) {
773 direntry->d_inode->i_mode = mode;
774 direntry->d_inode->i_mode |= S_IFDIR;
775 if(cifs_sb->mnt_cifs_flags &
776 CIFS_MOUNT_SET_UID) {
777 direntry->d_inode->i_uid =
778 current->fsuid;
779 direntry->d_inode->i_gid =
780 current->fsgid;
781 }
782 }
638 } 783 }
639 } 784 }
640 kfree(full_path); 785 kfree(full_path);
@@ -651,7 +796,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
651 char *full_path = NULL; 796 char *full_path = NULL;
652 struct cifsInodeInfo *cifsInode; 797 struct cifsInodeInfo *cifsInode;
653 798
654 cFYI(1, (" cifs_rmdir, inode = 0x%p with ", inode)); 799 cFYI(1, ("cifs_rmdir, inode = 0x%p with ", inode));
655 800
656 xid = GetXid(); 801 xid = GetXid();
657 802
@@ -896,9 +1041,9 @@ int cifs_revalidate(struct dentry *direntry)
896 } 1041 }
897 1042
898 /* can not grab this sem since kernel filesys locking documentation 1043 /* can not grab this sem since kernel filesys locking documentation
899 indicates i_sem may be taken by the kernel on lookup and rename 1044 indicates i_mutex may be taken by the kernel on lookup and rename
900 which could deadlock if we grab the i_sem here as well */ 1045 which could deadlock if we grab the i_mutex here as well */
901/* down(&direntry->d_inode->i_sem);*/ 1046/* mutex_lock(&direntry->d_inode->i_mutex);*/
902 /* need to write out dirty pages here */ 1047 /* need to write out dirty pages here */
903 if (direntry->d_inode->i_mapping) { 1048 if (direntry->d_inode->i_mapping) {
904 /* do we need to lock inode until after invalidate completes 1049 /* do we need to lock inode until after invalidate completes
@@ -906,17 +1051,23 @@ int cifs_revalidate(struct dentry *direntry)
906 filemap_fdatawrite(direntry->d_inode->i_mapping); 1051 filemap_fdatawrite(direntry->d_inode->i_mapping);
907 } 1052 }
908 if (invalidate_inode) { 1053 if (invalidate_inode) {
909 if (direntry->d_inode->i_mapping) 1054 /* shrink_dcache not necessary now that cifs dentry ops
910 filemap_fdatawait(direntry->d_inode->i_mapping); 1055 are exported for negative dentries */
911 /* may eventually have to do this for open files too */ 1056/* if(S_ISDIR(direntry->d_inode->i_mode))
912 if (list_empty(&(cifsInode->openFileList))) { 1057 shrink_dcache_parent(direntry); */
913 /* Has changed on server - flush read ahead pages */ 1058 if (S_ISREG(direntry->d_inode->i_mode)) {
914 cFYI(1, ("Invalidating read ahead data on " 1059 if (direntry->d_inode->i_mapping)
915 "closed file")); 1060 filemap_fdatawait(direntry->d_inode->i_mapping);
916 invalidate_remote_inode(direntry->d_inode); 1061 /* may eventually have to do this for open files too */
1062 if (list_empty(&(cifsInode->openFileList))) {
1063 /* changed on server - flush read ahead pages */
1064 cFYI(1, ("Invalidating read ahead data on "
1065 "closed file"));
1066 invalidate_remote_inode(direntry->d_inode);
1067 }
917 } 1068 }
918 } 1069 }
919/* up(&direntry->d_inode->i_sem); */ 1070/* mutex_unlock(&direntry->d_inode->i_mutex); */
920 1071
921 kfree(full_path); 1072 kfree(full_path);
922 FreeXid(xid); 1073 FreeXid(xid);
@@ -970,11 +1121,22 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
970 1121
971 xid = GetXid(); 1122 xid = GetXid();
972 1123
973 cFYI(1, (" In cifs_setattr, name = %s attrs->iavalid 0x%x ", 1124 cFYI(1, ("In cifs_setattr, name = %s attrs->iavalid 0x%x ",
974 direntry->d_name.name, attrs->ia_valid)); 1125 direntry->d_name.name, attrs->ia_valid));
1126
975 cifs_sb = CIFS_SB(direntry->d_inode->i_sb); 1127 cifs_sb = CIFS_SB(direntry->d_inode->i_sb);
976 pTcon = cifs_sb->tcon; 1128 pTcon = cifs_sb->tcon;
977 1129
1130 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) == 0) {
1131 /* check if we have permission to change attrs */
1132 rc = inode_change_ok(direntry->d_inode, attrs);
1133 if(rc < 0) {
1134 FreeXid(xid);
1135 return rc;
1136 } else
1137 rc = 0;
1138 }
1139
978 down(&direntry->d_sb->s_vfs_rename_sem); 1140 down(&direntry->d_sb->s_vfs_rename_sem);
979 full_path = build_path_from_dentry(direntry); 1141 full_path = build_path_from_dentry(direntry);
980 up(&direntry->d_sb->s_vfs_rename_sem); 1142 up(&direntry->d_sb->s_vfs_rename_sem);
@@ -987,8 +1149,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
987 /* BB check if we need to refresh inode from server now ? BB */ 1149 /* BB check if we need to refresh inode from server now ? BB */
988 1150
989 /* need to flush data before changing file size on server */ 1151 /* need to flush data before changing file size on server */
990 filemap_fdatawrite(direntry->d_inode->i_mapping); 1152 filemap_write_and_wait(direntry->d_inode->i_mapping);
991 filemap_fdatawait(direntry->d_inode->i_mapping);
992 1153
993 if (attrs->ia_valid & ATTR_SIZE) { 1154 if (attrs->ia_valid & ATTR_SIZE) {
994 /* To avoid spurious oplock breaks from server, in the case of 1155 /* To avoid spurious oplock breaks from server, in the case of
@@ -1014,7 +1175,9 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
1014 1 /* 45 seconds */); 1175 1 /* 45 seconds */);
1015 cFYI(1,("Wrt seteof rc %d", rc)); 1176 cFYI(1,("Wrt seteof rc %d", rc));
1016 } 1177 }
1017 } 1178 } else
1179 rc = -EINVAL;
1180
1018 if (rc != 0) { 1181 if (rc != 0) {
1019 /* Set file size by pathname rather than by handle 1182 /* Set file size by pathname rather than by handle
1020 either because no valid, writeable file handle for 1183 either because no valid, writeable file handle for
@@ -1086,6 +1249,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
1086 cifs_sb->mnt_cifs_flags & 1249 cifs_sb->mnt_cifs_flags &
1087 CIFS_MOUNT_MAP_SPECIAL_CHR); 1250 CIFS_MOUNT_MAP_SPECIAL_CHR);
1088 else if (attrs->ia_valid & ATTR_MODE) { 1251 else if (attrs->ia_valid & ATTR_MODE) {
1252 rc = 0;
1089 if ((mode & S_IWUGO) == 0) /* not writeable */ { 1253 if ((mode & S_IWUGO) == 0) /* not writeable */ {
1090 if ((cifsInode->cifsAttrs & ATTR_READONLY) == 0) 1254 if ((cifsInode->cifsAttrs & ATTR_READONLY) == 0)
1091 time_buf.Attributes = 1255 time_buf.Attributes =
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 34a06692e4f..812c6bb0fe3 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/misc.c 2 * fs/cifs/misc.c
3 * 3 *
4 * Copyright (C) International Business Machines Corp., 2002,2004 4 * Copyright (C) International Business Machines Corp., 2002,2005
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * This library is free software; you can redistribute it and/or modify 7 * This library is free software; you can redistribute it and/or modify
@@ -161,6 +161,9 @@ cifs_buf_get(void)
161 if (ret_buf) { 161 if (ret_buf) {
162 memset(ret_buf, 0, sizeof(struct smb_hdr) + 3); 162 memset(ret_buf, 0, sizeof(struct smb_hdr) + 3);
163 atomic_inc(&bufAllocCount); 163 atomic_inc(&bufAllocCount);
164#ifdef CONFIG_CIFS_STATS2
165 atomic_inc(&totBufAllocCount);
166#endif /* CONFIG_CIFS_STATS2 */
164 } 167 }
165 168
166 return ret_buf; 169 return ret_buf;
@@ -195,6 +198,10 @@ cifs_small_buf_get(void)
195 /* No need to clear memory here, cleared in header assemble */ 198 /* No need to clear memory here, cleared in header assemble */
196 /* memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/ 199 /* memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/
197 atomic_inc(&smBufAllocCount); 200 atomic_inc(&smBufAllocCount);
201#ifdef CONFIG_CIFS_STATS2
202 atomic_inc(&totSmBufAllocCount);
203#endif /* CONFIG_CIFS_STATS2 */
204
198 } 205 }
199 return ret_buf; 206 return ret_buf;
200} 207}
@@ -292,7 +299,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
292 struct cifsSesInfo * ses; 299 struct cifsSesInfo * ses;
293 char *temp = (char *) buffer; 300 char *temp = (char *) buffer;
294 301
295 memset(temp,0,MAX_CIFS_HDR_SIZE); 302 memset(temp,0,256); /* bigger than MAX_CIFS_HDR_SIZE */
296 303
297 buffer->smb_buf_length = 304 buffer->smb_buf_length =
298 (2 * word_count) + sizeof (struct smb_hdr) - 305 (2 * word_count) + sizeof (struct smb_hdr) -
@@ -348,12 +355,12 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
348 /* BB Add support for establishing new tCon and SMB Session */ 355 /* BB Add support for establishing new tCon and SMB Session */
349 /* with userid/password pairs found on the smb session */ 356 /* with userid/password pairs found on the smb session */
350 /* for other target tcp/ip addresses BB */ 357 /* for other target tcp/ip addresses BB */
351 if(current->uid != treeCon->ses->linux_uid) { 358 if(current->fsuid != treeCon->ses->linux_uid) {
352 cFYI(1,("Multiuser mode and UID did not match tcon uid ")); 359 cFYI(1,("Multiuser mode and UID did not match tcon uid"));
353 read_lock(&GlobalSMBSeslock); 360 read_lock(&GlobalSMBSeslock);
354 list_for_each(temp_item, &GlobalSMBSessionList) { 361 list_for_each(temp_item, &GlobalSMBSessionList) {
355 ses = list_entry(temp_item, struct cifsSesInfo, cifsSessionList); 362 ses = list_entry(temp_item, struct cifsSesInfo, cifsSessionList);
356 if(ses->linux_uid == current->uid) { 363 if(ses->linux_uid == current->fsuid) {
357 if(ses->server == treeCon->ses->server) { 364 if(ses->server == treeCon->ses->server) {
358 cFYI(1,("found matching uid substitute right smb_uid")); 365 cFYI(1,("found matching uid substitute right smb_uid"));
359 buffer->Uid = ses->Suid; 366 buffer->Uid = ses->Suid;
@@ -397,12 +404,12 @@ checkSMBhdr(struct smb_hdr *smb, __u16 mid)
397 if(smb->Command == SMB_COM_LOCKING_ANDX) 404 if(smb->Command == SMB_COM_LOCKING_ANDX)
398 return 0; 405 return 0;
399 else 406 else
400 cERROR(1, ("Rcvd Request not response ")); 407 cERROR(1, ("Rcvd Request not response"));
401 } 408 }
402 } else { /* bad signature or mid */ 409 } else { /* bad signature or mid */
403 if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) 410 if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff))
404 cERROR(1, 411 cERROR(1,
405 ("Bad protocol string signature header %x ", 412 ("Bad protocol string signature header %x",
406 *(unsigned int *) smb->Protocol)); 413 *(unsigned int *) smb->Protocol));
407 if (mid != smb->Mid) 414 if (mid != smb->Mid)
408 cERROR(1, ("Mids do not match")); 415 cERROR(1, ("Mids do not match"));
@@ -417,7 +424,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, int length)
417 __u32 len = smb->smb_buf_length; 424 __u32 len = smb->smb_buf_length;
418 __u32 clc_len; /* calculated length */ 425 __u32 clc_len; /* calculated length */
419 cFYI(0, 426 cFYI(0,
420 ("Entering checkSMB with Length: %x, smb_buf_length: %x ", 427 ("Entering checkSMB with Length: %x, smb_buf_length: %x",
421 length, len)); 428 length, len));
422 if (((unsigned int)length < 2 + sizeof (struct smb_hdr)) || 429 if (((unsigned int)length < 2 + sizeof (struct smb_hdr)) ||
423 (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4)) { 430 (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4)) {
@@ -451,9 +458,16 @@ checkSMB(struct smb_hdr *smb, __u16 mid, int length)
451 cERROR(1, ("bad smb size detected for Mid=%d", smb->Mid)); 458 cERROR(1, ("bad smb size detected for Mid=%d", smb->Mid));
452 /* Windows XP can return a few bytes too much, presumably 459 /* Windows XP can return a few bytes too much, presumably
453 an illegal pad, at the end of byte range lock responses 460 an illegal pad, at the end of byte range lock responses
454 so we allow for up to eight byte pad, as long as actual 461 so we allow for that three byte pad, as long as actual
455 received length is as long or longer than calculated length */ 462 received length is as long or longer than calculated length */
456 if((4+len > clc_len) && (len <= clc_len + 3)) 463 /* We have now had to extend this more, since there is a
464 case in which it needs to be bigger still to handle a
465 malformed response to transact2 findfirst from WinXP when
466 access denied is returned and thus bcc and wct are zero
467 but server says length is 0x21 bytes too long as if the server
468 forget to reset the smb rfc1001 length when it reset the
469 wct and bcc to minimum size and drop the t2 parms and data */
470 if((4+len > clc_len) && (len <= clc_len + 512))
457 return 0; 471 return 0;
458 else 472 else
459 return 1; 473 return 1;
@@ -678,7 +692,7 @@ cifsConvertToUCS(__le16 * target, const char *source, int maxlen,
678 __u16 temp; 692 __u16 temp;
679 693
680 if(!mapChars) 694 if(!mapChars)
681 return cifs_strtoUCS((wchar_t *) target, source, PATH_MAX, cp); 695 return cifs_strtoUCS(target, source, PATH_MAX, cp);
682 696
683 for(i = 0, j = 0; i < maxlen; j++) { 697 for(i = 0, j = 0; i < maxlen; j++) {
684 src_char = source[i]; 698 src_char = source[i];
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index f7814689844..5de74d216fd 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -330,7 +330,7 @@ static const struct {
330 ERRHRD, ERRgeneral, NT_STATUS_ACCOUNT_RESTRICTION}, { 330 ERRHRD, ERRgeneral, NT_STATUS_ACCOUNT_RESTRICTION}, {
331 ERRSRV, 2241, NT_STATUS_INVALID_LOGON_HOURS}, { 331 ERRSRV, 2241, NT_STATUS_INVALID_LOGON_HOURS}, {
332 ERRSRV, 2240, NT_STATUS_INVALID_WORKSTATION}, { 332 ERRSRV, 2240, NT_STATUS_INVALID_WORKSTATION}, {
333 ERRSRV, 2242, NT_STATUS_PASSWORD_EXPIRED}, { 333 ERRSRV, ERRpasswordExpired, NT_STATUS_PASSWORD_EXPIRED}, {
334 ERRSRV, 2239, NT_STATUS_ACCOUNT_DISABLED}, { 334 ERRSRV, 2239, NT_STATUS_ACCOUNT_DISABLED}, {
335 ERRHRD, ERRgeneral, NT_STATUS_NONE_MAPPED}, { 335 ERRHRD, ERRgeneral, NT_STATUS_NONE_MAPPED}, {
336 ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_LUIDS_REQUESTED}, { 336 ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_LUIDS_REQUESTED}, {
@@ -676,7 +676,7 @@ static const struct {
676 ERRDOS, 193, NT_STATUS_IMAGE_CHECKSUM_MISMATCH}, { 676 ERRDOS, 193, NT_STATUS_IMAGE_CHECKSUM_MISMATCH}, {
677 ERRHRD, ERRgeneral, NT_STATUS_LOST_WRITEBEHIND_DATA}, { 677 ERRHRD, ERRgeneral, NT_STATUS_LOST_WRITEBEHIND_DATA}, {
678 ERRHRD, ERRgeneral, NT_STATUS_CLIENT_SERVER_PARAMETERS_INVALID}, { 678 ERRHRD, ERRgeneral, NT_STATUS_CLIENT_SERVER_PARAMETERS_INVALID}, {
679 ERRSRV, 2242, NT_STATUS_PASSWORD_MUST_CHANGE}, { 679 ERRSRV, ERRpasswordExpired, NT_STATUS_PASSWORD_MUST_CHANGE}, {
680 ERRHRD, ERRgeneral, NT_STATUS_NOT_FOUND}, { 680 ERRHRD, ERRgeneral, NT_STATUS_NOT_FOUND}, {
681 ERRHRD, ERRgeneral, NT_STATUS_NOT_TINY_STREAM}, { 681 ERRHRD, ERRgeneral, NT_STATUS_NOT_TINY_STREAM}, {
682 ERRHRD, ERRgeneral, NT_STATUS_RECOVERY_FAILURE}, { 682 ERRHRD, ERRgeneral, NT_STATUS_RECOVERY_FAILURE}, {
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index a86bd1c0760..288cc048d37 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -142,6 +142,11 @@ static void fill_in_inode(struct inode *tmp_inode,
142 tmp_inode->i_gid = cifs_sb->mnt_gid; 142 tmp_inode->i_gid = cifs_sb->mnt_gid;
143 /* set default mode. will override for dirs below */ 143 /* set default mode. will override for dirs below */
144 tmp_inode->i_mode = cifs_sb->mnt_file_mode; 144 tmp_inode->i_mode = cifs_sb->mnt_file_mode;
145 } else {
146 /* mask off the type bits since it gets set
147 below and we do not want to get two type
148 bits set */
149 tmp_inode->i_mode &= ~S_IFMT;
145 } 150 }
146 151
147 if (attr & ATTR_DIRECTORY) { 152 if (attr & ATTR_DIRECTORY) {
@@ -152,12 +157,18 @@ static void fill_in_inode(struct inode *tmp_inode,
152 } 157 }
153 tmp_inode->i_mode |= S_IFDIR; 158 tmp_inode->i_mode |= S_IFDIR;
154 } else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) && 159 } else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) &&
155 (attr & ATTR_SYSTEM) && (end_of_file == 0)) { 160 (attr & ATTR_SYSTEM)) {
156 *pobject_type = DT_FIFO; 161 if (end_of_file == 0) {
157 tmp_inode->i_mode |= S_IFIFO; 162 *pobject_type = DT_FIFO;
158/* BB Finish for SFU style symlinks and devies */ 163 tmp_inode->i_mode |= S_IFIFO;
159/* } else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) && 164 } else {
160 (attr & ATTR_SYSTEM) && ) { */ 165 /* rather than get the type here, we mark the
166 inode as needing revalidate and get the real type
167 (blk vs chr vs. symlink) later ie in lookup */
168 *pobject_type = DT_REG;
169 tmp_inode->i_mode |= S_IFREG;
170 cifsInfo->time = 0;
171 }
161/* we no longer mark these because we could not follow them */ 172/* we no longer mark these because we could not follow them */
162/* } else if (attr & ATTR_REPARSE) { 173/* } else if (attr & ATTR_REPARSE) {
163 *pobject_type = DT_LNK; 174 *pobject_type = DT_LNK;
@@ -193,12 +204,17 @@ static void fill_in_inode(struct inode *tmp_inode,
193 if (S_ISREG(tmp_inode->i_mode)) { 204 if (S_ISREG(tmp_inode->i_mode)) {
194 cFYI(1, ("File inode")); 205 cFYI(1, ("File inode"));
195 tmp_inode->i_op = &cifs_file_inode_ops; 206 tmp_inode->i_op = &cifs_file_inode_ops;
196 if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) 207 if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
197 tmp_inode->i_fop = &cifs_file_direct_ops; 208 if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
209 tmp_inode->i_fop = &cifs_file_direct_nobrl_ops;
210 else
211 tmp_inode->i_fop = &cifs_file_direct_ops;
212
213 } else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
214 tmp_inode->i_fop = &cifs_file_nobrl_ops;
198 else 215 else
199 tmp_inode->i_fop = &cifs_file_ops; 216 tmp_inode->i_fop = &cifs_file_ops;
200 if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) 217
201 tmp_inode->i_fop->lock = NULL;
202 tmp_inode->i_data.a_ops = &cifs_addr_ops; 218 tmp_inode->i_data.a_ops = &cifs_addr_ops;
203 if((cifs_sb->tcon) && (cifs_sb->tcon->ses) && 219 if((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
204 (cifs_sb->tcon->ses->server->maxBuf < 220 (cifs_sb->tcon->ses->server->maxBuf <
@@ -258,6 +274,9 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
258 cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastStatusChange)); 274 cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastStatusChange));
259 275
260 tmp_inode->i_mode = le64_to_cpu(pfindData->Permissions); 276 tmp_inode->i_mode = le64_to_cpu(pfindData->Permissions);
277 /* since we set the inode type below we need to mask off type
278 to avoid strange results if bits above were corrupt */
279 tmp_inode->i_mode &= ~S_IFMT;
261 if (type == UNIX_FILE) { 280 if (type == UNIX_FILE) {
262 *pobject_type = DT_REG; 281 *pobject_type = DT_REG;
263 tmp_inode->i_mode |= S_IFREG; 282 tmp_inode->i_mode |= S_IFREG;
@@ -283,6 +302,11 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
283 } else if (type == UNIX_SOCKET) { 302 } else if (type == UNIX_SOCKET) {
284 *pobject_type = DT_SOCK; 303 *pobject_type = DT_SOCK;
285 tmp_inode->i_mode |= S_IFSOCK; 304 tmp_inode->i_mode |= S_IFSOCK;
305 } else {
306 /* safest to just call it a file */
307 *pobject_type = DT_REG;
308 tmp_inode->i_mode |= S_IFREG;
309 cFYI(1,("unknown inode type %d",type));
286 } 310 }
287 311
288 tmp_inode->i_uid = le64_to_cpu(pfindData->Uid); 312 tmp_inode->i_uid = le64_to_cpu(pfindData->Uid);
@@ -302,12 +326,18 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
302 if (S_ISREG(tmp_inode->i_mode)) { 326 if (S_ISREG(tmp_inode->i_mode)) {
303 cFYI(1, ("File inode")); 327 cFYI(1, ("File inode"));
304 tmp_inode->i_op = &cifs_file_inode_ops; 328 tmp_inode->i_op = &cifs_file_inode_ops;
305 if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) 329
306 tmp_inode->i_fop = &cifs_file_direct_ops; 330 if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
331 if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
332 tmp_inode->i_fop = &cifs_file_direct_nobrl_ops;
333 else
334 tmp_inode->i_fop = &cifs_file_direct_ops;
335
336 } else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
337 tmp_inode->i_fop = &cifs_file_nobrl_ops;
307 else 338 else
308 tmp_inode->i_fop = &cifs_file_ops; 339 tmp_inode->i_fop = &cifs_file_ops;
309 if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) 340
310 tmp_inode->i_fop->lock = NULL;
311 tmp_inode->i_data.a_ops = &cifs_addr_ops; 341 tmp_inode->i_data.a_ops = &cifs_addr_ops;
312 if((cifs_sb->tcon) && (cifs_sb->tcon->ses) && 342 if((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
313 (cifs_sb->tcon->ses->server->maxBuf < 343 (cifs_sb->tcon->ses->server->maxBuf <
@@ -699,7 +729,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
699 (__le16 *)filename, len/2, nlt); 729 (__le16 *)filename, len/2, nlt);
700 else 730 else
701 pqst->len = cifs_strfromUCS_le((char *)pqst->name, 731 pqst->len = cifs_strfromUCS_le((char *)pqst->name,
702 (wchar_t *)filename,len/2,nlt); 732 (__le16 *)filename,len/2,nlt);
703 } else { 733 } else {
704 pqst->name = filename; 734 pqst->name = filename;
705 pqst->len = len; 735 pqst->len = len;
diff --git a/fs/cifs/rfc1002pdu.h b/fs/cifs/rfc1002pdu.h
index 9222033cad8..aede606132a 100644
--- a/fs/cifs/rfc1002pdu.h
+++ b/fs/cifs/rfc1002pdu.h
@@ -24,11 +24,11 @@
24/* NB: unlike smb/cifs packets, the RFC1002 structures are big endian */ 24/* NB: unlike smb/cifs packets, the RFC1002 structures are big endian */
25 25
26 /* RFC 1002 session packet types */ 26 /* RFC 1002 session packet types */
27#define RFC1002_SESSION_MESASAGE 0x00 27#define RFC1002_SESSION_MESSAGE 0x00
28#define RFC1002_SESSION_REQUEST 0x81 28#define RFC1002_SESSION_REQUEST 0x81
29#define RFC1002_POSITIVE_SESSION_RESPONSE 0x82 29#define RFC1002_POSITIVE_SESSION_RESPONSE 0x82
30#define RFC1002_NEGATIVE_SESSION_RESPONSE 0x83 30#define RFC1002_NEGATIVE_SESSION_RESPONSE 0x83
31#define RFC1002_RETARGET_SESSION_RESPONSE 0x83 31#define RFC1002_RETARGET_SESSION_RESPONSE 0x84
32#define RFC1002_SESSION_KEEP_ALIVE 0x85 32#define RFC1002_SESSION_KEEP_ALIVE 0x85
33 33
34 /* RFC 1002 flags (only one defined */ 34 /* RFC 1002 flags (only one defined */
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 981ea0d8b9c..7b98792150e 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -206,7 +206,6 @@ smb_send(struct socket *ssocket, struct smb_hdr *smb_buffer,
206 return rc; 206 return rc;
207} 207}
208 208
209#ifdef CONFIG_CIFS_EXPERIMENTAL
210static int 209static int
211smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec, 210smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec,
212 struct sockaddr *sin) 211 struct sockaddr *sin)
@@ -299,7 +298,7 @@ smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec,
299 298
300int 299int
301SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, 300SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
302 struct kvec *iov, int n_vec, int *pbytes_returned, 301 struct kvec *iov, int n_vec, int * pRespBufType /* ret */,
303 const int long_op) 302 const int long_op)
304{ 303{
305 int rc = 0; 304 int rc = 0;
@@ -307,6 +306,8 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
307 unsigned long timeout; 306 unsigned long timeout;
308 struct mid_q_entry *midQ; 307 struct mid_q_entry *midQ;
309 struct smb_hdr *in_buf = iov[0].iov_base; 308 struct smb_hdr *in_buf = iov[0].iov_base;
309
310 *pRespBufType = CIFS_NO_BUFFER; /* no response buf yet */
310 311
311 if (ses == NULL) { 312 if (ses == NULL) {
312 cERROR(1,("Null smb session")); 313 cERROR(1,("Null smb session"));
@@ -392,8 +393,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
392 return -ENOMEM; 393 return -ENOMEM;
393 } 394 }
394 395
395/* BB FIXME */ 396 rc = cifs_sign_smb2(iov, n_vec, ses->server, &midQ->sequence_number);
396/* rc = cifs_sign_smb2(iov, n_vec, ses->server, &midQ->sequence_number); */
397 397
398 midQ->midState = MID_REQUEST_SUBMITTED; 398 midQ->midState = MID_REQUEST_SUBMITTED;
399#ifdef CONFIG_CIFS_STATS2 399#ifdef CONFIG_CIFS_STATS2
@@ -489,21 +489,23 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
489 receive_len, xid)); 489 receive_len, xid));
490 rc = -EIO; 490 rc = -EIO;
491 } else { /* rcvd frame is ok */ 491 } else { /* rcvd frame is ok */
492
493 if (midQ->resp_buf && 492 if (midQ->resp_buf &&
494 (midQ->midState == MID_RESPONSE_RECEIVED)) { 493 (midQ->midState == MID_RESPONSE_RECEIVED)) {
495 in_buf->smb_buf_length = receive_len;
496 /* BB verify that length would not overrun small buf */
497 memcpy((char *)in_buf + 4,
498 (char *)midQ->resp_buf + 4,
499 receive_len);
500 494
501 dump_smb(in_buf, 80); 495 iov[0].iov_base = (char *)midQ->resp_buf;
496 if(midQ->largeBuf)
497 *pRespBufType = CIFS_LARGE_BUFFER;
498 else
499 *pRespBufType = CIFS_SMALL_BUFFER;
500 iov[0].iov_len = receive_len + 4;
501 iov[1].iov_len = 0;
502
503 dump_smb(midQ->resp_buf, 80);
502 /* convert the length into a more usable form */ 504 /* convert the length into a more usable form */
503 if((receive_len > 24) && 505 if((receive_len > 24) &&
504 (ses->server->secMode & (SECMODE_SIGN_REQUIRED | 506 (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
505 SECMODE_SIGN_ENABLED))) { 507 SECMODE_SIGN_ENABLED))) {
506 rc = cifs_verify_signature(in_buf, 508 rc = cifs_verify_signature(midQ->resp_buf,
507 ses->server->mac_signing_key, 509 ses->server->mac_signing_key,
508 midQ->sequence_number+1); 510 midQ->sequence_number+1);
509 if(rc) { 511 if(rc) {
@@ -512,17 +514,19 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
512 } 514 }
513 } 515 }
514 516
515 *pbytes_returned = in_buf->smb_buf_length;
516
517 /* BB special case reconnect tid and uid here? */ 517 /* BB special case reconnect tid and uid here? */
518 rc = map_smb_to_linux_error(in_buf); 518 /* BB special case Errbadpassword and pwdexpired here */
519 rc = map_smb_to_linux_error(midQ->resp_buf);
519 520
520 /* convert ByteCount if necessary */ 521 /* convert ByteCount if necessary */
521 if (receive_len >= 522 if (receive_len >=
522 sizeof (struct smb_hdr) - 523 sizeof (struct smb_hdr) -
523 4 /* do not count RFC1001 header */ + 524 4 /* do not count RFC1001 header */ +
524 (2 * in_buf->WordCount) + 2 /* bcc */ ) 525 (2 * midQ->resp_buf->WordCount) + 2 /* bcc */ )
525 BCC(in_buf) = le16_to_cpu(BCC(in_buf)); 526 BCC(midQ->resp_buf) =
527 le16_to_cpu(BCC_LE(midQ->resp_buf));
528 midQ->resp_buf = NULL; /* mark it so will not be freed
529 by DeleteMidQEntry */
526 } else { 530 } else {
527 rc = -EIO; 531 rc = -EIO;
528 cFYI(1,("Bad MID state?")); 532 cFYI(1,("Bad MID state?"));
@@ -548,7 +552,6 @@ out_unlock2:
548 552
549 return rc; 553 return rc;
550} 554}
551#endif /* CIFS_EXPERIMENTAL */
552 555
553int 556int
554SendReceive(const unsigned int xid, struct cifsSesInfo *ses, 557SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
@@ -786,10 +789,10 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
786 sizeof (struct smb_hdr) - 789 sizeof (struct smb_hdr) -
787 4 /* do not count RFC1001 header */ + 790 4 /* do not count RFC1001 header */ +
788 (2 * out_buf->WordCount) + 2 /* bcc */ ) 791 (2 * out_buf->WordCount) + 2 /* bcc */ )
789 BCC(out_buf) = le16_to_cpu(BCC(out_buf)); 792 BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf));
790 } else { 793 } else {
791 rc = -EIO; 794 rc = -EIO;
792 cERROR(1,("Bad MID state? ")); 795 cERROR(1,("Bad MID state?"));
793 } 796 }
794 } 797 }
795cifs_no_response_exit: 798cifs_no_response_exit:
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index f375f87c7db..777e3363c2a 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -254,7 +254,8 @@ ssize_t cifs_getxattr(struct dentry * direntry, const char * ea_name,
254 rc = CIFSSMBQueryEA(xid,pTcon,full_path,ea_name,ea_value, 254 rc = CIFSSMBQueryEA(xid,pTcon,full_path,ea_name,ea_value,
255 buf_size, cifs_sb->local_nls, 255 buf_size, cifs_sb->local_nls,
256 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 256 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
257 } else if(strncmp(ea_name,POSIX_ACL_XATTR_ACCESS,strlen(POSIX_ACL_XATTR_ACCESS)) == 0) { 257 } else if(strncmp(ea_name,POSIX_ACL_XATTR_ACCESS,
258 strlen(POSIX_ACL_XATTR_ACCESS)) == 0) {
258#ifdef CONFIG_CIFS_POSIX 259#ifdef CONFIG_CIFS_POSIX
259 if(sb->s_flags & MS_POSIXACL) 260 if(sb->s_flags & MS_POSIXACL)
260 rc = CIFSSMBGetPosixACL(xid, pTcon, full_path, 261 rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
@@ -262,10 +263,27 @@ ssize_t cifs_getxattr(struct dentry * direntry, const char * ea_name,
262 cifs_sb->local_nls, 263 cifs_sb->local_nls,
263 cifs_sb->mnt_cifs_flags & 264 cifs_sb->mnt_cifs_flags &
264 CIFS_MOUNT_MAP_SPECIAL_CHR); 265 CIFS_MOUNT_MAP_SPECIAL_CHR);
266/* else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
267 __u16 fid;
268 int oplock = FALSE;
269 rc = CIFSSMBOpen(xid, pTcon, full_path,
270 FILE_OPEN, GENERIC_READ, 0, &fid,
271 &oplock, NULL, cifs_sb->local_nls,
272 cifs_sb->mnt_cifs_flags &
273 CIFS_MOUNT_MAP_SPECIAL_CHR);
274 if(rc == 0) {
275 rc = CIFSSMBGetCIFSACL(xid, pTcon, fid,
276 ea_value, buf_size,
277 ACL_TYPE_ACCESS);
278 CIFSSMBClose(xid, pTcon, fid)
279 }
280 } */ /* BB enable after fixing up return data */
281
265#else 282#else
266 cFYI(1,("query POSIX ACL not supported yet")); 283 cFYI(1,("query POSIX ACL not supported yet"));
267#endif /* CONFIG_CIFS_POSIX */ 284#endif /* CONFIG_CIFS_POSIX */
268 } else if(strncmp(ea_name,POSIX_ACL_XATTR_DEFAULT,strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) { 285 } else if(strncmp(ea_name,POSIX_ACL_XATTR_DEFAULT,
286 strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
269#ifdef CONFIG_CIFS_POSIX 287#ifdef CONFIG_CIFS_POSIX
270 if(sb->s_flags & MS_POSIXACL) 288 if(sb->s_flags & MS_POSIXACL)
271 rc = CIFSSMBGetPosixACL(xid, pTcon, full_path, 289 rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 80072fd9b7f..c607d923350 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -93,7 +93,7 @@ static void coda_flag_children(struct dentry *parent, int flag)
93 spin_lock(&dcache_lock); 93 spin_lock(&dcache_lock);
94 list_for_each(child, &parent->d_subdirs) 94 list_for_each(child, &parent->d_subdirs)
95 { 95 {
96 de = list_entry(child, struct dentry, d_child); 96 de = list_entry(child, struct dentry, d_u.d_child);
97 /* don't know what to do with negative dentries */ 97 /* don't know what to do with negative dentries */
98 if ( ! de->d_inode ) 98 if ( ! de->d_inode )
99 continue; 99 continue;
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 2391766e9c7..8f1a517f8b4 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -453,7 +453,7 @@ int coda_readdir(struct file *coda_file, void *dirent, filldir_t filldir)
453 coda_vfs_stat.readdir++; 453 coda_vfs_stat.readdir++;
454 454
455 host_inode = host_file->f_dentry->d_inode; 455 host_inode = host_file->f_dentry->d_inode;
456 down(&host_inode->i_sem); 456 mutex_lock(&host_inode->i_mutex);
457 host_file->f_pos = coda_file->f_pos; 457 host_file->f_pos = coda_file->f_pos;
458 458
459 if (!host_file->f_op->readdir) { 459 if (!host_file->f_op->readdir) {
@@ -475,7 +475,7 @@ int coda_readdir(struct file *coda_file, void *dirent, filldir_t filldir)
475 } 475 }
476out: 476out:
477 coda_file->f_pos = host_file->f_pos; 477 coda_file->f_pos = host_file->f_pos;
478 up(&host_inode->i_sem); 478 mutex_unlock(&host_inode->i_mutex);
479 479
480 return ret; 480 return ret;
481} 481}
diff --git a/fs/coda/file.c b/fs/coda/file.c
index e6bc022568f..30b4630bd73 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -77,14 +77,14 @@ coda_file_write(struct file *coda_file, const char __user *buf, size_t count, lo
77 return -EINVAL; 77 return -EINVAL;
78 78
79 host_inode = host_file->f_dentry->d_inode; 79 host_inode = host_file->f_dentry->d_inode;
80 down(&coda_inode->i_sem); 80 mutex_lock(&coda_inode->i_mutex);
81 81
82 ret = host_file->f_op->write(host_file, buf, count, ppos); 82 ret = host_file->f_op->write(host_file, buf, count, ppos);
83 83
84 coda_inode->i_size = host_inode->i_size; 84 coda_inode->i_size = host_inode->i_size;
85 coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9; 85 coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9;
86 coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC; 86 coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC;
87 up(&coda_inode->i_sem); 87 mutex_unlock(&coda_inode->i_mutex);
88 88
89 return ret; 89 return ret;
90} 90}
@@ -272,9 +272,9 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
272 if (host_file->f_op && host_file->f_op->fsync) { 272 if (host_file->f_op && host_file->f_op->fsync) {
273 host_dentry = host_file->f_dentry; 273 host_dentry = host_file->f_dentry;
274 host_inode = host_dentry->d_inode; 274 host_inode = host_dentry->d_inode;
275 down(&host_inode->i_sem); 275 mutex_lock(&host_inode->i_mutex);
276 err = host_file->f_op->fsync(host_file, host_dentry, datasync); 276 err = host_file->f_op->fsync(host_file, host_dentry, datasync);
277 up(&host_inode->i_sem); 277 mutex_unlock(&host_inode->i_mutex);
278 } 278 }
279 279
280 if ( !err && !datasync ) { 280 if ( !err && !datasync ) {
diff --git a/fs/compat.c b/fs/compat.c
index 8e71cdbecc7..ff0bafcff72 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -53,6 +53,8 @@
53#include <asm/mmu_context.h> 53#include <asm/mmu_context.h>
54#include <asm/ioctls.h> 54#include <asm/ioctls.h>
55 55
56extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat);
57
56/* 58/*
57 * Not all architectures have sys_utime, so implement this in terms 59 * Not all architectures have sys_utime, so implement this in terms
58 * of sys_utimes. 60 * of sys_utimes.
@@ -68,10 +70,10 @@ asmlinkage long compat_sys_utime(char __user *filename, struct compat_utimbuf __
68 tv[0].tv_usec = 0; 70 tv[0].tv_usec = 0;
69 tv[1].tv_usec = 0; 71 tv[1].tv_usec = 0;
70 } 72 }
71 return do_utimes(filename, t ? tv : NULL); 73 return do_utimes(AT_FDCWD, filename, t ? tv : NULL);
72} 74}
73 75
74asmlinkage long compat_sys_utimes(char __user *filename, struct compat_timeval __user *t) 76asmlinkage long compat_sys_futimesat(int dfd, char __user *filename, struct compat_timeval __user *t)
75{ 77{
76 struct timeval tv[2]; 78 struct timeval tv[2];
77 79
@@ -82,14 +84,19 @@ asmlinkage long compat_sys_utimes(char __user *filename, struct compat_timeval _
82 get_user(tv[1].tv_usec, &t[1].tv_usec)) 84 get_user(tv[1].tv_usec, &t[1].tv_usec))
83 return -EFAULT; 85 return -EFAULT;
84 } 86 }
85 return do_utimes(filename, t ? tv : NULL); 87 return do_utimes(dfd, filename, t ? tv : NULL);
88}
89
90asmlinkage long compat_sys_utimes(char __user *filename, struct compat_timeval __user *t)
91{
92 return compat_sys_futimesat(AT_FDCWD, filename, t);
86} 93}
87 94
88asmlinkage long compat_sys_newstat(char __user * filename, 95asmlinkage long compat_sys_newstat(char __user * filename,
89 struct compat_stat __user *statbuf) 96 struct compat_stat __user *statbuf)
90{ 97{
91 struct kstat stat; 98 struct kstat stat;
92 int error = vfs_stat(filename, &stat); 99 int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
93 100
94 if (!error) 101 if (!error)
95 error = cp_compat_stat(&stat, statbuf); 102 error = cp_compat_stat(&stat, statbuf);
@@ -100,13 +107,34 @@ asmlinkage long compat_sys_newlstat(char __user * filename,
100 struct compat_stat __user *statbuf) 107 struct compat_stat __user *statbuf)
101{ 108{
102 struct kstat stat; 109 struct kstat stat;
103 int error = vfs_lstat(filename, &stat); 110 int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
104 111
105 if (!error) 112 if (!error)
106 error = cp_compat_stat(&stat, statbuf); 113 error = cp_compat_stat(&stat, statbuf);
107 return error; 114 return error;
108} 115}
109 116
117asmlinkage long compat_sys_newfstatat(int dfd, char __user *filename,
118 struct compat_stat __user *statbuf, int flag)
119{
120 struct kstat stat;
121 int error = -EINVAL;
122
123 if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
124 goto out;
125
126 if (flag & AT_SYMLINK_NOFOLLOW)
127 error = vfs_lstat_fd(dfd, filename, &stat);
128 else
129 error = vfs_stat_fd(dfd, filename, &stat);
130
131 if (!error)
132 error = cp_compat_stat(&stat, statbuf);
133
134out:
135 return error;
136}
137
110asmlinkage long compat_sys_newfstat(unsigned int fd, 138asmlinkage long compat_sys_newfstat(unsigned int fd,
111 struct compat_stat __user * statbuf) 139 struct compat_stat __user * statbuf)
112{ 140{
@@ -168,8 +196,8 @@ asmlinkage long compat_sys_statfs(const char __user *path, struct compat_statfs
168 if (!error) { 196 if (!error) {
169 struct kstatfs tmp; 197 struct kstatfs tmp;
170 error = vfs_statfs(nd.dentry->d_inode->i_sb, &tmp); 198 error = vfs_statfs(nd.dentry->d_inode->i_sb, &tmp);
171 if (!error && put_compat_statfs(buf, &tmp)) 199 if (!error)
172 error = -EFAULT; 200 error = put_compat_statfs(buf, &tmp);
173 path_release(&nd); 201 path_release(&nd);
174 } 202 }
175 return error; 203 return error;
@@ -186,8 +214,8 @@ asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user
186 if (!file) 214 if (!file)
187 goto out; 215 goto out;
188 error = vfs_statfs(file->f_dentry->d_inode->i_sb, &tmp); 216 error = vfs_statfs(file->f_dentry->d_inode->i_sb, &tmp);
189 if (!error && put_compat_statfs(buf, &tmp)) 217 if (!error)
190 error = -EFAULT; 218 error = put_compat_statfs(buf, &tmp);
191 fput(file); 219 fput(file);
192out: 220out:
193 return error; 221 return error;
@@ -236,8 +264,8 @@ asmlinkage long compat_sys_statfs64(const char __user *path, compat_size_t sz, s
236 if (!error) { 264 if (!error) {
237 struct kstatfs tmp; 265 struct kstatfs tmp;
238 error = vfs_statfs(nd.dentry->d_inode->i_sb, &tmp); 266 error = vfs_statfs(nd.dentry->d_inode->i_sb, &tmp);
239 if (!error && put_compat_statfs64(buf, &tmp)) 267 if (!error)
240 error = -EFAULT; 268 error = put_compat_statfs64(buf, &tmp);
241 path_release(&nd); 269 path_release(&nd);
242 } 270 }
243 return error; 271 return error;
@@ -257,8 +285,8 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c
257 if (!file) 285 if (!file)
258 goto out; 286 goto out;
259 error = vfs_statfs(file->f_dentry->d_inode->i_sb, &tmp); 287 error = vfs_statfs(file->f_dentry->d_inode->i_sb, &tmp);
260 if (!error && put_compat_statfs64(buf, &tmp)) 288 if (!error)
261 error = -EFAULT; 289 error = put_compat_statfs64(buf, &tmp);
262 fput(file); 290 fput(file);
263out: 291out:
264 return error; 292 return error;
@@ -268,7 +296,6 @@ out:
268 296
269#define IOCTL_HASHSIZE 256 297#define IOCTL_HASHSIZE 256
270static struct ioctl_trans *ioctl32_hash_table[IOCTL_HASHSIZE]; 298static struct ioctl_trans *ioctl32_hash_table[IOCTL_HASHSIZE];
271static DECLARE_RWSEM(ioctl32_sem);
272 299
273extern struct ioctl_trans ioctl_start[]; 300extern struct ioctl_trans ioctl_start[];
274extern int ioctl_table_size; 301extern int ioctl_table_size;
@@ -390,14 +417,10 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
390 break; 417 break;
391 } 418 }
392 419
393 /* When register_ioctl32_conversion is finally gone remove
394 this lock! -AK */
395 down_read(&ioctl32_sem);
396 for (t = ioctl32_hash_table[ioctl32_hash(cmd)]; t; t = t->next) { 420 for (t = ioctl32_hash_table[ioctl32_hash(cmd)]; t; t = t->next) {
397 if (t->cmd == cmd) 421 if (t->cmd == cmd)
398 goto found_handler; 422 goto found_handler;
399 } 423 }
400 up_read(&ioctl32_sem);
401 424
402 if (S_ISSOCK(filp->f_dentry->d_inode->i_mode) && 425 if (S_ISSOCK(filp->f_dentry->d_inode->i_mode) &&
403 cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) { 426 cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) {
@@ -417,11 +440,9 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
417 lock_kernel(); 440 lock_kernel();
418 error = t->handler(fd, cmd, arg, filp); 441 error = t->handler(fd, cmd, arg, filp);
419 unlock_kernel(); 442 unlock_kernel();
420 up_read(&ioctl32_sem);
421 goto out_fput; 443 goto out_fput;
422 } 444 }
423 445
424 up_read(&ioctl32_sem);
425 do_ioctl: 446 do_ioctl:
426 error = vfs_ioctl(filp, fd, cmd, arg); 447 error = vfs_ioctl(filp, fd, cmd, arg);
427 out_fput: 448 out_fput:
@@ -501,9 +522,21 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
501 ret = sys_fcntl(fd, cmd, (unsigned long)&f); 522 ret = sys_fcntl(fd, cmd, (unsigned long)&f);
502 set_fs(old_fs); 523 set_fs(old_fs);
503 if (cmd == F_GETLK && ret == 0) { 524 if (cmd == F_GETLK && ret == 0) {
504 if ((f.l_start >= COMPAT_OFF_T_MAX) || 525 /* GETLK was successfule and we need to return the data...
505 ((f.l_start + f.l_len) > COMPAT_OFF_T_MAX)) 526 * but it needs to fit in the compat structure.
527 * l_start shouldn't be too big, unless the original
528 * start + end is greater than COMPAT_OFF_T_MAX, in which
529 * case the app was asking for trouble, so we return
530 * -EOVERFLOW in that case.
531 * l_len could be too big, in which case we just truncate it,
532 * and only allow the app to see that part of the conflicting
533 * lock that might make sense to it anyway
534 */
535
536 if (f.l_start > COMPAT_OFF_T_MAX)
506 ret = -EOVERFLOW; 537 ret = -EOVERFLOW;
538 if (f.l_len > COMPAT_OFF_T_MAX)
539 f.l_len = COMPAT_OFF_T_MAX;
507 if (ret == 0) 540 if (ret == 0)
508 ret = put_compat_flock(&f, compat_ptr(arg)); 541 ret = put_compat_flock(&f, compat_ptr(arg));
509 } 542 }
@@ -522,9 +555,11 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
522 (unsigned long)&f); 555 (unsigned long)&f);
523 set_fs(old_fs); 556 set_fs(old_fs);
524 if (cmd == F_GETLK64 && ret == 0) { 557 if (cmd == F_GETLK64 && ret == 0) {
525 if ((f.l_start >= COMPAT_LOFF_T_MAX) || 558 /* need to return lock information - see above for commentary */
526 ((f.l_start + f.l_len) > COMPAT_LOFF_T_MAX)) 559 if (f.l_start > COMPAT_LOFF_T_MAX)
527 ret = -EOVERFLOW; 560 ret = -EOVERFLOW;
561 if (f.l_len > COMPAT_LOFF_T_MAX)
562 f.l_len = COMPAT_LOFF_T_MAX;
528 if (ret == 0) 563 if (ret == 0)
529 ret = put_compat_flock64(&f, compat_ptr(arg)); 564 ret = put_compat_flock64(&f, compat_ptr(arg));
530 } 565 }
@@ -1177,7 +1212,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
1177 } 1212 }
1178 1213
1179 ret = rw_verify_area(type, file, pos, tot_len); 1214 ret = rw_verify_area(type, file, pos, tot_len);
1180 if (ret) 1215 if (ret < 0)
1181 goto out; 1216 goto out;
1182 1217
1183 fnv = NULL; 1218 fnv = NULL;
@@ -1283,7 +1318,17 @@ out:
1283asmlinkage long 1318asmlinkage long
1284compat_sys_open(const char __user *filename, int flags, int mode) 1319compat_sys_open(const char __user *filename, int flags, int mode)
1285{ 1320{
1286 return do_sys_open(filename, flags, mode); 1321 return do_sys_open(AT_FDCWD, filename, flags, mode);
1322}
1323
1324/*
1325 * Exactly like fs/open.c:sys_openat(), except that it doesn't set the
1326 * O_LARGEFILE flag.
1327 */
1328asmlinkage long
1329compat_sys_openat(int dfd, const char __user *filename, int flags, int mode)
1330{
1331 return do_sys_open(dfd, filename, flags, mode);
1287} 1332}
1288 1333
1289/* 1334/*
@@ -1530,7 +1575,7 @@ out_ret:
1530 * Ooo, nasty. We need here to frob 32-bit unsigned longs to 1575 * Ooo, nasty. We need here to frob 32-bit unsigned longs to
1531 * 64-bit unsigned longs. 1576 * 64-bit unsigned longs.
1532 */ 1577 */
1533static inline 1578static
1534int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, 1579int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
1535 unsigned long *fdset) 1580 unsigned long *fdset)
1536{ 1581{
@@ -1563,7 +1608,7 @@ int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
1563 return 0; 1608 return 0;
1564} 1609}
1565 1610
1566static inline 1611static
1567void compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, 1612void compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
1568 unsigned long *fdset) 1613 unsigned long *fdset)
1569{ 1614{
@@ -1614,36 +1659,14 @@ static void select_bits_free(void *bits, int size)
1614#define MAX_SELECT_SECONDS \ 1659#define MAX_SELECT_SECONDS \
1615 ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1) 1660 ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
1616 1661
1617asmlinkage long 1662int compat_core_sys_select(int n, compat_ulong_t __user *inp,
1618compat_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, 1663 compat_ulong_t __user *outp, compat_ulong_t __user *exp, s64 *timeout)
1619 compat_ulong_t __user *exp, struct compat_timeval __user *tvp)
1620{ 1664{
1621 fd_set_bits fds; 1665 fd_set_bits fds;
1622 char *bits; 1666 char *bits;
1623 long timeout;
1624 int size, max_fdset, ret = -EINVAL; 1667 int size, max_fdset, ret = -EINVAL;
1625 struct fdtable *fdt; 1668 struct fdtable *fdt;
1626 1669
1627 timeout = MAX_SCHEDULE_TIMEOUT;
1628 if (tvp) {
1629 time_t sec, usec;
1630
1631 if (!access_ok(VERIFY_READ, tvp, sizeof(*tvp))
1632 || __get_user(sec, &tvp->tv_sec)
1633 || __get_user(usec, &tvp->tv_usec)) {
1634 ret = -EFAULT;
1635 goto out_nofds;
1636 }
1637
1638 if (sec < 0 || usec < 0)
1639 goto out_nofds;
1640
1641 if ((unsigned long) sec < MAX_SELECT_SECONDS) {
1642 timeout = ROUND_UP(usec, 1000000/HZ);
1643 timeout += sec * (unsigned long) HZ;
1644 }
1645 }
1646
1647 if (n < 0) 1670 if (n < 0)
1648 goto out_nofds; 1671 goto out_nofds;
1649 1672
@@ -1680,19 +1703,7 @@ compat_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp
1680 zero_fd_set(n, fds.res_out); 1703 zero_fd_set(n, fds.res_out);
1681 zero_fd_set(n, fds.res_ex); 1704 zero_fd_set(n, fds.res_ex);
1682 1705
1683 ret = do_select(n, &fds, &timeout); 1706 ret = do_select(n, &fds, timeout);
1684
1685 if (tvp && !(current->personality & STICKY_TIMEOUTS)) {
1686 time_t sec = 0, usec = 0;
1687 if (timeout) {
1688 sec = timeout / HZ;
1689 usec = timeout % HZ;
1690 usec *= (1000000/HZ);
1691 }
1692 if (put_user(sec, &tvp->tv_sec) ||
1693 put_user(usec, &tvp->tv_usec))
1694 ret = -EFAULT;
1695 }
1696 1707
1697 if (ret < 0) 1708 if (ret < 0)
1698 goto out; 1709 goto out;
@@ -1713,6 +1724,224 @@ out_nofds:
1713 return ret; 1724 return ret;
1714} 1725}
1715 1726
1727asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
1728 compat_ulong_t __user *outp, compat_ulong_t __user *exp,
1729 struct compat_timeval __user *tvp)
1730{
1731 s64 timeout = -1;
1732 struct compat_timeval tv;
1733 int ret;
1734
1735 if (tvp) {
1736 if (copy_from_user(&tv, tvp, sizeof(tv)))
1737 return -EFAULT;
1738
1739 if (tv.tv_sec < 0 || tv.tv_usec < 0)
1740 return -EINVAL;
1741
1742 /* Cast to u64 to make GCC stop complaining */
1743 if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)
1744 timeout = -1; /* infinite */
1745 else {
1746 timeout = ROUND_UP(tv.tv_usec, 1000000/HZ);
1747 timeout += tv.tv_sec * HZ;
1748 }
1749 }
1750
1751 ret = compat_core_sys_select(n, inp, outp, exp, &timeout);
1752
1753 if (tvp) {
1754 if (current->personality & STICKY_TIMEOUTS)
1755 goto sticky;
1756 tv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
1757 tv.tv_sec = timeout;
1758 if (copy_to_user(tvp, &tv, sizeof(tv))) {
1759sticky:
1760 /*
1761 * If an application puts its timeval in read-only
1762 * memory, we don't want the Linux-specific update to
1763 * the timeval to cause a fault after the select has
1764 * completed successfully. However, because we're not
1765 * updating the timeval, we can't restart the system
1766 * call.
1767 */
1768 if (ret == -ERESTARTNOHAND)
1769 ret = -EINTR;
1770 }
1771 }
1772
1773 return ret;
1774}
1775
1776#ifdef TIF_RESTORE_SIGMASK
1777asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp,
1778 compat_ulong_t __user *outp, compat_ulong_t __user *exp,
1779 struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
1780 compat_size_t sigsetsize)
1781{
1782 compat_sigset_t ss32;
1783 sigset_t ksigmask, sigsaved;
1784 long timeout = MAX_SCHEDULE_TIMEOUT;
1785 struct compat_timespec ts;
1786 int ret;
1787
1788 if (tsp) {
1789 if (copy_from_user(&ts, tsp, sizeof(ts)))
1790 return -EFAULT;
1791
1792 if (ts.tv_sec < 0 || ts.tv_nsec < 0)
1793 return -EINVAL;
1794 }
1795
1796 if (sigmask) {
1797 if (sigsetsize != sizeof(compat_sigset_t))
1798 return -EINVAL;
1799 if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
1800 return -EFAULT;
1801 sigset_from_compat(&ksigmask, &ss32);
1802
1803 sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
1804 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
1805 }
1806
1807 do {
1808 if (tsp) {
1809 if ((unsigned long)ts.tv_sec < MAX_SELECT_SECONDS) {
1810 timeout = ROUND_UP(ts.tv_nsec, 1000000000/HZ);
1811 timeout += ts.tv_sec * (unsigned long)HZ;
1812 ts.tv_sec = 0;
1813 ts.tv_nsec = 0;
1814 } else {
1815 ts.tv_sec -= MAX_SELECT_SECONDS;
1816 timeout = MAX_SELECT_SECONDS * HZ;
1817 }
1818 }
1819
1820 ret = compat_core_sys_select(n, inp, outp, exp, &timeout);
1821
1822 } while (!ret && !timeout && tsp && (ts.tv_sec || ts.tv_nsec));
1823
1824 if (tsp && !(current->personality & STICKY_TIMEOUTS)) {
1825 ts.tv_sec += timeout / HZ;
1826 ts.tv_nsec += (timeout % HZ) * (1000000000/HZ);
1827 if (ts.tv_nsec >= 1000000000) {
1828 ts.tv_sec++;
1829 ts.tv_nsec -= 1000000000;
1830 }
1831 (void)copy_to_user(tsp, &ts, sizeof(ts));
1832 }
1833
1834 if (ret == -ERESTARTNOHAND) {
1835 /*
1836 * Don't restore the signal mask yet. Let do_signal() deliver
1837 * the signal on the way back to userspace, before the signal
1838 * mask is restored.
1839 */
1840 if (sigmask) {
1841 memcpy(&current->saved_sigmask, &sigsaved,
1842 sizeof(sigsaved));
1843 set_thread_flag(TIF_RESTORE_SIGMASK);
1844 }
1845 } else if (sigmask)
1846 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
1847
1848 return ret;
1849}
1850
1851asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp,
1852 compat_ulong_t __user *outp, compat_ulong_t __user *exp,
1853 struct compat_timespec __user *tsp, void __user *sig)
1854{
1855 compat_size_t sigsetsize = 0;
1856 compat_uptr_t up = 0;
1857
1858 if (sig) {
1859 if (!access_ok(VERIFY_READ, sig,
1860 sizeof(compat_uptr_t)+sizeof(compat_size_t)) ||
1861 __get_user(up, (compat_uptr_t __user *)sig) ||
1862 __get_user(sigsetsize,
1863 (compat_size_t __user *)(sig+sizeof(up))))
1864 return -EFAULT;
1865 }
1866 return compat_sys_pselect7(n, inp, outp, exp, tsp, compat_ptr(up),
1867 sigsetsize);
1868}
1869
1870asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
1871 unsigned int nfds, struct compat_timespec __user *tsp,
1872 const compat_sigset_t __user *sigmask, compat_size_t sigsetsize)
1873{
1874 compat_sigset_t ss32;
1875 sigset_t ksigmask, sigsaved;
1876 struct compat_timespec ts;
1877 s64 timeout = -1;
1878 int ret;
1879
1880 if (tsp) {
1881 if (copy_from_user(&ts, tsp, sizeof(ts)))
1882 return -EFAULT;
1883
1884 /* We assume that ts.tv_sec is always lower than
1885 the number of seconds that can be expressed in
1886 an s64. Otherwise the compiler bitches at us */
1887 timeout = ROUND_UP(ts.tv_nsec, 1000000000/HZ);
1888 timeout += ts.tv_sec * HZ;
1889 }
1890
1891 if (sigmask) {
1892 if (sigsetsize |= sizeof(compat_sigset_t))
1893 return -EINVAL;
1894 if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
1895 return -EFAULT;
1896 sigset_from_compat(&ksigmask, &ss32);
1897
1898 sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
1899 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
1900 }
1901
1902 ret = do_sys_poll(ufds, nfds, &timeout);
1903
1904 /* We can restart this syscall, usually */
1905 if (ret == -EINTR) {
1906 /*
1907 * Don't restore the signal mask yet. Let do_signal() deliver
1908 * the signal on the way back to userspace, before the signal
1909 * mask is restored.
1910 */
1911 if (sigmask) {
1912 memcpy(&current->saved_sigmask, &sigsaved,
1913 sizeof(sigsaved));
1914 set_thread_flag(TIF_RESTORE_SIGMASK);
1915 }
1916 ret = -ERESTARTNOHAND;
1917 } else if (sigmask)
1918 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
1919
1920 if (tsp && timeout >= 0) {
1921 if (current->personality & STICKY_TIMEOUTS)
1922 goto sticky;
1923 /* Yes, we know it's actually an s64, but it's also positive. */
1924 ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000;
1925 ts.tv_sec = timeout;
1926 if (copy_to_user(tsp, &ts, sizeof(ts))) {
1927sticky:
1928 /*
1929 * If an application puts its timeval in read-only
1930 * memory, we don't want the Linux-specific update to
1931 * the timeval to cause a fault after the select has
1932 * completed successfully. However, because we're not
1933 * updating the timeval, we can't restart the system
1934 * call.
1935 */
1936 if (ret == -ERESTARTNOHAND && timeout >= 0)
1937 ret = -EINTR;
1938 }
1939 }
1940
1941 return ret;
1942}
1943#endif /* TIF_RESTORE_SIGMASK */
1944
1716#if defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE) 1945#if defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)
1717/* Stuff for NFS server syscalls... */ 1946/* Stuff for NFS server syscalls... */
1718struct compat_nfsctl_svc { 1947struct compat_nfsctl_svc {
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 26300fccb4f..5dd0207ffd4 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -10,11 +10,11 @@
10 * ioctls. 10 * ioctls.
11 */ 11 */
12 12
13#ifdef INCLUDES
14#include <linux/config.h> 13#include <linux/config.h>
15#include <linux/types.h> 14#include <linux/types.h>
16#include <linux/compat.h> 15#include <linux/compat.h>
17#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/capability.h>
18#include <linux/compiler.h> 18#include <linux/compiler.h>
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/smp.h> 20#include <linux/smp.h>
@@ -49,6 +49,8 @@
49#include <linux/vt_kern.h> 49#include <linux/vt_kern.h>
50#include <linux/fb.h> 50#include <linux/fb.h>
51#include <linux/ext2_fs.h> 51#include <linux/ext2_fs.h>
52#include <linux/ext3_jbd.h>
53#include <linux/ext3_fs.h>
52#include <linux/videodev.h> 54#include <linux/videodev.h>
53#include <linux/netdevice.h> 55#include <linux/netdevice.h>
54#include <linux/raw.h> 56#include <linux/raw.h>
@@ -79,13 +81,9 @@
79#include <linux/capi.h> 81#include <linux/capi.h>
80 82
81#include <scsi/scsi.h> 83#include <scsi/scsi.h>
82/* Ugly hack. */
83#undef __KERNEL__
84#include <scsi/scsi_ioctl.h> 84#include <scsi/scsi_ioctl.h>
85#define __KERNEL__
86#include <scsi/sg.h> 85#include <scsi/sg.h>
87 86
88#include <asm/types.h>
89#include <asm/uaccess.h> 87#include <asm/uaccess.h>
90#include <linux/ethtool.h> 88#include <linux/ethtool.h>
91#include <linux/mii.h> 89#include <linux/mii.h>
@@ -93,7 +91,6 @@
93#include <linux/watchdog.h> 91#include <linux/watchdog.h>
94#include <linux/dm-ioctl.h> 92#include <linux/dm-ioctl.h>
95 93
96#include <asm/module.h>
97#include <linux/soundcard.h> 94#include <linux/soundcard.h>
98#include <linux/lp.h> 95#include <linux/lp.h>
99#include <linux/ppdev.h> 96#include <linux/ppdev.h>
@@ -121,17 +118,33 @@
121 118
122#include <linux/hiddev.h> 119#include <linux/hiddev.h>
123 120
124#undef INCLUDES 121#include <linux/dvb/audio.h>
125#endif 122#include <linux/dvb/dmx.h>
126 123#include <linux/dvb/frontend.h>
127#ifdef CODE 124#include <linux/dvb/video.h>
125#include <linux/lp.h>
128 126
129/* Aiee. Someone does not find a difference between int and long */ 127/* Aiee. Someone does not find a difference between int and long */
130#define EXT2_IOC32_GETFLAGS _IOR('f', 1, int) 128#define EXT2_IOC32_GETFLAGS _IOR('f', 1, int)
131#define EXT2_IOC32_SETFLAGS _IOW('f', 2, int) 129#define EXT2_IOC32_SETFLAGS _IOW('f', 2, int)
130#define EXT3_IOC32_GETVERSION _IOR('f', 3, int)
131#define EXT3_IOC32_SETVERSION _IOW('f', 4, int)
132#define EXT3_IOC32_GETRSVSZ _IOR('f', 5, int)
133#define EXT3_IOC32_SETRSVSZ _IOW('f', 6, int)
134#define EXT3_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int)
135#ifdef CONFIG_JBD_DEBUG
136#define EXT3_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int)
137#endif
138
132#define EXT2_IOC32_GETVERSION _IOR('v', 1, int) 139#define EXT2_IOC32_GETVERSION _IOR('v', 1, int)
133#define EXT2_IOC32_SETVERSION _IOW('v', 2, int) 140#define EXT2_IOC32_SETVERSION _IOW('v', 2, int)
134 141
142static int do_ioctl32_pointer(unsigned int fd, unsigned int cmd,
143 unsigned long arg, struct file *f)
144{
145 return sys_ioctl(fd, cmd, (unsigned long)compat_ptr(arg));
146}
147
135static int w_long(unsigned int fd, unsigned int cmd, unsigned long arg) 148static int w_long(unsigned int fd, unsigned int cmd, unsigned long arg)
136{ 149{
137 mm_segment_t old_fs = get_fs(); 150 mm_segment_t old_fs = get_fs();
@@ -175,241 +188,141 @@ static int do_ext2_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
175 return sys_ioctl(fd, cmd, (unsigned long)compat_ptr(arg)); 188 return sys_ioctl(fd, cmd, (unsigned long)compat_ptr(arg));
176} 189}
177 190
178struct video_tuner32 { 191static int do_ext3_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
179 compat_int_t tuner;
180 char name[32];
181 compat_ulong_t rangelow, rangehigh;
182 u32 flags; /* It is really u32 in videodev.h */
183 u16 mode, signal;
184};
185
186static int get_video_tuner32(struct video_tuner *kp, struct video_tuner32 __user *up)
187{
188 int i;
189
190 if(get_user(kp->tuner, &up->tuner))
191 return -EFAULT;
192 for(i = 0; i < 32; i++)
193 __get_user(kp->name[i], &up->name[i]);
194 __get_user(kp->rangelow, &up->rangelow);
195 __get_user(kp->rangehigh, &up->rangehigh);
196 __get_user(kp->flags, &up->flags);
197 __get_user(kp->mode, &up->mode);
198 __get_user(kp->signal, &up->signal);
199 return 0;
200}
201
202static int put_video_tuner32(struct video_tuner *kp, struct video_tuner32 __user *up)
203{ 192{
204 int i; 193 /* These are just misnamed, they actually get/put from/to user an int */
205 194 switch (cmd) {
206 if(put_user(kp->tuner, &up->tuner)) 195 case EXT3_IOC32_GETVERSION: cmd = EXT3_IOC_GETVERSION; break;
207 return -EFAULT; 196 case EXT3_IOC32_SETVERSION: cmd = EXT3_IOC_SETVERSION; break;
208 for(i = 0; i < 32; i++) 197 case EXT3_IOC32_GETRSVSZ: cmd = EXT3_IOC_GETRSVSZ; break;
209 __put_user(kp->name[i], &up->name[i]); 198 case EXT3_IOC32_SETRSVSZ: cmd = EXT3_IOC_SETRSVSZ; break;
210 __put_user(kp->rangelow, &up->rangelow); 199 case EXT3_IOC32_GROUP_EXTEND: cmd = EXT3_IOC_GROUP_EXTEND; break;
211 __put_user(kp->rangehigh, &up->rangehigh); 200#ifdef CONFIG_JBD_DEBUG
212 __put_user(kp->flags, &up->flags); 201 case EXT3_IOC32_WAIT_FOR_READONLY: cmd = EXT3_IOC_WAIT_FOR_READONLY; break;
213 __put_user(kp->mode, &up->mode); 202#endif
214 __put_user(kp->signal, &up->signal); 203 }
215 return 0; 204 return sys_ioctl(fd, cmd, (unsigned long)compat_ptr(arg));
216} 205}
217 206
218struct video_buffer32 { 207struct compat_dmx_event {
219 compat_caddr_t base; 208 dmx_event_t event;
220 compat_int_t height, width, depth, bytesperline; 209 compat_time_t timeStamp;
210 union
211 {
212 dmx_scrambling_status_t scrambling;
213 } u;
221}; 214};
222 215
223static int get_video_buffer32(struct video_buffer *kp, struct video_buffer32 __user *up) 216static int do_dmx_get_event(unsigned int fd, unsigned int cmd, unsigned long arg)
224{ 217{
225 u32 tmp; 218 struct dmx_event kevent;
226 219 mm_segment_t old_fs = get_fs();
227 if (get_user(tmp, &up->base)) 220 int err;
228 return -EFAULT;
229 221
230 /* This is actually a physical address stored 222 set_fs(KERNEL_DS);
231 * as a void pointer. 223 err = sys_ioctl(fd, cmd, (unsigned long) &kevent);
232 */ 224 set_fs(old_fs);
233 kp->base = (void *)(unsigned long) tmp;
234 225
235 __get_user(kp->height, &up->height); 226 if (!err) {
236 __get_user(kp->width, &up->width); 227 struct compat_dmx_event __user *up = compat_ptr(arg);
237 __get_user(kp->depth, &up->depth);
238 __get_user(kp->bytesperline, &up->bytesperline);
239 return 0;
240}
241 228
242static int put_video_buffer32(struct video_buffer *kp, struct video_buffer32 __user *up) 229 err = put_user(kevent.event, &up->event);
243{ 230 err |= put_user(kevent.timeStamp, &up->timeStamp);
244 u32 tmp = (u32)((unsigned long)kp->base); 231 err |= put_user(kevent.u.scrambling, &up->u.scrambling);
232 if (err)
233 err = -EFAULT;
234 }
245 235
246 if(put_user(tmp, &up->base)) 236 return err;
247 return -EFAULT;
248 __put_user(kp->height, &up->height);
249 __put_user(kp->width, &up->width);
250 __put_user(kp->depth, &up->depth);
251 __put_user(kp->bytesperline, &up->bytesperline);
252 return 0;
253} 237}
254 238
255struct video_clip32 { 239struct compat_video_event {
256 s32 x, y, width, height; /* Its really s32 in videodev.h */ 240 int32_t type;
257 compat_caddr_t next; 241 compat_time_t timestamp;
258}; 242 union {
259 243 video_size_t size;
260struct video_window32 { 244 unsigned int frame_rate;
261 u32 x, y, width, height, chromakey, flags; 245 } u;
262 compat_caddr_t clips;
263 compat_int_t clipcount;
264}; 246};
265 247
266/* You get back everything except the clips... */ 248static int do_video_get_event(unsigned int fd, unsigned int cmd, unsigned long arg)
267static int put_video_window32(struct video_window *kp, struct video_window32 __user *up)
268{ 249{
269 if(put_user(kp->x, &up->x)) 250 struct video_event kevent;
270 return -EFAULT; 251 mm_segment_t old_fs = get_fs();
271 __put_user(kp->y, &up->y); 252 int err;
272 __put_user(kp->width, &up->width);
273 __put_user(kp->height, &up->height);
274 __put_user(kp->chromakey, &up->chromakey);
275 __put_user(kp->flags, &up->flags);
276 __put_user(kp->clipcount, &up->clipcount);
277 return 0;
278}
279 253
280#define VIDIOCGTUNER32 _IOWR('v',4, struct video_tuner32) 254 set_fs(KERNEL_DS);
281#define VIDIOCSTUNER32 _IOW('v',5, struct video_tuner32) 255 err = sys_ioctl(fd, cmd, (unsigned long) &kevent);
282#define VIDIOCGWIN32 _IOR('v',9, struct video_window32) 256 set_fs(old_fs);
283#define VIDIOCSWIN32 _IOW('v',10, struct video_window32)
284#define VIDIOCGFBUF32 _IOR('v',11, struct video_buffer32)
285#define VIDIOCSFBUF32 _IOW('v',12, struct video_buffer32)
286#define VIDIOCGFREQ32 _IOR('v',14, u32)
287#define VIDIOCSFREQ32 _IOW('v',15, u32)
288 257
289enum { 258 if (!err) {
290 MaxClips = (~0U-sizeof(struct video_window))/sizeof(struct video_clip) 259 struct compat_video_event __user *up = compat_ptr(arg);
260
261 err = put_user(kevent.type, &up->type);
262 err |= put_user(kevent.timestamp, &up->timestamp);
263 err |= put_user(kevent.u.size.w, &up->u.size.w);
264 err |= put_user(kevent.u.size.h, &up->u.size.h);
265 err |= put_user(kevent.u.size.aspect_ratio,
266 &up->u.size.aspect_ratio);
267 if (err)
268 err = -EFAULT;
269 }
270
271 return err;
272}
273
274struct compat_video_still_picture {
275 compat_uptr_t iFrame;
276 int32_t size;
291}; 277};
292 278
293static int do_set_window(unsigned int fd, unsigned int cmd, unsigned long arg) 279static int do_video_stillpicture(unsigned int fd, unsigned int cmd, unsigned long arg)
294{ 280{
295 struct video_window32 __user *up = compat_ptr(arg); 281 struct compat_video_still_picture __user *up;
296 struct video_window __user *vw; 282 struct video_still_picture __user *up_native;
297 struct video_clip __user *p; 283 compat_uptr_t fp;
298 int nclips; 284 int32_t size;
299 u32 n; 285 int err;
300 286
301 if (get_user(nclips, &up->clipcount)) 287 up = (struct compat_video_still_picture __user *) arg;
288 err = get_user(fp, &up->iFrame);
289 err |= get_user(size, &up->size);
290 if (err)
302 return -EFAULT; 291 return -EFAULT;
303 292
304 /* Peculiar interface... */ 293 up_native =
305 if (nclips < 0) 294 compat_alloc_user_space(sizeof(struct video_still_picture));
306 nclips = VIDEO_CLIPMAP_SIZE;
307
308 if (nclips > MaxClips)
309 return -ENOMEM;
310 295
311 vw = compat_alloc_user_space(sizeof(struct video_window) + 296 put_user(compat_ptr(fp), &up_native->iFrame);
312 nclips * sizeof(struct video_clip)); 297 put_user(size, &up_native->size);
313 298
314 p = nclips ? (struct video_clip __user *)(vw + 1) : NULL; 299 err = sys_ioctl(fd, cmd, (unsigned long) up_native);
315 300
316 if (get_user(n, &up->x) || put_user(n, &vw->x) || 301 return err;
317 get_user(n, &up->y) || put_user(n, &vw->y) ||
318 get_user(n, &up->width) || put_user(n, &vw->width) ||
319 get_user(n, &up->height) || put_user(n, &vw->height) ||
320 get_user(n, &up->chromakey) || put_user(n, &vw->chromakey) ||
321 get_user(n, &up->flags) || put_user(n, &vw->flags) ||
322 get_user(n, &up->clipcount) || put_user(n, &vw->clipcount) ||
323 get_user(n, &up->clips) || put_user(p, &vw->clips))
324 return -EFAULT;
325
326 if (nclips) {
327 struct video_clip32 __user *u = compat_ptr(n);
328 int i;
329 if (!u)
330 return -EINVAL;
331 for (i = 0; i < nclips; i++, u++, p++) {
332 s32 v;
333 if (get_user(v, &u->x) ||
334 put_user(v, &p->x) ||
335 get_user(v, &u->y) ||
336 put_user(v, &p->y) ||
337 get_user(v, &u->width) ||
338 put_user(v, &p->width) ||
339 get_user(v, &u->height) ||
340 put_user(v, &p->height) ||
341 put_user(NULL, &p->next))
342 return -EFAULT;
343 }
344 }
345
346 return sys_ioctl(fd, VIDIOCSWIN, (unsigned long)p);
347} 302}
348 303
349static int do_video_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) 304struct compat_video_spu_palette {
350{ 305 int length;
351 union { 306 compat_uptr_t palette;
352 struct video_tuner vt; 307};
353 struct video_buffer vb;
354 struct video_window vw;
355 unsigned long vx;
356 } karg;
357 mm_segment_t old_fs = get_fs();
358 void __user *up = compat_ptr(arg);
359 int err = 0;
360
361 /* First, convert the command. */
362 switch(cmd) {
363 case VIDIOCGTUNER32: cmd = VIDIOCGTUNER; break;
364 case VIDIOCSTUNER32: cmd = VIDIOCSTUNER; break;
365 case VIDIOCGWIN32: cmd = VIDIOCGWIN; break;
366 case VIDIOCGFBUF32: cmd = VIDIOCGFBUF; break;
367 case VIDIOCSFBUF32: cmd = VIDIOCSFBUF; break;
368 case VIDIOCGFREQ32: cmd = VIDIOCGFREQ; break;
369 case VIDIOCSFREQ32: cmd = VIDIOCSFREQ; break;
370 };
371
372 switch(cmd) {
373 case VIDIOCSTUNER:
374 case VIDIOCGTUNER:
375 err = get_video_tuner32(&karg.vt, up);
376 break;
377
378 case VIDIOCSFBUF:
379 err = get_video_buffer32(&karg.vb, up);
380 break;
381
382 case VIDIOCSFREQ:
383 err = get_user(karg.vx, (u32 __user *)up);
384 break;
385 };
386 if(err)
387 goto out;
388 308
389 set_fs(KERNEL_DS); 309static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, unsigned long arg)
390 err = sys_ioctl(fd, cmd, (unsigned long)&karg); 310{
391 set_fs(old_fs); 311 struct compat_video_spu_palette __user *up;
312 struct video_spu_palette __user *up_native;
313 compat_uptr_t palp;
314 int length, err;
392 315
393 if(err == 0) { 316 up = (struct compat_video_spu_palette __user *) arg;
394 switch(cmd) { 317 err = get_user(palp, &up->palette);
395 case VIDIOCGTUNER: 318 err |= get_user(length, &up->length);
396 err = put_video_tuner32(&karg.vt, up);
397 break;
398 319
399 case VIDIOCGWIN: 320 up_native = compat_alloc_user_space(sizeof(struct video_spu_palette));
400 err = put_video_window32(&karg.vw, up); 321 put_user(compat_ptr(palp), &up_native->palette);
401 break; 322 put_user(length, &up_native->length);
402 323
403 case VIDIOCGFBUF: 324 err = sys_ioctl(fd, cmd, (unsigned long) up_native);
404 err = put_video_buffer32(&karg.vb, up);
405 break;
406 325
407 case VIDIOCGFREQ:
408 err = put_user(((u32)karg.vx), (u32 __user *)up);
409 break;
410 };
411 }
412out:
413 return err; 326 return err;
414} 327}
415 328
@@ -532,7 +445,8 @@ static int dev_ifconf(unsigned int fd, unsigned int cmd, unsigned long arg)
532 445
533 ifr = ifc.ifc_req; 446 ifr = ifc.ifc_req;
534 ifr32 = compat_ptr(ifc32.ifcbuf); 447 ifr32 = compat_ptr(ifc32.ifcbuf);
535 for (i = 0, j = 0; i < ifc32.ifc_len && j < ifc.ifc_len; 448 for (i = 0, j = 0;
449 i + sizeof (struct ifreq32) < ifc32.ifc_len && j < ifc.ifc_len;
536 i += sizeof (struct ifreq32), j += sizeof (struct ifreq)) { 450 i += sizeof (struct ifreq32), j += sizeof (struct ifreq)) {
537 if (copy_in_user(ifr32, ifr, sizeof (struct ifreq32))) 451 if (copy_in_user(ifr32, ifr, sizeof (struct ifreq32)))
538 return -EFAULT; 452 return -EFAULT;
@@ -548,10 +462,7 @@ static int dev_ifconf(unsigned int fd, unsigned int cmd, unsigned long arg)
548 i = ((i / sizeof(struct ifreq)) * sizeof(struct ifreq32)); 462 i = ((i / sizeof(struct ifreq)) * sizeof(struct ifreq32));
549 ifc32.ifc_len = i; 463 ifc32.ifc_len = i;
550 } else { 464 } else {
551 if (i <= ifc32.ifc_len) 465 ifc32.ifc_len = i;
552 ifc32.ifc_len = i;
553 else
554 ifc32.ifc_len = i - sizeof (struct ifreq32);
555 } 466 }
556 if (copy_to_user(compat_ptr(arg), &ifc32, sizeof(struct ifconf32))) 467 if (copy_to_user(compat_ptr(arg), &ifc32, sizeof(struct ifconf32)))
557 return -EFAULT; 468 return -EFAULT;
@@ -1006,6 +917,40 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
1006 return err; 917 return err;
1007} 918}
1008 919
920struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */
921 char req_state;
922 char orphan;
923 char sg_io_owned;
924 char problem;
925 int pack_id;
926 compat_uptr_t usr_ptr;
927 unsigned int duration;
928 int unused;
929};
930
931static int sg_grt_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
932{
933 int err, i;
934 sg_req_info_t *r;
935 struct compat_sg_req_info *o = (struct compat_sg_req_info *)arg;
936 r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE);
937 err = sys_ioctl(fd,cmd,(unsigned long)r);
938 if (err < 0)
939 return err;
940 for (i = 0; i < SG_MAX_QUEUE; i++) {
941 void __user *ptr;
942 int d;
943
944 if (copy_in_user(o + i, r + i, offsetof(sg_req_info_t, usr_ptr)) ||
945 get_user(ptr, &r[i].usr_ptr) ||
946 get_user(d, &r[i].duration) ||
947 put_user((u32)(unsigned long)(ptr), &o[i].usr_ptr) ||
948 put_user(d, &o[i].duration))
949 return -EFAULT;
950 }
951 return err;
952}
953
1009struct sock_fprog32 { 954struct sock_fprog32 {
1010 unsigned short len; 955 unsigned short len;
1011 compat_caddr_t filter; 956 compat_caddr_t filter;
@@ -2561,6 +2506,49 @@ static int old_bridge_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg
2561 return -EINVAL; 2506 return -EINVAL;
2562} 2507}
2563 2508
2509#define RTC_IRQP_READ32 _IOR('p', 0x0b, compat_ulong_t)
2510#define RTC_IRQP_SET32 _IOW('p', 0x0c, compat_ulong_t)
2511#define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t)
2512#define RTC_EPOCH_SET32 _IOW('p', 0x0e, compat_ulong_t)
2513
2514static int rtc_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
2515{
2516 mm_segment_t oldfs = get_fs();
2517 compat_ulong_t val32;
2518 unsigned long kval;
2519 int ret;
2520
2521 switch (cmd) {
2522 case RTC_IRQP_READ32:
2523 case RTC_EPOCH_READ32:
2524 set_fs(KERNEL_DS);
2525 ret = sys_ioctl(fd, (cmd == RTC_IRQP_READ32) ?
2526 RTC_IRQP_READ : RTC_EPOCH_READ,
2527 (unsigned long)&kval);
2528 set_fs(oldfs);
2529 if (ret)
2530 return ret;
2531 val32 = kval;
2532 return put_user(val32, (unsigned int __user *)arg);
2533 case RTC_IRQP_SET32:
2534 case RTC_EPOCH_SET32:
2535 ret = get_user(val32, (unsigned int __user *)arg);
2536 if (ret)
2537 return ret;
2538 kval = val32;
2539
2540 set_fs(KERNEL_DS);
2541 ret = sys_ioctl(fd, (cmd == RTC_IRQP_SET32) ?
2542 RTC_IRQP_SET : RTC_EPOCH_SET,
2543 (unsigned long)&kval);
2544 set_fs(oldfs);
2545 return ret;
2546 default:
2547 /* unreached */
2548 return -ENOIOCTLCMD;
2549 }
2550}
2551
2564#if defined(CONFIG_NCP_FS) || defined(CONFIG_NCP_FS_MODULE) 2552#if defined(CONFIG_NCP_FS) || defined(CONFIG_NCP_FS_MODULE)
2565struct ncp_ioctl_request_32 { 2553struct ncp_ioctl_request_32 {
2566 u32 function; 2554 u32 function;
@@ -2748,10 +2736,34 @@ static int do_ncp_setprivatedata(unsigned int fd, unsigned int cmd, unsigned lon
2748} 2736}
2749#endif 2737#endif
2750 2738
2751#undef CODE 2739static int
2752#endif 2740lp_timeout_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
2741{
2742 struct compat_timeval *tc = (struct compat_timeval *)arg;
2743 struct timeval *tn = compat_alloc_user_space(sizeof(struct timeval));
2744 struct timeval ts;
2745 if (get_user(ts.tv_sec, &tc->tv_sec) ||
2746 get_user(ts.tv_usec, &tc->tv_usec) ||
2747 put_user(ts.tv_sec, &tn->tv_sec) ||
2748 put_user(ts.tv_usec, &tn->tv_usec))
2749 return -EFAULT;
2750 return sys_ioctl(fd, cmd, (unsigned long)tn);
2751}
2753 2752
2754#ifdef DECLARES 2753#define HANDLE_IOCTL(cmd,handler) \
2754 { (cmd), (ioctl_trans_handler_t)(handler) },
2755
2756/* pointer to compatible structure or no argument */
2757#define COMPATIBLE_IOCTL(cmd) \
2758 { (cmd), do_ioctl32_pointer },
2759
2760/* argument is an unsigned long integer, not a pointer */
2761#define ULONG_IOCTL(cmd) \
2762 { (cmd), (ioctl_trans_handler_t)sys_ioctl },
2763
2764
2765struct ioctl_trans ioctl_start[] = {
2766#include <linux/compat_ioctl.h>
2755HANDLE_IOCTL(MEMREADOOB32, mtd_rw_oob) 2767HANDLE_IOCTL(MEMREADOOB32, mtd_rw_oob)
2756HANDLE_IOCTL(MEMWRITEOOB32, mtd_rw_oob) 2768HANDLE_IOCTL(MEMWRITEOOB32, mtd_rw_oob)
2757#ifdef CONFIG_NET 2769#ifdef CONFIG_NET
@@ -2831,6 +2843,7 @@ HANDLE_IOCTL(FDPOLLDRVSTAT32, fd_ioctl_trans)
2831HANDLE_IOCTL(FDGETFDCSTAT32, fd_ioctl_trans) 2843HANDLE_IOCTL(FDGETFDCSTAT32, fd_ioctl_trans)
2832HANDLE_IOCTL(FDWERRORGET32, fd_ioctl_trans) 2844HANDLE_IOCTL(FDWERRORGET32, fd_ioctl_trans)
2833HANDLE_IOCTL(SG_IO,sg_ioctl_trans) 2845HANDLE_IOCTL(SG_IO,sg_ioctl_trans)
2846HANDLE_IOCTL(SG_GET_REQUEST_TABLE, sg_grt_trans)
2834HANDLE_IOCTL(PPPIOCGIDLE32, ppp_ioctl_trans) 2847HANDLE_IOCTL(PPPIOCGIDLE32, ppp_ioctl_trans)
2835HANDLE_IOCTL(PPPIOCSCOMPRESS32, ppp_ioctl_trans) 2848HANDLE_IOCTL(PPPIOCSCOMPRESS32, ppp_ioctl_trans)
2836HANDLE_IOCTL(PPPIOCSPASS32, ppp_sock_fprog_ioctl_trans) 2849HANDLE_IOCTL(PPPIOCSPASS32, ppp_sock_fprog_ioctl_trans)
@@ -2854,14 +2867,15 @@ HANDLE_IOCTL(EXT2_IOC32_GETFLAGS, do_ext2_ioctl)
2854HANDLE_IOCTL(EXT2_IOC32_SETFLAGS, do_ext2_ioctl) 2867HANDLE_IOCTL(EXT2_IOC32_SETFLAGS, do_ext2_ioctl)
2855HANDLE_IOCTL(EXT2_IOC32_GETVERSION, do_ext2_ioctl) 2868HANDLE_IOCTL(EXT2_IOC32_GETVERSION, do_ext2_ioctl)
2856HANDLE_IOCTL(EXT2_IOC32_SETVERSION, do_ext2_ioctl) 2869HANDLE_IOCTL(EXT2_IOC32_SETVERSION, do_ext2_ioctl)
2857HANDLE_IOCTL(VIDIOCGTUNER32, do_video_ioctl) 2870HANDLE_IOCTL(EXT3_IOC32_GETVERSION, do_ext3_ioctl)
2858HANDLE_IOCTL(VIDIOCSTUNER32, do_video_ioctl) 2871HANDLE_IOCTL(EXT3_IOC32_SETVERSION, do_ext3_ioctl)
2859HANDLE_IOCTL(VIDIOCGWIN32, do_video_ioctl) 2872HANDLE_IOCTL(EXT3_IOC32_GETRSVSZ, do_ext3_ioctl)
2860HANDLE_IOCTL(VIDIOCSWIN32, do_set_window) 2873HANDLE_IOCTL(EXT3_IOC32_SETRSVSZ, do_ext3_ioctl)
2861HANDLE_IOCTL(VIDIOCGFBUF32, do_video_ioctl) 2874HANDLE_IOCTL(EXT3_IOC32_GROUP_EXTEND, do_ext3_ioctl)
2862HANDLE_IOCTL(VIDIOCSFBUF32, do_video_ioctl) 2875COMPATIBLE_IOCTL(EXT3_IOC_GROUP_ADD)
2863HANDLE_IOCTL(VIDIOCGFREQ32, do_video_ioctl) 2876#ifdef CONFIG_JBD_DEBUG
2864HANDLE_IOCTL(VIDIOCSFREQ32, do_video_ioctl) 2877HANDLE_IOCTL(EXT3_IOC32_WAIT_FOR_READONLY, do_ext3_ioctl)
2878#endif
2865/* One SMB ioctl needs translations. */ 2879/* One SMB ioctl needs translations. */
2866#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t) 2880#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
2867HANDLE_IOCTL(SMB_IOC_GETMOUNTUID_32, do_smb_getmountuid) 2881HANDLE_IOCTL(SMB_IOC_GETMOUNTUID_32, do_smb_getmountuid)
@@ -2943,6 +2957,10 @@ HANDLE_IOCTL(SIOCSIWENCODE, do_wireless_ioctl)
2943HANDLE_IOCTL(SIOCGIWENCODE, do_wireless_ioctl) 2957HANDLE_IOCTL(SIOCGIWENCODE, do_wireless_ioctl)
2944HANDLE_IOCTL(SIOCSIFBR, old_bridge_ioctl) 2958HANDLE_IOCTL(SIOCSIFBR, old_bridge_ioctl)
2945HANDLE_IOCTL(SIOCGIFBR, old_bridge_ioctl) 2959HANDLE_IOCTL(SIOCGIFBR, old_bridge_ioctl)
2960HANDLE_IOCTL(RTC_IRQP_READ32, rtc_ioctl)
2961HANDLE_IOCTL(RTC_IRQP_SET32, rtc_ioctl)
2962HANDLE_IOCTL(RTC_EPOCH_READ32, rtc_ioctl)
2963HANDLE_IOCTL(RTC_EPOCH_SET32, rtc_ioctl)
2946 2964
2947#if defined(CONFIG_NCP_FS) || defined(CONFIG_NCP_FS_MODULE) 2965#if defined(CONFIG_NCP_FS) || defined(CONFIG_NCP_FS_MODULE)
2948HANDLE_IOCTL(NCP_IOC_NCPREQUEST_32, do_ncp_ncprequest) 2966HANDLE_IOCTL(NCP_IOC_NCPREQUEST_32, do_ncp_ncprequest)
@@ -2954,5 +2972,25 @@ HANDLE_IOCTL(NCP_IOC_GETPRIVATEDATA_32, do_ncp_getprivatedata)
2954HANDLE_IOCTL(NCP_IOC_SETPRIVATEDATA_32, do_ncp_setprivatedata) 2972HANDLE_IOCTL(NCP_IOC_SETPRIVATEDATA_32, do_ncp_setprivatedata)
2955#endif 2973#endif
2956 2974
2957#undef DECLARES 2975/* dvb */
2958#endif 2976HANDLE_IOCTL(DMX_GET_EVENT, do_dmx_get_event)
2977HANDLE_IOCTL(VIDEO_GET_EVENT, do_video_get_event)
2978HANDLE_IOCTL(VIDEO_STILLPICTURE, do_video_stillpicture)
2979HANDLE_IOCTL(VIDEO_SET_SPU_PALETTE, do_video_set_spu_palette)
2980
2981/* parport */
2982COMPATIBLE_IOCTL(LPTIME)
2983COMPATIBLE_IOCTL(LPCHAR)
2984COMPATIBLE_IOCTL(LPABORTOPEN)
2985COMPATIBLE_IOCTL(LPCAREFUL)
2986COMPATIBLE_IOCTL(LPWAIT)
2987COMPATIBLE_IOCTL(LPSETIRQ)
2988COMPATIBLE_IOCTL(LPGETSTATUS)
2989COMPATIBLE_IOCTL(LPGETSTATUS)
2990COMPATIBLE_IOCTL(LPRESET)
2991/*LPGETSTATS not implemented, but no kernels seem to compile it in anyways*/
2992COMPATIBLE_IOCTL(LPGETFLAGS)
2993HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans)
2994};
2995
2996int ioctl_table_size = ARRAY_SIZE(ioctl_start);
diff --git a/fs/configfs/Makefile b/fs/configfs/Makefile
new file mode 100644
index 00000000000..00ffb278e98
--- /dev/null
+++ b/fs/configfs/Makefile
@@ -0,0 +1,7 @@
1#
2# Makefile for the configfs virtual filesystem
3#
4
5obj-$(CONFIG_CONFIGFS_FS) += configfs.o
6
7configfs-objs := inode.o file.o dir.o symlink.o mount.o item.o
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
new file mode 100644
index 00000000000..8899d9c5f6b
--- /dev/null
+++ b/fs/configfs/configfs_internal.h
@@ -0,0 +1,142 @@
1/* -*- mode: c; c-basic-offset:8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * configfs_internal.h - Internal stuff for configfs
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 *
21 * Based on sysfs:
22 * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
23 *
24 * configfs Copyright (C) 2005 Oracle. All rights reserved.
25 */
26
27#include <linux/slab.h>
28#include <linux/list.h>
29
30struct configfs_dirent {
31 atomic_t s_count;
32 struct list_head s_sibling;
33 struct list_head s_children;
34 struct list_head s_links;
35 void * s_element;
36 int s_type;
37 umode_t s_mode;
38 struct dentry * s_dentry;
39};
40
41#define CONFIGFS_ROOT 0x0001
42#define CONFIGFS_DIR 0x0002
43#define CONFIGFS_ITEM_ATTR 0x0004
44#define CONFIGFS_ITEM_LINK 0x0020
45#define CONFIGFS_USET_DIR 0x0040
46#define CONFIGFS_USET_DEFAULT 0x0080
47#define CONFIGFS_USET_DROPPING 0x0100
48#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR)
49
50extern struct vfsmount * configfs_mount;
51
52extern int configfs_is_root(struct config_item *item);
53
54extern struct inode * configfs_new_inode(mode_t mode);
55extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *));
56
57extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
58extern int configfs_make_dirent(struct configfs_dirent *,
59 struct dentry *, void *, umode_t, int);
60
61extern int configfs_add_file(struct dentry *, const struct configfs_attribute *, int);
62extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
63
64extern const unsigned char * configfs_get_name(struct configfs_dirent *sd);
65extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent);
66
67extern int configfs_pin_fs(void);
68extern void configfs_release_fs(void);
69
70extern struct rw_semaphore configfs_rename_sem;
71extern struct super_block * configfs_sb;
72extern struct file_operations configfs_dir_operations;
73extern struct file_operations configfs_file_operations;
74extern struct file_operations bin_fops;
75extern struct inode_operations configfs_dir_inode_operations;
76extern struct inode_operations configfs_symlink_inode_operations;
77
78extern int configfs_symlink(struct inode *dir, struct dentry *dentry,
79 const char *symname);
80extern int configfs_unlink(struct inode *dir, struct dentry *dentry);
81
82struct configfs_symlink {
83 struct list_head sl_list;
84 struct config_item *sl_target;
85};
86
87extern int configfs_create_link(struct configfs_symlink *sl,
88 struct dentry *parent,
89 struct dentry *dentry);
90
91static inline struct config_item * to_item(struct dentry * dentry)
92{
93 struct configfs_dirent * sd = dentry->d_fsdata;
94 return ((struct config_item *) sd->s_element);
95}
96
97static inline struct configfs_attribute * to_attr(struct dentry * dentry)
98{
99 struct configfs_dirent * sd = dentry->d_fsdata;
100 return ((struct configfs_attribute *) sd->s_element);
101}
102
103static inline struct config_item *configfs_get_config_item(struct dentry *dentry)
104{
105 struct config_item * item = NULL;
106
107 spin_lock(&dcache_lock);
108 if (!d_unhashed(dentry)) {
109 struct configfs_dirent * sd = dentry->d_fsdata;
110 if (sd->s_type & CONFIGFS_ITEM_LINK) {
111 struct configfs_symlink * sl = sd->s_element;
112 item = config_item_get(sl->sl_target);
113 } else
114 item = config_item_get(sd->s_element);
115 }
116 spin_unlock(&dcache_lock);
117
118 return item;
119}
120
121static inline void release_configfs_dirent(struct configfs_dirent * sd)
122{
123 if (!(sd->s_type & CONFIGFS_ROOT))
124 kfree(sd);
125}
126
127static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd)
128{
129 if (sd) {
130 WARN_ON(!atomic_read(&sd->s_count));
131 atomic_inc(&sd->s_count);
132 }
133 return sd;
134}
135
136static inline void configfs_put(struct configfs_dirent * sd)
137{
138 WARN_ON(!atomic_read(&sd->s_count));
139 if (atomic_dec_and_test(&sd->s_count))
140 release_configfs_dirent(sd);
141}
142
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
new file mode 100644
index 00000000000..b668ec61527
--- /dev/null
+++ b/fs/configfs/dir.c
@@ -0,0 +1,1102 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dir.c - Operations for configfs directories.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 *
21 * Based on sysfs:
22 * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
23 *
24 * configfs Copyright (C) 2005 Oracle. All rights reserved.
25 */
26
27#undef DEBUG
28
29#include <linux/fs.h>
30#include <linux/mount.h>
31#include <linux/module.h>
32#include <linux/slab.h>
33
34#include <linux/configfs.h>
35#include "configfs_internal.h"
36
37DECLARE_RWSEM(configfs_rename_sem);
38
39static void configfs_d_iput(struct dentry * dentry,
40 struct inode * inode)
41{
42 struct configfs_dirent * sd = dentry->d_fsdata;
43
44 if (sd) {
45 BUG_ON(sd->s_dentry != dentry);
46 sd->s_dentry = NULL;
47 configfs_put(sd);
48 }
49 iput(inode);
50}
51
52/*
53 * We _must_ delete our dentries on last dput, as the chain-to-parent
54 * behavior is required to clear the parents of default_groups.
55 */
56static int configfs_d_delete(struct dentry *dentry)
57{
58 return 1;
59}
60
61static struct dentry_operations configfs_dentry_ops = {
62 .d_iput = configfs_d_iput,
63 /* simple_delete_dentry() isn't exported */
64 .d_delete = configfs_d_delete,
65};
66
67/*
68 * Allocates a new configfs_dirent and links it to the parent configfs_dirent
69 */
70static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * parent_sd,
71 void * element)
72{
73 struct configfs_dirent * sd;
74
75 sd = kmalloc(sizeof(*sd), GFP_KERNEL);
76 if (!sd)
77 return NULL;
78
79 memset(sd, 0, sizeof(*sd));
80 atomic_set(&sd->s_count, 1);
81 INIT_LIST_HEAD(&sd->s_links);
82 INIT_LIST_HEAD(&sd->s_children);
83 list_add(&sd->s_sibling, &parent_sd->s_children);
84 sd->s_element = element;
85
86 return sd;
87}
88
89int configfs_make_dirent(struct configfs_dirent * parent_sd,
90 struct dentry * dentry, void * element,
91 umode_t mode, int type)
92{
93 struct configfs_dirent * sd;
94
95 sd = configfs_new_dirent(parent_sd, element);
96 if (!sd)
97 return -ENOMEM;
98
99 sd->s_mode = mode;
100 sd->s_type = type;
101 sd->s_dentry = dentry;
102 if (dentry) {
103 dentry->d_fsdata = configfs_get(sd);
104 dentry->d_op = &configfs_dentry_ops;
105 }
106
107 return 0;
108}
109
110static int init_dir(struct inode * inode)
111{
112 inode->i_op = &configfs_dir_inode_operations;
113 inode->i_fop = &configfs_dir_operations;
114
115 /* directory inodes start off with i_nlink == 2 (for "." entry) */
116 inode->i_nlink++;
117 return 0;
118}
119
120static int init_file(struct inode * inode)
121{
122 inode->i_size = PAGE_SIZE;
123 inode->i_fop = &configfs_file_operations;
124 return 0;
125}
126
127static int init_symlink(struct inode * inode)
128{
129 inode->i_op = &configfs_symlink_inode_operations;
130 return 0;
131}
132
133static int create_dir(struct config_item * k, struct dentry * p,
134 struct dentry * d)
135{
136 int error;
137 umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
138
139 error = configfs_create(d, mode, init_dir);
140 if (!error) {
141 error = configfs_make_dirent(p->d_fsdata, d, k, mode,
142 CONFIGFS_DIR);
143 if (!error) {
144 p->d_inode->i_nlink++;
145 (d)->d_op = &configfs_dentry_ops;
146 }
147 }
148 return error;
149}
150
151
152/**
153 * configfs_create_dir - create a directory for an config_item.
154 * @item: config_itemwe're creating directory for.
155 * @dentry: config_item's dentry.
156 */
157
158static int configfs_create_dir(struct config_item * item, struct dentry *dentry)
159{
160 struct dentry * parent;
161 int error = 0;
162
163 BUG_ON(!item);
164
165 if (item->ci_parent)
166 parent = item->ci_parent->ci_dentry;
167 else if (configfs_mount && configfs_mount->mnt_sb)
168 parent = configfs_mount->mnt_sb->s_root;
169 else
170 return -EFAULT;
171
172 error = create_dir(item,parent,dentry);
173 if (!error)
174 item->ci_dentry = dentry;
175 return error;
176}
177
178int configfs_create_link(struct configfs_symlink *sl,
179 struct dentry *parent,
180 struct dentry *dentry)
181{
182 int err = 0;
183 umode_t mode = S_IFLNK | S_IRWXUGO;
184
185 err = configfs_create(dentry, mode, init_symlink);
186 if (!err) {
187 err = configfs_make_dirent(parent->d_fsdata, dentry, sl,
188 mode, CONFIGFS_ITEM_LINK);
189 if (!err)
190 dentry->d_op = &configfs_dentry_ops;
191 }
192 return err;
193}
194
195static void remove_dir(struct dentry * d)
196{
197 struct dentry * parent = dget(d->d_parent);
198 struct configfs_dirent * sd;
199
200 sd = d->d_fsdata;
201 list_del_init(&sd->s_sibling);
202 configfs_put(sd);
203 if (d->d_inode)
204 simple_rmdir(parent->d_inode,d);
205
206 pr_debug(" o %s removing done (%d)\n",d->d_name.name,
207 atomic_read(&d->d_count));
208
209 dput(parent);
210}
211
212/**
213 * configfs_remove_dir - remove an config_item's directory.
214 * @item: config_item we're removing.
215 *
216 * The only thing special about this is that we remove any files in
217 * the directory before we remove the directory, and we've inlined
218 * what used to be configfs_rmdir() below, instead of calling separately.
219 */
220
221static void configfs_remove_dir(struct config_item * item)
222{
223 struct dentry * dentry = dget(item->ci_dentry);
224
225 if (!dentry)
226 return;
227
228 remove_dir(dentry);
229 /**
230 * Drop reference from dget() on entrance.
231 */
232 dput(dentry);
233}
234
235
236/* attaches attribute's configfs_dirent to the dentry corresponding to the
237 * attribute file
238 */
239static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * dentry)
240{
241 struct configfs_attribute * attr = sd->s_element;
242 int error;
243
244 error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, init_file);
245 if (error)
246 return error;
247
248 dentry->d_op = &configfs_dentry_ops;
249 dentry->d_fsdata = configfs_get(sd);
250 sd->s_dentry = dentry;
251 d_rehash(dentry);
252
253 return 0;
254}
255
256static struct dentry * configfs_lookup(struct inode *dir,
257 struct dentry *dentry,
258 struct nameidata *nd)
259{
260 struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata;
261 struct configfs_dirent * sd;
262 int found = 0;
263 int err = 0;
264
265 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
266 if (sd->s_type & CONFIGFS_NOT_PINNED) {
267 const unsigned char * name = configfs_get_name(sd);
268
269 if (strcmp(name, dentry->d_name.name))
270 continue;
271
272 found = 1;
273 err = configfs_attach_attr(sd, dentry);
274 break;
275 }
276 }
277
278 if (!found) {
279 /*
280 * If it doesn't exist and it isn't a NOT_PINNED item,
281 * it must be negative.
282 */
283 return simple_lookup(dir, dentry, nd);
284 }
285
286 return ERR_PTR(err);
287}
288
289/*
290 * Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are
291 * attributes and are removed by rmdir(). We recurse, taking i_mutex
292 * on all children that are candidates for default detach. If the
293 * result is clean, then configfs_detach_group() will handle dropping
294 * i_mutex. If there is an error, the caller will clean up the i_mutex
295 * holders via configfs_detach_rollback().
296 */
297static int configfs_detach_prep(struct dentry *dentry)
298{
299 struct configfs_dirent *parent_sd = dentry->d_fsdata;
300 struct configfs_dirent *sd;
301 int ret;
302
303 ret = -EBUSY;
304 if (!list_empty(&parent_sd->s_links))
305 goto out;
306
307 ret = 0;
308 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
309 if (sd->s_type & CONFIGFS_NOT_PINNED)
310 continue;
311 if (sd->s_type & CONFIGFS_USET_DEFAULT) {
312 mutex_lock(&sd->s_dentry->d_inode->i_mutex);
313 /* Mark that we've taken i_mutex */
314 sd->s_type |= CONFIGFS_USET_DROPPING;
315
316 ret = configfs_detach_prep(sd->s_dentry);
317 if (!ret)
318 continue;
319 } else
320 ret = -ENOTEMPTY;
321
322 break;
323 }
324
325out:
326 return ret;
327}
328
329/*
330 * Walk the tree, dropping i_mutex wherever CONFIGFS_USET_DROPPING is
331 * set.
332 */
333static void configfs_detach_rollback(struct dentry *dentry)
334{
335 struct configfs_dirent *parent_sd = dentry->d_fsdata;
336 struct configfs_dirent *sd;
337
338 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
339 if (sd->s_type & CONFIGFS_USET_DEFAULT) {
340 configfs_detach_rollback(sd->s_dentry);
341
342 if (sd->s_type & CONFIGFS_USET_DROPPING) {
343 sd->s_type &= ~CONFIGFS_USET_DROPPING;
344 mutex_unlock(&sd->s_dentry->d_inode->i_mutex);
345 }
346 }
347 }
348}
349
350static void detach_attrs(struct config_item * item)
351{
352 struct dentry * dentry = dget(item->ci_dentry);
353 struct configfs_dirent * parent_sd;
354 struct configfs_dirent * sd, * tmp;
355
356 if (!dentry)
357 return;
358
359 pr_debug("configfs %s: dropping attrs for dir\n",
360 dentry->d_name.name);
361
362 parent_sd = dentry->d_fsdata;
363 list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
364 if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED))
365 continue;
366 list_del_init(&sd->s_sibling);
367 configfs_drop_dentry(sd, dentry);
368 configfs_put(sd);
369 }
370
371 /**
372 * Drop reference from dget() on entrance.
373 */
374 dput(dentry);
375}
376
377static int populate_attrs(struct config_item *item)
378{
379 struct config_item_type *t = item->ci_type;
380 struct configfs_attribute *attr;
381 int error = 0;
382 int i;
383
384 if (!t)
385 return -EINVAL;
386 if (t->ct_attrs) {
387 for (i = 0; (attr = t->ct_attrs[i]) != NULL; i++) {
388 if ((error = configfs_create_file(item, attr)))
389 break;
390 }
391 }
392
393 if (error)
394 detach_attrs(item);
395
396 return error;
397}
398
399static int configfs_attach_group(struct config_item *parent_item,
400 struct config_item *item,
401 struct dentry *dentry);
402static void configfs_detach_group(struct config_item *item);
403
404static void detach_groups(struct config_group *group)
405{
406 struct dentry * dentry = dget(group->cg_item.ci_dentry);
407 struct dentry *child;
408 struct configfs_dirent *parent_sd;
409 struct configfs_dirent *sd, *tmp;
410
411 if (!dentry)
412 return;
413
414 parent_sd = dentry->d_fsdata;
415 list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
416 if (!sd->s_element ||
417 !(sd->s_type & CONFIGFS_USET_DEFAULT))
418 continue;
419
420 child = sd->s_dentry;
421
422 configfs_detach_group(sd->s_element);
423 child->d_inode->i_flags |= S_DEAD;
424
425 /*
426 * From rmdir/unregister, a configfs_detach_prep() pass
427 * has taken our i_mutex for us. Drop it.
428 * From mkdir/register cleanup, there is no sem held.
429 */
430 if (sd->s_type & CONFIGFS_USET_DROPPING)
431 mutex_unlock(&child->d_inode->i_mutex);
432
433 d_delete(child);
434 dput(child);
435 }
436
437 /**
438 * Drop reference from dget() on entrance.
439 */
440 dput(dentry);
441}
442
443/*
444 * This fakes mkdir(2) on a default_groups[] entry. It
445 * creates a dentry, attachs it, and then does fixup
446 * on the sd->s_type.
447 *
448 * We could, perhaps, tweak our parent's ->mkdir for a minute and
449 * try using vfs_mkdir. Just a thought.
450 */
451static int create_default_group(struct config_group *parent_group,
452 struct config_group *group)
453{
454 int ret;
455 struct qstr name;
456 struct configfs_dirent *sd;
457 /* We trust the caller holds a reference to parent */
458 struct dentry *child, *parent = parent_group->cg_item.ci_dentry;
459
460 if (!group->cg_item.ci_name)
461 group->cg_item.ci_name = group->cg_item.ci_namebuf;
462 name.name = group->cg_item.ci_name;
463 name.len = strlen(name.name);
464 name.hash = full_name_hash(name.name, name.len);
465
466 ret = -ENOMEM;
467 child = d_alloc(parent, &name);
468 if (child) {
469 d_add(child, NULL);
470
471 ret = configfs_attach_group(&parent_group->cg_item,
472 &group->cg_item, child);
473 if (!ret) {
474 sd = child->d_fsdata;
475 sd->s_type |= CONFIGFS_USET_DEFAULT;
476 } else {
477 d_delete(child);
478 dput(child);
479 }
480 }
481
482 return ret;
483}
484
485static int populate_groups(struct config_group *group)
486{
487 struct config_group *new_group;
488 struct dentry *dentry = group->cg_item.ci_dentry;
489 int ret = 0;
490 int i;
491
492 if (group && group->default_groups) {
493 /* FYI, we're faking mkdir here
494 * I'm not sure we need this semaphore, as we're called
495 * from our parent's mkdir. That holds our parent's
496 * i_mutex, so afaik lookup cannot continue through our
497 * parent to find us, let alone mess with our tree.
498 * That said, taking our i_mutex is closer to mkdir
499 * emulation, and shouldn't hurt. */
500 mutex_lock(&dentry->d_inode->i_mutex);
501
502 for (i = 0; group->default_groups[i]; i++) {
503 new_group = group->default_groups[i];
504
505 ret = create_default_group(group, new_group);
506 if (ret)
507 break;
508 }
509
510 mutex_unlock(&dentry->d_inode->i_mutex);
511 }
512
513 if (ret)
514 detach_groups(group);
515
516 return ret;
517}
518
519/*
520 * All of link_obj/unlink_obj/link_group/unlink_group require that
521 * subsys->su_sem is held.
522 */
523
524static void unlink_obj(struct config_item *item)
525{
526 struct config_group *group;
527
528 group = item->ci_group;
529 if (group) {
530 list_del_init(&item->ci_entry);
531
532 item->ci_group = NULL;
533 item->ci_parent = NULL;
534 config_item_put(item);
535
536 config_group_put(group);
537 }
538}
539
540static void link_obj(struct config_item *parent_item, struct config_item *item)
541{
542 /* Parent seems redundant with group, but it makes certain
543 * traversals much nicer. */
544 item->ci_parent = parent_item;
545 item->ci_group = config_group_get(to_config_group(parent_item));
546 list_add_tail(&item->ci_entry, &item->ci_group->cg_children);
547
548 config_item_get(item);
549}
550
551static void unlink_group(struct config_group *group)
552{
553 int i;
554 struct config_group *new_group;
555
556 if (group->default_groups) {
557 for (i = 0; group->default_groups[i]; i++) {
558 new_group = group->default_groups[i];
559 unlink_group(new_group);
560 }
561 }
562
563 group->cg_subsys = NULL;
564 unlink_obj(&group->cg_item);
565}
566
567static void link_group(struct config_group *parent_group, struct config_group *group)
568{
569 int i;
570 struct config_group *new_group;
571 struct configfs_subsystem *subsys = NULL; /* gcc is a turd */
572
573 link_obj(&parent_group->cg_item, &group->cg_item);
574
575 if (parent_group->cg_subsys)
576 subsys = parent_group->cg_subsys;
577 else if (configfs_is_root(&parent_group->cg_item))
578 subsys = to_configfs_subsystem(group);
579 else
580 BUG();
581 group->cg_subsys = subsys;
582
583 if (group->default_groups) {
584 for (i = 0; group->default_groups[i]; i++) {
585 new_group = group->default_groups[i];
586 link_group(group, new_group);
587 }
588 }
589}
590
591/*
592 * The goal is that configfs_attach_item() (and
593 * configfs_attach_group()) can be called from either the VFS or this
594 * module. That is, they assume that the items have been created,
595 * the dentry allocated, and the dcache is all ready to go.
596 *
597 * If they fail, they must clean up after themselves as if they
598 * had never been called. The caller (VFS or local function) will
599 * handle cleaning up the dcache bits.
600 *
601 * configfs_detach_group() and configfs_detach_item() behave similarly on
602 * the way out. They assume that the proper semaphores are held, they
603 * clean up the configfs items, and they expect their callers will
604 * handle the dcache bits.
605 */
606static int configfs_attach_item(struct config_item *parent_item,
607 struct config_item *item,
608 struct dentry *dentry)
609{
610 int ret;
611
612 ret = configfs_create_dir(item, dentry);
613 if (!ret) {
614 ret = populate_attrs(item);
615 if (ret) {
616 configfs_remove_dir(item);
617 d_delete(dentry);
618 }
619 }
620
621 return ret;
622}
623
624static void configfs_detach_item(struct config_item *item)
625{
626 detach_attrs(item);
627 configfs_remove_dir(item);
628}
629
630static int configfs_attach_group(struct config_item *parent_item,
631 struct config_item *item,
632 struct dentry *dentry)
633{
634 int ret;
635 struct configfs_dirent *sd;
636
637 ret = configfs_attach_item(parent_item, item, dentry);
638 if (!ret) {
639 sd = dentry->d_fsdata;
640 sd->s_type |= CONFIGFS_USET_DIR;
641
642 ret = populate_groups(to_config_group(item));
643 if (ret) {
644 configfs_detach_item(item);
645 d_delete(dentry);
646 }
647 }
648
649 return ret;
650}
651
652static void configfs_detach_group(struct config_item *item)
653{
654 detach_groups(to_config_group(item));
655 configfs_detach_item(item);
656}
657
658/*
659 * Drop the initial reference from make_item()/make_group()
660 * This function assumes that reference is held on item
661 * and that item holds a valid reference to the parent. Also, it
662 * assumes the caller has validated ci_type.
663 */
664static void client_drop_item(struct config_item *parent_item,
665 struct config_item *item)
666{
667 struct config_item_type *type;
668
669 type = parent_item->ci_type;
670 BUG_ON(!type);
671
672 if (type->ct_group_ops && type->ct_group_ops->drop_item)
673 type->ct_group_ops->drop_item(to_config_group(parent_item),
674 item);
675 else
676 config_item_put(item);
677}
678
679
680static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
681{
682 int ret;
683 struct config_group *group;
684 struct config_item *item;
685 struct config_item *parent_item;
686 struct configfs_subsystem *subsys;
687 struct configfs_dirent *sd;
688 struct config_item_type *type;
689 struct module *owner;
690 char *name;
691
692 if (dentry->d_parent == configfs_sb->s_root)
693 return -EPERM;
694
695 sd = dentry->d_parent->d_fsdata;
696 if (!(sd->s_type & CONFIGFS_USET_DIR))
697 return -EPERM;
698
699 parent_item = configfs_get_config_item(dentry->d_parent);
700 type = parent_item->ci_type;
701 subsys = to_config_group(parent_item)->cg_subsys;
702 BUG_ON(!subsys);
703
704 if (!type || !type->ct_group_ops ||
705 (!type->ct_group_ops->make_group &&
706 !type->ct_group_ops->make_item)) {
707 config_item_put(parent_item);
708 return -EPERM; /* What lack-of-mkdir returns */
709 }
710
711 name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL);
712 if (!name) {
713 config_item_put(parent_item);
714 return -ENOMEM;
715 }
716 snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
717
718 down(&subsys->su_sem);
719 group = NULL;
720 item = NULL;
721 if (type->ct_group_ops->make_group) {
722 group = type->ct_group_ops->make_group(to_config_group(parent_item), name);
723 if (group) {
724 link_group(to_config_group(parent_item), group);
725 item = &group->cg_item;
726 }
727 } else {
728 item = type->ct_group_ops->make_item(to_config_group(parent_item), name);
729 if (item)
730 link_obj(parent_item, item);
731 }
732 up(&subsys->su_sem);
733
734 kfree(name);
735 if (!item) {
736 config_item_put(parent_item);
737 return -ENOMEM;
738 }
739
740 ret = -EINVAL;
741 type = item->ci_type;
742 if (type) {
743 owner = type->ct_owner;
744 if (try_module_get(owner)) {
745 if (group) {
746 ret = configfs_attach_group(parent_item,
747 item,
748 dentry);
749 } else {
750 ret = configfs_attach_item(parent_item,
751 item,
752 dentry);
753 }
754
755 if (ret) {
756 down(&subsys->su_sem);
757 if (group)
758 unlink_group(group);
759 else
760 unlink_obj(item);
761 client_drop_item(parent_item, item);
762 up(&subsys->su_sem);
763
764 config_item_put(parent_item);
765 module_put(owner);
766 }
767 }
768 }
769
770 return ret;
771}
772
773static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
774{
775 struct config_item *parent_item;
776 struct config_item *item;
777 struct configfs_subsystem *subsys;
778 struct configfs_dirent *sd;
779 struct module *owner = NULL;
780 int ret;
781
782 if (dentry->d_parent == configfs_sb->s_root)
783 return -EPERM;
784
785 sd = dentry->d_fsdata;
786 if (sd->s_type & CONFIGFS_USET_DEFAULT)
787 return -EPERM;
788
789 parent_item = configfs_get_config_item(dentry->d_parent);
790 subsys = to_config_group(parent_item)->cg_subsys;
791 BUG_ON(!subsys);
792
793 if (!parent_item->ci_type) {
794 config_item_put(parent_item);
795 return -EINVAL;
796 }
797
798 ret = configfs_detach_prep(dentry);
799 if (ret) {
800 configfs_detach_rollback(dentry);
801 config_item_put(parent_item);
802 return ret;
803 }
804
805 item = configfs_get_config_item(dentry);
806
807 /* Drop reference from above, item already holds one. */
808 config_item_put(parent_item);
809
810 if (item->ci_type)
811 owner = item->ci_type->ct_owner;
812
813 if (sd->s_type & CONFIGFS_USET_DIR) {
814 configfs_detach_group(item);
815
816 down(&subsys->su_sem);
817 unlink_group(to_config_group(item));
818 } else {
819 configfs_detach_item(item);
820
821 down(&subsys->su_sem);
822 unlink_obj(item);
823 }
824
825 client_drop_item(parent_item, item);
826 up(&subsys->su_sem);
827
828 /* Drop our reference from above */
829 config_item_put(item);
830
831 module_put(owner);
832
833 return 0;
834}
835
836struct inode_operations configfs_dir_inode_operations = {
837 .mkdir = configfs_mkdir,
838 .rmdir = configfs_rmdir,
839 .symlink = configfs_symlink,
840 .unlink = configfs_unlink,
841 .lookup = configfs_lookup,
842};
843
844#if 0
845int configfs_rename_dir(struct config_item * item, const char *new_name)
846{
847 int error = 0;
848 struct dentry * new_dentry, * parent;
849
850 if (!strcmp(config_item_name(item), new_name))
851 return -EINVAL;
852
853 if (!item->parent)
854 return -EINVAL;
855
856 down_write(&configfs_rename_sem);
857 parent = item->parent->dentry;
858
859 mutex_lock(&parent->d_inode->i_mutex);
860
861 new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
862 if (!IS_ERR(new_dentry)) {
863 if (!new_dentry->d_inode) {
864 error = config_item_set_name(item, "%s", new_name);
865 if (!error) {
866 d_add(new_dentry, NULL);
867 d_move(item->dentry, new_dentry);
868 }
869 else
870 d_delete(new_dentry);
871 } else
872 error = -EEXIST;
873 dput(new_dentry);
874 }
875 mutex_unlock(&parent->d_inode->i_mutex);
876 up_write(&configfs_rename_sem);
877
878 return error;
879}
880#endif
881
882static int configfs_dir_open(struct inode *inode, struct file *file)
883{
884 struct dentry * dentry = file->f_dentry;
885 struct configfs_dirent * parent_sd = dentry->d_fsdata;
886
887 mutex_lock(&dentry->d_inode->i_mutex);
888 file->private_data = configfs_new_dirent(parent_sd, NULL);
889 mutex_unlock(&dentry->d_inode->i_mutex);
890
891 return file->private_data ? 0 : -ENOMEM;
892
893}
894
895static int configfs_dir_close(struct inode *inode, struct file *file)
896{
897 struct dentry * dentry = file->f_dentry;
898 struct configfs_dirent * cursor = file->private_data;
899
900 mutex_lock(&dentry->d_inode->i_mutex);
901 list_del_init(&cursor->s_sibling);
902 mutex_unlock(&dentry->d_inode->i_mutex);
903
904 release_configfs_dirent(cursor);
905
906 return 0;
907}
908
909/* Relationship between s_mode and the DT_xxx types */
910static inline unsigned char dt_type(struct configfs_dirent *sd)
911{
912 return (sd->s_mode >> 12) & 15;
913}
914
915static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
916{
917 struct dentry *dentry = filp->f_dentry;
918 struct configfs_dirent * parent_sd = dentry->d_fsdata;
919 struct configfs_dirent *cursor = filp->private_data;
920 struct list_head *p, *q = &cursor->s_sibling;
921 ino_t ino;
922 int i = filp->f_pos;
923
924 switch (i) {
925 case 0:
926 ino = dentry->d_inode->i_ino;
927 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
928 break;
929 filp->f_pos++;
930 i++;
931 /* fallthrough */
932 case 1:
933 ino = parent_ino(dentry);
934 if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
935 break;
936 filp->f_pos++;
937 i++;
938 /* fallthrough */
939 default:
940 if (filp->f_pos == 2) {
941 list_del(q);
942 list_add(q, &parent_sd->s_children);
943 }
944 for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
945 struct configfs_dirent *next;
946 const char * name;
947 int len;
948
949 next = list_entry(p, struct configfs_dirent,
950 s_sibling);
951 if (!next->s_element)
952 continue;
953
954 name = configfs_get_name(next);
955 len = strlen(name);
956 if (next->s_dentry)
957 ino = next->s_dentry->d_inode->i_ino;
958 else
959 ino = iunique(configfs_sb, 2);
960
961 if (filldir(dirent, name, len, filp->f_pos, ino,
962 dt_type(next)) < 0)
963 return 0;
964
965 list_del(q);
966 list_add(q, p);
967 p = q;
968 filp->f_pos++;
969 }
970 }
971 return 0;
972}
973
974static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
975{
976 struct dentry * dentry = file->f_dentry;
977
978 mutex_lock(&dentry->d_inode->i_mutex);
979 switch (origin) {
980 case 1:
981 offset += file->f_pos;
982 case 0:
983 if (offset >= 0)
984 break;
985 default:
986 mutex_unlock(&file->f_dentry->d_inode->i_mutex);
987 return -EINVAL;
988 }
989 if (offset != file->f_pos) {
990 file->f_pos = offset;
991 if (file->f_pos >= 2) {
992 struct configfs_dirent *sd = dentry->d_fsdata;
993 struct configfs_dirent *cursor = file->private_data;
994 struct list_head *p;
995 loff_t n = file->f_pos - 2;
996
997 list_del(&cursor->s_sibling);
998 p = sd->s_children.next;
999 while (n && p != &sd->s_children) {
1000 struct configfs_dirent *next;
1001 next = list_entry(p, struct configfs_dirent,
1002 s_sibling);
1003 if (next->s_element)
1004 n--;
1005 p = p->next;
1006 }
1007 list_add_tail(&cursor->s_sibling, p);
1008 }
1009 }
1010 mutex_unlock(&dentry->d_inode->i_mutex);
1011 return offset;
1012}
1013
1014struct file_operations configfs_dir_operations = {
1015 .open = configfs_dir_open,
1016 .release = configfs_dir_close,
1017 .llseek = configfs_dir_lseek,
1018 .read = generic_read_dir,
1019 .readdir = configfs_readdir,
1020};
1021
1022int configfs_register_subsystem(struct configfs_subsystem *subsys)
1023{
1024 int err;
1025 struct config_group *group = &subsys->su_group;
1026 struct qstr name;
1027 struct dentry *dentry;
1028 struct configfs_dirent *sd;
1029
1030 err = configfs_pin_fs();
1031 if (err)
1032 return err;
1033
1034 if (!group->cg_item.ci_name)
1035 group->cg_item.ci_name = group->cg_item.ci_namebuf;
1036
1037 sd = configfs_sb->s_root->d_fsdata;
1038 link_group(to_config_group(sd->s_element), group);
1039
1040 mutex_lock(&configfs_sb->s_root->d_inode->i_mutex);
1041
1042 name.name = group->cg_item.ci_name;
1043 name.len = strlen(name.name);
1044 name.hash = full_name_hash(name.name, name.len);
1045
1046 err = -ENOMEM;
1047 dentry = d_alloc(configfs_sb->s_root, &name);
1048 if (!dentry)
1049 goto out_release;
1050
1051 d_add(dentry, NULL);
1052
1053 err = configfs_attach_group(sd->s_element, &group->cg_item,
1054 dentry);
1055 if (!err)
1056 dentry = NULL;
1057 else
1058 d_delete(dentry);
1059
1060 mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
1061
1062 if (dentry) {
1063 dput(dentry);
1064out_release:
1065 unlink_group(group);
1066 configfs_release_fs();
1067 }
1068
1069 return err;
1070}
1071
1072void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
1073{
1074 struct config_group *group = &subsys->su_group;
1075 struct dentry *dentry = group->cg_item.ci_dentry;
1076
1077 if (dentry->d_parent != configfs_sb->s_root) {
1078 printk(KERN_ERR "configfs: Tried to unregister non-subsystem!\n");
1079 return;
1080 }
1081
1082 mutex_lock(&configfs_sb->s_root->d_inode->i_mutex);
1083 mutex_lock(&dentry->d_inode->i_mutex);
1084 if (configfs_detach_prep(dentry)) {
1085 printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
1086 }
1087 configfs_detach_group(&group->cg_item);
1088 dentry->d_inode->i_flags |= S_DEAD;
1089 mutex_unlock(&dentry->d_inode->i_mutex);
1090
1091 d_delete(dentry);
1092
1093 mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
1094
1095 dput(dentry);
1096
1097 unlink_group(group);
1098 configfs_release_fs();
1099}
1100
1101EXPORT_SYMBOL(configfs_register_subsystem);
1102EXPORT_SYMBOL(configfs_unregister_subsystem);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
new file mode 100644
index 00000000000..c26cd61f13a
--- /dev/null
+++ b/fs/configfs/file.c
@@ -0,0 +1,360 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * file.c - operations for regular (text) files.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 *
21 * Based on sysfs:
22 * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
23 *
24 * configfs Copyright (C) 2005 Oracle. All rights reserved.
25 */
26
27#include <linux/fs.h>
28#include <linux/module.h>
29#include <linux/dnotify.h>
30#include <linux/slab.h>
31#include <asm/uaccess.h>
32#include <asm/semaphore.h>
33
34#include <linux/configfs.h>
35#include "configfs_internal.h"
36
37
38struct configfs_buffer {
39 size_t count;
40 loff_t pos;
41 char * page;
42 struct configfs_item_operations * ops;
43 struct semaphore sem;
44 int needs_read_fill;
45};
46
47
48/**
49 * fill_read_buffer - allocate and fill buffer from item.
50 * @dentry: dentry pointer.
51 * @buffer: data buffer for file.
52 *
53 * Allocate @buffer->page, if it hasn't been already, then call the
54 * config_item's show() method to fill the buffer with this attribute's
55 * data.
56 * This is called only once, on the file's first read.
57 */
58static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buffer)
59{
60 struct configfs_attribute * attr = to_attr(dentry);
61 struct config_item * item = to_item(dentry->d_parent);
62 struct configfs_item_operations * ops = buffer->ops;
63 int ret = 0;
64 ssize_t count;
65
66 if (!buffer->page)
67 buffer->page = (char *) get_zeroed_page(GFP_KERNEL);
68 if (!buffer->page)
69 return -ENOMEM;
70
71 count = ops->show_attribute(item,attr,buffer->page);
72 buffer->needs_read_fill = 0;
73 BUG_ON(count > (ssize_t)PAGE_SIZE);
74 if (count >= 0)
75 buffer->count = count;
76 else
77 ret = count;
78 return ret;
79}
80
81
82/**
83 * flush_read_buffer - push buffer to userspace.
84 * @buffer: data buffer for file.
85 * @userbuf: user-passed buffer.
86 * @count: number of bytes requested.
87 * @ppos: file position.
88 *
89 * Copy the buffer we filled in fill_read_buffer() to userspace.
90 * This is done at the reader's leisure, copying and advancing
91 * the amount they specify each time.
92 * This may be called continuously until the buffer is empty.
93 */
94static int flush_read_buffer(struct configfs_buffer * buffer, char __user * buf,
95 size_t count, loff_t * ppos)
96{
97 int error;
98
99 if (*ppos > buffer->count)
100 return 0;
101
102 if (count > (buffer->count - *ppos))
103 count = buffer->count - *ppos;
104
105 error = copy_to_user(buf,buffer->page + *ppos,count);
106 if (!error)
107 *ppos += count;
108 return error ? -EFAULT : count;
109}
110
111/**
112 * configfs_read_file - read an attribute.
113 * @file: file pointer.
114 * @buf: buffer to fill.
115 * @count: number of bytes to read.
116 * @ppos: starting offset in file.
117 *
118 * Userspace wants to read an attribute file. The attribute descriptor
119 * is in the file's ->d_fsdata. The target item is in the directory's
120 * ->d_fsdata.
121 *
122 * We call fill_read_buffer() to allocate and fill the buffer from the
123 * item's show() method exactly once (if the read is happening from
124 * the beginning of the file). That should fill the entire buffer with
125 * all the data the item has to offer for that attribute.
126 * We then call flush_read_buffer() to copy the buffer to userspace
127 * in the increments specified.
128 */
129
130static ssize_t
131configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
132{
133 struct configfs_buffer * buffer = file->private_data;
134 ssize_t retval = 0;
135
136 down(&buffer->sem);
137 if (buffer->needs_read_fill) {
138 if ((retval = fill_read_buffer(file->f_dentry,buffer)))
139 goto out;
140 }
141 pr_debug("%s: count = %d, ppos = %lld, buf = %s\n",
142 __FUNCTION__,count,*ppos,buffer->page);
143 retval = flush_read_buffer(buffer,buf,count,ppos);
144out:
145 up(&buffer->sem);
146 return retval;
147}
148
149
150/**
151 * fill_write_buffer - copy buffer from userspace.
152 * @buffer: data buffer for file.
153 * @userbuf: data from user.
154 * @count: number of bytes in @userbuf.
155 *
156 * Allocate @buffer->page if it hasn't been already, then
157 * copy the user-supplied buffer into it.
158 */
159
160static int
161fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size_t count)
162{
163 int error;
164
165 if (!buffer->page)
166 buffer->page = (char *)get_zeroed_page(GFP_KERNEL);
167 if (!buffer->page)
168 return -ENOMEM;
169
170 if (count > PAGE_SIZE)
171 count = PAGE_SIZE;
172 error = copy_from_user(buffer->page,buf,count);
173 buffer->needs_read_fill = 1;
174 return error ? -EFAULT : count;
175}
176
177
178/**
179 * flush_write_buffer - push buffer to config_item.
180 * @file: file pointer.
181 * @buffer: data buffer for file.
182 *
183 * Get the correct pointers for the config_item and the attribute we're
184 * dealing with, then call the store() method for the attribute,
185 * passing the buffer that we acquired in fill_write_buffer().
186 */
187
188static int
189flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size_t count)
190{
191 struct configfs_attribute * attr = to_attr(dentry);
192 struct config_item * item = to_item(dentry->d_parent);
193 struct configfs_item_operations * ops = buffer->ops;
194
195 return ops->store_attribute(item,attr,buffer->page,count);
196}
197
198
199/**
200 * configfs_write_file - write an attribute.
201 * @file: file pointer
202 * @buf: data to write
203 * @count: number of bytes
204 * @ppos: starting offset
205 *
206 * Similar to configfs_read_file(), though working in the opposite direction.
207 * We allocate and fill the data from the user in fill_write_buffer(),
208 * then push it to the config_item in flush_write_buffer().
209 * There is no easy way for us to know if userspace is only doing a partial
210 * write, so we don't support them. We expect the entire buffer to come
211 * on the first write.
212 * Hint: if you're writing a value, first read the file, modify only the
213 * the value you're changing, then write entire buffer back.
214 */
215
216static ssize_t
217configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
218{
219 struct configfs_buffer * buffer = file->private_data;
220
221 down(&buffer->sem);
222 count = fill_write_buffer(buffer,buf,count);
223 if (count > 0)
224 count = flush_write_buffer(file->f_dentry,buffer,count);
225 if (count > 0)
226 *ppos += count;
227 up(&buffer->sem);
228 return count;
229}
230
231static int check_perm(struct inode * inode, struct file * file)
232{
233 struct config_item *item = configfs_get_config_item(file->f_dentry->d_parent);
234 struct configfs_attribute * attr = to_attr(file->f_dentry);
235 struct configfs_buffer * buffer;
236 struct configfs_item_operations * ops = NULL;
237 int error = 0;
238
239 if (!item || !attr)
240 goto Einval;
241
242 /* Grab the module reference for this attribute if we have one */
243 if (!try_module_get(attr->ca_owner)) {
244 error = -ENODEV;
245 goto Done;
246 }
247
248 if (item->ci_type)
249 ops = item->ci_type->ct_item_ops;
250 else
251 goto Eaccess;
252
253 /* File needs write support.
254 * The inode's perms must say it's ok,
255 * and we must have a store method.
256 */
257 if (file->f_mode & FMODE_WRITE) {
258
259 if (!(inode->i_mode & S_IWUGO) || !ops->store_attribute)
260 goto Eaccess;
261
262 }
263
264 /* File needs read support.
265 * The inode's perms must say it's ok, and we there
266 * must be a show method for it.
267 */
268 if (file->f_mode & FMODE_READ) {
269 if (!(inode->i_mode & S_IRUGO) || !ops->show_attribute)
270 goto Eaccess;
271 }
272
273 /* No error? Great, allocate a buffer for the file, and store it
274 * it in file->private_data for easy access.
275 */
276 buffer = kmalloc(sizeof(struct configfs_buffer),GFP_KERNEL);
277 if (buffer) {
278 memset(buffer,0,sizeof(struct configfs_buffer));
279 init_MUTEX(&buffer->sem);
280 buffer->needs_read_fill = 1;
281 buffer->ops = ops;
282 file->private_data = buffer;
283 } else
284 error = -ENOMEM;
285 goto Done;
286
287 Einval:
288 error = -EINVAL;
289 goto Done;
290 Eaccess:
291 error = -EACCES;
292 module_put(attr->ca_owner);
293 Done:
294 if (error && item)
295 config_item_put(item);
296 return error;
297}
298
299static int configfs_open_file(struct inode * inode, struct file * filp)
300{
301 return check_perm(inode,filp);
302}
303
304static int configfs_release(struct inode * inode, struct file * filp)
305{
306 struct config_item * item = to_item(filp->f_dentry->d_parent);
307 struct configfs_attribute * attr = to_attr(filp->f_dentry);
308 struct module * owner = attr->ca_owner;
309 struct configfs_buffer * buffer = filp->private_data;
310
311 if (item)
312 config_item_put(item);
313 /* After this point, attr should not be accessed. */
314 module_put(owner);
315
316 if (buffer) {
317 if (buffer->page)
318 free_page((unsigned long)buffer->page);
319 kfree(buffer);
320 }
321 return 0;
322}
323
324struct file_operations configfs_file_operations = {
325 .read = configfs_read_file,
326 .write = configfs_write_file,
327 .llseek = generic_file_llseek,
328 .open = configfs_open_file,
329 .release = configfs_release,
330};
331
332
333int configfs_add_file(struct dentry * dir, const struct configfs_attribute * attr, int type)
334{
335 struct configfs_dirent * parent_sd = dir->d_fsdata;
336 umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
337 int error = 0;
338
339 mutex_lock(&dir->d_inode->i_mutex);
340 error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type);
341 mutex_unlock(&dir->d_inode->i_mutex);
342
343 return error;
344}
345
346
347/**
348 * configfs_create_file - create an attribute file for an item.
349 * @item: item we're creating for.
350 * @attr: atrribute descriptor.
351 */
352
353int configfs_create_file(struct config_item * item, const struct configfs_attribute * attr)
354{
355 BUG_ON(!item || !item->ci_dentry || !attr);
356
357 return configfs_add_file(item->ci_dentry, attr,
358 CONFIGFS_ITEM_ATTR);
359}
360
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
new file mode 100644
index 00000000000..6577c588de9
--- /dev/null
+++ b/fs/configfs/inode.c
@@ -0,0 +1,162 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * inode.c - basic inode and dentry operations.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 *
21 * Based on sysfs:
22 * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
23 *
24 * configfs Copyright (C) 2005 Oracle. All rights reserved.
25 *
26 * Please see Documentation/filesystems/configfs.txt for more information.
27 */
28
29#undef DEBUG
30
31#include <linux/pagemap.h>
32#include <linux/namei.h>
33#include <linux/backing-dev.h>
34
35#include <linux/configfs.h>
36#include "configfs_internal.h"
37
38extern struct super_block * configfs_sb;
39
40static struct address_space_operations configfs_aops = {
41 .readpage = simple_readpage,
42 .prepare_write = simple_prepare_write,
43 .commit_write = simple_commit_write
44};
45
46static struct backing_dev_info configfs_backing_dev_info = {
47 .ra_pages = 0, /* No readahead */
48 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
49};
50
51struct inode * configfs_new_inode(mode_t mode)
52{
53 struct inode * inode = new_inode(configfs_sb);
54 if (inode) {
55 inode->i_mode = mode;
56 inode->i_uid = 0;
57 inode->i_gid = 0;
58 inode->i_blksize = PAGE_CACHE_SIZE;
59 inode->i_blocks = 0;
60 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
61 inode->i_mapping->a_ops = &configfs_aops;
62 inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
63 }
64 return inode;
65}
66
67int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *))
68{
69 int error = 0;
70 struct inode * inode = NULL;
71 if (dentry) {
72 if (!dentry->d_inode) {
73 if ((inode = configfs_new_inode(mode))) {
74 if (dentry->d_parent && dentry->d_parent->d_inode) {
75 struct inode *p_inode = dentry->d_parent->d_inode;
76 p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
77 }
78 goto Proceed;
79 }
80 else
81 error = -ENOMEM;
82 } else
83 error = -EEXIST;
84 } else
85 error = -ENOENT;
86 goto Done;
87
88 Proceed:
89 if (init)
90 error = init(inode);
91 if (!error) {
92 d_instantiate(dentry, inode);
93 if (S_ISDIR(mode) || S_ISLNK(mode))
94 dget(dentry); /* pin link and directory dentries in core */
95 } else
96 iput(inode);
97 Done:
98 return error;
99}
100
101/*
102 * Get the name for corresponding element represented by the given configfs_dirent
103 */
104const unsigned char * configfs_get_name(struct configfs_dirent *sd)
105{
106 struct attribute * attr;
107
108 if (!sd || !sd->s_element)
109 BUG();
110
111 /* These always have a dentry, so use that */
112 if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK))
113 return sd->s_dentry->d_name.name;
114
115 if (sd->s_type & CONFIGFS_ITEM_ATTR) {
116 attr = sd->s_element;
117 return attr->name;
118 }
119 return NULL;
120}
121
122
123/*
124 * Unhashes the dentry corresponding to given configfs_dirent
125 * Called with parent inode's i_mutex held.
126 */
127void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
128{
129 struct dentry * dentry = sd->s_dentry;
130
131 if (dentry) {
132 spin_lock(&dcache_lock);
133 if (!(d_unhashed(dentry) && dentry->d_inode)) {
134 dget_locked(dentry);
135 __d_drop(dentry);
136 spin_unlock(&dcache_lock);
137 simple_unlink(parent->d_inode, dentry);
138 } else
139 spin_unlock(&dcache_lock);
140 }
141}
142
143void configfs_hash_and_remove(struct dentry * dir, const char * name)
144{
145 struct configfs_dirent * sd;
146 struct configfs_dirent * parent_sd = dir->d_fsdata;
147
148 mutex_lock(&dir->d_inode->i_mutex);
149 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
150 if (!sd->s_element)
151 continue;
152 if (!strcmp(configfs_get_name(sd), name)) {
153 list_del_init(&sd->s_sibling);
154 configfs_drop_dentry(sd, dir);
155 configfs_put(sd);
156 break;
157 }
158 }
159 mutex_unlock(&dir->d_inode->i_mutex);
160}
161
162
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
new file mode 100644
index 00000000000..e07485ac50a
--- /dev/null
+++ b/fs/configfs/item.c
@@ -0,0 +1,227 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * item.c - library routines for handling generic config items
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 *
21 * Based on kobject:
22 * kobject is Copyright (c) 2002-2003 Patrick Mochel
23 *
24 * configfs Copyright (C) 2005 Oracle. All rights reserved.
25 *
26 * Please see the file Documentation/filesystems/configfs.txt for
27 * critical information about using the config_item interface.
28 */
29
30#include <linux/string.h>
31#include <linux/module.h>
32#include <linux/stat.h>
33#include <linux/slab.h>
34
35#include <linux/configfs.h>
36
37
38static inline struct config_item * to_item(struct list_head * entry)
39{
40 return container_of(entry,struct config_item,ci_entry);
41}
42
43/* Evil kernel */
44static void config_item_release(struct kref *kref);
45
46/**
47 * config_item_init - initialize item.
48 * @item: item in question.
49 */
50void config_item_init(struct config_item * item)
51{
52 kref_init(&item->ci_kref);
53 INIT_LIST_HEAD(&item->ci_entry);
54}
55
56/**
57 * config_item_set_name - Set the name of an item
58 * @item: item.
59 * @name: name.
60 *
61 * If strlen(name) >= CONFIGFS_ITEM_NAME_LEN, then use a
62 * dynamically allocated string that @item->ci_name points to.
63 * Otherwise, use the static @item->ci_namebuf array.
64 */
65
66int config_item_set_name(struct config_item * item, const char * fmt, ...)
67{
68 int error = 0;
69 int limit = CONFIGFS_ITEM_NAME_LEN;
70 int need;
71 va_list args;
72 char * name;
73
74 /*
75 * First, try the static array
76 */
77 va_start(args,fmt);
78 need = vsnprintf(item->ci_namebuf,limit,fmt,args);
79 va_end(args);
80 if (need < limit)
81 name = item->ci_namebuf;
82 else {
83 /*
84 * Need more space? Allocate it and try again
85 */
86 limit = need + 1;
87 name = kmalloc(limit,GFP_KERNEL);
88 if (!name) {
89 error = -ENOMEM;
90 goto Done;
91 }
92 va_start(args,fmt);
93 need = vsnprintf(name,limit,fmt,args);
94 va_end(args);
95
96 /* Still? Give up. */
97 if (need >= limit) {
98 kfree(name);
99 error = -EFAULT;
100 goto Done;
101 }
102 }
103
104 /* Free the old name, if necessary. */
105 if (item->ci_name && item->ci_name != item->ci_namebuf)
106 kfree(item->ci_name);
107
108 /* Now, set the new name */
109 item->ci_name = name;
110 Done:
111 return error;
112}
113
114EXPORT_SYMBOL(config_item_set_name);
115
116void config_item_init_type_name(struct config_item *item,
117 const char *name,
118 struct config_item_type *type)
119{
120 config_item_set_name(item, name);
121 item->ci_type = type;
122 config_item_init(item);
123}
124EXPORT_SYMBOL(config_item_init_type_name);
125
126void config_group_init_type_name(struct config_group *group, const char *name,
127 struct config_item_type *type)
128{
129 config_item_set_name(&group->cg_item, name);
130 group->cg_item.ci_type = type;
131 config_group_init(group);
132}
133EXPORT_SYMBOL(config_group_init_type_name);
134
135struct config_item * config_item_get(struct config_item * item)
136{
137 if (item)
138 kref_get(&item->ci_kref);
139 return item;
140}
141
142/**
143 * config_item_cleanup - free config_item resources.
144 * @item: item.
145 */
146
147void config_item_cleanup(struct config_item * item)
148{
149 struct config_item_type * t = item->ci_type;
150 struct config_group * s = item->ci_group;
151 struct config_item * parent = item->ci_parent;
152
153 pr_debug("config_item %s: cleaning up\n",config_item_name(item));
154 if (item->ci_name != item->ci_namebuf)
155 kfree(item->ci_name);
156 item->ci_name = NULL;
157 if (t && t->ct_item_ops && t->ct_item_ops->release)
158 t->ct_item_ops->release(item);
159 if (s)
160 config_group_put(s);
161 if (parent)
162 config_item_put(parent);
163}
164
165static void config_item_release(struct kref *kref)
166{
167 config_item_cleanup(container_of(kref, struct config_item, ci_kref));
168}
169
170/**
171 * config_item_put - decrement refcount for item.
172 * @item: item.
173 *
174 * Decrement the refcount, and if 0, call config_item_cleanup().
175 */
176void config_item_put(struct config_item * item)
177{
178 if (item)
179 kref_put(&item->ci_kref, config_item_release);
180}
181
182
183/**
184 * config_group_init - initialize a group for use
185 * @k: group
186 */
187
188void config_group_init(struct config_group *group)
189{
190 config_item_init(&group->cg_item);
191 INIT_LIST_HEAD(&group->cg_children);
192}
193
194
195/**
196 * config_group_find_obj - search for item in group.
197 * @group: group we're looking in.
198 * @name: item's name.
199 *
200 * Lock group via @group->cg_subsys, and iterate over @group->cg_list,
201 * looking for a matching config_item. If matching item is found
202 * take a reference and return the item.
203 */
204
205struct config_item * config_group_find_obj(struct config_group * group, const char * name)
206{
207 struct list_head * entry;
208 struct config_item * ret = NULL;
209
210 /* XXX LOCKING! */
211 list_for_each(entry,&group->cg_children) {
212 struct config_item * item = to_item(entry);
213 if (config_item_name(item) &&
214 !strcmp(config_item_name(item), name)) {
215 ret = config_item_get(item);
216 break;
217 }
218 }
219 return ret;
220}
221
222
223EXPORT_SYMBOL(config_item_init);
224EXPORT_SYMBOL(config_group_init);
225EXPORT_SYMBOL(config_item_get);
226EXPORT_SYMBOL(config_item_put);
227
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
new file mode 100644
index 00000000000..1a2f6f6a4d9
--- /dev/null
+++ b/fs/configfs/mount.c
@@ -0,0 +1,159 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * mount.c - operations for initializing and mounting configfs.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 *
21 * Based on sysfs:
22 * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
23 *
24 * configfs Copyright (C) 2005 Oracle. All rights reserved.
25 */
26
27#include <linux/fs.h>
28#include <linux/module.h>
29#include <linux/mount.h>
30#include <linux/pagemap.h>
31#include <linux/init.h>
32
33#include <linux/configfs.h>
34#include "configfs_internal.h"
35
36/* Random magic number */
37#define CONFIGFS_MAGIC 0x62656570
38
39struct vfsmount * configfs_mount = NULL;
40struct super_block * configfs_sb = NULL;
41static int configfs_mnt_count = 0;
42
43static struct super_operations configfs_ops = {
44 .statfs = simple_statfs,
45 .drop_inode = generic_delete_inode,
46};
47
48static struct config_group configfs_root_group = {
49 .cg_item = {
50 .ci_namebuf = "root",
51 .ci_name = configfs_root_group.cg_item.ci_namebuf,
52 },
53};
54
55int configfs_is_root(struct config_item *item)
56{
57 return item == &configfs_root_group.cg_item;
58}
59
60static struct configfs_dirent configfs_root = {
61 .s_sibling = LIST_HEAD_INIT(configfs_root.s_sibling),
62 .s_children = LIST_HEAD_INIT(configfs_root.s_children),
63 .s_element = &configfs_root_group.cg_item,
64 .s_type = CONFIGFS_ROOT,
65};
66
67static int configfs_fill_super(struct super_block *sb, void *data, int silent)
68{
69 struct inode *inode;
70 struct dentry *root;
71
72 sb->s_blocksize = PAGE_CACHE_SIZE;
73 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
74 sb->s_magic = CONFIGFS_MAGIC;
75 sb->s_op = &configfs_ops;
76 configfs_sb = sb;
77
78 inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO);
79 if (inode) {
80 inode->i_op = &configfs_dir_inode_operations;
81 inode->i_fop = &configfs_dir_operations;
82 /* directory inodes start off with i_nlink == 2 (for "." entry) */
83 inode->i_nlink++;
84 } else {
85 pr_debug("configfs: could not get root inode\n");
86 return -ENOMEM;
87 }
88
89 root = d_alloc_root(inode);
90 if (!root) {
91 pr_debug("%s: could not get root dentry!\n",__FUNCTION__);
92 iput(inode);
93 return -ENOMEM;
94 }
95 config_group_init(&configfs_root_group);
96 configfs_root_group.cg_item.ci_dentry = root;
97 root->d_fsdata = &configfs_root;
98 sb->s_root = root;
99 return 0;
100}
101
102static struct super_block *configfs_get_sb(struct file_system_type *fs_type,
103 int flags, const char *dev_name, void *data)
104{
105 return get_sb_single(fs_type, flags, data, configfs_fill_super);
106}
107
108static struct file_system_type configfs_fs_type = {
109 .owner = THIS_MODULE,
110 .name = "configfs",
111 .get_sb = configfs_get_sb,
112 .kill_sb = kill_litter_super,
113};
114
115int configfs_pin_fs(void)
116{
117 return simple_pin_fs("configfs", &configfs_mount,
118 &configfs_mnt_count);
119}
120
121void configfs_release_fs(void)
122{
123 simple_release_fs(&configfs_mount, &configfs_mnt_count);
124}
125
126
127static decl_subsys(config, NULL, NULL);
128
129static int __init configfs_init(void)
130{
131 int err;
132
133 kset_set_kset_s(&config_subsys, kernel_subsys);
134 err = subsystem_register(&config_subsys);
135 if (err)
136 return err;
137
138 err = register_filesystem(&configfs_fs_type);
139 if (err) {
140 printk(KERN_ERR "configfs: Unable to register filesystem!\n");
141 subsystem_unregister(&config_subsys);
142 }
143
144 return err;
145}
146
147static void __exit configfs_exit(void)
148{
149 unregister_filesystem(&configfs_fs_type);
150 subsystem_unregister(&config_subsys);
151}
152
153MODULE_AUTHOR("Oracle");
154MODULE_LICENSE("GPL");
155MODULE_VERSION("0.0.1");
156MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration.");
157
158module_init(configfs_init);
159module_exit(configfs_exit);
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
new file mode 100644
index 00000000000..50f5840521a
--- /dev/null
+++ b/fs/configfs/symlink.c
@@ -0,0 +1,281 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * symlink.c - operations for configfs symlinks.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 *
21 * Based on sysfs:
22 * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
23 *
24 * configfs Copyright (C) 2005 Oracle. All rights reserved.
25 */
26
27#include <linux/fs.h>
28#include <linux/module.h>
29#include <linux/namei.h>
30
31#include <linux/configfs.h>
32#include "configfs_internal.h"
33
34static int item_depth(struct config_item * item)
35{
36 struct config_item * p = item;
37 int depth = 0;
38 do { depth++; } while ((p = p->ci_parent) && !configfs_is_root(p));
39 return depth;
40}
41
42static int item_path_length(struct config_item * item)
43{
44 struct config_item * p = item;
45 int length = 1;
46 do {
47 length += strlen(config_item_name(p)) + 1;
48 p = p->ci_parent;
49 } while (p && !configfs_is_root(p));
50 return length;
51}
52
53static void fill_item_path(struct config_item * item, char * buffer, int length)
54{
55 struct config_item * p;
56
57 --length;
58 for (p = item; p && !configfs_is_root(p); p = p->ci_parent) {
59 int cur = strlen(config_item_name(p));
60
61 /* back up enough to print this bus id with '/' */
62 length -= cur;
63 strncpy(buffer + length,config_item_name(p),cur);
64 *(buffer + --length) = '/';
65 }
66}
67
68static int create_link(struct config_item *parent_item,
69 struct config_item *item,
70 struct dentry *dentry)
71{
72 struct configfs_dirent *target_sd = item->ci_dentry->d_fsdata;
73 struct configfs_symlink *sl;
74 int ret;
75
76 ret = -ENOMEM;
77 sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL);
78 if (sl) {
79 sl->sl_target = config_item_get(item);
80 /* FIXME: needs a lock, I'd bet */
81 list_add(&sl->sl_list, &target_sd->s_links);
82 ret = configfs_create_link(sl, parent_item->ci_dentry,
83 dentry);
84 if (ret) {
85 list_del_init(&sl->sl_list);
86 config_item_put(item);
87 kfree(sl);
88 }
89 }
90
91 return ret;
92}
93
94
95static int get_target(const char *symname, struct nameidata *nd,
96 struct config_item **target)
97{
98 int ret;
99
100 ret = path_lookup(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, nd);
101 if (!ret) {
102 if (nd->dentry->d_sb == configfs_sb) {
103 *target = configfs_get_config_item(nd->dentry);
104 if (!*target) {
105 ret = -ENOENT;
106 path_release(nd);
107 }
108 } else
109 ret = -EPERM;
110 }
111
112 return ret;
113}
114
115
116int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
117{
118 int ret;
119 struct nameidata nd;
120 struct config_item *parent_item;
121 struct config_item *target_item;
122 struct config_item_type *type;
123
124 ret = -EPERM; /* What lack-of-symlink returns */
125 if (dentry->d_parent == configfs_sb->s_root)
126 goto out;
127
128 parent_item = configfs_get_config_item(dentry->d_parent);
129 type = parent_item->ci_type;
130
131 if (!type || !type->ct_item_ops ||
132 !type->ct_item_ops->allow_link)
133 goto out_put;
134
135 ret = get_target(symname, &nd, &target_item);
136 if (ret)
137 goto out_put;
138
139 ret = type->ct_item_ops->allow_link(parent_item, target_item);
140 if (!ret)
141 ret = create_link(parent_item, target_item, dentry);
142
143 config_item_put(target_item);
144 path_release(&nd);
145
146out_put:
147 config_item_put(parent_item);
148
149out:
150 return ret;
151}
152
153int configfs_unlink(struct inode *dir, struct dentry *dentry)
154{
155 struct configfs_dirent *sd = dentry->d_fsdata;
156 struct configfs_symlink *sl;
157 struct config_item *parent_item;
158 struct config_item_type *type;
159 int ret;
160
161 ret = -EPERM; /* What lack-of-symlink returns */
162 if (!(sd->s_type & CONFIGFS_ITEM_LINK))
163 goto out;
164
165 if (dentry->d_parent == configfs_sb->s_root)
166 BUG();
167
168 sl = sd->s_element;
169
170 parent_item = configfs_get_config_item(dentry->d_parent);
171 type = parent_item->ci_type;
172
173 list_del_init(&sd->s_sibling);
174 configfs_drop_dentry(sd, dentry->d_parent);
175 dput(dentry);
176 configfs_put(sd);
177
178 /*
179 * drop_link() must be called before
180 * list_del_init(&sl->sl_list), so that the order of
181 * drop_link(this, target) and drop_item(target) is preserved.
182 */
183 if (type && type->ct_item_ops &&
184 type->ct_item_ops->drop_link)
185 type->ct_item_ops->drop_link(parent_item,
186 sl->sl_target);
187
188 /* FIXME: Needs lock */
189 list_del_init(&sl->sl_list);
190
191 /* Put reference from create_link() */
192 config_item_put(sl->sl_target);
193 kfree(sl);
194
195 config_item_put(parent_item);
196
197 ret = 0;
198
199out:
200 return ret;
201}
202
203static int configfs_get_target_path(struct config_item * item, struct config_item * target,
204 char *path)
205{
206 char * s;
207 int depth, size;
208
209 depth = item_depth(item);
210 size = item_path_length(target) + depth * 3 - 1;
211 if (size > PATH_MAX)
212 return -ENAMETOOLONG;
213
214 pr_debug("%s: depth = %d, size = %d\n", __FUNCTION__, depth, size);
215
216 for (s = path; depth--; s += 3)
217 strcpy(s,"../");
218
219 fill_item_path(target, path, size);
220 pr_debug("%s: path = '%s'\n", __FUNCTION__, path);
221
222 return 0;
223}
224
225static int configfs_getlink(struct dentry *dentry, char * path)
226{
227 struct config_item *item, *target_item;
228 int error = 0;
229
230 item = configfs_get_config_item(dentry->d_parent);
231 if (!item)
232 return -EINVAL;
233
234 target_item = configfs_get_config_item(dentry);
235 if (!target_item) {
236 config_item_put(item);
237 return -EINVAL;
238 }
239
240 down_read(&configfs_rename_sem);
241 error = configfs_get_target_path(item, target_item, path);
242 up_read(&configfs_rename_sem);
243
244 config_item_put(item);
245 config_item_put(target_item);
246 return error;
247
248}
249
250static void *configfs_follow_link(struct dentry *dentry, struct nameidata *nd)
251{
252 int error = -ENOMEM;
253 unsigned long page = get_zeroed_page(GFP_KERNEL);
254
255 if (page) {
256 error = configfs_getlink(dentry, (char *)page);
257 if (!error) {
258 nd_set_link(nd, (char *)page);
259 return (void *)page;
260 }
261 }
262
263 nd_set_link(nd, ERR_PTR(error));
264 return NULL;
265}
266
267static void configfs_put_link(struct dentry *dentry, struct nameidata *nd,
268 void *cookie)
269{
270 if (cookie) {
271 unsigned long page = (unsigned long)cookie;
272 free_page(page);
273 }
274}
275
276struct inode_operations configfs_symlink_inode_operations = {
277 .follow_link = configfs_follow_link,
278 .readlink = generic_readlink,
279 .put_link = configfs_put_link,
280};
281
diff --git a/fs/dcache.c b/fs/dcache.c
index 17e43913868..86bdb93789c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -71,7 +71,7 @@ struct dentry_stat_t dentry_stat = {
71 71
72static void d_callback(struct rcu_head *head) 72static void d_callback(struct rcu_head *head)
73{ 73{
74 struct dentry * dentry = container_of(head, struct dentry, d_rcu); 74 struct dentry * dentry = container_of(head, struct dentry, d_u.d_rcu);
75 75
76 if (dname_external(dentry)) 76 if (dname_external(dentry))
77 kfree(dentry->d_name.name); 77 kfree(dentry->d_name.name);
@@ -86,7 +86,7 @@ static void d_free(struct dentry *dentry)
86{ 86{
87 if (dentry->d_op && dentry->d_op->d_release) 87 if (dentry->d_op && dentry->d_op->d_release)
88 dentry->d_op->d_release(dentry); 88 dentry->d_op->d_release(dentry);
89 call_rcu(&dentry->d_rcu, d_callback); 89 call_rcu(&dentry->d_u.d_rcu, d_callback);
90} 90}
91 91
92/* 92/*
@@ -94,7 +94,7 @@ static void d_free(struct dentry *dentry)
94 * d_iput() operation if defined. 94 * d_iput() operation if defined.
95 * Called with dcache_lock and per dentry lock held, drops both. 95 * Called with dcache_lock and per dentry lock held, drops both.
96 */ 96 */
97static inline void dentry_iput(struct dentry * dentry) 97static void dentry_iput(struct dentry * dentry)
98{ 98{
99 struct inode *inode = dentry->d_inode; 99 struct inode *inode = dentry->d_inode;
100 if (inode) { 100 if (inode) {
@@ -193,7 +193,7 @@ kill_it: {
193 list_del(&dentry->d_lru); 193 list_del(&dentry->d_lru);
194 dentry_stat.nr_unused--; 194 dentry_stat.nr_unused--;
195 } 195 }
196 list_del(&dentry->d_child); 196 list_del(&dentry->d_u.d_child);
197 dentry_stat.nr_dentry--; /* For d_free, below */ 197 dentry_stat.nr_dentry--; /* For d_free, below */
198 /*drops the locks, at that point nobody can reach this dentry */ 198 /*drops the locks, at that point nobody can reach this dentry */
199 dentry_iput(dentry); 199 dentry_iput(dentry);
@@ -367,7 +367,7 @@ static inline void prune_one_dentry(struct dentry * dentry)
367 struct dentry * parent; 367 struct dentry * parent;
368 368
369 __d_drop(dentry); 369 __d_drop(dentry);
370 list_del(&dentry->d_child); 370 list_del(&dentry->d_u.d_child);
371 dentry_stat.nr_dentry--; /* For d_free, below */ 371 dentry_stat.nr_dentry--; /* For d_free, below */
372 dentry_iput(dentry); 372 dentry_iput(dentry);
373 parent = dentry->d_parent; 373 parent = dentry->d_parent;
@@ -518,7 +518,7 @@ repeat:
518resume: 518resume:
519 while (next != &this_parent->d_subdirs) { 519 while (next != &this_parent->d_subdirs) {
520 struct list_head *tmp = next; 520 struct list_head *tmp = next;
521 struct dentry *dentry = list_entry(tmp, struct dentry, d_child); 521 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
522 next = tmp->next; 522 next = tmp->next;
523 /* Have we found a mount point ? */ 523 /* Have we found a mount point ? */
524 if (d_mountpoint(dentry)) 524 if (d_mountpoint(dentry))
@@ -532,7 +532,7 @@ resume:
532 * All done at this level ... ascend and resume the search. 532 * All done at this level ... ascend and resume the search.
533 */ 533 */
534 if (this_parent != parent) { 534 if (this_parent != parent) {
535 next = this_parent->d_child.next; 535 next = this_parent->d_u.d_child.next;
536 this_parent = this_parent->d_parent; 536 this_parent = this_parent->d_parent;
537 goto resume; 537 goto resume;
538 } 538 }
@@ -569,7 +569,7 @@ repeat:
569resume: 569resume:
570 while (next != &this_parent->d_subdirs) { 570 while (next != &this_parent->d_subdirs) {
571 struct list_head *tmp = next; 571 struct list_head *tmp = next;
572 struct dentry *dentry = list_entry(tmp, struct dentry, d_child); 572 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
573 next = tmp->next; 573 next = tmp->next;
574 574
575 if (!list_empty(&dentry->d_lru)) { 575 if (!list_empty(&dentry->d_lru)) {
@@ -610,7 +610,7 @@ dentry->d_parent->d_name.name, dentry->d_name.name, found);
610 * All done at this level ... ascend and resume the search. 610 * All done at this level ... ascend and resume the search.
611 */ 611 */
612 if (this_parent != parent) { 612 if (this_parent != parent) {
613 next = this_parent->d_child.next; 613 next = this_parent->d_u.d_child.next;
614 this_parent = this_parent->d_parent; 614 this_parent = this_parent->d_parent;
615#ifdef DCACHE_DEBUG 615#ifdef DCACHE_DEBUG
616printk(KERN_DEBUG "select_parent: ascending to %s/%s, found=%d\n", 616printk(KERN_DEBUG "select_parent: ascending to %s/%s, found=%d\n",
@@ -753,12 +753,12 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
753 dentry->d_parent = dget(parent); 753 dentry->d_parent = dget(parent);
754 dentry->d_sb = parent->d_sb; 754 dentry->d_sb = parent->d_sb;
755 } else { 755 } else {
756 INIT_LIST_HEAD(&dentry->d_child); 756 INIT_LIST_HEAD(&dentry->d_u.d_child);
757 } 757 }
758 758
759 spin_lock(&dcache_lock); 759 spin_lock(&dcache_lock);
760 if (parent) 760 if (parent)
761 list_add(&dentry->d_child, &parent->d_subdirs); 761 list_add(&dentry->d_u.d_child, &parent->d_subdirs);
762 dentry_stat.nr_dentry++; 762 dentry_stat.nr_dentry++;
763 spin_unlock(&dcache_lock); 763 spin_unlock(&dcache_lock);
764 764
@@ -808,10 +808,14 @@ void d_instantiate(struct dentry *entry, struct inode * inode)
808 * 808 *
809 * Fill in inode information in the entry. On success, it returns NULL. 809 * Fill in inode information in the entry. On success, it returns NULL.
810 * If an unhashed alias of "entry" already exists, then we return the 810 * If an unhashed alias of "entry" already exists, then we return the
811 * aliased dentry instead. 811 * aliased dentry instead and drop one reference to inode.
812 * 812 *
813 * Note that in order to avoid conflicts with rename() etc, the caller 813 * Note that in order to avoid conflicts with rename() etc, the caller
814 * had better be holding the parent directory semaphore. 814 * had better be holding the parent directory semaphore.
815 *
816 * This also assumes that the inode count has been incremented
817 * (or otherwise set) by the caller to indicate that it is now
818 * in use by the dcache.
815 */ 819 */
816struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode) 820struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
817{ 821{
@@ -838,6 +842,7 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
838 dget_locked(alias); 842 dget_locked(alias);
839 spin_unlock(&dcache_lock); 843 spin_unlock(&dcache_lock);
840 BUG_ON(!d_unhashed(alias)); 844 BUG_ON(!d_unhashed(alias));
845 iput(inode);
841 return alias; 846 return alias;
842 } 847 }
843 list_add(&entry->d_alias, &inode->i_dentry); 848 list_add(&entry->d_alias, &inode->i_dentry);
@@ -1310,8 +1315,8 @@ already_unhashed:
1310 /* Unhash the target: dput() will then get rid of it */ 1315 /* Unhash the target: dput() will then get rid of it */
1311 __d_drop(target); 1316 __d_drop(target);
1312 1317
1313 list_del(&dentry->d_child); 1318 list_del(&dentry->d_u.d_child);
1314 list_del(&target->d_child); 1319 list_del(&target->d_u.d_child);
1315 1320
1316 /* Switch the names.. */ 1321 /* Switch the names.. */
1317 switch_names(dentry, target); 1322 switch_names(dentry, target);
@@ -1322,15 +1327,15 @@ already_unhashed:
1322 if (IS_ROOT(dentry)) { 1327 if (IS_ROOT(dentry)) {
1323 dentry->d_parent = target->d_parent; 1328 dentry->d_parent = target->d_parent;
1324 target->d_parent = target; 1329 target->d_parent = target;
1325 INIT_LIST_HEAD(&target->d_child); 1330 INIT_LIST_HEAD(&target->d_u.d_child);
1326 } else { 1331 } else {
1327 do_switch(dentry->d_parent, target->d_parent); 1332 do_switch(dentry->d_parent, target->d_parent);
1328 1333
1329 /* And add them back to the (new) parent lists */ 1334 /* And add them back to the (new) parent lists */
1330 list_add(&target->d_child, &target->d_parent->d_subdirs); 1335 list_add(&target->d_u.d_child, &target->d_parent->d_subdirs);
1331 } 1336 }
1332 1337
1333 list_add(&dentry->d_child, &dentry->d_parent->d_subdirs); 1338 list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
1334 spin_unlock(&target->d_lock); 1339 spin_unlock(&target->d_lock);
1335 spin_unlock(&dentry->d_lock); 1340 spin_unlock(&dentry->d_lock);
1336 write_sequnlock(&rename_lock); 1341 write_sequnlock(&rename_lock);
@@ -1568,7 +1573,7 @@ repeat:
1568resume: 1573resume:
1569 while (next != &this_parent->d_subdirs) { 1574 while (next != &this_parent->d_subdirs) {
1570 struct list_head *tmp = next; 1575 struct list_head *tmp = next;
1571 struct dentry *dentry = list_entry(tmp, struct dentry, d_child); 1576 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
1572 next = tmp->next; 1577 next = tmp->next;
1573 if (d_unhashed(dentry)||!dentry->d_inode) 1578 if (d_unhashed(dentry)||!dentry->d_inode)
1574 continue; 1579 continue;
@@ -1579,7 +1584,7 @@ resume:
1579 atomic_dec(&dentry->d_count); 1584 atomic_dec(&dentry->d_count);
1580 } 1585 }
1581 if (this_parent != root) { 1586 if (this_parent != root) {
1582 next = this_parent->d_child.next; 1587 next = this_parent->d_u.d_child.next;
1583 atomic_dec(&this_parent->d_count); 1588 atomic_dec(&this_parent->d_count);
1584 this_parent = this_parent->d_parent; 1589 this_parent = this_parent->d_parent;
1585 goto resume; 1590 goto resume;
diff --git a/fs/dcookies.c b/fs/dcookies.c
index 02aa0ddc582..f8274a8f83b 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -18,6 +18,7 @@
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/list.h> 19#include <linux/list.h>
20#include <linux/mount.h> 20#include <linux/mount.h>
21#include <linux/capability.h>
21#include <linux/dcache.h> 22#include <linux/dcache.h>
22#include <linux/mm.h> 23#include <linux/mm.h>
23#include <linux/errno.h> 24#include <linux/errno.h>
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index a86ac4aeaed..d4f1a2cddd4 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -146,7 +146,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
146 } 146 }
147 147
148 *dentry = NULL; 148 *dentry = NULL;
149 down(&parent->d_inode->i_sem); 149 mutex_lock(&parent->d_inode->i_mutex);
150 *dentry = lookup_one_len(name, parent, strlen(name)); 150 *dentry = lookup_one_len(name, parent, strlen(name));
151 if (!IS_ERR(dentry)) { 151 if (!IS_ERR(dentry)) {
152 if ((mode & S_IFMT) == S_IFDIR) 152 if ((mode & S_IFMT) == S_IFDIR)
@@ -155,7 +155,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
155 error = debugfs_create(parent->d_inode, *dentry, mode); 155 error = debugfs_create(parent->d_inode, *dentry, mode);
156 } else 156 } else
157 error = PTR_ERR(dentry); 157 error = PTR_ERR(dentry);
158 up(&parent->d_inode->i_sem); 158 mutex_unlock(&parent->d_inode->i_mutex);
159 159
160 return error; 160 return error;
161} 161}
@@ -273,7 +273,7 @@ void debugfs_remove(struct dentry *dentry)
273 if (!parent || !parent->d_inode) 273 if (!parent || !parent->d_inode)
274 return; 274 return;
275 275
276 down(&parent->d_inode->i_sem); 276 mutex_lock(&parent->d_inode->i_mutex);
277 if (debugfs_positive(dentry)) { 277 if (debugfs_positive(dentry)) {
278 if (dentry->d_inode) { 278 if (dentry->d_inode) {
279 if (S_ISDIR(dentry->d_inode->i_mode)) 279 if (S_ISDIR(dentry->d_inode->i_mode))
@@ -283,7 +283,7 @@ void debugfs_remove(struct dentry *dentry)
283 dput(dentry); 283 dput(dentry);
284 } 284 }
285 } 285 }
286 up(&parent->d_inode->i_sem); 286 mutex_unlock(&parent->d_inode->i_mutex);
287 simple_release_fs(&debugfs_mount, &debugfs_mount_count); 287 simple_release_fs(&debugfs_mount, &debugfs_mount_count);
288} 288}
289EXPORT_SYMBOL_GPL(debugfs_remove); 289EXPORT_SYMBOL_GPL(debugfs_remove);
diff --git a/fs/devfs/base.c b/fs/devfs/base.c
index 1274422a538..b621521e09d 100644
--- a/fs/devfs/base.c
+++ b/fs/devfs/base.c
@@ -2162,27 +2162,27 @@ static int devfs_d_revalidate_wait(struct dentry *dentry, struct nameidata *nd)
2162 * 2162 *
2163 * make sure that 2163 * make sure that
2164 * d_instantiate always runs under lock 2164 * d_instantiate always runs under lock
2165 * we release i_sem lock before going to sleep 2165 * we release i_mutex lock before going to sleep
2166 * 2166 *
2167 * unfortunately sometimes d_revalidate is called with 2167 * unfortunately sometimes d_revalidate is called with
2168 * and sometimes without i_sem lock held. The following checks 2168 * and sometimes without i_mutex lock held. The following checks
2169 * attempt to deduce when we need to add (and drop resp.) lock 2169 * attempt to deduce when we need to add (and drop resp.) lock
2170 * here. This relies on current (2.6.2) calling coventions: 2170 * here. This relies on current (2.6.2) calling coventions:
2171 * 2171 *
2172 * lookup_hash is always run under i_sem and is passing NULL 2172 * lookup_hash is always run under i_mutex and is passing NULL
2173 * as nd 2173 * as nd
2174 * 2174 *
2175 * open(...,O_CREATE,...) calls _lookup_hash under i_sem 2175 * open(...,O_CREATE,...) calls _lookup_hash under i_mutex
2176 * and sets flags to LOOKUP_OPEN|LOOKUP_CREATE 2176 * and sets flags to LOOKUP_OPEN|LOOKUP_CREATE
2177 * 2177 *
2178 * all other invocations of ->d_revalidate seem to happen 2178 * all other invocations of ->d_revalidate seem to happen
2179 * outside of i_sem 2179 * outside of i_mutex
2180 */ 2180 */
2181 need_lock = nd && 2181 need_lock = nd &&
2182 (!(nd->flags & LOOKUP_CREATE) || (nd->flags & LOOKUP_PARENT)); 2182 (!(nd->flags & LOOKUP_CREATE) || (nd->flags & LOOKUP_PARENT));
2183 2183
2184 if (need_lock) 2184 if (need_lock)
2185 down(&dir->i_sem); 2185 mutex_lock(&dir->i_mutex);
2186 2186
2187 if (is_devfsd_or_child(fs_info)) { 2187 if (is_devfsd_or_child(fs_info)) {
2188 devfs_handle_t de = lookup_info->de; 2188 devfs_handle_t de = lookup_info->de;
@@ -2221,9 +2221,9 @@ static int devfs_d_revalidate_wait(struct dentry *dentry, struct nameidata *nd)
2221 add_wait_queue(&lookup_info->wait_queue, &wait); 2221 add_wait_queue(&lookup_info->wait_queue, &wait);
2222 read_unlock(&parent->u.dir.lock); 2222 read_unlock(&parent->u.dir.lock);
2223 /* at this point it is always (hopefully) locked */ 2223 /* at this point it is always (hopefully) locked */
2224 up(&dir->i_sem); 2224 mutex_unlock(&dir->i_mutex);
2225 schedule(); 2225 schedule();
2226 down(&dir->i_sem); 2226 mutex_lock(&dir->i_mutex);
2227 /* 2227 /*
2228 * This does not need nor should remove wait from wait_queue. 2228 * This does not need nor should remove wait from wait_queue.
2229 * Wait queue head is never reused - nothing is ever added to it 2229 * Wait queue head is never reused - nothing is ever added to it
@@ -2238,7 +2238,7 @@ static int devfs_d_revalidate_wait(struct dentry *dentry, struct nameidata *nd)
2238 2238
2239 out: 2239 out:
2240 if (need_lock) 2240 if (need_lock)
2241 up(&dir->i_sem); 2241 mutex_unlock(&dir->i_mutex);
2242 return 1; 2242 return 1;
2243} /* End Function devfs_d_revalidate_wait */ 2243} /* End Function devfs_d_revalidate_wait */
2244 2244
@@ -2284,9 +2284,9 @@ static struct dentry *devfs_lookup(struct inode *dir, struct dentry *dentry,
2284 /* Unlock directory semaphore, which will release any waiters. They 2284 /* Unlock directory semaphore, which will release any waiters. They
2285 will get the hashed dentry, and may be forced to wait for 2285 will get the hashed dentry, and may be forced to wait for
2286 revalidation */ 2286 revalidation */
2287 up(&dir->i_sem); 2287 mutex_unlock(&dir->i_mutex);
2288 wait_for_devfsd_finished(fs_info); /* If I'm not devfsd, must wait */ 2288 wait_for_devfsd_finished(fs_info); /* If I'm not devfsd, must wait */
2289 down(&dir->i_sem); /* Grab it again because them's the rules */ 2289 mutex_lock(&dir->i_mutex); /* Grab it again because them's the rules */
2290 de = lookup_info.de; 2290 de = lookup_info.de;
2291 /* If someone else has been so kind as to make the inode, we go home 2291 /* If someone else has been so kind as to make the inode, we go home
2292 early */ 2292 early */
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index f2be44d4491..bfb8a230bac 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -130,7 +130,7 @@ static struct dentry *get_node(int num)
130{ 130{
131 char s[12]; 131 char s[12];
132 struct dentry *root = devpts_root; 132 struct dentry *root = devpts_root;
133 down(&root->d_inode->i_sem); 133 mutex_lock(&root->d_inode->i_mutex);
134 return lookup_one_len(s, root, sprintf(s, "%d", num)); 134 return lookup_one_len(s, root, sprintf(s, "%d", num));
135} 135}
136 136
@@ -161,7 +161,7 @@ int devpts_pty_new(struct tty_struct *tty)
161 if (!IS_ERR(dentry) && !dentry->d_inode) 161 if (!IS_ERR(dentry) && !dentry->d_inode)
162 d_instantiate(dentry, inode); 162 d_instantiate(dentry, inode);
163 163
164 up(&devpts_root->d_inode->i_sem); 164 mutex_unlock(&devpts_root->d_inode->i_mutex);
165 165
166 return 0; 166 return 0;
167} 167}
@@ -178,7 +178,7 @@ struct tty_struct *devpts_get_tty(int number)
178 dput(dentry); 178 dput(dentry);
179 } 179 }
180 180
181 up(&devpts_root->d_inode->i_sem); 181 mutex_unlock(&devpts_root->d_inode->i_mutex);
182 182
183 return tty; 183 return tty;
184} 184}
@@ -196,7 +196,7 @@ void devpts_pty_kill(int number)
196 } 196 }
197 dput(dentry); 197 dput(dentry);
198 } 198 }
199 up(&devpts_root->d_inode->i_sem); 199 mutex_unlock(&devpts_root->d_inode->i_mutex);
200} 200}
201 201
202static int __init init_devpts_fs(void) 202static int __init init_devpts_fs(void)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 3931e7f1e6b..30dbbd1df51 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -56,7 +56,7 @@
56 * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems. 56 * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
57 * This determines whether we need to do the fancy locking which prevents 57 * This determines whether we need to do the fancy locking which prevents
58 * direct-IO from being able to read uninitialised disk blocks. If its zero 58 * direct-IO from being able to read uninitialised disk blocks. If its zero
59 * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_sem is 59 * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
60 * not held for the entire direct write (taken briefly, initially, during a 60 * not held for the entire direct write (taken briefly, initially, during a
61 * direct read though, but its never held for the duration of a direct-IO). 61 * direct read though, but its never held for the duration of a direct-IO).
62 */ 62 */
@@ -930,7 +930,7 @@ out:
930} 930}
931 931
932/* 932/*
933 * Releases both i_sem and i_alloc_sem 933 * Releases both i_mutex and i_alloc_sem
934 */ 934 */
935static ssize_t 935static ssize_t
936direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 936direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
@@ -1062,11 +1062,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1062 1062
1063 /* 1063 /*
1064 * All block lookups have been performed. For READ requests 1064 * All block lookups have been performed. For READ requests
1065 * we can let i_sem go now that its achieved its purpose 1065 * we can let i_mutex go now that its achieved its purpose
1066 * of protecting us from looking up uninitialized blocks. 1066 * of protecting us from looking up uninitialized blocks.
1067 */ 1067 */
1068 if ((rw == READ) && (dio->lock_type == DIO_LOCKING)) 1068 if ((rw == READ) && (dio->lock_type == DIO_LOCKING))
1069 up(&dio->inode->i_sem); 1069 mutex_unlock(&dio->inode->i_mutex);
1070 1070
1071 /* 1071 /*
1072 * OK, all BIOs are submitted, so we can decrement bio_count to truly 1072 * OK, all BIOs are submitted, so we can decrement bio_count to truly
@@ -1145,18 +1145,18 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1145 * The locking rules are governed by the dio_lock_type parameter. 1145 * The locking rules are governed by the dio_lock_type parameter.
1146 * 1146 *
1147 * DIO_NO_LOCKING (no locking, for raw block device access) 1147 * DIO_NO_LOCKING (no locking, for raw block device access)
1148 * For writes, i_sem is not held on entry; it is never taken. 1148 * For writes, i_mutex is not held on entry; it is never taken.
1149 * 1149 *
1150 * DIO_LOCKING (simple locking for regular files) 1150 * DIO_LOCKING (simple locking for regular files)
1151 * For writes we are called under i_sem and return with i_sem held, even though 1151 * For writes we are called under i_mutex and return with i_mutex held, even though
1152 * it is internally dropped. 1152 * it is internally dropped.
1153 * For reads, i_sem is not held on entry, but it is taken and dropped before 1153 * For reads, i_mutex is not held on entry, but it is taken and dropped before
1154 * returning. 1154 * returning.
1155 * 1155 *
1156 * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of 1156 * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
1157 * uninitialised data, allowing parallel direct readers and writers) 1157 * uninitialised data, allowing parallel direct readers and writers)
1158 * For writes we are called without i_sem, return without it, never touch it. 1158 * For writes we are called without i_mutex, return without it, never touch it.
1159 * For reads, i_sem is held on entry and will be released before returning. 1159 * For reads, i_mutex is held on entry and will be released before returning.
1160 * 1160 *
1161 * Additional i_alloc_sem locking requirements described inline below. 1161 * Additional i_alloc_sem locking requirements described inline below.
1162 */ 1162 */
@@ -1214,11 +1214,11 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1214 * For block device access DIO_NO_LOCKING is used, 1214 * For block device access DIO_NO_LOCKING is used,
1215 * neither readers nor writers do any locking at all 1215 * neither readers nor writers do any locking at all
1216 * For regular files using DIO_LOCKING, 1216 * For regular files using DIO_LOCKING,
1217 * readers need to grab i_sem and i_alloc_sem 1217 * readers need to grab i_mutex and i_alloc_sem
1218 * writers need to grab i_alloc_sem only (i_sem is already held) 1218 * writers need to grab i_alloc_sem only (i_mutex is already held)
1219 * For regular files using DIO_OWN_LOCKING, 1219 * For regular files using DIO_OWN_LOCKING,
1220 * neither readers nor writers take any locks here 1220 * neither readers nor writers take any locks here
1221 * (i_sem is already held and release for writers here) 1221 * (i_mutex is already held and release for writers here)
1222 */ 1222 */
1223 dio->lock_type = dio_lock_type; 1223 dio->lock_type = dio_lock_type;
1224 if (dio_lock_type != DIO_NO_LOCKING) { 1224 if (dio_lock_type != DIO_NO_LOCKING) {
@@ -1228,7 +1228,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1228 1228
1229 mapping = iocb->ki_filp->f_mapping; 1229 mapping = iocb->ki_filp->f_mapping;
1230 if (dio_lock_type != DIO_OWN_LOCKING) { 1230 if (dio_lock_type != DIO_OWN_LOCKING) {
1231 down(&inode->i_sem); 1231 mutex_lock(&inode->i_mutex);
1232 reader_with_isem = 1; 1232 reader_with_isem = 1;
1233 } 1233 }
1234 1234
@@ -1240,7 +1240,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1240 } 1240 }
1241 1241
1242 if (dio_lock_type == DIO_OWN_LOCKING) { 1242 if (dio_lock_type == DIO_OWN_LOCKING) {
1243 up(&inode->i_sem); 1243 mutex_unlock(&inode->i_mutex);
1244 reader_with_isem = 0; 1244 reader_with_isem = 0;
1245 } 1245 }
1246 } 1246 }
@@ -1266,7 +1266,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1266 1266
1267out: 1267out:
1268 if (reader_with_isem) 1268 if (reader_with_isem)
1269 up(&inode->i_sem); 1269 mutex_unlock(&inode->i_mutex);
1270 if (rw & WRITE) 1270 if (rw & WRITE)
1271 current->flags &= ~PF_SYNCWRITE; 1271 current->flags &= ~PF_SYNCWRITE;
1272 return retval; 1272 return retval;
diff --git a/fs/dquot.c b/fs/dquot.c
index 05b60283c9c..1966c890b48 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -77,6 +77,7 @@
77#include <linux/kmod.h> 77#include <linux/kmod.h>
78#include <linux/namei.h> 78#include <linux/namei.h>
79#include <linux/buffer_head.h> 79#include <linux/buffer_head.h>
80#include <linux/capability.h>
80#include <linux/quotaops.h> 81#include <linux/quotaops.h>
81 82
82#include <asm/uaccess.h> 83#include <asm/uaccess.h>
@@ -100,7 +101,7 @@
100 * operation is just reading pointers from inode (or not using them at all) the 101 * operation is just reading pointers from inode (or not using them at all) the
101 * read lock is enough. If pointers are altered function must hold write lock 102 * read lock is enough. If pointers are altered function must hold write lock
102 * (these locking rules also apply for S_NOQUOTA flag in the inode - note that 103 * (these locking rules also apply for S_NOQUOTA flag in the inode - note that
103 * for altering the flag i_sem is also needed). If operation is holding 104 * for altering the flag i_mutex is also needed). If operation is holding
104 * reference to dquot in other way (e.g. quotactl ops) it must be guarded by 105 * reference to dquot in other way (e.g. quotactl ops) it must be guarded by
105 * dqonoff_sem. 106 * dqonoff_sem.
106 * This locking assures that: 107 * This locking assures that:
@@ -117,9 +118,9 @@
117 * spinlock to internal buffers before writing. 118 * spinlock to internal buffers before writing.
118 * 119 *
119 * Lock ordering (including related VFS locks) is the following: 120 * Lock ordering (including related VFS locks) is the following:
120 * i_sem > dqonoff_sem > iprune_sem > journal_lock > dqptr_sem > 121 * i_mutex > dqonoff_sem > iprune_sem > journal_lock > dqptr_sem >
121 * > dquot->dq_lock > dqio_sem 122 * > dquot->dq_lock > dqio_sem
122 * i_sem on quota files is special (it's below dqio_sem) 123 * i_mutex on quota files is special (it's below dqio_sem)
123 */ 124 */
124 125
125static DEFINE_SPINLOCK(dq_list_lock); 126static DEFINE_SPINLOCK(dq_list_lock);
@@ -1369,11 +1370,11 @@ int vfs_quota_off(struct super_block *sb, int type)
1369 /* If quota was reenabled in the meantime, we have 1370 /* If quota was reenabled in the meantime, we have
1370 * nothing to do */ 1371 * nothing to do */
1371 if (!sb_has_quota_enabled(sb, cnt)) { 1372 if (!sb_has_quota_enabled(sb, cnt)) {
1372 down(&toputinode[cnt]->i_sem); 1373 mutex_lock(&toputinode[cnt]->i_mutex);
1373 toputinode[cnt]->i_flags &= ~(S_IMMUTABLE | 1374 toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
1374 S_NOATIME | S_NOQUOTA); 1375 S_NOATIME | S_NOQUOTA);
1375 truncate_inode_pages(&toputinode[cnt]->i_data, 0); 1376 truncate_inode_pages(&toputinode[cnt]->i_data, 0);
1376 up(&toputinode[cnt]->i_sem); 1377 mutex_unlock(&toputinode[cnt]->i_mutex);
1377 mark_inode_dirty(toputinode[cnt]); 1378 mark_inode_dirty(toputinode[cnt]);
1378 iput(toputinode[cnt]); 1379 iput(toputinode[cnt]);
1379 } 1380 }
@@ -1417,7 +1418,7 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
1417 write_inode_now(inode, 1); 1418 write_inode_now(inode, 1);
1418 /* And now flush the block cache so that kernel sees the changes */ 1419 /* And now flush the block cache so that kernel sees the changes */
1419 invalidate_bdev(sb->s_bdev, 0); 1420 invalidate_bdev(sb->s_bdev, 0);
1420 down(&inode->i_sem); 1421 mutex_lock(&inode->i_mutex);
1421 down(&dqopt->dqonoff_sem); 1422 down(&dqopt->dqonoff_sem);
1422 if (sb_has_quota_enabled(sb, type)) { 1423 if (sb_has_quota_enabled(sb, type)) {
1423 error = -EBUSY; 1424 error = -EBUSY;
@@ -1449,7 +1450,7 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
1449 goto out_file_init; 1450 goto out_file_init;
1450 } 1451 }
1451 up(&dqopt->dqio_sem); 1452 up(&dqopt->dqio_sem);
1452 up(&inode->i_sem); 1453 mutex_unlock(&inode->i_mutex);
1453 set_enable_flags(dqopt, type); 1454 set_enable_flags(dqopt, type);
1454 1455
1455 add_dquot_ref(sb, type); 1456 add_dquot_ref(sb, type);
@@ -1470,7 +1471,7 @@ out_lock:
1470 inode->i_flags |= oldflags; 1471 inode->i_flags |= oldflags;
1471 up_write(&dqopt->dqptr_sem); 1472 up_write(&dqopt->dqptr_sem);
1472 } 1473 }
1473 up(&inode->i_sem); 1474 mutex_unlock(&inode->i_mutex);
1474out_fmt: 1475out_fmt:
1475 put_quota_format(fmt); 1476 put_quota_format(fmt);
1476 1477
@@ -1513,10 +1514,16 @@ int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
1513 if (IS_ERR(dentry)) 1514 if (IS_ERR(dentry))
1514 return PTR_ERR(dentry); 1515 return PTR_ERR(dentry);
1515 1516
1517 if (!dentry->d_inode) {
1518 error = -ENOENT;
1519 goto out;
1520 }
1521
1516 error = security_quota_on(dentry); 1522 error = security_quota_on(dentry);
1517 if (!error) 1523 if (!error)
1518 error = vfs_quota_on_inode(dentry->d_inode, type, format_id); 1524 error = vfs_quota_on_inode(dentry->d_inode, type, format_id);
1519 1525
1526out:
1520 dput(dentry); 1527 dput(dentry);
1521 return error; 1528 return error;
1522} 1529}
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
new file mode 100644
index 00000000000..4e4762389bd
--- /dev/null
+++ b/fs/drop_caches.c
@@ -0,0 +1,68 @@
1/*
2 * Implement the manual drop-all-pagecache function
3 */
4
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/fs.h>
8#include <linux/writeback.h>
9#include <linux/sysctl.h>
10#include <linux/gfp.h>
11
12/* A global variable is a bit ugly, but it keeps the code simple */
13int sysctl_drop_caches;
14
15static void drop_pagecache_sb(struct super_block *sb)
16{
17 struct inode *inode;
18
19 spin_lock(&inode_lock);
20 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
21 if (inode->i_state & (I_FREEING|I_WILL_FREE))
22 continue;
23 invalidate_inode_pages(inode->i_mapping);
24 }
25 spin_unlock(&inode_lock);
26}
27
28void drop_pagecache(void)
29{
30 struct super_block *sb;
31
32 spin_lock(&sb_lock);
33restart:
34 list_for_each_entry(sb, &super_blocks, s_list) {
35 sb->s_count++;
36 spin_unlock(&sb_lock);
37 down_read(&sb->s_umount);
38 if (sb->s_root)
39 drop_pagecache_sb(sb);
40 up_read(&sb->s_umount);
41 spin_lock(&sb_lock);
42 if (__put_super_and_need_restart(sb))
43 goto restart;
44 }
45 spin_unlock(&sb_lock);
46}
47
48void drop_slab(void)
49{
50 int nr_objects;
51
52 do {
53 nr_objects = shrink_slab(1000, GFP_KERNEL, 1000);
54 } while (nr_objects > 10);
55}
56
57int drop_caches_sysctl_handler(ctl_table *table, int write,
58 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
59{
60 proc_dointvec_minmax(table, write, file, buffer, length, ppos);
61 if (write) {
62 if (sysctl_drop_caches & 1)
63 drop_pagecache();
64 if (sysctl_drop_caches & 2)
65 drop_slab();
66 }
67 return 0;
68}
diff --git a/fs/efs/super.c b/fs/efs/super.c
index d8d5ea9a999..afc4891feb3 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -222,12 +222,13 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) {
222 sblock); 222 sblock);
223#endif 223#endif
224 } 224 }
225 return(sblock); 225 return sblock;
226} 226}
227 227
228static int efs_validate_super(struct efs_sb_info *sb, struct efs_super *super) { 228static int efs_validate_super(struct efs_sb_info *sb, struct efs_super *super) {
229 229
230 if (!IS_EFS_MAGIC(be32_to_cpu(super->fs_magic))) return -1; 230 if (!IS_EFS_MAGIC(be32_to_cpu(super->fs_magic)))
231 return -1;
231 232
232 sb->fs_magic = be32_to_cpu(super->fs_magic); 233 sb->fs_magic = be32_to_cpu(super->fs_magic);
233 sb->total_blocks = be32_to_cpu(super->fs_size); 234 sb->total_blocks = be32_to_cpu(super->fs_size);
diff --git a/fs/exec.c b/fs/exec.c
index c466fec5de2..055378d2513 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -306,9 +306,6 @@ void install_arg_page(struct vm_area_struct *vma,
306 struct page *page, unsigned long address) 306 struct page *page, unsigned long address)
307{ 307{
308 struct mm_struct *mm = vma->vm_mm; 308 struct mm_struct *mm = vma->vm_mm;
309 pgd_t * pgd;
310 pud_t * pud;
311 pmd_t * pmd;
312 pte_t * pte; 309 pte_t * pte;
313 spinlock_t *ptl; 310 spinlock_t *ptl;
314 311
@@ -316,14 +313,7 @@ void install_arg_page(struct vm_area_struct *vma,
316 goto out; 313 goto out;
317 314
318 flush_dcache_page(page); 315 flush_dcache_page(page);
319 pgd = pgd_offset(mm, address); 316 pte = get_locked_pte(mm, address, &ptl);
320 pud = pud_alloc(mm, pgd, address);
321 if (!pud)
322 goto out;
323 pmd = pmd_alloc(mm, pud, address);
324 if (!pmd)
325 goto out;
326 pte = pte_alloc_map_lock(mm, pmd, address, &ptl);
327 if (!pte) 317 if (!pte)
328 goto out; 318 goto out;
329 if (!pte_none(*pte)) { 319 if (!pte_none(*pte)) {
@@ -334,7 +324,7 @@ void install_arg_page(struct vm_area_struct *vma,
334 lru_cache_add_active(page); 324 lru_cache_add_active(page);
335 set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte( 325 set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
336 page, vma->vm_page_prot)))); 326 page, vma->vm_page_prot))));
337 page_add_anon_rmap(page, vma, address); 327 page_add_new_anon_rmap(page, vma, address);
338 pte_unmap_unlock(pte, ptl); 328 pte_unmap_unlock(pte, ptl);
339 329
340 /* no need for flush_tlb */ 330 /* no need for flush_tlb */
@@ -487,7 +477,7 @@ struct file *open_exec(const char *name)
487 int err; 477 int err;
488 struct file *file; 478 struct file *file;
489 479
490 err = path_lookup_open(name, LOOKUP_FOLLOW, &nd, FMODE_READ); 480 err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd, FMODE_READ);
491 file = ERR_PTR(err); 481 file = ERR_PTR(err);
492 482
493 if (!err) { 483 if (!err) {
@@ -585,7 +575,7 @@ static int exec_mmap(struct mm_struct *mm)
585 * disturbing other processes. (Other processes might share the signal 575 * disturbing other processes. (Other processes might share the signal
586 * table via the CLONE_SIGHAND option to clone().) 576 * table via the CLONE_SIGHAND option to clone().)
587 */ 577 */
588static inline int de_thread(struct task_struct *tsk) 578static int de_thread(struct task_struct *tsk)
589{ 579{
590 struct signal_struct *sig = tsk->signal; 580 struct signal_struct *sig = tsk->signal;
591 struct sighand_struct *newsighand, *oldsighand = tsk->sighand; 581 struct sighand_struct *newsighand, *oldsighand = tsk->sighand;
@@ -642,10 +632,10 @@ static inline int de_thread(struct task_struct *tsk)
642 * synchronize with any firing (by calling del_timer_sync) 632 * synchronize with any firing (by calling del_timer_sync)
643 * before we can safely let the old group leader die. 633 * before we can safely let the old group leader die.
644 */ 634 */
645 sig->real_timer.data = (unsigned long)current; 635 sig->real_timer.data = current;
646 spin_unlock_irq(lock); 636 spin_unlock_irq(lock);
647 if (del_timer_sync(&sig->real_timer)) 637 if (hrtimer_cancel(&sig->real_timer))
648 add_timer(&sig->real_timer); 638 hrtimer_restart(&sig->real_timer);
649 spin_lock_irq(lock); 639 spin_lock_irq(lock);
650 } 640 }
651 while (atomic_read(&sig->count) > count) { 641 while (atomic_read(&sig->count) > count) {
@@ -668,7 +658,7 @@ static inline int de_thread(struct task_struct *tsk)
668 if (!thread_group_leader(current)) { 658 if (!thread_group_leader(current)) {
669 struct task_struct *parent; 659 struct task_struct *parent;
670 struct dentry *proc_dentry1, *proc_dentry2; 660 struct dentry *proc_dentry1, *proc_dentry2;
671 unsigned long exit_state, ptrace; 661 unsigned long ptrace;
672 662
673 /* 663 /*
674 * Wait for the thread group leader to be a zombie. 664 * Wait for the thread group leader to be a zombie.
@@ -726,15 +716,15 @@ static inline int de_thread(struct task_struct *tsk)
726 list_del(&current->tasks); 716 list_del(&current->tasks);
727 list_add_tail(&current->tasks, &init_task.tasks); 717 list_add_tail(&current->tasks, &init_task.tasks);
728 current->exit_signal = SIGCHLD; 718 current->exit_signal = SIGCHLD;
729 exit_state = leader->exit_state; 719
720 BUG_ON(leader->exit_state != EXIT_ZOMBIE);
721 leader->exit_state = EXIT_DEAD;
730 722
731 write_unlock_irq(&tasklist_lock); 723 write_unlock_irq(&tasklist_lock);
732 spin_unlock(&leader->proc_lock); 724 spin_unlock(&leader->proc_lock);
733 spin_unlock(&current->proc_lock); 725 spin_unlock(&current->proc_lock);
734 proc_pid_flush(proc_dentry1); 726 proc_pid_flush(proc_dentry1);
735 proc_pid_flush(proc_dentry2); 727 proc_pid_flush(proc_dentry2);
736
737 BUG_ON(exit_state != EXIT_ZOMBIE);
738 } 728 }
739 729
740 /* 730 /*
@@ -770,7 +760,7 @@ no_thread_group:
770 spin_lock(&oldsighand->siglock); 760 spin_lock(&oldsighand->siglock);
771 spin_lock(&newsighand->siglock); 761 spin_lock(&newsighand->siglock);
772 762
773 current->sighand = newsighand; 763 rcu_assign_pointer(current->sighand, newsighand);
774 recalc_sigpending(); 764 recalc_sigpending();
775 765
776 spin_unlock(&newsighand->siglock); 766 spin_unlock(&newsighand->siglock);
@@ -778,7 +768,7 @@ no_thread_group:
778 write_unlock_irq(&tasklist_lock); 768 write_unlock_irq(&tasklist_lock);
779 769
780 if (atomic_dec_and_test(&oldsighand->count)) 770 if (atomic_dec_and_test(&oldsighand->count))
781 kmem_cache_free(sighand_cachep, oldsighand); 771 sighand_free(oldsighand);
782 } 772 }
783 773
784 BUG_ON(!thread_group_leader(current)); 774 BUG_ON(!thread_group_leader(current));
@@ -790,7 +780,7 @@ no_thread_group:
790 * so that a new one can be started 780 * so that a new one can be started
791 */ 781 */
792 782
793static inline void flush_old_files(struct files_struct * files) 783static void flush_old_files(struct files_struct * files)
794{ 784{
795 long j = -1; 785 long j = -1;
796 struct fdtable *fdt; 786 struct fdtable *fdt;
@@ -974,7 +964,7 @@ int prepare_binprm(struct linux_binprm *bprm)
974 964
975EXPORT_SYMBOL(prepare_binprm); 965EXPORT_SYMBOL(prepare_binprm);
976 966
977static inline int unsafe_exec(struct task_struct *p) 967static int unsafe_exec(struct task_struct *p)
978{ 968{
979 int unsafe = 0; 969 int unsafe = 0;
980 if (p->ptrace & PT_PTRACED) { 970 if (p->ptrace & PT_PTRACED) {
@@ -1472,6 +1462,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
1472 if (!(current->signal->flags & SIGNAL_GROUP_EXIT)) { 1462 if (!(current->signal->flags & SIGNAL_GROUP_EXIT)) {
1473 current->signal->flags = SIGNAL_GROUP_EXIT; 1463 current->signal->flags = SIGNAL_GROUP_EXIT;
1474 current->signal->group_exit_code = exit_code; 1464 current->signal->group_exit_code = exit_code;
1465 current->signal->group_stop_count = 0;
1475 retval = 0; 1466 retval = 0;
1476 } 1467 }
1477 spin_unlock_irq(&current->sighand->siglock); 1468 spin_unlock_irq(&current->sighand->siglock);
@@ -1487,7 +1478,6 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
1487 * Clear any false indication of pending signals that might 1478 * Clear any false indication of pending signals that might
1488 * be seen by the filesystem code called to write the core file. 1479 * be seen by the filesystem code called to write the core file.
1489 */ 1480 */
1490 current->signal->group_stop_count = 0;
1491 clear_thread_flag(TIF_SIGPENDING); 1481 clear_thread_flag(TIF_SIGPENDING);
1492 1482
1493 if (current->signal->rlim[RLIMIT_CORE].rlim_cur < binfmt->min_coredump) 1483 if (current->signal->rlim[RLIMIT_CORE].rlim_cur < binfmt->min_coredump)
@@ -1515,7 +1505,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
1515 goto close_fail; 1505 goto close_fail;
1516 if (!file->f_op->write) 1506 if (!file->f_op->write)
1517 goto close_fail; 1507 goto close_fail;
1518 if (do_truncate(file->f_dentry, 0, file) != 0) 1508 if (do_truncate(file->f_dentry, 0, 0, file) != 0)
1519 goto close_fail; 1509 goto close_fail;
1520 1510
1521 retval = binfmt->core_dump(signr, regs, file); 1511 retval = binfmt->core_dump(signr, regs, file);
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index c49d6254379..b06b54f1bbb 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -11,6 +11,33 @@ struct export_operations export_op_default;
11 11
12#define dprintk(fmt, args...) do{}while(0) 12#define dprintk(fmt, args...) do{}while(0)
13 13
14static struct dentry *
15find_acceptable_alias(struct dentry *result,
16 int (*acceptable)(void *context, struct dentry *dentry),
17 void *context)
18{
19 struct dentry *dentry, *toput = NULL;
20
21 spin_lock(&dcache_lock);
22 list_for_each_entry(dentry, &result->d_inode->i_dentry, d_alias) {
23 dget_locked(dentry);
24 spin_unlock(&dcache_lock);
25 if (toput)
26 dput(toput);
27 if (dentry != result && acceptable(context, dentry)) {
28 dput(result);
29 return dentry;
30 }
31 spin_lock(&dcache_lock);
32 toput = dentry;
33 }
34 spin_unlock(&dcache_lock);
35
36 if (toput)
37 dput(toput);
38 return NULL;
39}
40
14/** 41/**
15 * find_exported_dentry - helper routine to implement export_operations->decode_fh 42 * find_exported_dentry - helper routine to implement export_operations->decode_fh
16 * @sb: The &super_block identifying the filesystem 43 * @sb: The &super_block identifying the filesystem
@@ -52,8 +79,7 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
52 struct dentry *target_dir; 79 struct dentry *target_dir;
53 int err; 80 int err;
54 struct export_operations *nops = sb->s_export_op; 81 struct export_operations *nops = sb->s_export_op;
55 struct list_head *le, *head; 82 struct dentry *alias;
56 struct dentry *toput = NULL;
57 int noprogress; 83 int noprogress;
58 char nbuf[NAME_MAX+1]; 84 char nbuf[NAME_MAX+1];
59 85
@@ -79,27 +105,10 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
79 /* there is no other dentry, so fail */ 105 /* there is no other dentry, so fail */
80 goto err_result; 106 goto err_result;
81 } 107 }
82 /* try any other aliases */ 108
83 spin_lock(&dcache_lock); 109 alias = find_acceptable_alias(result, acceptable, context);
84 head = &result->d_inode->i_dentry; 110 if (alias)
85 list_for_each(le, head) { 111 return alias;
86 struct dentry *dentry = list_entry(le, struct dentry, d_alias);
87 dget_locked(dentry);
88 spin_unlock(&dcache_lock);
89 if (toput)
90 dput(toput);
91 toput = NULL;
92 if (dentry != result &&
93 acceptable(context, dentry)) {
94 dput(result);
95 return dentry;
96 }
97 spin_lock(&dcache_lock);
98 toput = dentry;
99 }
100 spin_unlock(&dcache_lock);
101 if (toput)
102 dput(toput);
103 } 112 }
104 113
105 /* It's a directory, or we are required to confirm the file's 114 /* It's a directory, or we are required to confirm the file's
@@ -177,9 +186,9 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
177 struct dentry *ppd; 186 struct dentry *ppd;
178 struct dentry *npd; 187 struct dentry *npd;
179 188
180 down(&pd->d_inode->i_sem); 189 mutex_lock(&pd->d_inode->i_mutex);
181 ppd = CALL(nops,get_parent)(pd); 190 ppd = CALL(nops,get_parent)(pd);
182 up(&pd->d_inode->i_sem); 191 mutex_unlock(&pd->d_inode->i_mutex);
183 192
184 if (IS_ERR(ppd)) { 193 if (IS_ERR(ppd)) {
185 err = PTR_ERR(ppd); 194 err = PTR_ERR(ppd);
@@ -201,9 +210,9 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
201 break; 210 break;
202 } 211 }
203 dprintk("find_exported_dentry: found name: %s\n", nbuf); 212 dprintk("find_exported_dentry: found name: %s\n", nbuf);
204 down(&ppd->d_inode->i_sem); 213 mutex_lock(&ppd->d_inode->i_mutex);
205 npd = lookup_one_len(nbuf, ppd, strlen(nbuf)); 214 npd = lookup_one_len(nbuf, ppd, strlen(nbuf));
206 up(&ppd->d_inode->i_sem); 215 mutex_unlock(&ppd->d_inode->i_mutex);
207 if (IS_ERR(npd)) { 216 if (IS_ERR(npd)) {
208 err = PTR_ERR(npd); 217 err = PTR_ERR(npd);
209 dprintk("find_exported_dentry: lookup failed: %d\n", err); 218 dprintk("find_exported_dentry: lookup failed: %d\n", err);
@@ -242,9 +251,9 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
242 struct dentry *nresult; 251 struct dentry *nresult;
243 err = CALL(nops,get_name)(target_dir, nbuf, result); 252 err = CALL(nops,get_name)(target_dir, nbuf, result);
244 if (!err) { 253 if (!err) {
245 down(&target_dir->d_inode->i_sem); 254 mutex_lock(&target_dir->d_inode->i_mutex);
246 nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf)); 255 nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf));
247 up(&target_dir->d_inode->i_sem); 256 mutex_unlock(&target_dir->d_inode->i_mutex);
248 if (!IS_ERR(nresult)) { 257 if (!IS_ERR(nresult)) {
249 if (nresult->d_inode) { 258 if (nresult->d_inode) {
250 dput(result); 259 dput(result);
@@ -258,26 +267,10 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
258 /* now result is properly connected, it is our best bet */ 267 /* now result is properly connected, it is our best bet */
259 if (acceptable(context, result)) 268 if (acceptable(context, result))
260 return result; 269 return result;
261 /* one last try of the aliases.. */ 270
262 spin_lock(&dcache_lock); 271 alias = find_acceptable_alias(result, acceptable, context);
263 toput = NULL; 272 if (alias)
264 head = &result->d_inode->i_dentry; 273 return alias;
265 list_for_each(le, head) {
266 struct dentry *dentry = list_entry(le, struct dentry, d_alias);
267 dget_locked(dentry);
268 spin_unlock(&dcache_lock);
269 if (toput) dput(toput);
270 if (dentry != result &&
271 acceptable(context, dentry)) {
272 dput(result);
273 return dentry;
274 }
275 spin_lock(&dcache_lock);
276 toput = dentry;
277 }
278 spin_unlock(&dcache_lock);
279 if (toput)
280 dput(toput);
281 274
282 /* drat - I just cannot find anything acceptable */ 275 /* drat - I just cannot find anything acceptable */
283 dput(result); 276 dput(result);
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 6af2f413029..35acc43b897 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -4,6 +4,7 @@
4 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> 4 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
5 */ 5 */
6 6
7#include <linux/capability.h>
7#include <linux/init.h> 8#include <linux/init.h>
8#include <linux/sched.h> 9#include <linux/sched.h>
9#include <linux/slab.h> 10#include <linux/slab.h>
@@ -149,7 +150,7 @@ ext2_iset_acl(struct inode *inode, struct posix_acl **i_acl,
149} 150}
150 151
151/* 152/*
152 * inode->i_sem: don't care 153 * inode->i_mutex: don't care
153 */ 154 */
154static struct posix_acl * 155static struct posix_acl *
155ext2_get_acl(struct inode *inode, int type) 156ext2_get_acl(struct inode *inode, int type)
@@ -211,7 +212,7 @@ ext2_get_acl(struct inode *inode, int type)
211} 212}
212 213
213/* 214/*
214 * inode->i_sem: down 215 * inode->i_mutex: down
215 */ 216 */
216static int 217static int
217ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) 218ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
@@ -301,8 +302,8 @@ ext2_permission(struct inode *inode, int mask, struct nameidata *nd)
301/* 302/*
302 * Initialize the ACLs of a new inode. Called from ext2_new_inode. 303 * Initialize the ACLs of a new inode. Called from ext2_new_inode.
303 * 304 *
304 * dir->i_sem: down 305 * dir->i_mutex: down
305 * inode->i_sem: up (access to inode is still exclusive) 306 * inode->i_mutex: up (access to inode is still exclusive)
306 */ 307 */
307int 308int
308ext2_init_acl(struct inode *inode, struct inode *dir) 309ext2_init_acl(struct inode *inode, struct inode *dir)
@@ -361,7 +362,7 @@ cleanup:
361 * for directories) are added. There are no more bits available in the 362 * for directories) are added. There are no more bits available in the
362 * file mode. 363 * file mode.
363 * 364 *
364 * inode->i_sem: down 365 * inode->i_mutex: down
365 */ 366 */
366int 367int
367ext2_acl_chmod(struct inode *inode) 368ext2_acl_chmod(struct inode *inode)
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index bb690806649..2c00953d4b0 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -16,6 +16,7 @@
16#include <linux/quotaops.h> 16#include <linux/quotaops.h>
17#include <linux/sched.h> 17#include <linux/sched.h>
18#include <linux/buffer_head.h> 18#include <linux/buffer_head.h>
19#include <linux/capability.h>
19 20
20/* 21/*
21 * balloc.c contains the blocks allocation and deallocation routines 22 * balloc.c contains the blocks allocation and deallocation routines
diff --git a/fs/ext2/bitmap.c b/fs/ext2/bitmap.c
index 20145b74623..e9983a0dd39 100644
--- a/fs/ext2/bitmap.c
+++ b/fs/ext2/bitmap.c
@@ -7,8 +7,12 @@
7 * Universite Pierre et Marie Curie (Paris VI) 7 * Universite Pierre et Marie Curie (Paris VI)
8 */ 8 */
9 9
10#ifdef EXT2FS_DEBUG
11
10#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
11 13
14#include "ext2.h"
15
12static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; 16static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
13 17
14unsigned long ext2_count_free (struct buffer_head * map, unsigned int numchars) 18unsigned long ext2_count_free (struct buffer_head * map, unsigned int numchars)
@@ -23,3 +27,6 @@ unsigned long ext2_count_free (struct buffer_head * map, unsigned int numchars)
23 nibblemap[(map->b_data[i] >> 4) & 0xf]; 27 nibblemap[(map->b_data[i] >> 4) & 0xf];
24 return (sum); 28 return (sum);
25} 29}
30
31#endif /* EXT2FS_DEBUG */
32
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 5b5f52876b4..7442bdd1267 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -592,7 +592,7 @@ int ext2_make_empty(struct inode *inode, struct inode *parent)
592 goto fail; 592 goto fail;
593 } 593 }
594 kaddr = kmap_atomic(page, KM_USER0); 594 kaddr = kmap_atomic(page, KM_USER0);
595 memset(kaddr, 0, chunk_size); 595 memset(kaddr, 0, chunk_size);
596 de = (struct ext2_dir_entry_2 *)kaddr; 596 de = (struct ext2_dir_entry_2 *)kaddr;
597 de->name_len = 1; 597 de->name_len = 1;
598 de->rec_len = cpu_to_le16(EXT2_DIR_REC_LEN(1)); 598 de->rec_len = cpu_to_le16(EXT2_DIR_REC_LEN(1));
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index e977f8566d1..00de0a7312a 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -53,7 +53,7 @@ struct ext2_inode_info {
53#ifdef CONFIG_EXT2_FS_XATTR 53#ifdef CONFIG_EXT2_FS_XATTR
54 /* 54 /*
55 * Extended attributes can be read independently of the main file 55 * Extended attributes can be read independently of the main file
56 * data. Taking i_sem even when reading would cause contention 56 * data. Taking i_mutex even when reading would cause contention
57 * between readers of EAs and writers of regular file data, so 57 * between readers of EAs and writers of regular file data, so
58 * instead we synchronize on xattr_sem when reading or changing 58 * instead we synchronize on xattr_sem when reading or changing
59 * EAs. 59 * EAs.
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 709d8676b96..3ca9afdf713 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -8,6 +8,7 @@
8 */ 8 */
9 9
10#include "ext2.h" 10#include "ext2.h"
11#include <linux/capability.h>
11#include <linux/time.h> 12#include <linux/time.h>
12#include <linux/sched.h> 13#include <linux/sched.h>
13#include <asm/current.h> 14#include <asm/current.h>
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index c5513953c82..ad1432a2a62 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -83,10 +83,7 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str
83 if (!inode) 83 if (!inode)
84 return ERR_PTR(-EACCES); 84 return ERR_PTR(-EACCES);
85 } 85 }
86 if (inode) 86 return d_splice_alias(inode, dentry);
87 return d_splice_alias(inode, dentry);
88 d_add(dentry, inode);
89 return NULL;
90} 87}
91 88
92struct dentry *ext2_get_parent(struct dentry *child) 89struct dentry *ext2_get_parent(struct dentry *child)
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index e4ed4b31a43..8d6819846fc 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -881,7 +881,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
881 } 881 }
882 if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) 882 if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL))
883 ext2_warning(sb, __FUNCTION__, 883 ext2_warning(sb, __FUNCTION__,
884 "mounting ext3 filesystem as ext2\n"); 884 "mounting ext3 filesystem as ext2");
885 ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY); 885 ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY);
886 percpu_counter_mod(&sbi->s_freeblocks_counter, 886 percpu_counter_mod(&sbi->s_freeblocks_counter,
887 ext2_count_free_blocks(sb)); 887 ext2_count_free_blocks(sb));
@@ -1152,7 +1152,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
1152 struct buffer_head tmp_bh; 1152 struct buffer_head tmp_bh;
1153 struct buffer_head *bh; 1153 struct buffer_head *bh;
1154 1154
1155 down(&inode->i_sem); 1155 mutex_lock(&inode->i_mutex);
1156 while (towrite > 0) { 1156 while (towrite > 0) {
1157 tocopy = sb->s_blocksize - offset < towrite ? 1157 tocopy = sb->s_blocksize - offset < towrite ?
1158 sb->s_blocksize - offset : towrite; 1158 sb->s_blocksize - offset : towrite;
@@ -1189,7 +1189,7 @@ out:
1189 inode->i_version++; 1189 inode->i_version++;
1190 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1190 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1191 mark_inode_dirty(inode); 1191 mark_inode_dirty(inode);
1192 up(&inode->i_sem); 1192 mutex_unlock(&inode->i_mutex);
1193 return len - towrite; 1193 return len - towrite;
1194} 1194}
1195 1195
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 0099462d427..a2ca3107d47 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -325,7 +325,7 @@ cleanup:
325/* 325/*
326 * Inode operation listxattr() 326 * Inode operation listxattr()
327 * 327 *
328 * dentry->d_inode->i_sem: don't care 328 * dentry->d_inode->i_mutex: don't care
329 */ 329 */
330ssize_t 330ssize_t
331ext2_listxattr(struct dentry *dentry, char *buffer, size_t size) 331ext2_listxattr(struct dentry *dentry, char *buffer, size_t size)
@@ -389,10 +389,6 @@ ext2_xattr_set(struct inode *inode, int name_index, const char *name,
389 ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", 389 ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
390 name_index, name, value, (long)value_len); 390 name_index, name, value, (long)value_len);
391 391
392 if (IS_RDONLY(inode))
393 return -EROFS;
394 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
395 return -EPERM;
396 if (value == NULL) 392 if (value == NULL)
397 value_len = 0; 393 value_len = 0;
398 if (name == NULL) 394 if (name == NULL)
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 52b30ee6a25..f28a6a499c9 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -7,6 +7,7 @@
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/string.h> 9#include <linux/string.h>
10#include <linux/capability.h>
10#include <linux/fs.h> 11#include <linux/fs.h>
11#include <linux/smp_lock.h> 12#include <linux/smp_lock.h>
12#include <linux/ext2_fs.h> 13#include <linux/ext2_fs.h>
@@ -38,8 +39,6 @@ ext2_xattr_trusted_get(struct inode *inode, const char *name,
38{ 39{
39 if (strcmp(name, "") == 0) 40 if (strcmp(name, "") == 0)
40 return -EINVAL; 41 return -EINVAL;
41 if (!capable(CAP_SYS_ADMIN))
42 return -EPERM;
43 return ext2_xattr_get(inode, EXT2_XATTR_INDEX_TRUSTED, name, 42 return ext2_xattr_get(inode, EXT2_XATTR_INDEX_TRUSTED, name,
44 buffer, size); 43 buffer, size);
45} 44}
@@ -50,8 +49,6 @@ ext2_xattr_trusted_set(struct inode *inode, const char *name,
50{ 49{
51 if (strcmp(name, "") == 0) 50 if (strcmp(name, "") == 0)
52 return -EINVAL; 51 return -EINVAL;
53 if (!capable(CAP_SYS_ADMIN))
54 return -EPERM;
55 return ext2_xattr_set(inode, EXT2_XATTR_INDEX_TRUSTED, name, 52 return ext2_xattr_set(inode, EXT2_XATTR_INDEX_TRUSTED, name,
56 value, size, flags); 53 value, size, flags);
57} 54}
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 0c03ea131a9..f383e7c3a7b 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -35,16 +35,10 @@ static int
35ext2_xattr_user_get(struct inode *inode, const char *name, 35ext2_xattr_user_get(struct inode *inode, const char *name,
36 void *buffer, size_t size) 36 void *buffer, size_t size)
37{ 37{
38 int error;
39
40 if (strcmp(name, "") == 0) 38 if (strcmp(name, "") == 0)
41 return -EINVAL; 39 return -EINVAL;
42 if (!test_opt(inode->i_sb, XATTR_USER)) 40 if (!test_opt(inode->i_sb, XATTR_USER))
43 return -EOPNOTSUPP; 41 return -EOPNOTSUPP;
44 error = permission(inode, MAY_READ, NULL);
45 if (error)
46 return error;
47
48 return ext2_xattr_get(inode, EXT2_XATTR_INDEX_USER, name, buffer, size); 42 return ext2_xattr_get(inode, EXT2_XATTR_INDEX_USER, name, buffer, size);
49} 43}
50 44
@@ -52,18 +46,10 @@ static int
52ext2_xattr_user_set(struct inode *inode, const char *name, 46ext2_xattr_user_set(struct inode *inode, const char *name,
53 const void *value, size_t size, int flags) 47 const void *value, size_t size, int flags)
54{ 48{
55 int error;
56
57 if (strcmp(name, "") == 0) 49 if (strcmp(name, "") == 0)
58 return -EINVAL; 50 return -EINVAL;
59 if (!test_opt(inode->i_sb, XATTR_USER)) 51 if (!test_opt(inode->i_sb, XATTR_USER))
60 return -EOPNOTSUPP; 52 return -EOPNOTSUPP;
61 if ( !S_ISREG(inode->i_mode) &&
62 (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
63 return -EPERM;
64 error = permission(inode, MAY_WRITE, NULL);
65 if (error)
66 return error;
67 53
68 return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER, name, 54 return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER, name,
69 value, size, flags); 55 value, size, flags);
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 3ac38266fc9..47a9da2dfb4 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -7,6 +7,7 @@
7#include <linux/init.h> 7#include <linux/init.h>
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/slab.h> 9#include <linux/slab.h>
10#include <linux/capability.h>
10#include <linux/fs.h> 11#include <linux/fs.h>
11#include <linux/ext3_jbd.h> 12#include <linux/ext3_jbd.h>
12#include <linux/ext3_fs.h> 13#include <linux/ext3_fs.h>
@@ -152,7 +153,7 @@ ext3_iset_acl(struct inode *inode, struct posix_acl **i_acl,
152/* 153/*
153 * Inode operation get_posix_acl(). 154 * Inode operation get_posix_acl().
154 * 155 *
155 * inode->i_sem: don't care 156 * inode->i_mutex: don't care
156 */ 157 */
157static struct posix_acl * 158static struct posix_acl *
158ext3_get_acl(struct inode *inode, int type) 159ext3_get_acl(struct inode *inode, int type)
@@ -216,7 +217,7 @@ ext3_get_acl(struct inode *inode, int type)
216/* 217/*
217 * Set the access or default ACL of an inode. 218 * Set the access or default ACL of an inode.
218 * 219 *
219 * inode->i_sem: down unless called from ext3_new_inode 220 * inode->i_mutex: down unless called from ext3_new_inode
220 */ 221 */
221static int 222static int
222ext3_set_acl(handle_t *handle, struct inode *inode, int type, 223ext3_set_acl(handle_t *handle, struct inode *inode, int type,
@@ -306,8 +307,8 @@ ext3_permission(struct inode *inode, int mask, struct nameidata *nd)
306/* 307/*
307 * Initialize the ACLs of a new inode. Called from ext3_new_inode. 308 * Initialize the ACLs of a new inode. Called from ext3_new_inode.
308 * 309 *
309 * dir->i_sem: down 310 * dir->i_mutex: down
310 * inode->i_sem: up (access to inode is still exclusive) 311 * inode->i_mutex: up (access to inode is still exclusive)
311 */ 312 */
312int 313int
313ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) 314ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
@@ -368,7 +369,7 @@ cleanup:
368 * for directories) are added. There are no more bits available in the 369 * for directories) are added. There are no more bits available in the
369 * file mode. 370 * file mode.
370 * 371 *
371 * inode->i_sem: down 372 * inode->i_mutex: down
372 */ 373 */
373int 374int
374ext3_acl_chmod(struct inode *inode) 375ext3_acl_chmod(struct inode *inode)
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index ae1148c24c5..6250fcdf14a 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -13,6 +13,7 @@
13 13
14#include <linux/config.h> 14#include <linux/config.h>
15#include <linux/time.h> 15#include <linux/time.h>
16#include <linux/capability.h>
16#include <linux/fs.h> 17#include <linux/fs.h>
17#include <linux/jbd.h> 18#include <linux/jbd.h>
18#include <linux/ext3_fs.h> 19#include <linux/ext3_fs.h>
@@ -20,8 +21,6 @@
20#include <linux/quotaops.h> 21#include <linux/quotaops.h>
21#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
22 23
23#include "bitmap.h"
24
25/* 24/*
26 * balloc.c contains the blocks allocation and deallocation routines 25 * balloc.c contains the blocks allocation and deallocation routines
27 */ 26 */
diff --git a/fs/ext3/bitmap.c b/fs/ext3/bitmap.c
index 5b4ba3e246e..cb16b4c5d5d 100644
--- a/fs/ext3/bitmap.c
+++ b/fs/ext3/bitmap.c
@@ -7,8 +7,11 @@
7 * Universite Pierre et Marie Curie (Paris VI) 7 * Universite Pierre et Marie Curie (Paris VI)
8 */ 8 */
9 9
10#ifdef EXT3FS_DEBUG
11
10#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
11#include "bitmap.h" 13
14#include "ext3_fs.h"
12 15
13static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; 16static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
14 17
@@ -24,3 +27,6 @@ unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars)
24 nibblemap[(map->b_data[i] >> 4) & 0xf]; 27 nibblemap[(map->b_data[i] >> 4) & 0xf];
25 return (sum); 28 return (sum);
26} 29}
30
31#endif /* EXT3FS_DEBUG */
32
diff --git a/fs/ext3/bitmap.h b/fs/ext3/bitmap.h
deleted file mode 100644
index 6ee503a6bb4..00000000000
--- a/fs/ext3/bitmap.h
+++ /dev/null
@@ -1,8 +0,0 @@
1/* linux/fs/ext3/bitmap.c
2 *
3 * Copyright (C) 2005 Simtec Electronics
4 * Ben Dooks <ben@simtec.co.uk>
5 *
6*/
7
8extern unsigned long ext3_count_free (struct buffer_head *, unsigned int );
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 9e4a2437621..dc826464f31 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -26,7 +26,6 @@
26 26
27#include <asm/byteorder.h> 27#include <asm/byteorder.h>
28 28
29#include "bitmap.h"
30#include "xattr.h" 29#include "xattr.h"
31#include "acl.h" 30#include "acl.h"
32 31
@@ -651,7 +650,7 @@ struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino)
651 /* Error cases - e2fsck has already cleaned up for us */ 650 /* Error cases - e2fsck has already cleaned up for us */
652 if (ino > max_ino) { 651 if (ino > max_ino) {
653 ext3_warning(sb, __FUNCTION__, 652 ext3_warning(sb, __FUNCTION__,
654 "bad orphan ino %lu! e2fsck was run?\n", ino); 653 "bad orphan ino %lu! e2fsck was run?", ino);
655 goto out; 654 goto out;
656 } 655 }
657 656
@@ -660,7 +659,7 @@ struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino)
660 bitmap_bh = read_inode_bitmap(sb, block_group); 659 bitmap_bh = read_inode_bitmap(sb, block_group);
661 if (!bitmap_bh) { 660 if (!bitmap_bh) {
662 ext3_warning(sb, __FUNCTION__, 661 ext3_warning(sb, __FUNCTION__,
663 "inode bitmap error for orphan %lu\n", ino); 662 "inode bitmap error for orphan %lu", ino);
664 goto out; 663 goto out;
665 } 664 }
666 665
@@ -672,7 +671,7 @@ struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino)
672 !(inode = iget(sb, ino)) || is_bad_inode(inode) || 671 !(inode = iget(sb, ino)) || is_bad_inode(inode) ||
673 NEXT_ORPHAN(inode) > max_ino) { 672 NEXT_ORPHAN(inode) > max_ino) {
674 ext3_warning(sb, __FUNCTION__, 673 ext3_warning(sb, __FUNCTION__,
675 "bad orphan inode %lu! e2fsck was run?\n", ino); 674 "bad orphan inode %lu! e2fsck was run?", ino);
676 printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%llu) = %d\n", 675 printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%llu) = %d\n",
677 bit, (unsigned long long)bitmap_bh->b_blocknr, 676 bit, (unsigned long long)bitmap_bh->b_blocknr,
678 ext3_test_bit(bit, bitmap_bh->b_data)); 677 ext3_test_bit(bit, bitmap_bh->b_data));
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 5d9b00e2883..8824e84f8a5 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1384,8 +1384,10 @@ static int ext3_journalled_writepage(struct page *page,
1384 ClearPageChecked(page); 1384 ClearPageChecked(page);
1385 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 1385 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
1386 ext3_get_block); 1386 ext3_get_block);
1387 if (ret != 0) 1387 if (ret != 0) {
1388 ext3_journal_stop(handle);
1388 goto out_unlock; 1389 goto out_unlock;
1390 }
1389 ret = walk_page_buffers(handle, page_buffers(page), 0, 1391 ret = walk_page_buffers(handle, page_buffers(page), 0,
1390 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); 1392 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1391 1393
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 706d6860838..556cd551007 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -9,6 +9,7 @@
9 9
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/jbd.h> 11#include <linux/jbd.h>
12#include <linux/capability.h>
12#include <linux/ext3_fs.h> 13#include <linux/ext3_fs.h>
13#include <linux/ext3_jbd.h> 14#include <linux/ext3_jbd.h>
14#include <linux/time.h> 15#include <linux/time.h>
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index b3c690a3b54..8bd8ac07770 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1005,10 +1005,7 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
1005 if (!inode) 1005 if (!inode)
1006 return ERR_PTR(-EACCES); 1006 return ERR_PTR(-EACCES);
1007 } 1007 }
1008 if (inode) 1008 return d_splice_alias(inode, dentry);
1009 return d_splice_alias(inode, dentry);
1010 d_add(dentry, inode);
1011 return NULL;
1012} 1009}
1013 1010
1014 1011
@@ -1476,7 +1473,7 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
1476 if (levels && (dx_get_count(frames->entries) == 1473 if (levels && (dx_get_count(frames->entries) ==
1477 dx_get_limit(frames->entries))) { 1474 dx_get_limit(frames->entries))) {
1478 ext3_warning(sb, __FUNCTION__, 1475 ext3_warning(sb, __FUNCTION__,
1479 "Directory index full!\n"); 1476 "Directory index full!");
1480 err = -ENOSPC; 1477 err = -ENOSPC;
1481 goto cleanup; 1478 goto cleanup;
1482 } 1479 }
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 1be78b4b4de..1041dab6de2 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -31,7 +31,7 @@ static int verify_group_input(struct super_block *sb,
31 unsigned start = le32_to_cpu(es->s_blocks_count); 31 unsigned start = le32_to_cpu(es->s_blocks_count);
32 unsigned end = start + input->blocks_count; 32 unsigned end = start + input->blocks_count;
33 unsigned group = input->group; 33 unsigned group = input->group;
34 unsigned itend = input->inode_table + EXT3_SB(sb)->s_itb_per_group; 34 unsigned itend = input->inode_table + sbi->s_itb_per_group;
35 unsigned overhead = ext3_bg_has_super(sb, group) ? 35 unsigned overhead = ext3_bg_has_super(sb, group) ?
36 (1 + ext3_bg_num_gdb(sb, group) + 36 (1 + ext3_bg_num_gdb(sb, group) +
37 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; 37 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
@@ -340,7 +340,7 @@ static int verify_reserved_gdb(struct super_block *sb,
340 while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) { 340 while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) {
341 if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){ 341 if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){
342 ext3_warning(sb, __FUNCTION__, 342 ext3_warning(sb, __FUNCTION__,
343 "reserved GDT %ld missing grp %d (%ld)\n", 343 "reserved GDT %ld missing grp %d (%ld)",
344 blk, grp, 344 blk, grp,
345 grp * EXT3_BLOCKS_PER_GROUP(sb) + blk); 345 grp * EXT3_BLOCKS_PER_GROUP(sb) + blk);
346 return -EINVAL; 346 return -EINVAL;
@@ -393,7 +393,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
393 if (EXT3_SB(sb)->s_sbh->b_blocknr != 393 if (EXT3_SB(sb)->s_sbh->b_blocknr !=
394 le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) { 394 le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) {
395 ext3_warning(sb, __FUNCTION__, 395 ext3_warning(sb, __FUNCTION__,
396 "won't resize using backup superblock at %llu\n", 396 "won't resize using backup superblock at %llu",
397 (unsigned long long)EXT3_SB(sb)->s_sbh->b_blocknr); 397 (unsigned long long)EXT3_SB(sb)->s_sbh->b_blocknr);
398 return -EPERM; 398 return -EPERM;
399 } 399 }
@@ -417,7 +417,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
417 data = (__u32 *)dind->b_data; 417 data = (__u32 *)dind->b_data;
418 if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) { 418 if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) {
419 ext3_warning(sb, __FUNCTION__, 419 ext3_warning(sb, __FUNCTION__,
420 "new group %u GDT block %lu not reserved\n", 420 "new group %u GDT block %lu not reserved",
421 input->group, gdblock); 421 input->group, gdblock);
422 err = -EINVAL; 422 err = -EINVAL;
423 goto exit_dind; 423 goto exit_dind;
@@ -540,7 +540,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
540 for (res = 0; res < reserved_gdb; res++, blk++) { 540 for (res = 0; res < reserved_gdb; res++, blk++) {
541 if (le32_to_cpu(*data) != blk) { 541 if (le32_to_cpu(*data) != blk) {
542 ext3_warning(sb, __FUNCTION__, 542 ext3_warning(sb, __FUNCTION__,
543 "reserved block %lu not at offset %ld\n", 543 "reserved block %lu not at offset %ld",
544 blk, (long)(data - (__u32 *)dind->b_data)); 544 blk, (long)(data - (__u32 *)dind->b_data));
545 err = -EINVAL; 545 err = -EINVAL;
546 goto exit_bh; 546 goto exit_bh;
@@ -683,7 +683,7 @@ exit_err:
683 if (err) { 683 if (err) {
684 ext3_warning(sb, __FUNCTION__, 684 ext3_warning(sb, __FUNCTION__,
685 "can't update backup for group %d (err %d), " 685 "can't update backup for group %d (err %d), "
686 "forcing fsck on next reboot\n", group, err); 686 "forcing fsck on next reboot", group, err);
687 sbi->s_mount_state &= ~EXT3_VALID_FS; 687 sbi->s_mount_state &= ~EXT3_VALID_FS;
688 sbi->s_es->s_state &= ~cpu_to_le16(EXT3_VALID_FS); 688 sbi->s_es->s_state &= ~cpu_to_le16(EXT3_VALID_FS);
689 mark_buffer_dirty(sbi->s_sbh); 689 mark_buffer_dirty(sbi->s_sbh);
@@ -722,7 +722,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
722 if (gdb_off == 0 && !EXT3_HAS_RO_COMPAT_FEATURE(sb, 722 if (gdb_off == 0 && !EXT3_HAS_RO_COMPAT_FEATURE(sb,
723 EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) { 723 EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
724 ext3_warning(sb, __FUNCTION__, 724 ext3_warning(sb, __FUNCTION__,
725 "Can't resize non-sparse filesystem further\n"); 725 "Can't resize non-sparse filesystem further");
726 return -EPERM; 726 return -EPERM;
727 } 727 }
728 728
@@ -730,13 +730,13 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
730 if (!EXT3_HAS_COMPAT_FEATURE(sb, 730 if (!EXT3_HAS_COMPAT_FEATURE(sb,
731 EXT3_FEATURE_COMPAT_RESIZE_INODE)){ 731 EXT3_FEATURE_COMPAT_RESIZE_INODE)){
732 ext3_warning(sb, __FUNCTION__, 732 ext3_warning(sb, __FUNCTION__,
733 "No reserved GDT blocks, can't resize\n"); 733 "No reserved GDT blocks, can't resize");
734 return -EPERM; 734 return -EPERM;
735 } 735 }
736 inode = iget(sb, EXT3_RESIZE_INO); 736 inode = iget(sb, EXT3_RESIZE_INO);
737 if (!inode || is_bad_inode(inode)) { 737 if (!inode || is_bad_inode(inode)) {
738 ext3_warning(sb, __FUNCTION__, 738 ext3_warning(sb, __FUNCTION__,
739 "Error opening resize inode\n"); 739 "Error opening resize inode");
740 iput(inode); 740 iput(inode);
741 return -ENOENT; 741 return -ENOENT;
742 } 742 }
@@ -764,9 +764,10 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
764 } 764 }
765 765
766 lock_super(sb); 766 lock_super(sb);
767 if (input->group != EXT3_SB(sb)->s_groups_count) { 767 if (input->group != sbi->s_groups_count) {
768 ext3_warning(sb, __FUNCTION__, 768 ext3_warning(sb, __FUNCTION__,
769 "multiple resizers run on filesystem!\n"); 769 "multiple resizers run on filesystem!");
770 err = -EBUSY;
770 goto exit_journal; 771 goto exit_journal;
771 } 772 }
772 773
@@ -798,7 +799,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
798 * data. So we need to be careful to set all of the relevant 799 * data. So we need to be careful to set all of the relevant
799 * group descriptor data etc. *before* we enable the group. 800 * group descriptor data etc. *before* we enable the group.
800 * 801 *
801 * The key field here is EXT3_SB(sb)->s_groups_count: as long as 802 * The key field here is sbi->s_groups_count: as long as
802 * that retains its old value, nobody is going to access the new 803 * that retains its old value, nobody is going to access the new
803 * group. 804 * group.
804 * 805 *
@@ -858,7 +859,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
858 smp_wmb(); 859 smp_wmb();
859 860
860 /* Update the global fs size fields */ 861 /* Update the global fs size fields */
861 EXT3_SB(sb)->s_groups_count++; 862 sbi->s_groups_count++;
862 863
863 ext3_journal_dirty_metadata(handle, primary); 864 ext3_journal_dirty_metadata(handle, primary);
864 865
@@ -873,7 +874,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
873 percpu_counter_mod(&sbi->s_freeinodes_counter, 874 percpu_counter_mod(&sbi->s_freeinodes_counter,
874 EXT3_INODES_PER_GROUP(sb)); 875 EXT3_INODES_PER_GROUP(sb));
875 876
876 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); 877 ext3_journal_dirty_metadata(handle, sbi->s_sbh);
877 sb->s_dirt = 1; 878 sb->s_dirt = 1;
878 879
879exit_journal: 880exit_journal:
@@ -936,7 +937,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
936 937
937 if (last == 0) { 938 if (last == 0) {
938 ext3_warning(sb, __FUNCTION__, 939 ext3_warning(sb, __FUNCTION__,
939 "need to use ext2online to resize further\n"); 940 "need to use ext2online to resize further");
940 return -EPERM; 941 return -EPERM;
941 } 942 }
942 943
@@ -972,7 +973,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
972 lock_super(sb); 973 lock_super(sb);
973 if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) { 974 if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) {
974 ext3_warning(sb, __FUNCTION__, 975 ext3_warning(sb, __FUNCTION__,
975 "multiple resizers run on filesystem!\n"); 976 "multiple resizers run on filesystem!");
976 err = -EBUSY; 977 err = -EBUSY;
977 goto exit_put; 978 goto exit_put;
978 } 979 }
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 4e6730622d9..56bf7658601 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -43,7 +43,8 @@
43#include "acl.h" 43#include "acl.h"
44#include "namei.h" 44#include "namei.h"
45 45
46static int ext3_load_journal(struct super_block *, struct ext3_super_block *); 46static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
47 unsigned long journal_devnum);
47static int ext3_create_journal(struct super_block *, struct ext3_super_block *, 48static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
48 int); 49 int);
49static void ext3_commit_super (struct super_block * sb, 50static void ext3_commit_super (struct super_block * sb,
@@ -628,7 +629,7 @@ enum {
628 Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, 629 Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
629 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, 630 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
630 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, 631 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh,
631 Opt_commit, Opt_journal_update, Opt_journal_inum, 632 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
632 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 633 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
633 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 634 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
634 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 635 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
@@ -666,6 +667,7 @@ static match_table_t tokens = {
666 {Opt_commit, "commit=%u"}, 667 {Opt_commit, "commit=%u"},
667 {Opt_journal_update, "journal=update"}, 668 {Opt_journal_update, "journal=update"},
668 {Opt_journal_inum, "journal=%u"}, 669 {Opt_journal_inum, "journal=%u"},
670 {Opt_journal_dev, "journal_dev=%u"},
669 {Opt_abort, "abort"}, 671 {Opt_abort, "abort"},
670 {Opt_data_journal, "data=journal"}, 672 {Opt_data_journal, "data=journal"},
671 {Opt_data_ordered, "data=ordered"}, 673 {Opt_data_ordered, "data=ordered"},
@@ -705,8 +707,9 @@ static unsigned long get_sb_block(void **data)
705 return sb_block; 707 return sb_block;
706} 708}
707 709
708static int parse_options (char * options, struct super_block *sb, 710static int parse_options (char *options, struct super_block *sb,
709 unsigned long * inum, unsigned long *n_blocks_count, int is_remount) 711 unsigned long *inum, unsigned long *journal_devnum,
712 unsigned long *n_blocks_count, int is_remount)
710{ 713{
711 struct ext3_sb_info *sbi = EXT3_SB(sb); 714 struct ext3_sb_info *sbi = EXT3_SB(sb);
712 char * p; 715 char * p;
@@ -839,6 +842,16 @@ static int parse_options (char * options, struct super_block *sb,
839 return 0; 842 return 0;
840 *inum = option; 843 *inum = option;
841 break; 844 break;
845 case Opt_journal_dev:
846 if (is_remount) {
847 printk(KERN_ERR "EXT3-fs: cannot specify "
848 "journal on remount\n");
849 return 0;
850 }
851 if (match_int(&args[0], &option))
852 return 0;
853 *journal_devnum = option;
854 break;
842 case Opt_noload: 855 case Opt_noload:
843 set_opt (sbi->s_mount_opt, NOLOAD); 856 set_opt (sbi->s_mount_opt, NOLOAD);
844 break; 857 break;
@@ -1331,6 +1344,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1331 unsigned long logic_sb_block; 1344 unsigned long logic_sb_block;
1332 unsigned long offset = 0; 1345 unsigned long offset = 0;
1333 unsigned long journal_inum = 0; 1346 unsigned long journal_inum = 0;
1347 unsigned long journal_devnum = 0;
1334 unsigned long def_mount_opts; 1348 unsigned long def_mount_opts;
1335 struct inode *root; 1349 struct inode *root;
1336 int blocksize; 1350 int blocksize;
@@ -1411,7 +1425,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1411 1425
1412 set_opt(sbi->s_mount_opt, RESERVATION); 1426 set_opt(sbi->s_mount_opt, RESERVATION);
1413 1427
1414 if (!parse_options ((char *) data, sb, &journal_inum, NULL, 0)) 1428 if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
1429 NULL, 0))
1415 goto failed_mount; 1430 goto failed_mount;
1416 1431
1417 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 1432 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -1622,7 +1637,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1622 */ 1637 */
1623 if (!test_opt(sb, NOLOAD) && 1638 if (!test_opt(sb, NOLOAD) &&
1624 EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) { 1639 EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
1625 if (ext3_load_journal(sb, es)) 1640 if (ext3_load_journal(sb, es, journal_devnum))
1626 goto failed_mount2; 1641 goto failed_mount2;
1627 } else if (journal_inum) { 1642 } else if (journal_inum) {
1628 if (ext3_create_journal(sb, es, journal_inum)) 1643 if (ext3_create_journal(sb, es, journal_inum))
@@ -1902,15 +1917,24 @@ out_bdev:
1902 return NULL; 1917 return NULL;
1903} 1918}
1904 1919
1905static int ext3_load_journal(struct super_block * sb, 1920static int ext3_load_journal(struct super_block *sb,
1906 struct ext3_super_block * es) 1921 struct ext3_super_block *es,
1922 unsigned long journal_devnum)
1907{ 1923{
1908 journal_t *journal; 1924 journal_t *journal;
1909 int journal_inum = le32_to_cpu(es->s_journal_inum); 1925 int journal_inum = le32_to_cpu(es->s_journal_inum);
1910 dev_t journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); 1926 dev_t journal_dev;
1911 int err = 0; 1927 int err = 0;
1912 int really_read_only; 1928 int really_read_only;
1913 1929
1930 if (journal_devnum &&
1931 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
1932 printk(KERN_INFO "EXT3-fs: external journal device major/minor "
1933 "numbers have changed\n");
1934 journal_dev = new_decode_dev(journal_devnum);
1935 } else
1936 journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
1937
1914 really_read_only = bdev_read_only(sb->s_bdev); 1938 really_read_only = bdev_read_only(sb->s_bdev);
1915 1939
1916 /* 1940 /*
@@ -1969,6 +1993,16 @@ static int ext3_load_journal(struct super_block * sb,
1969 1993
1970 EXT3_SB(sb)->s_journal = journal; 1994 EXT3_SB(sb)->s_journal = journal;
1971 ext3_clear_journal_err(sb, es); 1995 ext3_clear_journal_err(sb, es);
1996
1997 if (journal_devnum &&
1998 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
1999 es->s_journal_dev = cpu_to_le32(journal_devnum);
2000 sb->s_dirt = 1;
2001
2002 /* Make sure we flush the recovery flag to disk. */
2003 ext3_commit_super(sb, es, 1);
2004 }
2005
1972 return 0; 2006 return 0;
1973} 2007}
1974 2008
@@ -2116,7 +2150,7 @@ int ext3_force_commit(struct super_block *sb)
2116 2150
2117static void ext3_write_super (struct super_block * sb) 2151static void ext3_write_super (struct super_block * sb)
2118{ 2152{
2119 if (down_trylock(&sb->s_lock) == 0) 2153 if (mutex_trylock(&sb->s_lock) != 0)
2120 BUG(); 2154 BUG();
2121 sb->s_dirt = 0; 2155 sb->s_dirt = 0;
2122} 2156}
@@ -2197,7 +2231,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2197 /* 2231 /*
2198 * Allow the "check" option to be passed as a remount option. 2232 * Allow the "check" option to be passed as a remount option.
2199 */ 2233 */
2200 if (!parse_options(data, sb, NULL, &n_blocks_count, 1)) { 2234 if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
2201 err = -EINVAL; 2235 err = -EINVAL;
2202 goto restore_opts; 2236 goto restore_opts;
2203 } 2237 }
@@ -2567,7 +2601,7 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
2567 struct buffer_head *bh; 2601 struct buffer_head *bh;
2568 handle_t *handle = journal_current_handle(); 2602 handle_t *handle = journal_current_handle();
2569 2603
2570 down(&inode->i_sem); 2604 mutex_lock(&inode->i_mutex);
2571 while (towrite > 0) { 2605 while (towrite > 0) {
2572 tocopy = sb->s_blocksize - offset < towrite ? 2606 tocopy = sb->s_blocksize - offset < towrite ?
2573 sb->s_blocksize - offset : towrite; 2607 sb->s_blocksize - offset : towrite;
@@ -2610,7 +2644,7 @@ out:
2610 inode->i_version++; 2644 inode->i_version++;
2611 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 2645 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2612 ext3_mark_inode_dirty(handle, inode); 2646 ext3_mark_inode_dirty(handle, inode);
2613 up(&inode->i_sem); 2647 mutex_unlock(&inode->i_mutex);
2614 return len - towrite; 2648 return len - towrite;
2615} 2649}
2616 2650
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 430de9f63be..e8d60bf6b7d 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -140,7 +140,7 @@ ext3_xattr_handler(int name_index)
140/* 140/*
141 * Inode operation listxattr() 141 * Inode operation listxattr()
142 * 142 *
143 * dentry->d_inode->i_sem: don't care 143 * dentry->d_inode->i_mutex: don't care
144 */ 144 */
145ssize_t 145ssize_t
146ext3_listxattr(struct dentry *dentry, char *buffer, size_t size) 146ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
@@ -946,10 +946,6 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
946 }; 946 };
947 int error; 947 int error;
948 948
949 if (IS_RDONLY(inode))
950 return -EROFS;
951 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
952 return -EPERM;
953 if (!name) 949 if (!name)
954 return -EINVAL; 950 return -EINVAL;
955 if (strlen(name) > 255) 951 if (strlen(name) > 255)
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index f68bfd1cf51..86d91f1186d 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -7,6 +7,7 @@
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/string.h> 9#include <linux/string.h>
10#include <linux/capability.h>
10#include <linux/fs.h> 11#include <linux/fs.h>
11#include <linux/smp_lock.h> 12#include <linux/smp_lock.h>
12#include <linux/ext3_jbd.h> 13#include <linux/ext3_jbd.h>
@@ -39,8 +40,6 @@ ext3_xattr_trusted_get(struct inode *inode, const char *name,
39{ 40{
40 if (strcmp(name, "") == 0) 41 if (strcmp(name, "") == 0)
41 return -EINVAL; 42 return -EINVAL;
42 if (!capable(CAP_SYS_ADMIN))
43 return -EPERM;
44 return ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED, name, 43 return ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED, name,
45 buffer, size); 44 buffer, size);
46} 45}
@@ -51,8 +50,6 @@ ext3_xattr_trusted_set(struct inode *inode, const char *name,
51{ 50{
52 if (strcmp(name, "") == 0) 51 if (strcmp(name, "") == 0)
53 return -EINVAL; 52 return -EINVAL;
54 if (!capable(CAP_SYS_ADMIN))
55 return -EPERM;
56 return ext3_xattr_set(inode, EXT3_XATTR_INDEX_TRUSTED, name, 53 return ext3_xattr_set(inode, EXT3_XATTR_INDEX_TRUSTED, name,
57 value, size, flags); 54 value, size, flags);
58} 55}
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index e907cae7a07..a85a0a17c4f 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -37,16 +37,10 @@ static int
37ext3_xattr_user_get(struct inode *inode, const char *name, 37ext3_xattr_user_get(struct inode *inode, const char *name,
38 void *buffer, size_t size) 38 void *buffer, size_t size)
39{ 39{
40 int error;
41
42 if (strcmp(name, "") == 0) 40 if (strcmp(name, "") == 0)
43 return -EINVAL; 41 return -EINVAL;
44 if (!test_opt(inode->i_sb, XATTR_USER)) 42 if (!test_opt(inode->i_sb, XATTR_USER))
45 return -EOPNOTSUPP; 43 return -EOPNOTSUPP;
46 error = permission(inode, MAY_READ, NULL);
47 if (error)
48 return error;
49
50 return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name, buffer, size); 44 return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name, buffer, size);
51} 45}
52 46
@@ -54,19 +48,10 @@ static int
54ext3_xattr_user_set(struct inode *inode, const char *name, 48ext3_xattr_user_set(struct inode *inode, const char *name,
55 const void *value, size_t size, int flags) 49 const void *value, size_t size, int flags)
56{ 50{
57 int error;
58
59 if (strcmp(name, "") == 0) 51 if (strcmp(name, "") == 0)
60 return -EINVAL; 52 return -EINVAL;
61 if (!test_opt(inode->i_sb, XATTR_USER)) 53 if (!test_opt(inode->i_sb, XATTR_USER))
62 return -EOPNOTSUPP; 54 return -EOPNOTSUPP;
63 if ( !S_ISREG(inode->i_mode) &&
64 (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
65 return -EPERM;
66 error = permission(inode, MAY_WRITE, NULL);
67 if (error)
68 return error;
69
70 return ext3_xattr_set(inode, EXT3_XATTR_INDEX_USER, name, 55 return ext3_xattr_set(inode, EXT3_XATTR_INDEX_USER, name,
71 value, size, flags); 56 value, size, flags);
72} 57}
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 77c24fcf712..1acc941245f 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -295,7 +295,8 @@ static int fat_bmap_cluster(struct inode *inode, int cluster)
295 return dclus; 295 return dclus;
296} 296}
297 297
298int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys) 298int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
299 unsigned long *mapped_blocks)
299{ 300{
300 struct super_block *sb = inode->i_sb; 301 struct super_block *sb = inode->i_sb;
301 struct msdos_sb_info *sbi = MSDOS_SB(sb); 302 struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -303,9 +304,12 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys)
303 int cluster, offset; 304 int cluster, offset;
304 305
305 *phys = 0; 306 *phys = 0;
307 *mapped_blocks = 0;
306 if ((sbi->fat_bits != 32) && (inode->i_ino == MSDOS_ROOT_INO)) { 308 if ((sbi->fat_bits != 32) && (inode->i_ino == MSDOS_ROOT_INO)) {
307 if (sector < (sbi->dir_entries >> sbi->dir_per_block_bits)) 309 if (sector < (sbi->dir_entries >> sbi->dir_per_block_bits)) {
308 *phys = sector + sbi->dir_start; 310 *phys = sector + sbi->dir_start;
311 *mapped_blocks = 1;
312 }
309 return 0; 313 return 0;
310 } 314 }
311 last_block = (MSDOS_I(inode)->mmu_private + (sb->s_blocksize - 1)) 315 last_block = (MSDOS_I(inode)->mmu_private + (sb->s_blocksize - 1))
@@ -318,7 +322,11 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys)
318 cluster = fat_bmap_cluster(inode, cluster); 322 cluster = fat_bmap_cluster(inode, cluster);
319 if (cluster < 0) 323 if (cluster < 0)
320 return cluster; 324 return cluster;
321 else if (cluster) 325 else if (cluster) {
322 *phys = fat_clus_to_blknr(sbi, cluster) + offset; 326 *phys = fat_clus_to_blknr(sbi, cluster) + offset;
327 *mapped_blocks = sbi->sec_per_clus - offset;
328 if (*mapped_blocks > last_block - sector)
329 *mapped_blocks = last_block - sector;
330 }
323 return 0; 331 return 0;
324} 332}
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index ba824964b9b..db0de5c621c 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -45,8 +45,8 @@ static inline void fat_dir_readahead(struct inode *dir, sector_t iblock,
45 if ((sbi->fat_bits != 32) && (dir->i_ino == MSDOS_ROOT_INO)) 45 if ((sbi->fat_bits != 32) && (dir->i_ino == MSDOS_ROOT_INO))
46 return; 46 return;
47 47
48 bh = sb_getblk(sb, phys); 48 bh = sb_find_get_block(sb, phys);
49 if (bh && !buffer_uptodate(bh)) { 49 if (bh == NULL || !buffer_uptodate(bh)) {
50 for (sec = 0; sec < sbi->sec_per_clus; sec++) 50 for (sec = 0; sec < sbi->sec_per_clus; sec++)
51 sb_breadahead(sb, phys + sec); 51 sb_breadahead(sb, phys + sec);
52 } 52 }
@@ -68,8 +68,8 @@ static int fat__get_entry(struct inode *dir, loff_t *pos,
68{ 68{
69 struct super_block *sb = dir->i_sb; 69 struct super_block *sb = dir->i_sb;
70 sector_t phys, iblock; 70 sector_t phys, iblock;
71 int offset; 71 unsigned long mapped_blocks;
72 int err; 72 int err, offset;
73 73
74next: 74next:
75 if (*bh) 75 if (*bh)
@@ -77,7 +77,7 @@ next:
77 77
78 *bh = NULL; 78 *bh = NULL;
79 iblock = *pos >> sb->s_blocksize_bits; 79 iblock = *pos >> sb->s_blocksize_bits;
80 err = fat_bmap(dir, iblock, &phys); 80 err = fat_bmap(dir, iblock, &phys, &mapped_blocks);
81 if (err || !phys) 81 if (err || !phys)
82 return -1; /* beyond EOF or error */ 82 return -1; /* beyond EOF or error */
83 83
@@ -418,7 +418,7 @@ EODir:
418 return err; 418 return err;
419} 419}
420 420
421EXPORT_SYMBOL(fat_search_long); 421EXPORT_SYMBOL_GPL(fat_search_long);
422 422
423struct fat_ioctl_filldir_callback { 423struct fat_ioctl_filldir_callback {
424 struct dirent __user *dirent; 424 struct dirent __user *dirent;
@@ -729,13 +729,13 @@ static int fat_dir_ioctl(struct inode * inode, struct file * filp,
729 729
730 buf.dirent = d1; 730 buf.dirent = d1;
731 buf.result = 0; 731 buf.result = 0;
732 down(&inode->i_sem); 732 mutex_lock(&inode->i_mutex);
733 ret = -ENOENT; 733 ret = -ENOENT;
734 if (!IS_DEADDIR(inode)) { 734 if (!IS_DEADDIR(inode)) {
735 ret = __fat_readdir(inode, filp, &buf, fat_ioctl_filldir, 735 ret = __fat_readdir(inode, filp, &buf, fat_ioctl_filldir,
736 short_only, both); 736 short_only, both);
737 } 737 }
738 up(&inode->i_sem); 738 mutex_unlock(&inode->i_mutex);
739 if (ret >= 0) 739 if (ret >= 0)
740 ret = buf.result; 740 ret = buf.result;
741 return ret; 741 return ret;
@@ -780,7 +780,7 @@ int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
780 return -ENOENT; 780 return -ENOENT;
781} 781}
782 782
783EXPORT_SYMBOL(fat_get_dotdot_entry); 783EXPORT_SYMBOL_GPL(fat_get_dotdot_entry);
784 784
785/* See if directory is empty */ 785/* See if directory is empty */
786int fat_dir_empty(struct inode *dir) 786int fat_dir_empty(struct inode *dir)
@@ -803,7 +803,7 @@ int fat_dir_empty(struct inode *dir)
803 return result; 803 return result;
804} 804}
805 805
806EXPORT_SYMBOL(fat_dir_empty); 806EXPORT_SYMBOL_GPL(fat_dir_empty);
807 807
808/* 808/*
809 * fat_subdirs counts the number of sub-directories of dir. It can be run 809 * fat_subdirs counts the number of sub-directories of dir. It can be run
@@ -849,7 +849,7 @@ int fat_scan(struct inode *dir, const unsigned char *name,
849 return -ENOENT; 849 return -ENOENT;
850} 850}
851 851
852EXPORT_SYMBOL(fat_scan); 852EXPORT_SYMBOL_GPL(fat_scan);
853 853
854static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots) 854static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots)
855{ 855{
@@ -936,7 +936,7 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
936 return 0; 936 return 0;
937} 937}
938 938
939EXPORT_SYMBOL(fat_remove_entries); 939EXPORT_SYMBOL_GPL(fat_remove_entries);
940 940
941static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used, 941static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used,
942 struct buffer_head **bhs, int nr_bhs) 942 struct buffer_head **bhs, int nr_bhs)
@@ -1048,7 +1048,7 @@ error:
1048 return err; 1048 return err;
1049} 1049}
1050 1050
1051EXPORT_SYMBOL(fat_alloc_new_dir); 1051EXPORT_SYMBOL_GPL(fat_alloc_new_dir);
1052 1052
1053static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots, 1053static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots,
1054 int *nr_cluster, struct msdos_dir_entry **de, 1054 int *nr_cluster, struct msdos_dir_entry **de,
@@ -1264,4 +1264,4 @@ error_remove:
1264 return err; 1264 return err;
1265} 1265}
1266 1266
1267EXPORT_SYMBOL(fat_add_entries); 1267EXPORT_SYMBOL_GPL(fat_add_entries);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 4164cd54c4d..a1a9e045121 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -476,6 +476,7 @@ int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster)
476 sbi->prev_free = entry; 476 sbi->prev_free = entry;
477 if (sbi->free_clusters != -1) 477 if (sbi->free_clusters != -1)
478 sbi->free_clusters--; 478 sbi->free_clusters--;
479 sb->s_dirt = 1;
479 480
480 cluster[idx_clus] = entry; 481 cluster[idx_clus] = entry;
481 idx_clus++; 482 idx_clus++;
@@ -496,6 +497,7 @@ int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster)
496 497
497 /* Couldn't allocate the free entries */ 498 /* Couldn't allocate the free entries */
498 sbi->free_clusters = 0; 499 sbi->free_clusters = 0;
500 sb->s_dirt = 1;
499 err = -ENOSPC; 501 err = -ENOSPC;
500 502
501out: 503out:
@@ -509,7 +511,6 @@ out:
509 } 511 }
510 for (i = 0; i < nr_bhs; i++) 512 for (i = 0; i < nr_bhs; i++)
511 brelse(bhs[i]); 513 brelse(bhs[i]);
512 fat_clusters_flush(sb);
513 514
514 if (err && idx_clus) 515 if (err && idx_clus)
515 fat_free_clusters(inode, cluster[0]); 516 fat_free_clusters(inode, cluster[0]);
@@ -542,8 +543,10 @@ int fat_free_clusters(struct inode *inode, int cluster)
542 } 543 }
543 544
544 ops->ent_put(&fatent, FAT_ENT_FREE); 545 ops->ent_put(&fatent, FAT_ENT_FREE);
545 if (sbi->free_clusters != -1) 546 if (sbi->free_clusters != -1) {
546 sbi->free_clusters++; 547 sbi->free_clusters++;
548 sb->s_dirt = 1;
549 }
547 550
548 if (nr_bhs + fatent.nr_bhs > MAX_BUF_PER_PAGE) { 551 if (nr_bhs + fatent.nr_bhs > MAX_BUF_PER_PAGE) {
549 if (sb->s_flags & MS_SYNCHRONOUS) { 552 if (sb->s_flags & MS_SYNCHRONOUS) {
@@ -578,7 +581,7 @@ error:
578 return err; 581 return err;
579} 582}
580 583
581EXPORT_SYMBOL(fat_free_clusters); 584EXPORT_SYMBOL_GPL(fat_free_clusters);
582 585
583int fat_count_free_clusters(struct super_block *sb) 586int fat_count_free_clusters(struct super_block *sb)
584{ 587{
@@ -605,6 +608,7 @@ int fat_count_free_clusters(struct super_block *sb)
605 } while (fat_ent_next(sbi, &fatent)); 608 } while (fat_ent_next(sbi, &fatent));
606 } 609 }
607 sbi->free_clusters = free; 610 sbi->free_clusters = free;
611 sb->s_dirt = 1;
608 fatent_brelse(&fatent); 612 fatent_brelse(&fatent);
609out: 613out:
610 unlock_fat(sbi); 614 unlock_fat(sbi);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 7134403d5be..e99c5a73b39 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -6,11 +6,13 @@
6 * regular file handling primitives for fat-based filesystems 6 * regular file handling primitives for fat-based filesystems
7 */ 7 */
8 8
9#include <linux/capability.h>
9#include <linux/module.h> 10#include <linux/module.h>
10#include <linux/time.h> 11#include <linux/time.h>
11#include <linux/msdos_fs.h> 12#include <linux/msdos_fs.h>
12#include <linux/smp_lock.h> 13#include <linux/smp_lock.h>
13#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
15#include <linux/writeback.h>
14 16
15int fat_generic_ioctl(struct inode *inode, struct file *filp, 17int fat_generic_ioctl(struct inode *inode, struct file *filp,
16 unsigned int cmd, unsigned long arg) 18 unsigned int cmd, unsigned long arg)
@@ -40,7 +42,7 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
40 if (err) 42 if (err)
41 return err; 43 return err;
42 44
43 down(&inode->i_sem); 45 mutex_lock(&inode->i_mutex);
44 46
45 if (IS_RDONLY(inode)) { 47 if (IS_RDONLY(inode)) {
46 err = -EROFS; 48 err = -EROFS;
@@ -102,7 +104,7 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
102 MSDOS_I(inode)->i_attrs = attr & ATTR_UNUSED; 104 MSDOS_I(inode)->i_attrs = attr & ATTR_UNUSED;
103 mark_inode_dirty(inode); 105 mark_inode_dirty(inode);
104 up: 106 up:
105 up(&inode->i_sem); 107 mutex_unlock(&inode->i_mutex);
106 return err; 108 return err;
107 } 109 }
108 default: 110 default:
@@ -124,6 +126,24 @@ struct file_operations fat_file_operations = {
124 .sendfile = generic_file_sendfile, 126 .sendfile = generic_file_sendfile,
125}; 127};
126 128
129static int fat_cont_expand(struct inode *inode, loff_t size)
130{
131 struct address_space *mapping = inode->i_mapping;
132 loff_t start = inode->i_size, count = size - inode->i_size;
133 int err;
134
135 err = generic_cont_expand_simple(inode, size);
136 if (err)
137 goto out;
138
139 inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
140 mark_inode_dirty(inode);
141 if (IS_SYNC(inode))
142 err = sync_page_range_nolock(inode, mapping, start, count);
143out:
144 return err;
145}
146
127int fat_notify_change(struct dentry *dentry, struct iattr *attr) 147int fat_notify_change(struct dentry *dentry, struct iattr *attr)
128{ 148{
129 struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); 149 struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
@@ -132,11 +152,17 @@ int fat_notify_change(struct dentry *dentry, struct iattr *attr)
132 152
133 lock_kernel(); 153 lock_kernel();
134 154
135 /* FAT cannot truncate to a longer file */ 155 /*
156 * Expand the file. Since inode_setattr() updates ->i_size
157 * before calling the ->truncate(), but FAT needs to fill the
158 * hole before it.
159 */
136 if (attr->ia_valid & ATTR_SIZE) { 160 if (attr->ia_valid & ATTR_SIZE) {
137 if (attr->ia_size > inode->i_size) { 161 if (attr->ia_size > inode->i_size) {
138 error = -EPERM; 162 error = fat_cont_expand(inode, attr->ia_size);
139 goto out; 163 if (error || attr->ia_valid == ATTR_SIZE)
164 goto out;
165 attr->ia_valid &= ~ATTR_SIZE;
140 } 166 }
141 } 167 }
142 168
@@ -173,7 +199,7 @@ out:
173 return error; 199 return error;
174} 200}
175 201
176EXPORT_SYMBOL(fat_notify_change); 202EXPORT_SYMBOL_GPL(fat_notify_change);
177 203
178/* Free all clusters after the skip'th cluster. */ 204/* Free all clusters after the skip'th cluster. */
179static int fat_free(struct inode *inode, int skip) 205static int fat_free(struct inode *inode, int skip)
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index a0f9b9fe130..e7f4aa7fc68 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -18,10 +18,12 @@
18#include <linux/seq_file.h> 18#include <linux/seq_file.h>
19#include <linux/msdos_fs.h> 19#include <linux/msdos_fs.h>
20#include <linux/pagemap.h> 20#include <linux/pagemap.h>
21#include <linux/mpage.h>
21#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
22#include <linux/mount.h> 23#include <linux/mount.h>
23#include <linux/vfs.h> 24#include <linux/vfs.h>
24#include <linux/parser.h> 25#include <linux/parser.h>
26#include <linux/uio.h>
25#include <asm/unaligned.h> 27#include <asm/unaligned.h>
26 28
27#ifndef CONFIG_FAT_DEFAULT_IOCHARSET 29#ifndef CONFIG_FAT_DEFAULT_IOCHARSET
@@ -48,51 +50,97 @@ static int fat_add_cluster(struct inode *inode)
48 return err; 50 return err;
49} 51}
50 52
51static int fat_get_block(struct inode *inode, sector_t iblock, 53static int __fat_get_blocks(struct inode *inode, sector_t iblock,
52 struct buffer_head *bh_result, int create) 54 unsigned long *max_blocks,
55 struct buffer_head *bh_result, int create)
53{ 56{
54 struct super_block *sb = inode->i_sb; 57 struct super_block *sb = inode->i_sb;
58 struct msdos_sb_info *sbi = MSDOS_SB(sb);
55 sector_t phys; 59 sector_t phys;
56 int err; 60 unsigned long mapped_blocks;
61 int err, offset;
57 62
58 err = fat_bmap(inode, iblock, &phys); 63 err = fat_bmap(inode, iblock, &phys, &mapped_blocks);
59 if (err) 64 if (err)
60 return err; 65 return err;
61 if (phys) { 66 if (phys) {
62 map_bh(bh_result, sb, phys); 67 map_bh(bh_result, sb, phys);
68 *max_blocks = min(mapped_blocks, *max_blocks);
63 return 0; 69 return 0;
64 } 70 }
65 if (!create) 71 if (!create)
66 return 0; 72 return 0;
73
67 if (iblock != MSDOS_I(inode)->mmu_private >> sb->s_blocksize_bits) { 74 if (iblock != MSDOS_I(inode)->mmu_private >> sb->s_blocksize_bits) {
68 fat_fs_panic(sb, "corrupted file size (i_pos %lld, %lld)", 75 fat_fs_panic(sb, "corrupted file size (i_pos %lld, %lld)",
69 MSDOS_I(inode)->i_pos, MSDOS_I(inode)->mmu_private); 76 MSDOS_I(inode)->i_pos, MSDOS_I(inode)->mmu_private);
70 return -EIO; 77 return -EIO;
71 } 78 }
72 if (!((unsigned long)iblock & (MSDOS_SB(sb)->sec_per_clus - 1))) { 79
80 offset = (unsigned long)iblock & (sbi->sec_per_clus - 1);
81 if (!offset) {
82 /* TODO: multiple cluster allocation would be desirable. */
73 err = fat_add_cluster(inode); 83 err = fat_add_cluster(inode);
74 if (err) 84 if (err)
75 return err; 85 return err;
76 } 86 }
77 MSDOS_I(inode)->mmu_private += sb->s_blocksize; 87 /* available blocks on this cluster */
78 err = fat_bmap(inode, iblock, &phys); 88 mapped_blocks = sbi->sec_per_clus - offset;
89
90 *max_blocks = min(mapped_blocks, *max_blocks);
91 MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits;
92
93 err = fat_bmap(inode, iblock, &phys, &mapped_blocks);
79 if (err) 94 if (err)
80 return err; 95 return err;
81 if (!phys) 96 BUG_ON(!phys);
82 BUG(); 97 BUG_ON(*max_blocks != mapped_blocks);
83 set_buffer_new(bh_result); 98 set_buffer_new(bh_result);
84 map_bh(bh_result, sb, phys); 99 map_bh(bh_result, sb, phys);
85 return 0; 100 return 0;
86} 101}
87 102
103static int fat_get_blocks(struct inode *inode, sector_t iblock,
104 unsigned long max_blocks,
105 struct buffer_head *bh_result, int create)
106{
107 struct super_block *sb = inode->i_sb;
108 int err;
109
110 err = __fat_get_blocks(inode, iblock, &max_blocks, bh_result, create);
111 if (err)
112 return err;
113 bh_result->b_size = max_blocks << sb->s_blocksize_bits;
114 return 0;
115}
116
117static int fat_get_block(struct inode *inode, sector_t iblock,
118 struct buffer_head *bh_result, int create)
119{
120 unsigned long max_blocks = 1;
121 return __fat_get_blocks(inode, iblock, &max_blocks, bh_result, create);
122}
123
88static int fat_writepage(struct page *page, struct writeback_control *wbc) 124static int fat_writepage(struct page *page, struct writeback_control *wbc)
89{ 125{
90 return block_write_full_page(page, fat_get_block, wbc); 126 return block_write_full_page(page, fat_get_block, wbc);
91} 127}
92 128
129static int fat_writepages(struct address_space *mapping,
130 struct writeback_control *wbc)
131{
132 return mpage_writepages(mapping, wbc, fat_get_block);
133}
134
93static int fat_readpage(struct file *file, struct page *page) 135static int fat_readpage(struct file *file, struct page *page)
94{ 136{
95 return block_read_full_page(page, fat_get_block); 137 return mpage_readpage(page, fat_get_block);
138}
139
140static int fat_readpages(struct file *file, struct address_space *mapping,
141 struct list_head *pages, unsigned nr_pages)
142{
143 return mpage_readpages(mapping, pages, nr_pages, fat_get_block);
96} 144}
97 145
98static int fat_prepare_write(struct file *file, struct page *page, 146static int fat_prepare_write(struct file *file, struct page *page,
@@ -115,6 +163,34 @@ static int fat_commit_write(struct file *file, struct page *page,
115 return err; 163 return err;
116} 164}
117 165
166static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
167 const struct iovec *iov,
168 loff_t offset, unsigned long nr_segs)
169{
170 struct file *file = iocb->ki_filp;
171 struct inode *inode = file->f_mapping->host;
172
173 if (rw == WRITE) {
174 /*
175 * FIXME: blockdev_direct_IO() doesn't use ->prepare_write(),
176 * so we need to update the ->mmu_private to block boundary.
177 *
178 * But we must fill the remaining area or hole by nul for
179 * updating ->mmu_private.
180 */
181 loff_t size = offset + iov_length(iov, nr_segs);
182 if (MSDOS_I(inode)->mmu_private < size)
183 return -EINVAL;
184 }
185
186 /*
187 * FAT need to use the DIO_LOCKING for avoiding the race
188 * condition of fat_get_block() and ->truncate().
189 */
190 return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
191 offset, nr_segs, fat_get_blocks, NULL);
192}
193
118static sector_t _fat_bmap(struct address_space *mapping, sector_t block) 194static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
119{ 195{
120 return generic_block_bmap(mapping, block, fat_get_block); 196 return generic_block_bmap(mapping, block, fat_get_block);
@@ -122,10 +198,13 @@ static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
122 198
123static struct address_space_operations fat_aops = { 199static struct address_space_operations fat_aops = {
124 .readpage = fat_readpage, 200 .readpage = fat_readpage,
201 .readpages = fat_readpages,
125 .writepage = fat_writepage, 202 .writepage = fat_writepage,
203 .writepages = fat_writepages,
126 .sync_page = block_sync_page, 204 .sync_page = block_sync_page,
127 .prepare_write = fat_prepare_write, 205 .prepare_write = fat_prepare_write,
128 .commit_write = fat_commit_write, 206 .commit_write = fat_commit_write,
207 .direct_IO = fat_direct_IO,
129 .bmap = _fat_bmap 208 .bmap = _fat_bmap
130}; 209};
131 210
@@ -182,7 +261,7 @@ void fat_attach(struct inode *inode, loff_t i_pos)
182 spin_unlock(&sbi->inode_hash_lock); 261 spin_unlock(&sbi->inode_hash_lock);
183} 262}
184 263
185EXPORT_SYMBOL(fat_attach); 264EXPORT_SYMBOL_GPL(fat_attach);
186 265
187void fat_detach(struct inode *inode) 266void fat_detach(struct inode *inode)
188{ 267{
@@ -193,7 +272,7 @@ void fat_detach(struct inode *inode)
193 spin_unlock(&sbi->inode_hash_lock); 272 spin_unlock(&sbi->inode_hash_lock);
194} 273}
195 274
196EXPORT_SYMBOL(fat_detach); 275EXPORT_SYMBOL_GPL(fat_detach);
197 276
198struct inode *fat_iget(struct super_block *sb, loff_t i_pos) 277struct inode *fat_iget(struct super_block *sb, loff_t i_pos)
199{ 278{
@@ -347,7 +426,7 @@ out:
347 return inode; 426 return inode;
348} 427}
349 428
350EXPORT_SYMBOL(fat_build_inode); 429EXPORT_SYMBOL_GPL(fat_build_inode);
351 430
352static void fat_delete_inode(struct inode *inode) 431static void fat_delete_inode(struct inode *inode)
353{ 432{
@@ -374,12 +453,17 @@ static void fat_clear_inode(struct inode *inode)
374 unlock_kernel(); 453 unlock_kernel();
375} 454}
376 455
377static void fat_put_super(struct super_block *sb) 456static void fat_write_super(struct super_block *sb)
378{ 457{
379 struct msdos_sb_info *sbi = MSDOS_SB(sb); 458 sb->s_dirt = 0;
380 459
381 if (!(sb->s_flags & MS_RDONLY)) 460 if (!(sb->s_flags & MS_RDONLY))
382 fat_clusters_flush(sb); 461 fat_clusters_flush(sb);
462}
463
464static void fat_put_super(struct super_block *sb)
465{
466 struct msdos_sb_info *sbi = MSDOS_SB(sb);
383 467
384 if (sbi->nls_disk) { 468 if (sbi->nls_disk) {
385 unload_nls(sbi->nls_disk); 469 unload_nls(sbi->nls_disk);
@@ -537,7 +621,7 @@ int fat_sync_inode(struct inode *inode)
537 return fat_write_inode(inode, 1); 621 return fat_write_inode(inode, 1);
538} 622}
539 623
540EXPORT_SYMBOL(fat_sync_inode); 624EXPORT_SYMBOL_GPL(fat_sync_inode);
541 625
542static int fat_show_options(struct seq_file *m, struct vfsmount *mnt); 626static int fat_show_options(struct seq_file *m, struct vfsmount *mnt);
543static struct super_operations fat_sops = { 627static struct super_operations fat_sops = {
@@ -546,6 +630,7 @@ static struct super_operations fat_sops = {
546 .write_inode = fat_write_inode, 630 .write_inode = fat_write_inode,
547 .delete_inode = fat_delete_inode, 631 .delete_inode = fat_delete_inode,
548 .put_super = fat_put_super, 632 .put_super = fat_put_super,
633 .write_super = fat_write_super,
549 .statfs = fat_statfs, 634 .statfs = fat_statfs,
550 .clear_inode = fat_clear_inode, 635 .clear_inode = fat_clear_inode,
551 .remount_fs = fat_remount, 636 .remount_fs = fat_remount,
@@ -1347,7 +1432,7 @@ out_fail:
1347 return error; 1432 return error;
1348} 1433}
1349 1434
1350EXPORT_SYMBOL(fat_fill_super); 1435EXPORT_SYMBOL_GPL(fat_fill_super);
1351 1436
1352int __init fat_cache_init(void); 1437int __init fat_cache_init(void);
1353void fat_cache_destroy(void); 1438void fat_cache_destroy(void);
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 2a0df2122f5..32fb0a3f1da 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -33,7 +33,7 @@ void fat_fs_panic(struct super_block *s, const char *fmt, ...)
33 } 33 }
34} 34}
35 35
36EXPORT_SYMBOL(fat_fs_panic); 36EXPORT_SYMBOL_GPL(fat_fs_panic);
37 37
38/* Flushes the number of free clusters on FAT32 */ 38/* Flushes the number of free clusters on FAT32 */
39/* XXX: Need to write one per FSINFO block. Currently only writes 1 */ 39/* XXX: Need to write one per FSINFO block. Currently only writes 1 */
@@ -67,8 +67,6 @@ void fat_clusters_flush(struct super_block *sb)
67 if (sbi->prev_free != -1) 67 if (sbi->prev_free != -1)
68 fsinfo->next_cluster = cpu_to_le32(sbi->prev_free); 68 fsinfo->next_cluster = cpu_to_le32(sbi->prev_free);
69 mark_buffer_dirty(bh); 69 mark_buffer_dirty(bh);
70 if (sb->s_flags & MS_SYNCHRONOUS)
71 sync_dirty_buffer(bh);
72 } 70 }
73 brelse(bh); 71 brelse(bh);
74} 72}
@@ -194,7 +192,7 @@ void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date)
194 *date = cpu_to_le16(nl_day-day_n[month-1]+1+(month << 5)+(year << 9)); 192 *date = cpu_to_le16(nl_day-day_n[month-1]+1+(month << 5)+(year << 9));
195} 193}
196 194
197EXPORT_SYMBOL(fat_date_unix2dos); 195EXPORT_SYMBOL_GPL(fat_date_unix2dos);
198 196
199int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs) 197int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
200{ 198{
@@ -222,4 +220,4 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
222 return err; 220 return err;
223} 221}
224 222
225EXPORT_SYMBOL(fat_sync_bhs); 223EXPORT_SYMBOL_GPL(fat_sync_bhs);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 863b46e0d78..5f96786d1c7 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -9,6 +9,7 @@
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/file.h> 11#include <linux/file.h>
12#include <linux/capability.h>
12#include <linux/dnotify.h> 13#include <linux/dnotify.h>
13#include <linux/smp_lock.h> 14#include <linux/smp_lock.h>
14#include <linux/slab.h> 15#include <linux/slab.h>
@@ -35,7 +36,7 @@ void fastcall set_close_on_exec(unsigned int fd, int flag)
35 spin_unlock(&files->file_lock); 36 spin_unlock(&files->file_lock);
36} 37}
37 38
38static inline int get_close_on_exec(unsigned int fd) 39static int get_close_on_exec(unsigned int fd)
39{ 40{
40 struct files_struct *files = current->files; 41 struct files_struct *files = current->files;
41 struct fdtable *fdt; 42 struct fdtable *fdt;
@@ -457,11 +458,11 @@ static void send_sigio_to_task(struct task_struct *p,
457 else 458 else
458 si.si_band = band_table[reason - POLL_IN]; 459 si.si_band = band_table[reason - POLL_IN];
459 si.si_fd = fd; 460 si.si_fd = fd;
460 if (!send_group_sig_info(fown->signum, &si, p)) 461 if (!group_send_sig_info(fown->signum, &si, p))
461 break; 462 break;
462 /* fall-through: fall back on the old plain SIGIO signal */ 463 /* fall-through: fall back on the old plain SIGIO signal */
463 case 0: 464 case 0:
464 send_group_sig_info(SIGIO, SEND_SIG_PRIV, p); 465 group_send_sig_info(SIGIO, SEND_SIG_PRIV, p);
465 } 466 }
466} 467}
467 468
@@ -495,7 +496,7 @@ static void send_sigurg_to_task(struct task_struct *p,
495 struct fown_struct *fown) 496 struct fown_struct *fown)
496{ 497{
497 if (sigio_perm(p, fown, SIGURG)) 498 if (sigio_perm(p, fown, SIGURG))
498 send_group_sig_info(SIGURG, SEND_SIG_PRIV, p); 499 group_send_sig_info(SIGURG, SEND_SIG_PRIV, p);
499} 500}
500 501
501int send_sigurg(struct fown_struct *fown) 502int send_sigurg(struct fown_struct *fown)
diff --git a/fs/fifo.c b/fs/fifo.c
index 5455916241f..923371b753a 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -35,7 +35,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
35 int ret; 35 int ret;
36 36
37 ret = -ERESTARTSYS; 37 ret = -ERESTARTSYS;
38 if (down_interruptible(PIPE_SEM(*inode))) 38 if (mutex_lock_interruptible(PIPE_MUTEX(*inode)))
39 goto err_nolock_nocleanup; 39 goto err_nolock_nocleanup;
40 40
41 if (!inode->i_pipe) { 41 if (!inode->i_pipe) {
@@ -119,7 +119,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
119 } 119 }
120 120
121 /* Ok! */ 121 /* Ok! */
122 up(PIPE_SEM(*inode)); 122 mutex_unlock(PIPE_MUTEX(*inode));
123 return 0; 123 return 0;
124 124
125err_rd: 125err_rd:
@@ -139,7 +139,7 @@ err:
139 free_pipe_info(inode); 139 free_pipe_info(inode);
140 140
141err_nocleanup: 141err_nocleanup:
142 up(PIPE_SEM(*inode)); 142 mutex_unlock(PIPE_MUTEX(*inode));
143 143
144err_nolock_nocleanup: 144err_nolock_nocleanup:
145 return ret; 145 return ret;
diff --git a/fs/file_table.c b/fs/file_table.c
index c3a5e2fd663..768b5816754 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -16,6 +16,7 @@
16#include <linux/eventpoll.h> 16#include <linux/eventpoll.h>
17#include <linux/rcupdate.h> 17#include <linux/rcupdate.h>
18#include <linux/mount.h> 18#include <linux/mount.h>
19#include <linux/capability.h>
19#include <linux/cdev.h> 20#include <linux/cdev.h>
20#include <linux/fsnotify.h> 21#include <linux/fsnotify.h>
21 22
@@ -117,7 +118,7 @@ EXPORT_SYMBOL(get_empty_filp);
117 118
118void fastcall fput(struct file *file) 119void fastcall fput(struct file *file)
119{ 120{
120 if (rcuref_dec_and_test(&file->f_count)) 121 if (atomic_dec_and_test(&file->f_count))
121 __fput(file); 122 __fput(file);
122} 123}
123 124
@@ -166,7 +167,7 @@ struct file fastcall *fget(unsigned int fd)
166 rcu_read_lock(); 167 rcu_read_lock();
167 file = fcheck_files(files, fd); 168 file = fcheck_files(files, fd);
168 if (file) { 169 if (file) {
169 if (!rcuref_inc_lf(&file->f_count)) { 170 if (!atomic_inc_not_zero(&file->f_count)) {
170 /* File object ref couldn't be taken */ 171 /* File object ref couldn't be taken */
171 rcu_read_unlock(); 172 rcu_read_unlock();
172 return NULL; 173 return NULL;
@@ -198,7 +199,7 @@ struct file fastcall *fget_light(unsigned int fd, int *fput_needed)
198 rcu_read_lock(); 199 rcu_read_lock();
199 file = fcheck_files(files, fd); 200 file = fcheck_files(files, fd);
200 if (file) { 201 if (file) {
201 if (rcuref_inc_lf(&file->f_count)) 202 if (atomic_inc_not_zero(&file->f_count))
202 *fput_needed = 1; 203 *fput_needed = 1;
203 else 204 else
204 /* Didn't get the reference, someone's freed */ 205 /* Didn't get the reference, someone's freed */
@@ -213,7 +214,7 @@ struct file fastcall *fget_light(unsigned int fd, int *fput_needed)
213 214
214void put_filp(struct file *file) 215void put_filp(struct file *file)
215{ 216{
216 if (rcuref_dec_and_test(&file->f_count)) { 217 if (atomic_dec_and_test(&file->f_count)) {
217 security_file_free(file); 218 security_file_free(file);
218 file_kill(file); 219 file_kill(file);
219 file_free(file); 220 file_free(file);
diff --git a/fs/freevxfs/vxfs_bmap.c b/fs/freevxfs/vxfs_bmap.c
index d3f6b2835bc..2d71128bd8d 100644
--- a/fs/freevxfs/vxfs_bmap.c
+++ b/fs/freevxfs/vxfs_bmap.c
@@ -36,6 +36,7 @@
36 36
37#include "vxfs.h" 37#include "vxfs.h"
38#include "vxfs_inode.h" 38#include "vxfs_inode.h"
39#include "vxfs_extern.h"
39 40
40 41
41#ifdef DIAGNOSTIC 42#ifdef DIAGNOSTIC
diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c
index d0401dc68d4..6f5df1700e9 100644
--- a/fs/freevxfs/vxfs_immed.c
+++ b/fs/freevxfs/vxfs_immed.c
@@ -99,8 +99,8 @@ static int
99vxfs_immed_readpage(struct file *fp, struct page *pp) 99vxfs_immed_readpage(struct file *fp, struct page *pp)
100{ 100{
101 struct vxfs_inode_info *vip = VXFS_INO(pp->mapping->host); 101 struct vxfs_inode_info *vip = VXFS_INO(pp->mapping->host);
102 u_int64_t offset = pp->index << PAGE_CACHE_SHIFT; 102 u_int64_t offset = (u_int64_t)pp->index << PAGE_CACHE_SHIFT;
103 caddr_t kaddr; 103 caddr_t kaddr;
104 104
105 kaddr = kmap(pp); 105 kaddr = kmap(pp);
106 memcpy(kaddr, vip->vii_immed.vi_immed + offset, PAGE_CACHE_SIZE); 106 memcpy(kaddr, vip->vii_immed.vi_immed + offset, PAGE_CACHE_SIZE);
diff --git a/fs/freevxfs/vxfs_olt.c b/fs/freevxfs/vxfs_olt.c
index 133476201d8..76a0708ae97 100644
--- a/fs/freevxfs/vxfs_olt.c
+++ b/fs/freevxfs/vxfs_olt.c
@@ -36,6 +36,7 @@
36 36
37#include "vxfs.h" 37#include "vxfs.h"
38#include "vxfs_olt.h" 38#include "vxfs_olt.h"
39#include "vxfs_extern.h"
39 40
40 41
41static inline void 42static inline void
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8f873e621f4..4526da8907c 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -21,18 +21,18 @@ MODULE_ALIAS_MISCDEV(FUSE_MINOR);
21 21
22static kmem_cache_t *fuse_req_cachep; 22static kmem_cache_t *fuse_req_cachep;
23 23
24static inline struct fuse_conn *fuse_get_conn(struct file *file) 24static struct fuse_conn *fuse_get_conn(struct file *file)
25{ 25{
26 struct fuse_conn *fc; 26 struct fuse_conn *fc;
27 spin_lock(&fuse_lock); 27 spin_lock(&fuse_lock);
28 fc = file->private_data; 28 fc = file->private_data;
29 if (fc && !fc->mounted) 29 if (fc && !fc->connected)
30 fc = NULL; 30 fc = NULL;
31 spin_unlock(&fuse_lock); 31 spin_unlock(&fuse_lock);
32 return fc; 32 return fc;
33} 33}
34 34
35static inline void fuse_request_init(struct fuse_req *req) 35static void fuse_request_init(struct fuse_req *req)
36{ 36{
37 memset(req, 0, sizeof(*req)); 37 memset(req, 0, sizeof(*req));
38 INIT_LIST_HEAD(&req->list); 38 INIT_LIST_HEAD(&req->list);
@@ -53,7 +53,7 @@ void fuse_request_free(struct fuse_req *req)
53 kmem_cache_free(fuse_req_cachep, req); 53 kmem_cache_free(fuse_req_cachep, req);
54} 54}
55 55
56static inline void block_sigs(sigset_t *oldset) 56static void block_sigs(sigset_t *oldset)
57{ 57{
58 sigset_t mask; 58 sigset_t mask;
59 59
@@ -61,7 +61,7 @@ static inline void block_sigs(sigset_t *oldset)
61 sigprocmask(SIG_BLOCK, &mask, oldset); 61 sigprocmask(SIG_BLOCK, &mask, oldset);
62} 62}
63 63
64static inline void restore_sigs(sigset_t *oldset) 64static void restore_sigs(sigset_t *oldset)
65{ 65{
66 sigprocmask(SIG_SETMASK, oldset, NULL); 66 sigprocmask(SIG_SETMASK, oldset, NULL);
67} 67}
@@ -109,18 +109,24 @@ struct fuse_req *fuse_get_request(struct fuse_conn *fc)
109 int intr; 109 int intr;
110 sigset_t oldset; 110 sigset_t oldset;
111 111
112 atomic_inc(&fc->num_waiting);
112 block_sigs(&oldset); 113 block_sigs(&oldset);
113 intr = down_interruptible(&fc->outstanding_sem); 114 intr = down_interruptible(&fc->outstanding_sem);
114 restore_sigs(&oldset); 115 restore_sigs(&oldset);
115 return intr ? NULL : do_get_request(fc); 116 if (intr) {
117 atomic_dec(&fc->num_waiting);
118 return NULL;
119 }
120 return do_get_request(fc);
116} 121}
117 122
118static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req) 123static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req)
119{ 124{
120 spin_lock(&fuse_lock); 125 spin_lock(&fuse_lock);
121 if (req->preallocated) 126 if (req->preallocated) {
127 atomic_dec(&fc->num_waiting);
122 list_add(&req->list, &fc->unused_list); 128 list_add(&req->list, &fc->unused_list);
123 else 129 } else
124 fuse_request_free(req); 130 fuse_request_free(req);
125 131
126 /* If we are in debt decrease that first */ 132 /* If we are in debt decrease that first */
@@ -151,19 +157,20 @@ void fuse_release_background(struct fuse_req *req)
151/* 157/*
152 * This function is called when a request is finished. Either a reply 158 * This function is called when a request is finished. Either a reply
153 * has arrived or it was interrupted (and not yet sent) or some error 159 * has arrived or it was interrupted (and not yet sent) or some error
154 * occurred during communication with userspace, or the device file was 160 * occurred during communication with userspace, or the device file
155 * closed. It decreases the reference count for the request. In case 161 * was closed. In case of a background request the reference to the
156 * of a background request the reference to the stored objects are 162 * stored objects are released. The requester thread is woken up (if
157 * released. The requester thread is woken up (if still waiting), and 163 * still waiting), the 'end' callback is called if given, else the
158 * finally the request is either freed or put on the unused_list 164 * reference to the request is released
159 * 165 *
160 * Called with fuse_lock, unlocks it 166 * Called with fuse_lock, unlocks it
161 */ 167 */
162static void request_end(struct fuse_conn *fc, struct fuse_req *req) 168static void request_end(struct fuse_conn *fc, struct fuse_req *req)
163{ 169{
164 int putback; 170 void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
165 req->finished = 1; 171 req->end = NULL;
166 putback = atomic_dec_and_test(&req->count); 172 list_del(&req->list);
173 req->state = FUSE_REQ_FINISHED;
167 spin_unlock(&fuse_lock); 174 spin_unlock(&fuse_lock);
168 if (req->background) { 175 if (req->background) {
169 down_read(&fc->sbput_sem); 176 down_read(&fc->sbput_sem);
@@ -172,28 +179,10 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
172 up_read(&fc->sbput_sem); 179 up_read(&fc->sbput_sem);
173 } 180 }
174 wake_up(&req->waitq); 181 wake_up(&req->waitq);
175 if (req->in.h.opcode == FUSE_INIT) { 182 if (end)
176 int i; 183 end(fc, req);
177 184 else
178 if (req->misc.init_in_out.major != FUSE_KERNEL_VERSION) 185 fuse_put_request(fc, req);
179 fc->conn_error = 1;
180
181 /* After INIT reply is received other requests can go
182 out. So do (FUSE_MAX_OUTSTANDING - 1) number of
183 up()s on outstanding_sem. The last up() is done in
184 fuse_putback_request() */
185 for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
186 up(&fc->outstanding_sem);
187 } else if (req->in.h.opcode == FUSE_RELEASE && req->inode == NULL) {
188 /* Special case for failed iget in CREATE */
189 u64 nodeid = req->in.h.nodeid;
190 __fuse_get_request(req);
191 fuse_reset_request(req);
192 fuse_send_forget(fc, req, nodeid, 1);
193 putback = 0;
194 }
195 if (putback)
196 fuse_putback_request(fc, req);
197} 186}
198 187
199/* 188/*
@@ -244,14 +233,16 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
244 233
245 spin_unlock(&fuse_lock); 234 spin_unlock(&fuse_lock);
246 block_sigs(&oldset); 235 block_sigs(&oldset);
247 wait_event_interruptible(req->waitq, req->finished); 236 wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED);
248 restore_sigs(&oldset); 237 restore_sigs(&oldset);
249 spin_lock(&fuse_lock); 238 spin_lock(&fuse_lock);
250 if (req->finished) 239 if (req->state == FUSE_REQ_FINISHED && !req->interrupted)
251 return; 240 return;
252 241
253 req->out.h.error = -EINTR; 242 if (!req->interrupted) {
254 req->interrupted = 1; 243 req->out.h.error = -EINTR;
244 req->interrupted = 1;
245 }
255 if (req->locked) { 246 if (req->locked) {
256 /* This is uninterruptible sleep, because data is 247 /* This is uninterruptible sleep, because data is
257 being copied to/from the buffers of req. During 248 being copied to/from the buffers of req. During
@@ -262,10 +253,10 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
262 wait_event(req->waitq, !req->locked); 253 wait_event(req->waitq, !req->locked);
263 spin_lock(&fuse_lock); 254 spin_lock(&fuse_lock);
264 } 255 }
265 if (!req->sent && !list_empty(&req->list)) { 256 if (req->state == FUSE_REQ_PENDING) {
266 list_del(&req->list); 257 list_del(&req->list);
267 __fuse_put_request(req); 258 __fuse_put_request(req);
268 } else if (!req->finished && req->sent) 259 } else if (req->state == FUSE_REQ_SENT)
269 background_request(fc, req); 260 background_request(fc, req);
270} 261}
271 262
@@ -300,6 +291,7 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
300 fc->outstanding_debt++; 291 fc->outstanding_debt++;
301 } 292 }
302 list_add_tail(&req->list, &fc->pending); 293 list_add_tail(&req->list, &fc->pending);
294 req->state = FUSE_REQ_PENDING;
303 wake_up(&fc->waitq); 295 wake_up(&fc->waitq);
304} 296}
305 297
@@ -352,30 +344,12 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
352 request_send_nowait(fc, req); 344 request_send_nowait(fc, req);
353} 345}
354 346
355void fuse_send_init(struct fuse_conn *fc)
356{
357 /* This is called from fuse_read_super() so there's guaranteed
358 to be a request available */
359 struct fuse_req *req = do_get_request(fc);
360 struct fuse_init_in_out *arg = &req->misc.init_in_out;
361 arg->major = FUSE_KERNEL_VERSION;
362 arg->minor = FUSE_KERNEL_MINOR_VERSION;
363 req->in.h.opcode = FUSE_INIT;
364 req->in.numargs = 1;
365 req->in.args[0].size = sizeof(*arg);
366 req->in.args[0].value = arg;
367 req->out.numargs = 1;
368 req->out.args[0].size = sizeof(*arg);
369 req->out.args[0].value = arg;
370 request_send_background(fc, req);
371}
372
373/* 347/*
374 * Lock the request. Up to the next unlock_request() there mustn't be 348 * Lock the request. Up to the next unlock_request() there mustn't be
375 * anything that could cause a page-fault. If the request was already 349 * anything that could cause a page-fault. If the request was already
376 * interrupted bail out. 350 * interrupted bail out.
377 */ 351 */
378static inline int lock_request(struct fuse_req *req) 352static int lock_request(struct fuse_req *req)
379{ 353{
380 int err = 0; 354 int err = 0;
381 if (req) { 355 if (req) {
@@ -394,7 +368,7 @@ static inline int lock_request(struct fuse_req *req)
394 * requester thread is currently waiting for it to be unlocked, so 368 * requester thread is currently waiting for it to be unlocked, so
395 * wake it up. 369 * wake it up.
396 */ 370 */
397static inline void unlock_request(struct fuse_req *req) 371static void unlock_request(struct fuse_req *req)
398{ 372{
399 if (req) { 373 if (req) {
400 spin_lock(&fuse_lock); 374 spin_lock(&fuse_lock);
@@ -430,7 +404,7 @@ static void fuse_copy_init(struct fuse_copy_state *cs, int write,
430} 404}
431 405
432/* Unmap and put previous page of userspace buffer */ 406/* Unmap and put previous page of userspace buffer */
433static inline void fuse_copy_finish(struct fuse_copy_state *cs) 407static void fuse_copy_finish(struct fuse_copy_state *cs)
434{ 408{
435 if (cs->mapaddr) { 409 if (cs->mapaddr) {
436 kunmap_atomic(cs->mapaddr, KM_USER0); 410 kunmap_atomic(cs->mapaddr, KM_USER0);
@@ -479,8 +453,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
479} 453}
480 454
481/* Do as much copy to/from userspace buffer as we can */ 455/* Do as much copy to/from userspace buffer as we can */
482static inline int fuse_copy_do(struct fuse_copy_state *cs, void **val, 456static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
483 unsigned *size)
484{ 457{
485 unsigned ncpy = min(*size, cs->len); 458 unsigned ncpy = min(*size, cs->len);
486 if (val) { 459 if (val) {
@@ -500,8 +473,8 @@ static inline int fuse_copy_do(struct fuse_copy_state *cs, void **val,
500 * Copy a page in the request to/from the userspace buffer. Must be 473 * Copy a page in the request to/from the userspace buffer. Must be
501 * done atomically 474 * done atomically
502 */ 475 */
503static inline int fuse_copy_page(struct fuse_copy_state *cs, struct page *page, 476static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
504 unsigned offset, unsigned count, int zeroing) 477 unsigned offset, unsigned count, int zeroing)
505{ 478{
506 if (page && zeroing && count < PAGE_SIZE) { 479 if (page && zeroing && count < PAGE_SIZE) {
507 void *mapaddr = kmap_atomic(page, KM_USER1); 480 void *mapaddr = kmap_atomic(page, KM_USER1);
@@ -583,7 +556,7 @@ static void request_wait(struct fuse_conn *fc)
583 DECLARE_WAITQUEUE(wait, current); 556 DECLARE_WAITQUEUE(wait, current);
584 557
585 add_wait_queue_exclusive(&fc->waitq, &wait); 558 add_wait_queue_exclusive(&fc->waitq, &wait);
586 while (fc->mounted && list_empty(&fc->pending)) { 559 while (fc->connected && list_empty(&fc->pending)) {
587 set_current_state(TASK_INTERRUPTIBLE); 560 set_current_state(TASK_INTERRUPTIBLE);
588 if (signal_pending(current)) 561 if (signal_pending(current))
589 break; 562 break;
@@ -615,6 +588,7 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
615 struct fuse_copy_state cs; 588 struct fuse_copy_state cs;
616 unsigned reqsize; 589 unsigned reqsize;
617 590
591 restart:
618 spin_lock(&fuse_lock); 592 spin_lock(&fuse_lock);
619 fc = file->private_data; 593 fc = file->private_data;
620 err = -EPERM; 594 err = -EPERM;
@@ -622,28 +596,34 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
622 goto err_unlock; 596 goto err_unlock;
623 request_wait(fc); 597 request_wait(fc);
624 err = -ENODEV; 598 err = -ENODEV;
625 if (!fc->mounted) 599 if (!fc->connected)
626 goto err_unlock; 600 goto err_unlock;
627 err = -ERESTARTSYS; 601 err = -ERESTARTSYS;
628 if (list_empty(&fc->pending)) 602 if (list_empty(&fc->pending))
629 goto err_unlock; 603 goto err_unlock;
630 604
631 req = list_entry(fc->pending.next, struct fuse_req, list); 605 req = list_entry(fc->pending.next, struct fuse_req, list);
632 list_del_init(&req->list); 606 req->state = FUSE_REQ_READING;
633 spin_unlock(&fuse_lock); 607 list_move(&req->list, &fc->io);
634 608
635 in = &req->in; 609 in = &req->in;
636 reqsize = req->in.h.len; 610 reqsize = in->h.len;
637 fuse_copy_init(&cs, 1, req, iov, nr_segs); 611 /* If request is too large, reply with an error and restart the read */
638 err = -EINVAL; 612 if (iov_length(iov, nr_segs) < reqsize) {
639 if (iov_length(iov, nr_segs) >= reqsize) { 613 req->out.h.error = -EIO;
640 err = fuse_copy_one(&cs, &in->h, sizeof(in->h)); 614 /* SETXATTR is special, since it may contain too large data */
641 if (!err) 615 if (in->h.opcode == FUSE_SETXATTR)
642 err = fuse_copy_args(&cs, in->numargs, in->argpages, 616 req->out.h.error = -E2BIG;
643 (struct fuse_arg *) in->args, 0); 617 request_end(fc, req);
618 goto restart;
644 } 619 }
620 spin_unlock(&fuse_lock);
621 fuse_copy_init(&cs, 1, req, iov, nr_segs);
622 err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
623 if (!err)
624 err = fuse_copy_args(&cs, in->numargs, in->argpages,
625 (struct fuse_arg *) in->args, 0);
645 fuse_copy_finish(&cs); 626 fuse_copy_finish(&cs);
646
647 spin_lock(&fuse_lock); 627 spin_lock(&fuse_lock);
648 req->locked = 0; 628 req->locked = 0;
649 if (!err && req->interrupted) 629 if (!err && req->interrupted)
@@ -657,8 +637,8 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
657 if (!req->isreply) 637 if (!req->isreply)
658 request_end(fc, req); 638 request_end(fc, req);
659 else { 639 else {
660 req->sent = 1; 640 req->state = FUSE_REQ_SENT;
661 list_add_tail(&req->list, &fc->processing); 641 list_move_tail(&req->list, &fc->processing);
662 spin_unlock(&fuse_lock); 642 spin_unlock(&fuse_lock);
663 } 643 }
664 return reqsize; 644 return reqsize;
@@ -746,17 +726,23 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
746 goto err_finish; 726 goto err_finish;
747 727
748 spin_lock(&fuse_lock); 728 spin_lock(&fuse_lock);
729 err = -ENOENT;
730 if (!fc->connected)
731 goto err_unlock;
732
749 req = request_find(fc, oh.unique); 733 req = request_find(fc, oh.unique);
750 err = -EINVAL; 734 err = -EINVAL;
751 if (!req) 735 if (!req)
752 goto err_unlock; 736 goto err_unlock;
753 737
754 list_del_init(&req->list);
755 if (req->interrupted) { 738 if (req->interrupted) {
756 request_end(fc, req); 739 spin_unlock(&fuse_lock);
757 fuse_copy_finish(&cs); 740 fuse_copy_finish(&cs);
741 spin_lock(&fuse_lock);
742 request_end(fc, req);
758 return -ENOENT; 743 return -ENOENT;
759 } 744 }
745 list_move(&req->list, &fc->io);
760 req->out.h = oh; 746 req->out.h = oh;
761 req->locked = 1; 747 req->locked = 1;
762 cs.req = req; 748 cs.req = req;
@@ -810,19 +796,90 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
810 return mask; 796 return mask;
811} 797}
812 798
813/* Abort all requests on the given list (pending or processing) */ 799/*
800 * Abort all requests on the given list (pending or processing)
801 *
802 * This function releases and reacquires fuse_lock
803 */
814static void end_requests(struct fuse_conn *fc, struct list_head *head) 804static void end_requests(struct fuse_conn *fc, struct list_head *head)
815{ 805{
816 while (!list_empty(head)) { 806 while (!list_empty(head)) {
817 struct fuse_req *req; 807 struct fuse_req *req;
818 req = list_entry(head->next, struct fuse_req, list); 808 req = list_entry(head->next, struct fuse_req, list);
819 list_del_init(&req->list);
820 req->out.h.error = -ECONNABORTED; 809 req->out.h.error = -ECONNABORTED;
821 request_end(fc, req); 810 request_end(fc, req);
822 spin_lock(&fuse_lock); 811 spin_lock(&fuse_lock);
823 } 812 }
824} 813}
825 814
815/*
816 * Abort requests under I/O
817 *
818 * The requests are set to interrupted and finished, and the request
819 * waiter is woken up. This will make request_wait_answer() wait
820 * until the request is unlocked and then return.
821 *
822 * If the request is asynchronous, then the end function needs to be
823 * called after waiting for the request to be unlocked (if it was
824 * locked).
825 */
826static void end_io_requests(struct fuse_conn *fc)
827{
828 while (!list_empty(&fc->io)) {
829 struct fuse_req *req =
830 list_entry(fc->io.next, struct fuse_req, list);
831 void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
832
833 req->interrupted = 1;
834 req->out.h.error = -ECONNABORTED;
835 req->state = FUSE_REQ_FINISHED;
836 list_del_init(&req->list);
837 wake_up(&req->waitq);
838 if (end) {
839 req->end = NULL;
840 /* The end function will consume this reference */
841 __fuse_get_request(req);
842 spin_unlock(&fuse_lock);
843 wait_event(req->waitq, !req->locked);
844 end(fc, req);
845 spin_lock(&fuse_lock);
846 }
847 }
848}
849
850/*
851 * Abort all requests.
852 *
853 * Emergency exit in case of a malicious or accidental deadlock, or
854 * just a hung filesystem.
855 *
856 * The same effect is usually achievable through killing the
857 * filesystem daemon and all users of the filesystem. The exception
858 * is the combination of an asynchronous request and the tricky
859 * deadlock (see Documentation/filesystems/fuse.txt).
860 *
861 * During the aborting, progression of requests from the pending and
862 * processing lists onto the io list, and progression of new requests
863 * onto the pending list is prevented by req->connected being false.
864 *
865 * Progression of requests under I/O to the processing list is
866 * prevented by the req->interrupted flag being true for these
867 * requests. For this reason requests on the io list must be aborted
868 * first.
869 */
870void fuse_abort_conn(struct fuse_conn *fc)
871{
872 spin_lock(&fuse_lock);
873 if (fc->connected) {
874 fc->connected = 0;
875 end_io_requests(fc);
876 end_requests(fc, &fc->pending);
877 end_requests(fc, &fc->processing);
878 wake_up_all(&fc->waitq);
879 }
880 spin_unlock(&fuse_lock);
881}
882
826static int fuse_dev_release(struct inode *inode, struct file *file) 883static int fuse_dev_release(struct inode *inode, struct file *file)
827{ 884{
828 struct fuse_conn *fc; 885 struct fuse_conn *fc;
@@ -833,9 +890,11 @@ static int fuse_dev_release(struct inode *inode, struct file *file)
833 fc->connected = 0; 890 fc->connected = 0;
834 end_requests(fc, &fc->pending); 891 end_requests(fc, &fc->pending);
835 end_requests(fc, &fc->processing); 892 end_requests(fc, &fc->processing);
836 fuse_release_conn(fc);
837 } 893 }
838 spin_unlock(&fuse_lock); 894 spin_unlock(&fuse_lock);
895 if (fc)
896 kobject_put(&fc->kobj);
897
839 return 0; 898 return 0;
840} 899}
841 900
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index c045cc70c74..21fd59c7bc2 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -13,15 +13,66 @@
13#include <linux/gfp.h> 13#include <linux/gfp.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/namei.h> 15#include <linux/namei.h>
16#include <linux/mount.h>
17 16
18static inline unsigned long time_to_jiffies(unsigned long sec, 17/*
19 unsigned long nsec) 18 * FUSE caches dentries and attributes with separate timeout. The
19 * time in jiffies until the dentry/attributes are valid is stored in
20 * dentry->d_time and fuse_inode->i_time respectively.
21 */
22
23/*
24 * Calculate the time in jiffies until a dentry/attributes are valid
25 */
26static unsigned long time_to_jiffies(unsigned long sec, unsigned long nsec)
20{ 27{
21 struct timespec ts = {sec, nsec}; 28 struct timespec ts = {sec, nsec};
22 return jiffies + timespec_to_jiffies(&ts); 29 return jiffies + timespec_to_jiffies(&ts);
23} 30}
24 31
32/*
33 * Set dentry and possibly attribute timeouts from the lookup/mk*
34 * replies
35 */
36static void fuse_change_timeout(struct dentry *entry, struct fuse_entry_out *o)
37{
38 entry->d_time = time_to_jiffies(o->entry_valid, o->entry_valid_nsec);
39 if (entry->d_inode)
40 get_fuse_inode(entry->d_inode)->i_time =
41 time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
42}
43
44/*
45 * Mark the attributes as stale, so that at the next call to
46 * ->getattr() they will be fetched from userspace
47 */
48void fuse_invalidate_attr(struct inode *inode)
49{
50 get_fuse_inode(inode)->i_time = jiffies - 1;
51}
52
53/*
54 * Just mark the entry as stale, so that a next attempt to look it up
55 * will result in a new lookup call to userspace
56 *
57 * This is called when a dentry is about to become negative and the
58 * timeout is unknown (unlink, rmdir, rename and in some cases
59 * lookup)
60 */
61static void fuse_invalidate_entry_cache(struct dentry *entry)
62{
63 entry->d_time = jiffies - 1;
64}
65
66/*
67 * Same as fuse_invalidate_entry_cache(), but also try to remove the
68 * dentry from the hash
69 */
70static void fuse_invalidate_entry(struct dentry *entry)
71{
72 d_invalidate(entry);
73 fuse_invalidate_entry_cache(entry);
74}
75
25static void fuse_lookup_init(struct fuse_req *req, struct inode *dir, 76static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
26 struct dentry *entry, 77 struct dentry *entry,
27 struct fuse_entry_out *outarg) 78 struct fuse_entry_out *outarg)
@@ -37,17 +88,34 @@ static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
37 req->out.args[0].value = outarg; 88 req->out.args[0].value = outarg;
38} 89}
39 90
91/*
92 * Check whether the dentry is still valid
93 *
94 * If the entry validity timeout has expired and the dentry is
95 * positive, try to redo the lookup. If the lookup results in a
96 * different inode, then let the VFS invalidate the dentry and redo
97 * the lookup once more. If the lookup results in the same inode,
98 * then refresh the attributes, timeouts and mark the dentry valid.
99 */
40static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) 100static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
41{ 101{
42 if (!entry->d_inode || is_bad_inode(entry->d_inode)) 102 struct inode *inode = entry->d_inode;
103
104 if (inode && is_bad_inode(inode))
43 return 0; 105 return 0;
44 else if (time_after(jiffies, entry->d_time)) { 106 else if (time_after(jiffies, entry->d_time)) {
45 int err; 107 int err;
46 struct fuse_entry_out outarg; 108 struct fuse_entry_out outarg;
47 struct inode *inode = entry->d_inode; 109 struct fuse_conn *fc;
48 struct fuse_inode *fi = get_fuse_inode(inode); 110 struct fuse_req *req;
49 struct fuse_conn *fc = get_fuse_conn(inode); 111
50 struct fuse_req *req = fuse_get_request(fc); 112 /* Doesn't hurt to "reset" the validity timeout */
113 fuse_invalidate_entry_cache(entry);
114 if (!inode)
115 return 0;
116
117 fc = get_fuse_conn(inode);
118 req = fuse_get_request(fc);
51 if (!req) 119 if (!req)
52 return 0; 120 return 0;
53 121
@@ -55,6 +123,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
55 request_send(fc, req); 123 request_send(fc, req);
56 err = req->out.h.error; 124 err = req->out.h.error;
57 if (!err) { 125 if (!err) {
126 struct fuse_inode *fi = get_fuse_inode(inode);
58 if (outarg.nodeid != get_node_id(inode)) { 127 if (outarg.nodeid != get_node_id(inode)) {
59 fuse_send_forget(fc, req, outarg.nodeid, 1); 128 fuse_send_forget(fc, req, outarg.nodeid, 1);
60 return 0; 129 return 0;
@@ -66,20 +135,44 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
66 return 0; 135 return 0;
67 136
68 fuse_change_attributes(inode, &outarg.attr); 137 fuse_change_attributes(inode, &outarg.attr);
69 entry->d_time = time_to_jiffies(outarg.entry_valid, 138 fuse_change_timeout(entry, &outarg);
70 outarg.entry_valid_nsec);
71 fi->i_time = time_to_jiffies(outarg.attr_valid,
72 outarg.attr_valid_nsec);
73 } 139 }
74 return 1; 140 return 1;
75} 141}
76 142
143/*
144 * Check if there's already a hashed alias of this directory inode.
145 * If yes, then lookup and mkdir must not create a new alias.
146 */
147static int dir_alias(struct inode *inode)
148{
149 if (S_ISDIR(inode->i_mode)) {
150 struct dentry *alias = d_find_alias(inode);
151 if (alias) {
152 dput(alias);
153 return 1;
154 }
155 }
156 return 0;
157}
158
159static int invalid_nodeid(u64 nodeid)
160{
161 return !nodeid || nodeid == FUSE_ROOT_ID;
162}
163
77static struct dentry_operations fuse_dentry_operations = { 164static struct dentry_operations fuse_dentry_operations = {
78 .d_revalidate = fuse_dentry_revalidate, 165 .d_revalidate = fuse_dentry_revalidate,
79}; 166};
80 167
81static int fuse_lookup_iget(struct inode *dir, struct dentry *entry, 168static int valid_mode(int m)
82 struct inode **inodep) 169{
170 return S_ISREG(m) || S_ISDIR(m) || S_ISLNK(m) || S_ISCHR(m) ||
171 S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m);
172}
173
174static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
175 struct nameidata *nd)
83{ 176{
84 int err; 177 int err;
85 struct fuse_entry_out outarg; 178 struct fuse_entry_out outarg;
@@ -88,53 +181,49 @@ static int fuse_lookup_iget(struct inode *dir, struct dentry *entry,
88 struct fuse_req *req; 181 struct fuse_req *req;
89 182
90 if (entry->d_name.len > FUSE_NAME_MAX) 183 if (entry->d_name.len > FUSE_NAME_MAX)
91 return -ENAMETOOLONG; 184 return ERR_PTR(-ENAMETOOLONG);
92 185
93 req = fuse_get_request(fc); 186 req = fuse_get_request(fc);
94 if (!req) 187 if (!req)
95 return -EINTR; 188 return ERR_PTR(-EINTR);
96 189
97 fuse_lookup_init(req, dir, entry, &outarg); 190 fuse_lookup_init(req, dir, entry, &outarg);
98 request_send(fc, req); 191 request_send(fc, req);
99 err = req->out.h.error; 192 err = req->out.h.error;
100 if (!err && (!outarg.nodeid || outarg.nodeid == FUSE_ROOT_ID)) 193 if (!err && ((outarg.nodeid && invalid_nodeid(outarg.nodeid)) ||
194 !valid_mode(outarg.attr.mode)))
101 err = -EIO; 195 err = -EIO;
102 if (!err) { 196 if (!err && outarg.nodeid) {
103 inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, 197 inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
104 &outarg.attr); 198 &outarg.attr);
105 if (!inode) { 199 if (!inode) {
106 fuse_send_forget(fc, req, outarg.nodeid, 1); 200 fuse_send_forget(fc, req, outarg.nodeid, 1);
107 return -ENOMEM; 201 return ERR_PTR(-ENOMEM);
108 } 202 }
109 } 203 }
110 fuse_put_request(fc, req); 204 fuse_put_request(fc, req);
111 if (err && err != -ENOENT) 205 if (err && err != -ENOENT)
112 return err; 206 return ERR_PTR(err);
113 207
114 if (inode) { 208 if (inode && dir_alias(inode)) {
115 struct fuse_inode *fi = get_fuse_inode(inode); 209 iput(inode);
116 entry->d_time = time_to_jiffies(outarg.entry_valid, 210 return ERR_PTR(-EIO);
117 outarg.entry_valid_nsec);
118 fi->i_time = time_to_jiffies(outarg.attr_valid,
119 outarg.attr_valid_nsec);
120 } 211 }
121 212 d_add(entry, inode);
122 entry->d_op = &fuse_dentry_operations; 213 entry->d_op = &fuse_dentry_operations;
123 *inodep = inode; 214 if (!err)
124 return 0; 215 fuse_change_timeout(entry, &outarg);
125} 216 else
126 217 fuse_invalidate_entry_cache(entry);
127void fuse_invalidate_attr(struct inode *inode) 218 return NULL;
128{
129 get_fuse_inode(inode)->i_time = jiffies - 1;
130}
131
132static void fuse_invalidate_entry(struct dentry *entry)
133{
134 d_invalidate(entry);
135 entry->d_time = jiffies - 1;
136} 219}
137 220
221/*
222 * Atomic create+open operation
223 *
224 * If the filesystem doesn't support this, then fall back to separate
225 * 'mknod' + 'open' requests.
226 */
138static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, 227static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
139 struct nameidata *nd) 228 struct nameidata *nd)
140{ 229{
@@ -145,7 +234,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
145 struct fuse_open_in inarg; 234 struct fuse_open_in inarg;
146 struct fuse_open_out outopen; 235 struct fuse_open_out outopen;
147 struct fuse_entry_out outentry; 236 struct fuse_entry_out outentry;
148 struct fuse_inode *fi;
149 struct fuse_file *ff; 237 struct fuse_file *ff;
150 struct file *file; 238 struct file *file;
151 int flags = nd->intent.open.flags - 1; 239 int flags = nd->intent.open.flags - 1;
@@ -154,10 +242,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
154 if (fc->no_create) 242 if (fc->no_create)
155 goto out; 243 goto out;
156 244
157 err = -ENAMETOOLONG;
158 if (entry->d_name.len > FUSE_NAME_MAX)
159 goto out;
160
161 err = -EINTR; 245 err = -EINTR;
162 req = fuse_get_request(fc); 246 req = fuse_get_request(fc);
163 if (!req) 247 if (!req)
@@ -193,7 +277,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
193 } 277 }
194 278
195 err = -EIO; 279 err = -EIO;
196 if (!S_ISREG(outentry.attr.mode)) 280 if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid))
197 goto out_free_ff; 281 goto out_free_ff;
198 282
199 inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, 283 inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
@@ -202,17 +286,15 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
202 if (!inode) { 286 if (!inode) {
203 flags &= ~(O_CREAT | O_EXCL | O_TRUNC); 287 flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
204 ff->fh = outopen.fh; 288 ff->fh = outopen.fh;
289 /* Special release, with inode = NULL, this will
290 trigger a 'forget' request when the release is
291 complete */
205 fuse_send_release(fc, ff, outentry.nodeid, NULL, flags, 0); 292 fuse_send_release(fc, ff, outentry.nodeid, NULL, flags, 0);
206 goto out_put_request; 293 goto out_put_request;
207 } 294 }
208 fuse_put_request(fc, req); 295 fuse_put_request(fc, req);
209 entry->d_time = time_to_jiffies(outentry.entry_valid,
210 outentry.entry_valid_nsec);
211 fi = get_fuse_inode(inode);
212 fi->i_time = time_to_jiffies(outentry.attr_valid,
213 outentry.attr_valid_nsec);
214
215 d_instantiate(entry, inode); 296 d_instantiate(entry, inode);
297 fuse_change_timeout(entry, &outentry);
216 file = lookup_instantiate_filp(nd, entry, generic_file_open); 298 file = lookup_instantiate_filp(nd, entry, generic_file_open);
217 if (IS_ERR(file)) { 299 if (IS_ERR(file)) {
218 ff->fh = outopen.fh; 300 ff->fh = outopen.fh;
@@ -230,13 +312,15 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
230 return err; 312 return err;
231} 313}
232 314
315/*
316 * Code shared between mknod, mkdir, symlink and link
317 */
233static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req, 318static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
234 struct inode *dir, struct dentry *entry, 319 struct inode *dir, struct dentry *entry,
235 int mode) 320 int mode)
236{ 321{
237 struct fuse_entry_out outarg; 322 struct fuse_entry_out outarg;
238 struct inode *inode; 323 struct inode *inode;
239 struct fuse_inode *fi;
240 int err; 324 int err;
241 325
242 req->in.h.nodeid = get_node_id(dir); 326 req->in.h.nodeid = get_node_id(dir);
@@ -250,10 +334,13 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
250 fuse_put_request(fc, req); 334 fuse_put_request(fc, req);
251 return err; 335 return err;
252 } 336 }
253 if (!outarg.nodeid || outarg.nodeid == FUSE_ROOT_ID) { 337 err = -EIO;
254 fuse_put_request(fc, req); 338 if (invalid_nodeid(outarg.nodeid))
255 return -EIO; 339 goto out_put_request;
256 } 340
341 if ((outarg.attr.mode ^ mode) & S_IFMT)
342 goto out_put_request;
343
257 inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, 344 inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
258 &outarg.attr); 345 &outarg.attr);
259 if (!inode) { 346 if (!inode) {
@@ -262,22 +349,19 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
262 } 349 }
263 fuse_put_request(fc, req); 350 fuse_put_request(fc, req);
264 351
265 /* Don't allow userspace to do really stupid things... */ 352 if (dir_alias(inode)) {
266 if ((inode->i_mode ^ mode) & S_IFMT) {
267 iput(inode); 353 iput(inode);
268 return -EIO; 354 return -EIO;
269 } 355 }
270 356
271 entry->d_time = time_to_jiffies(outarg.entry_valid,
272 outarg.entry_valid_nsec);
273
274 fi = get_fuse_inode(inode);
275 fi->i_time = time_to_jiffies(outarg.attr_valid,
276 outarg.attr_valid_nsec);
277
278 d_instantiate(entry, inode); 357 d_instantiate(entry, inode);
358 fuse_change_timeout(entry, &outarg);
279 fuse_invalidate_attr(dir); 359 fuse_invalidate_attr(dir);
280 return 0; 360 return 0;
361
362 out_put_request:
363 fuse_put_request(fc, req);
364 return err;
281} 365}
282 366
283static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode, 367static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode,
@@ -337,12 +421,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
337{ 421{
338 struct fuse_conn *fc = get_fuse_conn(dir); 422 struct fuse_conn *fc = get_fuse_conn(dir);
339 unsigned len = strlen(link) + 1; 423 unsigned len = strlen(link) + 1;
340 struct fuse_req *req; 424 struct fuse_req *req = fuse_get_request(fc);
341
342 if (len > FUSE_SYMLINK_MAX)
343 return -ENAMETOOLONG;
344
345 req = fuse_get_request(fc);
346 if (!req) 425 if (!req)
347 return -EINTR; 426 return -EINTR;
348 427
@@ -381,6 +460,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
381 inode->i_nlink = 0; 460 inode->i_nlink = 0;
382 fuse_invalidate_attr(inode); 461 fuse_invalidate_attr(inode);
383 fuse_invalidate_attr(dir); 462 fuse_invalidate_attr(dir);
463 fuse_invalidate_entry_cache(entry);
384 } else if (err == -EINTR) 464 } else if (err == -EINTR)
385 fuse_invalidate_entry(entry); 465 fuse_invalidate_entry(entry);
386 return err; 466 return err;
@@ -406,6 +486,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
406 if (!err) { 486 if (!err) {
407 entry->d_inode->i_nlink = 0; 487 entry->d_inode->i_nlink = 0;
408 fuse_invalidate_attr(dir); 488 fuse_invalidate_attr(dir);
489 fuse_invalidate_entry_cache(entry);
409 } else if (err == -EINTR) 490 } else if (err == -EINTR)
410 fuse_invalidate_entry(entry); 491 fuse_invalidate_entry(entry);
411 return err; 492 return err;
@@ -441,6 +522,10 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
441 fuse_invalidate_attr(olddir); 522 fuse_invalidate_attr(olddir);
442 if (olddir != newdir) 523 if (olddir != newdir)
443 fuse_invalidate_attr(newdir); 524 fuse_invalidate_attr(newdir);
525
526 /* newent will end up negative */
527 if (newent->d_inode)
528 fuse_invalidate_entry_cache(newent);
444 } else if (err == -EINTR) { 529 } else if (err == -EINTR) {
445 /* If request was interrupted, DEITY only knows if the 530 /* If request was interrupted, DEITY only knows if the
446 rename actually took place. If the invalidation 531 rename actually took place. If the invalidation
@@ -548,6 +633,15 @@ static int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task)
548 return 0; 633 return 0;
549} 634}
550 635
636/*
637 * Check whether the inode attributes are still valid
638 *
639 * If the attribute validity timeout has expired, then fetch the fresh
640 * attributes with a 'getattr' request
641 *
642 * I'm not sure why cached attributes are never returned for the root
643 * inode, this is probably being too cautious.
644 */
551static int fuse_revalidate(struct dentry *entry) 645static int fuse_revalidate(struct dentry *entry)
552{ 646{
553 struct inode *inode = entry->d_inode; 647 struct inode *inode = entry->d_inode;
@@ -595,6 +689,19 @@ static int fuse_access(struct inode *inode, int mask)
595 return err; 689 return err;
596} 690}
597 691
692/*
693 * Check permission. The two basic access models of FUSE are:
694 *
695 * 1) Local access checking ('default_permissions' mount option) based
696 * on file mode. This is the plain old disk filesystem permission
697 * modell.
698 *
699 * 2) "Remote" access checking, where server is responsible for
700 * checking permission in each inode operation. An exception to this
701 * is if ->permission() was invoked from sys_access() in which case an
702 * access request is sent. Execute permission is still checked
703 * locally based on file mode.
704 */
598static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd) 705static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
599{ 706{
600 struct fuse_conn *fc = get_fuse_conn(inode); 707 struct fuse_conn *fc = get_fuse_conn(inode);
@@ -613,14 +720,10 @@ static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
613 err = generic_permission(inode, mask, NULL); 720 err = generic_permission(inode, mask, NULL);
614 } 721 }
615 722
616 /* FIXME: Need some mechanism to revoke permissions: 723 /* Note: the opposite of the above test does not
617 currently if the filesystem suddenly changes the 724 exist. So if permissions are revoked this won't be
618 file mode, we will not be informed about it, and 725 noticed immediately, only after the attribute
619 continue to allow access to the file/directory. 726 timeout has expired */
620
621 This is actually not so grave, since the user can
622 simply keep access to the file/directory anyway by
623 keeping it open... */
624 727
625 return err; 728 return err;
626 } else { 729 } else {
@@ -659,13 +762,6 @@ static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
659 return 0; 762 return 0;
660} 763}
661 764
662static inline size_t fuse_send_readdir(struct fuse_req *req, struct file *file,
663 struct inode *inode, loff_t pos,
664 size_t count)
665{
666 return fuse_send_read_common(req, file, inode, pos, count, 1);
667}
668
669static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) 765static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
670{ 766{
671 int err; 767 int err;
@@ -673,7 +769,12 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
673 struct page *page; 769 struct page *page;
674 struct inode *inode = file->f_dentry->d_inode; 770 struct inode *inode = file->f_dentry->d_inode;
675 struct fuse_conn *fc = get_fuse_conn(inode); 771 struct fuse_conn *fc = get_fuse_conn(inode);
676 struct fuse_req *req = fuse_get_request(fc); 772 struct fuse_req *req;
773
774 if (is_bad_inode(inode))
775 return -EIO;
776
777 req = fuse_get_request(fc);
677 if (!req) 778 if (!req)
678 return -EINTR; 779 return -EINTR;
679 780
@@ -684,7 +785,9 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
684 } 785 }
685 req->num_pages = 1; 786 req->num_pages = 1;
686 req->pages[0] = page; 787 req->pages[0] = page;
687 nbytes = fuse_send_readdir(req, file, inode, file->f_pos, PAGE_SIZE); 788 fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR);
789 request_send(fc, req);
790 nbytes = req->out.args[0].size;
688 err = req->out.h.error; 791 err = req->out.h.error;
689 fuse_put_request(fc, req); 792 fuse_put_request(fc, req);
690 if (!err) 793 if (!err)
@@ -788,6 +891,15 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
788 } 891 }
789} 892}
790 893
894/*
895 * Set attributes, and at the same time refresh them.
896 *
897 * Truncation is slightly complicated, because the 'truncate' request
898 * may fail, in which case we don't want to touch the mapping.
899 * vmtruncate() doesn't allow for this case. So do the rlimit
900 * checking by hand and call vmtruncate() only after the file has
901 * actually been truncated.
902 */
791static int fuse_setattr(struct dentry *entry, struct iattr *attr) 903static int fuse_setattr(struct dentry *entry, struct iattr *attr)
792{ 904{
793 struct inode *inode = entry->d_inode; 905 struct inode *inode = entry->d_inode;
@@ -865,28 +977,6 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
865 return err; 977 return err;
866} 978}
867 979
868static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
869 struct nameidata *nd)
870{
871 struct inode *inode;
872 int err;
873
874 err = fuse_lookup_iget(dir, entry, &inode);
875 if (err)
876 return ERR_PTR(err);
877 if (inode && S_ISDIR(inode->i_mode)) {
878 /* Don't allow creating an alias to a directory */
879 struct dentry *alias = d_find_alias(inode);
880 if (alias) {
881 dput(alias);
882 iput(inode);
883 return ERR_PTR(-EIO);
884 }
885 }
886 d_add(entry, inode);
887 return NULL;
888}
889
890static int fuse_setxattr(struct dentry *entry, const char *name, 980static int fuse_setxattr(struct dentry *entry, const char *name,
891 const void *value, size_t size, int flags) 981 const void *value, size_t size, int flags)
892{ 982{
@@ -896,9 +986,6 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
896 struct fuse_setxattr_in inarg; 986 struct fuse_setxattr_in inarg;
897 int err; 987 int err;
898 988
899 if (size > FUSE_XATTR_SIZE_MAX)
900 return -E2BIG;
901
902 if (fc->no_setxattr) 989 if (fc->no_setxattr)
903 return -EOPNOTSUPP; 990 return -EOPNOTSUPP;
904 991
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 2ca86141d13..a7ef5e716f3 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -113,6 +113,14 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
113 return err; 113 return err;
114} 114}
115 115
116/* Special case for failed iget in CREATE */
117static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
118{
119 u64 nodeid = req->in.h.nodeid;
120 fuse_reset_request(req);
121 fuse_send_forget(fc, req, nodeid, 1);
122}
123
116void fuse_send_release(struct fuse_conn *fc, struct fuse_file *ff, 124void fuse_send_release(struct fuse_conn *fc, struct fuse_file *ff,
117 u64 nodeid, struct inode *inode, int flags, int isdir) 125 u64 nodeid, struct inode *inode, int flags, int isdir)
118{ 126{
@@ -128,6 +136,8 @@ void fuse_send_release(struct fuse_conn *fc, struct fuse_file *ff,
128 req->in.args[0].size = sizeof(struct fuse_release_in); 136 req->in.args[0].size = sizeof(struct fuse_release_in);
129 req->in.args[0].value = inarg; 137 req->in.args[0].value = inarg;
130 request_send_background(fc, req); 138 request_send_background(fc, req);
139 if (!inode)
140 req->end = fuse_release_end;
131 kfree(ff); 141 kfree(ff);
132} 142}
133 143
@@ -163,6 +173,9 @@ static int fuse_flush(struct file *file)
163 struct fuse_flush_in inarg; 173 struct fuse_flush_in inarg;
164 int err; 174 int err;
165 175
176 if (is_bad_inode(inode))
177 return -EIO;
178
166 if (fc->no_flush) 179 if (fc->no_flush)
167 return 0; 180 return 0;
168 181
@@ -199,6 +212,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
199 struct fuse_fsync_in inarg; 212 struct fuse_fsync_in inarg;
200 int err; 213 int err;
201 214
215 if (is_bad_inode(inode))
216 return -EIO;
217
202 if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) 218 if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
203 return 0; 219 return 0;
204 220
@@ -234,54 +250,57 @@ static int fuse_fsync(struct file *file, struct dentry *de, int datasync)
234 return fuse_fsync_common(file, de, datasync, 0); 250 return fuse_fsync_common(file, de, datasync, 0);
235} 251}
236 252
237size_t fuse_send_read_common(struct fuse_req *req, struct file *file, 253void fuse_read_fill(struct fuse_req *req, struct file *file,
238 struct inode *inode, loff_t pos, size_t count, 254 struct inode *inode, loff_t pos, size_t count, int opcode)
239 int isdir)
240{ 255{
241 struct fuse_conn *fc = get_fuse_conn(inode);
242 struct fuse_file *ff = file->private_data; 256 struct fuse_file *ff = file->private_data;
243 struct fuse_read_in inarg; 257 struct fuse_read_in *inarg = &req->misc.read_in;
244 258
245 memset(&inarg, 0, sizeof(struct fuse_read_in)); 259 inarg->fh = ff->fh;
246 inarg.fh = ff->fh; 260 inarg->offset = pos;
247 inarg.offset = pos; 261 inarg->size = count;
248 inarg.size = count; 262 req->in.h.opcode = opcode;
249 req->in.h.opcode = isdir ? FUSE_READDIR : FUSE_READ;
250 req->in.h.nodeid = get_node_id(inode); 263 req->in.h.nodeid = get_node_id(inode);
251 req->inode = inode; 264 req->inode = inode;
252 req->file = file; 265 req->file = file;
253 req->in.numargs = 1; 266 req->in.numargs = 1;
254 req->in.args[0].size = sizeof(struct fuse_read_in); 267 req->in.args[0].size = sizeof(struct fuse_read_in);
255 req->in.args[0].value = &inarg; 268 req->in.args[0].value = inarg;
256 req->out.argpages = 1; 269 req->out.argpages = 1;
257 req->out.argvar = 1; 270 req->out.argvar = 1;
258 req->out.numargs = 1; 271 req->out.numargs = 1;
259 req->out.args[0].size = count; 272 req->out.args[0].size = count;
260 request_send(fc, req);
261 return req->out.args[0].size;
262} 273}
263 274
264static inline size_t fuse_send_read(struct fuse_req *req, struct file *file, 275static size_t fuse_send_read(struct fuse_req *req, struct file *file,
265 struct inode *inode, loff_t pos, 276 struct inode *inode, loff_t pos, size_t count)
266 size_t count)
267{ 277{
268 return fuse_send_read_common(req, file, inode, pos, count, 0); 278 struct fuse_conn *fc = get_fuse_conn(inode);
279 fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
280 request_send(fc, req);
281 return req->out.args[0].size;
269} 282}
270 283
271static int fuse_readpage(struct file *file, struct page *page) 284static int fuse_readpage(struct file *file, struct page *page)
272{ 285{
273 struct inode *inode = page->mapping->host; 286 struct inode *inode = page->mapping->host;
274 struct fuse_conn *fc = get_fuse_conn(inode); 287 struct fuse_conn *fc = get_fuse_conn(inode);
275 loff_t pos = (loff_t) page->index << PAGE_CACHE_SHIFT; 288 struct fuse_req *req;
276 struct fuse_req *req = fuse_get_request(fc); 289 int err;
277 int err = -EINTR; 290
291 err = -EIO;
292 if (is_bad_inode(inode))
293 goto out;
294
295 err = -EINTR;
296 req = fuse_get_request(fc);
278 if (!req) 297 if (!req)
279 goto out; 298 goto out;
280 299
281 req->out.page_zeroing = 1; 300 req->out.page_zeroing = 1;
282 req->num_pages = 1; 301 req->num_pages = 1;
283 req->pages[0] = page; 302 req->pages[0] = page;
284 fuse_send_read(req, file, inode, pos, PAGE_CACHE_SIZE); 303 fuse_send_read(req, file, inode, page_offset(page), PAGE_CACHE_SIZE);
285 err = req->out.h.error; 304 err = req->out.h.error;
286 fuse_put_request(fc, req); 305 fuse_put_request(fc, req);
287 if (!err) 306 if (!err)
@@ -292,21 +311,33 @@ static int fuse_readpage(struct file *file, struct page *page)
292 return err; 311 return err;
293} 312}
294 313
295static int fuse_send_readpages(struct fuse_req *req, struct file *file, 314static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
296 struct inode *inode)
297{ 315{
298 loff_t pos = (loff_t) req->pages[0]->index << PAGE_CACHE_SHIFT; 316 int i;
299 size_t count = req->num_pages << PAGE_CACHE_SHIFT; 317
300 unsigned i; 318 fuse_invalidate_attr(req->pages[0]->mapping->host); /* atime changed */
301 req->out.page_zeroing = 1; 319
302 fuse_send_read(req, file, inode, pos, count);
303 for (i = 0; i < req->num_pages; i++) { 320 for (i = 0; i < req->num_pages; i++) {
304 struct page *page = req->pages[i]; 321 struct page *page = req->pages[i];
305 if (!req->out.h.error) 322 if (!req->out.h.error)
306 SetPageUptodate(page); 323 SetPageUptodate(page);
324 else
325 SetPageError(page);
307 unlock_page(page); 326 unlock_page(page);
308 } 327 }
309 return req->out.h.error; 328 fuse_put_request(fc, req);
329}
330
331static void fuse_send_readpages(struct fuse_req *req, struct file *file,
332 struct inode *inode)
333{
334 struct fuse_conn *fc = get_fuse_conn(inode);
335 loff_t pos = page_offset(req->pages[0]);
336 size_t count = req->num_pages << PAGE_CACHE_SHIFT;
337 req->out.page_zeroing = 1;
338 req->end = fuse_readpages_end;
339 fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
340 request_send_background(fc, req);
310} 341}
311 342
312struct fuse_readpages_data { 343struct fuse_readpages_data {
@@ -326,12 +357,12 @@ static int fuse_readpages_fill(void *_data, struct page *page)
326 (req->num_pages == FUSE_MAX_PAGES_PER_REQ || 357 (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
327 (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read || 358 (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
328 req->pages[req->num_pages - 1]->index + 1 != page->index)) { 359 req->pages[req->num_pages - 1]->index + 1 != page->index)) {
329 int err = fuse_send_readpages(req, data->file, inode); 360 fuse_send_readpages(req, data->file, inode);
330 if (err) { 361 data->req = req = fuse_get_request(fc);
362 if (!req) {
331 unlock_page(page); 363 unlock_page(page);
332 return err; 364 return -EINTR;
333 } 365 }
334 fuse_reset_request(req);
335 } 366 }
336 req->pages[req->num_pages] = page; 367 req->pages[req->num_pages] = page;
337 req->num_pages ++; 368 req->num_pages ++;
@@ -345,6 +376,10 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
345 struct fuse_conn *fc = get_fuse_conn(inode); 376 struct fuse_conn *fc = get_fuse_conn(inode);
346 struct fuse_readpages_data data; 377 struct fuse_readpages_data data;
347 int err; 378 int err;
379
380 if (is_bad_inode(inode))
381 return -EIO;
382
348 data.file = file; 383 data.file = file;
349 data.inode = inode; 384 data.inode = inode;
350 data.req = fuse_get_request(fc); 385 data.req = fuse_get_request(fc);
@@ -352,10 +387,8 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
352 return -EINTR; 387 return -EINTR;
353 388
354 err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); 389 err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
355 if (!err && data.req->num_pages) 390 if (!err)
356 err = fuse_send_readpages(data.req, file, inode); 391 fuse_send_readpages(data.req, file, inode);
357 fuse_put_request(fc, data.req);
358 fuse_invalidate_attr(inode); /* atime changed */
359 return err; 392 return err;
360} 393}
361 394
@@ -402,8 +435,13 @@ static int fuse_commit_write(struct file *file, struct page *page,
402 unsigned count = to - offset; 435 unsigned count = to - offset;
403 struct inode *inode = page->mapping->host; 436 struct inode *inode = page->mapping->host;
404 struct fuse_conn *fc = get_fuse_conn(inode); 437 struct fuse_conn *fc = get_fuse_conn(inode);
405 loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + offset; 438 loff_t pos = page_offset(page) + offset;
406 struct fuse_req *req = fuse_get_request(fc); 439 struct fuse_req *req;
440
441 if (is_bad_inode(inode))
442 return -EIO;
443
444 req = fuse_get_request(fc);
407 if (!req) 445 if (!req)
408 return -EINTR; 446 return -EINTR;
409 447
@@ -454,7 +492,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
454 492
455 nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); 493 nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
456 npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; 494 npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
457 npages = min(npages, FUSE_MAX_PAGES_PER_REQ); 495 npages = min(max(npages, 1), FUSE_MAX_PAGES_PER_REQ);
458 down_read(&current->mm->mmap_sem); 496 down_read(&current->mm->mmap_sem);
459 npages = get_user_pages(current, current->mm, user_addr, npages, write, 497 npages = get_user_pages(current, current->mm, user_addr, npages, write,
460 0, req->pages, NULL); 498 0, req->pages, NULL);
@@ -475,12 +513,16 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
475 size_t nmax = write ? fc->max_write : fc->max_read; 513 size_t nmax = write ? fc->max_write : fc->max_read;
476 loff_t pos = *ppos; 514 loff_t pos = *ppos;
477 ssize_t res = 0; 515 ssize_t res = 0;
478 struct fuse_req *req = fuse_get_request(fc); 516 struct fuse_req *req;
517
518 if (is_bad_inode(inode))
519 return -EIO;
520
521 req = fuse_get_request(fc);
479 if (!req) 522 if (!req)
480 return -EINTR; 523 return -EINTR;
481 524
482 while (count) { 525 while (count) {
483 size_t tmp;
484 size_t nres; 526 size_t nres;
485 size_t nbytes = min(count, nmax); 527 size_t nbytes = min(count, nmax);
486 int err = fuse_get_user_pages(req, buf, nbytes, !write); 528 int err = fuse_get_user_pages(req, buf, nbytes, !write);
@@ -488,8 +530,8 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
488 res = err; 530 res = err;
489 break; 531 break;
490 } 532 }
491 tmp = (req->num_pages << PAGE_SHIFT) - req->page_offset; 533 nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;
492 nbytes = min(nbytes, tmp); 534 nbytes = min(count, nbytes);
493 if (write) 535 if (write)
494 nres = fuse_send_write(req, file, inode, pos, nbytes); 536 nres = fuse_send_write(req, file, inode, pos, nbytes);
495 else 537 else
@@ -535,9 +577,9 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
535 struct inode *inode = file->f_dentry->d_inode; 577 struct inode *inode = file->f_dentry->d_inode;
536 ssize_t res; 578 ssize_t res;
537 /* Don't allow parallel writes to the same file */ 579 /* Don't allow parallel writes to the same file */
538 down(&inode->i_sem); 580 mutex_lock(&inode->i_mutex);
539 res = fuse_direct_io(file, buf, count, ppos, 1); 581 res = fuse_direct_io(file, buf, count, ppos, 1);
540 up(&inode->i_sem); 582 mutex_unlock(&inode->i_mutex);
541 return res; 583 return res;
542} 584}
543 585
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 0ea5301f86b..46cf933aa3b 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -21,6 +21,9 @@
21/** If more requests are outstanding, then the operation will block */ 21/** If more requests are outstanding, then the operation will block */
22#define FUSE_MAX_OUTSTANDING 10 22#define FUSE_MAX_OUTSTANDING 10
23 23
24/** It could be as large as PATH_MAX, but would that have any uses? */
25#define FUSE_NAME_MAX 1024
26
24/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem 27/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
25 module will check permissions based on the file mode. Otherwise no 28 module will check permissions based on the file mode. Otherwise no
26 permission checking is done in the kernel */ 29 permission checking is done in the kernel */
@@ -91,6 +94,11 @@ struct fuse_out {
91 /** Header returned from userspace */ 94 /** Header returned from userspace */
92 struct fuse_out_header h; 95 struct fuse_out_header h;
93 96
97 /*
98 * The following bitfields are not changed during the request
99 * processing
100 */
101
94 /** Last argument is variable length (can be shorter than 102 /** Last argument is variable length (can be shorter than
95 arg->size) */ 103 arg->size) */
96 unsigned argvar:1; 104 unsigned argvar:1;
@@ -108,15 +116,23 @@ struct fuse_out {
108 struct fuse_arg args[3]; 116 struct fuse_arg args[3];
109}; 117};
110 118
111struct fuse_req; 119/** The request state */
120enum fuse_req_state {
121 FUSE_REQ_INIT = 0,
122 FUSE_REQ_PENDING,
123 FUSE_REQ_READING,
124 FUSE_REQ_SENT,
125 FUSE_REQ_FINISHED
126};
127
112struct fuse_conn; 128struct fuse_conn;
113 129
114/** 130/**
115 * A request to the client 131 * A request to the client
116 */ 132 */
117struct fuse_req { 133struct fuse_req {
118 /** This can be on either unused_list, pending or processing 134 /** This can be on either unused_list, pending processing or
119 lists in fuse_conn */ 135 io lists in fuse_conn */
120 struct list_head list; 136 struct list_head list;
121 137
122 /** Entry on the background list */ 138 /** Entry on the background list */
@@ -125,6 +141,12 @@ struct fuse_req {
125 /** refcount */ 141 /** refcount */
126 atomic_t count; 142 atomic_t count;
127 143
144 /*
145 * The following bitfields are either set once before the
146 * request is queued or setting/clearing them is protected by
147 * fuse_lock
148 */
149
128 /** True if the request has reply */ 150 /** True if the request has reply */
129 unsigned isreply:1; 151 unsigned isreply:1;
130 152
@@ -140,11 +162,8 @@ struct fuse_req {
140 /** Data is being copied to/from the request */ 162 /** Data is being copied to/from the request */
141 unsigned locked:1; 163 unsigned locked:1;
142 164
143 /** Request has been sent to userspace */ 165 /** State of the request */
144 unsigned sent:1; 166 enum fuse_req_state state;
145
146 /** The request is finished */
147 unsigned finished:1;
148 167
149 /** The request input */ 168 /** The request input */
150 struct fuse_in in; 169 struct fuse_in in;
@@ -159,7 +178,9 @@ struct fuse_req {
159 union { 178 union {
160 struct fuse_forget_in forget_in; 179 struct fuse_forget_in forget_in;
161 struct fuse_release_in release_in; 180 struct fuse_release_in release_in;
162 struct fuse_init_in_out init_in_out; 181 struct fuse_init_in init_in;
182 struct fuse_init_out init_out;
183 struct fuse_read_in read_in;
163 } misc; 184 } misc;
164 185
165 /** page vector */ 186 /** page vector */
@@ -179,6 +200,9 @@ struct fuse_req {
179 200
180 /** File used in the request (or NULL) */ 201 /** File used in the request (or NULL) */
181 struct file *file; 202 struct file *file;
203
204 /** Request completion callback */
205 void (*end)(struct fuse_conn *, struct fuse_req *);
182}; 206};
183 207
184/** 208/**
@@ -189,9 +213,6 @@ struct fuse_req {
189 * unmounted. 213 * unmounted.
190 */ 214 */
191struct fuse_conn { 215struct fuse_conn {
192 /** Reference count */
193 int count;
194
195 /** The user id for this mount */ 216 /** The user id for this mount */
196 uid_t user_id; 217 uid_t user_id;
197 218
@@ -216,6 +237,9 @@ struct fuse_conn {
216 /** The list of requests being processed */ 237 /** The list of requests being processed */
217 struct list_head processing; 238 struct list_head processing;
218 239
240 /** The list of requests under I/O */
241 struct list_head io;
242
219 /** Requests put in the background (RELEASE or any other 243 /** Requests put in the background (RELEASE or any other
220 interrupted request) */ 244 interrupted request) */
221 struct list_head background; 245 struct list_head background;
@@ -237,14 +261,22 @@ struct fuse_conn {
237 u64 reqctr; 261 u64 reqctr;
238 262
239 /** Mount is active */ 263 /** Mount is active */
240 unsigned mounted : 1; 264 unsigned mounted;
241 265
242 /** Connection established */ 266 /** Connection established, cleared on umount, connection
243 unsigned connected : 1; 267 abort and device release */
268 unsigned connected;
244 269
245 /** Connection failed (version mismatch) */ 270 /** Connection failed (version mismatch). Cannot race with
271 setting other bitfields since it is only set once in INIT
272 reply, before any other request, and never cleared */
246 unsigned conn_error : 1; 273 unsigned conn_error : 1;
247 274
275 /*
276 * The following bitfields are only for optimization purposes
277 * and hence races in setting them will not cause malfunction
278 */
279
248 /** Is fsync not implemented by fs? */ 280 /** Is fsync not implemented by fs? */
249 unsigned no_fsync : 1; 281 unsigned no_fsync : 1;
250 282
@@ -272,18 +304,22 @@ struct fuse_conn {
272 /** Is create not implemented by fs? */ 304 /** Is create not implemented by fs? */
273 unsigned no_create : 1; 305 unsigned no_create : 1;
274 306
307 /** The number of requests waiting for completion */
308 atomic_t num_waiting;
309
310 /** Negotiated minor version */
311 unsigned minor;
312
275 /** Backing dev info */ 313 /** Backing dev info */
276 struct backing_dev_info bdi; 314 struct backing_dev_info bdi;
277};
278 315
279static inline struct fuse_conn **get_fuse_conn_super_p(struct super_block *sb) 316 /** kobject */
280{ 317 struct kobject kobj;
281 return (struct fuse_conn **) &sb->s_fs_info; 318};
282}
283 319
284static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) 320static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
285{ 321{
286 return *get_fuse_conn_super_p(sb); 322 return sb->s_fs_info;
287} 323}
288 324
289static inline struct fuse_conn *get_fuse_conn(struct inode *inode) 325static inline struct fuse_conn *get_fuse_conn(struct inode *inode)
@@ -291,6 +327,11 @@ static inline struct fuse_conn *get_fuse_conn(struct inode *inode)
291 return get_fuse_conn_super(inode->i_sb); 327 return get_fuse_conn_super(inode->i_sb);
292} 328}
293 329
330static inline struct fuse_conn *get_fuse_conn_kobj(struct kobject *obj)
331{
332 return container_of(obj, struct fuse_conn, kobj);
333}
334
294static inline struct fuse_inode *get_fuse_inode(struct inode *inode) 335static inline struct fuse_inode *get_fuse_inode(struct inode *inode)
295{ 336{
296 return container_of(inode, struct fuse_inode, inode); 337 return container_of(inode, struct fuse_inode, inode);
@@ -332,11 +373,10 @@ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
332 unsigned long nodeid, u64 nlookup); 373 unsigned long nodeid, u64 nlookup);
333 374
334/** 375/**
335 * Send READ or READDIR request 376 * Initialize READ or READDIR request
336 */ 377 */
337size_t fuse_send_read_common(struct fuse_req *req, struct file *file, 378void fuse_read_fill(struct fuse_req *req, struct file *file,
338 struct inode *inode, loff_t pos, size_t count, 379 struct inode *inode, loff_t pos, size_t count, int opcode);
339 int isdir);
340 380
341/** 381/**
342 * Send OPEN or OPENDIR request 382 * Send OPEN or OPENDIR request
@@ -391,12 +431,6 @@ void fuse_init_symlink(struct inode *inode);
391void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr); 431void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr);
392 432
393/** 433/**
394 * Check if the connection can be released, and if yes, then free the
395 * connection structure
396 */
397void fuse_release_conn(struct fuse_conn *fc);
398
399/**
400 * Initialize the client device 434 * Initialize the client device
401 */ 435 */
402int fuse_dev_init(void); 436int fuse_dev_init(void);
@@ -452,6 +486,9 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
452 */ 486 */
453void fuse_release_background(struct fuse_req *req); 487void fuse_release_background(struct fuse_req *req);
454 488
489/* Abort all requests */
490void fuse_abort_conn(struct fuse_conn *fc);
491
455/** 492/**
456 * Get the attributes of a file 493 * Get the attributes of a file
457 */ 494 */
@@ -461,8 +498,3 @@ int fuse_do_getattr(struct inode *inode);
461 * Invalidate inode attributes 498 * Invalidate inode attributes
462 */ 499 */
463void fuse_invalidate_attr(struct inode *inode); 500void fuse_invalidate_attr(struct inode *inode);
464
465/**
466 * Send the INIT message
467 */
468void fuse_send_init(struct fuse_conn *fc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e69a546844d..c755a0440a6 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -24,6 +24,13 @@ MODULE_LICENSE("GPL");
24 24
25spinlock_t fuse_lock; 25spinlock_t fuse_lock;
26static kmem_cache_t *fuse_inode_cachep; 26static kmem_cache_t *fuse_inode_cachep;
27static struct subsystem connections_subsys;
28
29struct fuse_conn_attr {
30 struct attribute attr;
31 ssize_t (*show)(struct fuse_conn *, char *);
32 ssize_t (*store)(struct fuse_conn *, const char *, size_t);
33};
27 34
28#define FUSE_SUPER_MAGIC 0x65735546 35#define FUSE_SUPER_MAGIC 0x65735546
29 36
@@ -135,12 +142,8 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
135 fuse_init_common(inode); 142 fuse_init_common(inode);
136 init_special_inode(inode, inode->i_mode, 143 init_special_inode(inode, inode->i_mode,
137 new_decode_dev(attr->rdev)); 144 new_decode_dev(attr->rdev));
138 } else { 145 } else
139 /* Don't let user create weird files */ 146 BUG();
140 inode->i_mode = S_IFREG;
141 fuse_init_common(inode);
142 fuse_init_file_inode(inode);
143 }
144} 147}
145 148
146static int fuse_inode_eq(struct inode *inode, void *_nodeidp) 149static int fuse_inode_eq(struct inode *inode, void *_nodeidp)
@@ -193,6 +196,11 @@ struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
193 return inode; 196 return inode;
194} 197}
195 198
199static void fuse_umount_begin(struct super_block *sb)
200{
201 fuse_abort_conn(get_fuse_conn_super(sb));
202}
203
196static void fuse_put_super(struct super_block *sb) 204static void fuse_put_super(struct super_block *sb)
197{ 205{
198 struct fuse_conn *fc = get_fuse_conn_super(sb); 206 struct fuse_conn *fc = get_fuse_conn_super(sb);
@@ -204,20 +212,20 @@ static void fuse_put_super(struct super_block *sb)
204 212
205 spin_lock(&fuse_lock); 213 spin_lock(&fuse_lock);
206 fc->mounted = 0; 214 fc->mounted = 0;
207 fc->user_id = 0; 215 fc->connected = 0;
208 fc->group_id = 0; 216 spin_unlock(&fuse_lock);
209 fc->flags = 0; 217 up_write(&fc->sbput_sem);
210 /* Flush all readers on this fs */ 218 /* Flush all readers on this fs */
211 wake_up_all(&fc->waitq); 219 wake_up_all(&fc->waitq);
212 up_write(&fc->sbput_sem); 220 kobject_del(&fc->kobj);
213 fuse_release_conn(fc); 221 kobject_put(&fc->kobj);
214 spin_unlock(&fuse_lock);
215} 222}
216 223
217static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr) 224static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr)
218{ 225{
219 stbuf->f_type = FUSE_SUPER_MAGIC; 226 stbuf->f_type = FUSE_SUPER_MAGIC;
220 stbuf->f_bsize = attr->bsize; 227 stbuf->f_bsize = attr->bsize;
228 stbuf->f_frsize = attr->frsize;
221 stbuf->f_blocks = attr->blocks; 229 stbuf->f_blocks = attr->blocks;
222 stbuf->f_bfree = attr->bfree; 230 stbuf->f_bfree = attr->bfree;
223 stbuf->f_bavail = attr->bavail; 231 stbuf->f_bavail = attr->bavail;
@@ -238,10 +246,12 @@ static int fuse_statfs(struct super_block *sb, struct kstatfs *buf)
238 if (!req) 246 if (!req)
239 return -EINTR; 247 return -EINTR;
240 248
249 memset(&outarg, 0, sizeof(outarg));
241 req->in.numargs = 0; 250 req->in.numargs = 0;
242 req->in.h.opcode = FUSE_STATFS; 251 req->in.h.opcode = FUSE_STATFS;
243 req->out.numargs = 1; 252 req->out.numargs = 1;
244 req->out.args[0].size = sizeof(outarg); 253 req->out.args[0].size =
254 fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg);
245 req->out.args[0].value = &outarg; 255 req->out.args[0].value = &outarg;
246 request_send(fc, req); 256 request_send(fc, req);
247 err = req->out.h.error; 257 err = req->out.h.error;
@@ -357,8 +367,10 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
357 return 0; 367 return 0;
358} 368}
359 369
360static void free_conn(struct fuse_conn *fc) 370static void fuse_conn_release(struct kobject *kobj)
361{ 371{
372 struct fuse_conn *fc = get_fuse_conn_kobj(kobj);
373
362 while (!list_empty(&fc->unused_list)) { 374 while (!list_empty(&fc->unused_list)) {
363 struct fuse_req *req; 375 struct fuse_req *req;
364 req = list_entry(fc->unused_list.next, struct fuse_req, list); 376 req = list_entry(fc->unused_list.next, struct fuse_req, list);
@@ -368,33 +380,28 @@ static void free_conn(struct fuse_conn *fc)
368 kfree(fc); 380 kfree(fc);
369} 381}
370 382
371/* Must be called with the fuse lock held */
372void fuse_release_conn(struct fuse_conn *fc)
373{
374 fc->count--;
375 if (!fc->count)
376 free_conn(fc);
377}
378
379static struct fuse_conn *new_conn(void) 383static struct fuse_conn *new_conn(void)
380{ 384{
381 struct fuse_conn *fc; 385 struct fuse_conn *fc;
382 386
383 fc = kmalloc(sizeof(*fc), GFP_KERNEL); 387 fc = kzalloc(sizeof(*fc), GFP_KERNEL);
384 if (fc != NULL) { 388 if (fc) {
385 int i; 389 int i;
386 memset(fc, 0, sizeof(*fc));
387 init_waitqueue_head(&fc->waitq); 390 init_waitqueue_head(&fc->waitq);
388 INIT_LIST_HEAD(&fc->pending); 391 INIT_LIST_HEAD(&fc->pending);
389 INIT_LIST_HEAD(&fc->processing); 392 INIT_LIST_HEAD(&fc->processing);
393 INIT_LIST_HEAD(&fc->io);
390 INIT_LIST_HEAD(&fc->unused_list); 394 INIT_LIST_HEAD(&fc->unused_list);
391 INIT_LIST_HEAD(&fc->background); 395 INIT_LIST_HEAD(&fc->background);
392 sema_init(&fc->outstanding_sem, 0); 396 sema_init(&fc->outstanding_sem, 1); /* One for INIT */
393 init_rwsem(&fc->sbput_sem); 397 init_rwsem(&fc->sbput_sem);
398 kobj_set_kset_s(fc, connections_subsys);
399 kobject_init(&fc->kobj);
400 atomic_set(&fc->num_waiting, 0);
394 for (i = 0; i < FUSE_MAX_OUTSTANDING; i++) { 401 for (i = 0; i < FUSE_MAX_OUTSTANDING; i++) {
395 struct fuse_req *req = fuse_request_alloc(); 402 struct fuse_req *req = fuse_request_alloc();
396 if (!req) { 403 if (!req) {
397 free_conn(fc); 404 kobject_put(&fc->kobj);
398 return NULL; 405 return NULL;
399 } 406 }
400 list_add(&req->list, &fc->unused_list); 407 list_add(&req->list, &fc->unused_list);
@@ -409,25 +416,32 @@ static struct fuse_conn *new_conn(void)
409static struct fuse_conn *get_conn(struct file *file, struct super_block *sb) 416static struct fuse_conn *get_conn(struct file *file, struct super_block *sb)
410{ 417{
411 struct fuse_conn *fc; 418 struct fuse_conn *fc;
419 int err;
412 420
421 err = -EINVAL;
413 if (file->f_op != &fuse_dev_operations) 422 if (file->f_op != &fuse_dev_operations)
414 return ERR_PTR(-EINVAL); 423 goto out_err;
424
425 err = -ENOMEM;
415 fc = new_conn(); 426 fc = new_conn();
416 if (fc == NULL) 427 if (!fc)
417 return ERR_PTR(-ENOMEM); 428 goto out_err;
429
418 spin_lock(&fuse_lock); 430 spin_lock(&fuse_lock);
419 if (file->private_data) { 431 err = -EINVAL;
420 free_conn(fc); 432 if (file->private_data)
421 fc = ERR_PTR(-EINVAL); 433 goto out_unlock;
422 } else { 434
423 file->private_data = fc; 435 kobject_get(&fc->kobj);
424 *get_fuse_conn_super_p(sb) = fc; 436 file->private_data = fc;
425 fc->mounted = 1;
426 fc->connected = 1;
427 fc->count = 2;
428 }
429 spin_unlock(&fuse_lock); 437 spin_unlock(&fuse_lock);
430 return fc; 438 return fc;
439
440 out_unlock:
441 spin_unlock(&fuse_lock);
442 kobject_put(&fc->kobj);
443 out_err:
444 return ERR_PTR(err);
431} 445}
432 446
433static struct inode *get_root_inode(struct super_block *sb, unsigned mode) 447static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
@@ -446,16 +460,74 @@ static struct super_operations fuse_super_operations = {
446 .read_inode = fuse_read_inode, 460 .read_inode = fuse_read_inode,
447 .clear_inode = fuse_clear_inode, 461 .clear_inode = fuse_clear_inode,
448 .put_super = fuse_put_super, 462 .put_super = fuse_put_super,
463 .umount_begin = fuse_umount_begin,
449 .statfs = fuse_statfs, 464 .statfs = fuse_statfs,
450 .show_options = fuse_show_options, 465 .show_options = fuse_show_options,
451}; 466};
452 467
468static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
469{
470 int i;
471 struct fuse_init_out *arg = &req->misc.init_out;
472
473 if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION)
474 fc->conn_error = 1;
475 else {
476 fc->minor = arg->minor;
477 fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
478 }
479
480 /* After INIT reply is received other requests can go
481 out. So do (FUSE_MAX_OUTSTANDING - 1) number of
482 up()s on outstanding_sem. The last up() is done in
483 fuse_putback_request() */
484 for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
485 up(&fc->outstanding_sem);
486
487 fuse_put_request(fc, req);
488}
489
490static void fuse_send_init(struct fuse_conn *fc)
491{
492 /* This is called from fuse_read_super() so there's guaranteed
493 to be exactly one request available */
494 struct fuse_req *req = fuse_get_request(fc);
495 struct fuse_init_in *arg = &req->misc.init_in;
496
497 arg->major = FUSE_KERNEL_VERSION;
498 arg->minor = FUSE_KERNEL_MINOR_VERSION;
499 req->in.h.opcode = FUSE_INIT;
500 req->in.numargs = 1;
501 req->in.args[0].size = sizeof(*arg);
502 req->in.args[0].value = arg;
503 req->out.numargs = 1;
504 /* Variable length arguement used for backward compatibility
505 with interface version < 7.5. Rest of init_out is zeroed
506 by do_get_request(), so a short reply is not a problem */
507 req->out.argvar = 1;
508 req->out.args[0].size = sizeof(struct fuse_init_out);
509 req->out.args[0].value = &req->misc.init_out;
510 req->end = process_init_reply;
511 request_send_background(fc, req);
512}
513
514static unsigned long long conn_id(void)
515{
516 static unsigned long long ctr = 1;
517 unsigned long long val;
518 spin_lock(&fuse_lock);
519 val = ctr++;
520 spin_unlock(&fuse_lock);
521 return val;
522}
523
453static int fuse_fill_super(struct super_block *sb, void *data, int silent) 524static int fuse_fill_super(struct super_block *sb, void *data, int silent)
454{ 525{
455 struct fuse_conn *fc; 526 struct fuse_conn *fc;
456 struct inode *root; 527 struct inode *root;
457 struct fuse_mount_data d; 528 struct fuse_mount_data d;
458 struct file *file; 529 struct file *file;
530 struct dentry *root_dentry;
459 int err; 531 int err;
460 532
461 if (!parse_fuse_opt((char *) data, &d)) 533 if (!parse_fuse_opt((char *) data, &d))
@@ -482,25 +554,43 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
482 fc->max_read = d.max_read; 554 fc->max_read = d.max_read;
483 if (fc->max_read / PAGE_CACHE_SIZE < fc->bdi.ra_pages) 555 if (fc->max_read / PAGE_CACHE_SIZE < fc->bdi.ra_pages)
484 fc->bdi.ra_pages = fc->max_read / PAGE_CACHE_SIZE; 556 fc->bdi.ra_pages = fc->max_read / PAGE_CACHE_SIZE;
485 fc->max_write = FUSE_MAX_IN / 2; 557
558 /* Used by get_root_inode() */
559 sb->s_fs_info = fc;
486 560
487 err = -ENOMEM; 561 err = -ENOMEM;
488 root = get_root_inode(sb, d.rootmode); 562 root = get_root_inode(sb, d.rootmode);
489 if (root == NULL) 563 if (!root)
490 goto err; 564 goto err;
491 565
492 sb->s_root = d_alloc_root(root); 566 root_dentry = d_alloc_root(root);
493 if (!sb->s_root) { 567 if (!root_dentry) {
494 iput(root); 568 iput(root);
495 goto err; 569 goto err;
496 } 570 }
571
572 err = kobject_set_name(&fc->kobj, "%llu", conn_id());
573 if (err)
574 goto err_put_root;
575
576 err = kobject_add(&fc->kobj);
577 if (err)
578 goto err_put_root;
579
580 sb->s_root = root_dentry;
581 spin_lock(&fuse_lock);
582 fc->mounted = 1;
583 fc->connected = 1;
584 spin_unlock(&fuse_lock);
585
497 fuse_send_init(fc); 586 fuse_send_init(fc);
587
498 return 0; 588 return 0;
499 589
590 err_put_root:
591 dput(root_dentry);
500 err: 592 err:
501 spin_lock(&fuse_lock); 593 kobject_put(&fc->kobj);
502 fuse_release_conn(fc);
503 spin_unlock(&fuse_lock);
504 return err; 594 return err;
505} 595}
506 596
@@ -518,6 +608,69 @@ static struct file_system_type fuse_fs_type = {
518 .kill_sb = kill_anon_super, 608 .kill_sb = kill_anon_super,
519}; 609};
520 610
611static ssize_t fuse_conn_waiting_show(struct fuse_conn *fc, char *page)
612{
613 return sprintf(page, "%i\n", atomic_read(&fc->num_waiting));
614}
615
616static ssize_t fuse_conn_abort_store(struct fuse_conn *fc, const char *page,
617 size_t count)
618{
619 fuse_abort_conn(fc);
620 return count;
621}
622
623static struct fuse_conn_attr fuse_conn_waiting =
624 __ATTR(waiting, 0400, fuse_conn_waiting_show, NULL);
625static struct fuse_conn_attr fuse_conn_abort =
626 __ATTR(abort, 0600, NULL, fuse_conn_abort_store);
627
628static struct attribute *fuse_conn_attrs[] = {
629 &fuse_conn_waiting.attr,
630 &fuse_conn_abort.attr,
631 NULL,
632};
633
634static ssize_t fuse_conn_attr_show(struct kobject *kobj,
635 struct attribute *attr,
636 char *page)
637{
638 struct fuse_conn_attr *fca =
639 container_of(attr, struct fuse_conn_attr, attr);
640
641 if (fca->show)
642 return fca->show(get_fuse_conn_kobj(kobj), page);
643 else
644 return -EACCES;
645}
646
647static ssize_t fuse_conn_attr_store(struct kobject *kobj,
648 struct attribute *attr,
649 const char *page, size_t count)
650{
651 struct fuse_conn_attr *fca =
652 container_of(attr, struct fuse_conn_attr, attr);
653
654 if (fca->store)
655 return fca->store(get_fuse_conn_kobj(kobj), page, count);
656 else
657 return -EACCES;
658}
659
660static struct sysfs_ops fuse_conn_sysfs_ops = {
661 .show = &fuse_conn_attr_show,
662 .store = &fuse_conn_attr_store,
663};
664
665static struct kobj_type ktype_fuse_conn = {
666 .release = fuse_conn_release,
667 .sysfs_ops = &fuse_conn_sysfs_ops,
668 .default_attrs = fuse_conn_attrs,
669};
670
671static decl_subsys(fuse, NULL, NULL);
672static decl_subsys(connections, &ktype_fuse_conn, NULL);
673
521static void fuse_inode_init_once(void *foo, kmem_cache_t *cachep, 674static void fuse_inode_init_once(void *foo, kmem_cache_t *cachep,
522 unsigned long flags) 675 unsigned long flags)
523{ 676{
@@ -555,6 +708,34 @@ static void fuse_fs_cleanup(void)
555 kmem_cache_destroy(fuse_inode_cachep); 708 kmem_cache_destroy(fuse_inode_cachep);
556} 709}
557 710
711static int fuse_sysfs_init(void)
712{
713 int err;
714
715 kset_set_kset_s(&fuse_subsys, fs_subsys);
716 err = subsystem_register(&fuse_subsys);
717 if (err)
718 goto out_err;
719
720 kset_set_kset_s(&connections_subsys, fuse_subsys);
721 err = subsystem_register(&connections_subsys);
722 if (err)
723 goto out_fuse_unregister;
724
725 return 0;
726
727 out_fuse_unregister:
728 subsystem_unregister(&fuse_subsys);
729 out_err:
730 return err;
731}
732
733static void fuse_sysfs_cleanup(void)
734{
735 subsystem_unregister(&connections_subsys);
736 subsystem_unregister(&fuse_subsys);
737}
738
558static int __init fuse_init(void) 739static int __init fuse_init(void)
559{ 740{
560 int res; 741 int res;
@@ -571,8 +752,14 @@ static int __init fuse_init(void)
571 if (res) 752 if (res)
572 goto err_fs_cleanup; 753 goto err_fs_cleanup;
573 754
755 res = fuse_sysfs_init();
756 if (res)
757 goto err_dev_cleanup;
758
574 return 0; 759 return 0;
575 760
761 err_dev_cleanup:
762 fuse_dev_cleanup();
576 err_fs_cleanup: 763 err_fs_cleanup:
577 fuse_fs_cleanup(); 764 fuse_fs_cleanup();
578 err: 765 err:
@@ -583,6 +770,7 @@ static void __exit fuse_exit(void)
583{ 770{
584 printk(KERN_DEBUG "fuse exit\n"); 771 printk(KERN_DEBUG "fuse exit\n");
585 772
773 fuse_sysfs_cleanup();
586 fuse_fs_cleanup(); 774 fuse_fs_cleanup();
587 fuse_dev_cleanup(); 775 fuse_dev_cleanup();
588} 776}
diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c
index 89450ae3222..f13f1494d4f 100644
--- a/fs/hfs/bfind.c
+++ b/fs/hfs/bfind.c
@@ -64,7 +64,6 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
64 else 64 else
65 e = rec - 1; 65 e = rec - 1;
66 } while (b <= e); 66 } while (b <= e);
67 //printk("%d: %d,%d,%d\n", bnode->this, b, e, rec);
68 if (rec != e && e >= 0) { 67 if (rec != e && e >= 0) {
69 len = hfs_brec_lenoff(bnode, e, &off); 68 len = hfs_brec_lenoff(bnode, e, &off);
70 keylen = hfs_brec_keylen(bnode, e); 69 keylen = hfs_brec_keylen(bnode, e);
@@ -127,7 +126,7 @@ int hfs_brec_find(struct hfs_find_data *fd)
127 return res; 126 return res;
128 127
129invalid: 128invalid:
130 printk("HFS: inconsistency in B*Tree (%d,%d,%d,%u,%u)\n", 129 printk(KERN_ERR "hfs: inconsistency in B*Tree (%d,%d,%d,%u,%u)\n",
131 height, bnode->height, bnode->type, nidx, parent); 130 height, bnode->height, bnode->type, nidx, parent);
132 res = -EIO; 131 res = -EIO;
133release: 132release:
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 3d5cdc6847c..a7a7d77f3fd 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -198,7 +198,7 @@ void hfs_bnode_unlink(struct hfs_bnode *node)
198 198
199 // move down? 199 // move down?
200 if (!node->prev && !node->next) { 200 if (!node->prev && !node->next) {
201 printk("hfs_btree_del_level\n"); 201 printk(KERN_DEBUG "hfs_btree_del_level\n");
202 } 202 }
203 if (!node->parent) { 203 if (!node->parent) {
204 tree->root = 0; 204 tree->root = 0;
@@ -219,7 +219,7 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid)
219 struct hfs_bnode *node; 219 struct hfs_bnode *node;
220 220
221 if (cnid >= tree->node_count) { 221 if (cnid >= tree->node_count) {
222 printk("HFS: request for non-existent node %d in B*Tree\n", cnid); 222 printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid);
223 return NULL; 223 return NULL;
224 } 224 }
225 225
@@ -242,7 +242,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
242 loff_t off; 242 loff_t off;
243 243
244 if (cnid >= tree->node_count) { 244 if (cnid >= tree->node_count) {
245 printk("HFS: request for non-existent node %d in B*Tree\n", cnid); 245 printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid);
246 return NULL; 246 return NULL;
247 } 247 }
248 248
diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c
index 7d8fff2c25f..5c87cf4801f 100644
--- a/fs/hfs/brec.c
+++ b/fs/hfs/brec.c
@@ -362,7 +362,7 @@ again:
362 end_off = hfs_bnode_read_u16(parent, end_rec_off); 362 end_off = hfs_bnode_read_u16(parent, end_rec_off);
363 if (end_rec_off - end_off < diff) { 363 if (end_rec_off - end_off < diff) {
364 364
365 printk("splitting index node...\n"); 365 printk(KERN_DEBUG "hfs: splitting index node...\n");
366 fd->bnode = parent; 366 fd->bnode = parent;
367 new_node = hfs_bnode_split(fd); 367 new_node = hfs_bnode_split(fd);
368 if (IS_ERR(new_node)) 368 if (IS_ERR(new_node))
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 394725efa1c..7bb11edd148 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -111,7 +111,7 @@ void hfs_btree_close(struct hfs_btree *tree)
111 while ((node = tree->node_hash[i])) { 111 while ((node = tree->node_hash[i])) {
112 tree->node_hash[i] = node->next_hash; 112 tree->node_hash[i] = node->next_hash;
113 if (atomic_read(&node->refcnt)) 113 if (atomic_read(&node->refcnt))
114 printk("HFS: node %d:%d still has %d user(s)!\n", 114 printk(KERN_ERR "hfs: node %d:%d still has %d user(s)!\n",
115 node->tree->cnid, node->this, atomic_read(&node->refcnt)); 115 node->tree->cnid, node->this, atomic_read(&node->refcnt));
116 hfs_bnode_free(node); 116 hfs_bnode_free(node);
117 tree->node_hash_cnt--; 117 tree->node_hash_cnt--;
@@ -252,7 +252,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
252 kunmap(*pagep); 252 kunmap(*pagep);
253 nidx = node->next; 253 nidx = node->next;
254 if (!nidx) { 254 if (!nidx) {
255 printk("create new bmap node...\n"); 255 printk(KERN_DEBUG "hfs: create new bmap node...\n");
256 next_node = hfs_bmap_new_bmap(node, idx); 256 next_node = hfs_bmap_new_bmap(node, idx);
257 } else 257 } else
258 next_node = hfs_bnode_find(tree, nidx); 258 next_node = hfs_bnode_find(tree, nidx);
@@ -292,7 +292,7 @@ void hfs_bmap_free(struct hfs_bnode *node)
292 hfs_bnode_put(node); 292 hfs_bnode_put(node);
293 if (!i) { 293 if (!i) {
294 /* panic */; 294 /* panic */;
295 printk("HFS: unable to free bnode %u. bmap not found!\n", node->this); 295 printk(KERN_CRIT "hfs: unable to free bnode %u. bmap not found!\n", node->this);
296 return; 296 return;
297 } 297 }
298 node = hfs_bnode_find(tree, i); 298 node = hfs_bnode_find(tree, i);
@@ -300,7 +300,7 @@ void hfs_bmap_free(struct hfs_bnode *node)
300 return; 300 return;
301 if (node->type != HFS_NODE_MAP) { 301 if (node->type != HFS_NODE_MAP) {
302 /* panic */; 302 /* panic */;
303 printk("HFS: invalid bmap found! (%u,%d)\n", node->this, node->type); 303 printk(KERN_CRIT "hfs: invalid bmap found! (%u,%d)\n", node->this, node->type);
304 hfs_bnode_put(node); 304 hfs_bnode_put(node);
305 return; 305 return;
306 } 306 }
@@ -313,7 +313,7 @@ void hfs_bmap_free(struct hfs_bnode *node)
313 m = 1 << (~nidx & 7); 313 m = 1 << (~nidx & 7);
314 byte = data[off]; 314 byte = data[off];
315 if (!(byte & m)) { 315 if (!(byte & m)) {
316 printk("HFS: trying to free free bnode %u(%d)\n", node->this, node->type); 316 printk(KERN_CRIT "hfs: trying to free free bnode %u(%d)\n", node->this, node->type);
317 kunmap(page); 317 kunmap(page);
318 hfs_bnode_put(node); 318 hfs_bnode_put(node);
319 return; 319 return;
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index 2fcd679f023..ba851576ebb 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -184,7 +184,7 @@ int hfs_cat_find_brec(struct super_block *sb, u32 cnid,
184 184
185 type = rec.type; 185 type = rec.type;
186 if (type != HFS_CDR_THD && type != HFS_CDR_FTH) { 186 if (type != HFS_CDR_THD && type != HFS_CDR_FTH) {
187 printk("HFS-fs: Found bad thread record in catalog\n"); 187 printk(KERN_ERR "hfs: found bad thread record in catalog\n");
188 return -EIO; 188 return -EIO;
189 } 189 }
190 190
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index e1f24befba5..534e5a7480e 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -81,12 +81,12 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
81 case 1: 81 case 1:
82 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); 82 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
83 if (entry.type != HFS_CDR_THD) { 83 if (entry.type != HFS_CDR_THD) {
84 printk("HFS: bad catalog folder thread\n"); 84 printk(KERN_ERR "hfs: bad catalog folder thread\n");
85 err = -EIO; 85 err = -EIO;
86 goto out; 86 goto out;
87 } 87 }
88 //if (fd.entrylength < HFS_MIN_THREAD_SZ) { 88 //if (fd.entrylength < HFS_MIN_THREAD_SZ) {
89 // printk("HFS: truncated catalog thread\n"); 89 // printk(KERN_ERR "hfs: truncated catalog thread\n");
90 // err = -EIO; 90 // err = -EIO;
91 // goto out; 91 // goto out;
92 //} 92 //}
@@ -105,7 +105,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
105 105
106 for (;;) { 106 for (;;) {
107 if (be32_to_cpu(fd.key->cat.ParID) != inode->i_ino) { 107 if (be32_to_cpu(fd.key->cat.ParID) != inode->i_ino) {
108 printk("HFS: walked past end of dir\n"); 108 printk(KERN_ERR "hfs: walked past end of dir\n");
109 err = -EIO; 109 err = -EIO;
110 goto out; 110 goto out;
111 } 111 }
@@ -114,7 +114,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
114 len = hfs_mac2asc(sb, strbuf, &fd.key->cat.CName); 114 len = hfs_mac2asc(sb, strbuf, &fd.key->cat.CName);
115 if (type == HFS_CDR_DIR) { 115 if (type == HFS_CDR_DIR) {
116 if (fd.entrylength < sizeof(struct hfs_cat_dir)) { 116 if (fd.entrylength < sizeof(struct hfs_cat_dir)) {
117 printk("HFS: small dir entry\n"); 117 printk(KERN_ERR "hfs: small dir entry\n");
118 err = -EIO; 118 err = -EIO;
119 goto out; 119 goto out;
120 } 120 }
@@ -123,7 +123,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
123 break; 123 break;
124 } else if (type == HFS_CDR_FIL) { 124 } else if (type == HFS_CDR_FIL) {
125 if (fd.entrylength < sizeof(struct hfs_cat_file)) { 125 if (fd.entrylength < sizeof(struct hfs_cat_file)) {
126 printk("HFS: small file entry\n"); 126 printk(KERN_ERR "hfs: small file entry\n");
127 err = -EIO; 127 err = -EIO;
128 goto out; 128 goto out;
129 } 129 }
@@ -131,7 +131,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
131 be32_to_cpu(entry.file.FlNum), DT_REG)) 131 be32_to_cpu(entry.file.FlNum), DT_REG))
132 break; 132 break;
133 } else { 133 } else {
134 printk("HFS: bad catalog entry type %d\n", type); 134 printk(KERN_ERR "hfs: bad catalog entry type %d\n", type);
135 err = -EIO; 135 err = -EIO;
136 goto out; 136 goto out;
137 } 137 }
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index cc5dcd52e23..18ce47ab1b7 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -35,9 +35,6 @@
35#define dprint(flg, fmt, args...) \ 35#define dprint(flg, fmt, args...) \
36 if (flg & DBG_MASK) printk(fmt , ## args) 36 if (flg & DBG_MASK) printk(fmt , ## args)
37 37
38#define hfs_warn(format, args...) printk(KERN_WARNING format , ## args)
39#define hfs_error(format, args...) printk(KERN_ERR format , ## args)
40
41/* 38/*
42 * struct hfs_inode_info 39 * struct hfs_inode_info
43 * 40 *
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index d499393a8ae..39fd85b9b91 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -95,7 +95,6 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
95 } while (--i && nidx < tree->node_count); 95 } while (--i && nidx < tree->node_count);
96 spin_unlock(&tree->hash_lock); 96 spin_unlock(&tree->hash_lock);
97 } 97 }
98 //printk("releasepage: %lu,%x = %d\n", page->index, mask, res);
99 return res ? try_to_free_buffers(page) : 0; 98 return res ? try_to_free_buffers(page) : 0;
100} 99}
101 100
@@ -547,13 +546,13 @@ static int hfs_file_release(struct inode *inode, struct file *file)
547 if (atomic_read(&file->f_count) != 0) 546 if (atomic_read(&file->f_count) != 0)
548 return 0; 547 return 0;
549 if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) { 548 if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) {
550 down(&inode->i_sem); 549 mutex_lock(&inode->i_mutex);
551 hfs_file_truncate(inode); 550 hfs_file_truncate(inode);
552 //if (inode->i_flags & S_DEAD) { 551 //if (inode->i_flags & S_DEAD) {
553 // hfs_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL); 552 // hfs_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
554 // hfs_delete_inode(inode); 553 // hfs_delete_inode(inode);
555 //} 554 //}
556 up(&inode->i_sem); 555 mutex_unlock(&inode->i_mutex);
557 } 556 }
558 return 0; 557 return 0;
559} 558}
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 0a473f79c89..b4651e128d7 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -47,7 +47,7 @@ static int hfs_get_last_session(struct super_block *sb,
47 *start = (sector_t)te.cdte_addr.lba << 2; 47 *start = (sector_t)te.cdte_addr.lba << 2;
48 return 0; 48 return 0;
49 } 49 }
50 printk(KERN_ERR "HFS: Invalid session number or type of track\n"); 50 printk(KERN_ERR "hfs: invalid session number or type of track\n");
51 return -EINVAL; 51 return -EINVAL;
52 } 52 }
53 ms_info.addr_format = CDROM_LBA; 53 ms_info.addr_format = CDROM_LBA;
@@ -100,7 +100,7 @@ int hfs_mdb_get(struct super_block *sb)
100 100
101 HFS_SB(sb)->alloc_blksz = size = be32_to_cpu(mdb->drAlBlkSiz); 101 HFS_SB(sb)->alloc_blksz = size = be32_to_cpu(mdb->drAlBlkSiz);
102 if (!size || (size & (HFS_SECTOR_SIZE - 1))) { 102 if (!size || (size & (HFS_SECTOR_SIZE - 1))) {
103 hfs_warn("hfs_fs: bad allocation block size %d\n", size); 103 printk(KERN_ERR "hfs: bad allocation block size %d\n", size);
104 goto out_bh; 104 goto out_bh;
105 } 105 }
106 106
@@ -117,7 +117,7 @@ int hfs_mdb_get(struct super_block *sb)
117 size >>= 1; 117 size >>= 1;
118 brelse(bh); 118 brelse(bh);
119 if (!sb_set_blocksize(sb, size)) { 119 if (!sb_set_blocksize(sb, size)) {
120 printk("hfs_fs: unable to set blocksize to %u\n", size); 120 printk(KERN_ERR "hfs: unable to set blocksize to %u\n", size);
121 goto out; 121 goto out;
122 } 122 }
123 123
@@ -161,8 +161,8 @@ int hfs_mdb_get(struct super_block *sb)
161 } 161 }
162 162
163 if (!HFS_SB(sb)->alt_mdb) { 163 if (!HFS_SB(sb)->alt_mdb) {
164 hfs_warn("hfs_fs: unable to locate alternate MDB\n"); 164 printk(KERN_WARNING "hfs: unable to locate alternate MDB\n");
165 hfs_warn("hfs_fs: continuing without an alternate MDB\n"); 165 printk(KERN_WARNING "hfs: continuing without an alternate MDB\n");
166 } 166 }
167 167
168 HFS_SB(sb)->bitmap = (__be32 *)__get_free_pages(GFP_KERNEL, PAGE_SIZE < 8192 ? 1 : 0); 168 HFS_SB(sb)->bitmap = (__be32 *)__get_free_pages(GFP_KERNEL, PAGE_SIZE < 8192 ? 1 : 0);
@@ -177,7 +177,7 @@ int hfs_mdb_get(struct super_block *sb)
177 while (size) { 177 while (size) {
178 bh = sb_bread(sb, off >> sb->s_blocksize_bits); 178 bh = sb_bread(sb, off >> sb->s_blocksize_bits);
179 if (!bh) { 179 if (!bh) {
180 hfs_warn("hfs_fs: unable to read volume bitmap\n"); 180 printk(KERN_ERR "hfs: unable to read volume bitmap\n");
181 goto out; 181 goto out;
182 } 182 }
183 off2 = off & (sb->s_blocksize - 1); 183 off2 = off & (sb->s_blocksize - 1);
@@ -191,23 +191,23 @@ int hfs_mdb_get(struct super_block *sb)
191 191
192 HFS_SB(sb)->ext_tree = hfs_btree_open(sb, HFS_EXT_CNID, hfs_ext_keycmp); 192 HFS_SB(sb)->ext_tree = hfs_btree_open(sb, HFS_EXT_CNID, hfs_ext_keycmp);
193 if (!HFS_SB(sb)->ext_tree) { 193 if (!HFS_SB(sb)->ext_tree) {
194 hfs_warn("hfs_fs: unable to open extent tree\n"); 194 printk(KERN_ERR "hfs: unable to open extent tree\n");
195 goto out; 195 goto out;
196 } 196 }
197 HFS_SB(sb)->cat_tree = hfs_btree_open(sb, HFS_CAT_CNID, hfs_cat_keycmp); 197 HFS_SB(sb)->cat_tree = hfs_btree_open(sb, HFS_CAT_CNID, hfs_cat_keycmp);
198 if (!HFS_SB(sb)->cat_tree) { 198 if (!HFS_SB(sb)->cat_tree) {
199 hfs_warn("hfs_fs: unable to open catalog tree\n"); 199 printk(KERN_ERR "hfs: unable to open catalog tree\n");
200 goto out; 200 goto out;
201 } 201 }
202 202
203 attrib = mdb->drAtrb; 203 attrib = mdb->drAtrb;
204 if (!(attrib & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) { 204 if (!(attrib & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) {
205 hfs_warn("HFS-fs warning: Filesystem was not cleanly unmounted, " 205 printk(KERN_WARNING "hfs: filesystem was not cleanly unmounted, "
206 "running fsck.hfs is recommended. mounting read-only.\n"); 206 "running fsck.hfs is recommended. mounting read-only.\n");
207 sb->s_flags |= MS_RDONLY; 207 sb->s_flags |= MS_RDONLY;
208 } 208 }
209 if ((attrib & cpu_to_be16(HFS_SB_ATTRIB_SLOCK))) { 209 if ((attrib & cpu_to_be16(HFS_SB_ATTRIB_SLOCK))) {
210 hfs_warn("HFS-fs: Filesystem is marked locked, mounting read-only.\n"); 210 printk(KERN_WARNING "hfs: filesystem is marked locked, mounting read-only.\n");
211 sb->s_flags |= MS_RDONLY; 211 sb->s_flags |= MS_RDONLY;
212 } 212 }
213 if (!(sb->s_flags & MS_RDONLY)) { 213 if (!(sb->s_flags & MS_RDONLY)) {
@@ -303,7 +303,7 @@ void hfs_mdb_commit(struct super_block *sb)
303 while (size) { 303 while (size) {
304 bh = sb_bread(sb, block); 304 bh = sb_bread(sb, block);
305 if (!bh) { 305 if (!bh) {
306 hfs_warn("hfs_fs: unable to read volume bitmap\n"); 306 printk(KERN_ERR "hfs: unable to read volume bitmap\n");
307 break; 307 break;
308 } 308 }
309 len = min((int)sb->s_blocksize - off, size); 309 len = min((int)sb->s_blocksize - off, size);
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index c5074aeafca..1181d116117 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -101,12 +101,12 @@ static int hfs_remount(struct super_block *sb, int *flags, char *data)
101 return 0; 101 return 0;
102 if (!(*flags & MS_RDONLY)) { 102 if (!(*flags & MS_RDONLY)) {
103 if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) { 103 if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) {
104 printk("HFS-fs warning: Filesystem was not cleanly unmounted, " 104 printk(KERN_WARNING "hfs: filesystem was not cleanly unmounted, "
105 "running fsck.hfs is recommended. leaving read-only.\n"); 105 "running fsck.hfs is recommended. leaving read-only.\n");
106 sb->s_flags |= MS_RDONLY; 106 sb->s_flags |= MS_RDONLY;
107 *flags |= MS_RDONLY; 107 *flags |= MS_RDONLY;
108 } else if (HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_SLOCK)) { 108 } else if (HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_SLOCK)) {
109 printk("HFS-fs: Filesystem is marked locked, leaving read-only.\n"); 109 printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n");
110 sb->s_flags |= MS_RDONLY; 110 sb->s_flags |= MS_RDONLY;
111 *flags |= MS_RDONLY; 111 *flags |= MS_RDONLY;
112 } 112 }
@@ -229,21 +229,21 @@ static int parse_options(char *options, struct hfs_sb_info *hsb)
229 switch (token) { 229 switch (token) {
230 case opt_uid: 230 case opt_uid:
231 if (match_int(&args[0], &tmp)) { 231 if (match_int(&args[0], &tmp)) {
232 printk("HFS: uid requires an argument\n"); 232 printk(KERN_ERR "hfs: uid requires an argument\n");
233 return 0; 233 return 0;
234 } 234 }
235 hsb->s_uid = (uid_t)tmp; 235 hsb->s_uid = (uid_t)tmp;
236 break; 236 break;
237 case opt_gid: 237 case opt_gid:
238 if (match_int(&args[0], &tmp)) { 238 if (match_int(&args[0], &tmp)) {
239 printk("HFS: gid requires an argument\n"); 239 printk(KERN_ERR "hfs: gid requires an argument\n");
240 return 0; 240 return 0;
241 } 241 }
242 hsb->s_gid = (gid_t)tmp; 242 hsb->s_gid = (gid_t)tmp;
243 break; 243 break;
244 case opt_umask: 244 case opt_umask:
245 if (match_octal(&args[0], &tmp)) { 245 if (match_octal(&args[0], &tmp)) {
246 printk("HFS: umask requires a value\n"); 246 printk(KERN_ERR "hfs: umask requires a value\n");
247 return 0; 247 return 0;
248 } 248 }
249 hsb->s_file_umask = (umode_t)tmp; 249 hsb->s_file_umask = (umode_t)tmp;
@@ -251,39 +251,39 @@ static int parse_options(char *options, struct hfs_sb_info *hsb)
251 break; 251 break;
252 case opt_file_umask: 252 case opt_file_umask:
253 if (match_octal(&args[0], &tmp)) { 253 if (match_octal(&args[0], &tmp)) {
254 printk("HFS: file_umask requires a value\n"); 254 printk(KERN_ERR "hfs: file_umask requires a value\n");
255 return 0; 255 return 0;
256 } 256 }
257 hsb->s_file_umask = (umode_t)tmp; 257 hsb->s_file_umask = (umode_t)tmp;
258 break; 258 break;
259 case opt_dir_umask: 259 case opt_dir_umask:
260 if (match_octal(&args[0], &tmp)) { 260 if (match_octal(&args[0], &tmp)) {
261 printk("HFS: dir_umask requires a value\n"); 261 printk(KERN_ERR "hfs: dir_umask requires a value\n");
262 return 0; 262 return 0;
263 } 263 }
264 hsb->s_dir_umask = (umode_t)tmp; 264 hsb->s_dir_umask = (umode_t)tmp;
265 break; 265 break;
266 case opt_part: 266 case opt_part:
267 if (match_int(&args[0], &hsb->part)) { 267 if (match_int(&args[0], &hsb->part)) {
268 printk("HFS: part requires an argument\n"); 268 printk(KERN_ERR "hfs: part requires an argument\n");
269 return 0; 269 return 0;
270 } 270 }
271 break; 271 break;
272 case opt_session: 272 case opt_session:
273 if (match_int(&args[0], &hsb->session)) { 273 if (match_int(&args[0], &hsb->session)) {
274 printk("HFS: session requires an argument\n"); 274 printk(KERN_ERR "hfs: session requires an argument\n");
275 return 0; 275 return 0;
276 } 276 }
277 break; 277 break;
278 case opt_type: 278 case opt_type:
279 if (match_fourchar(&args[0], &hsb->s_type)) { 279 if (match_fourchar(&args[0], &hsb->s_type)) {
280 printk("HFS+-fs: type requires a 4 character value\n"); 280 printk(KERN_ERR "hfs: type requires a 4 character value\n");
281 return 0; 281 return 0;
282 } 282 }
283 break; 283 break;
284 case opt_creator: 284 case opt_creator:
285 if (match_fourchar(&args[0], &hsb->s_creator)) { 285 if (match_fourchar(&args[0], &hsb->s_creator)) {
286 printk("HFS+-fs: creator requires a 4 character value\n"); 286 printk(KERN_ERR "hfs: creator requires a 4 character value\n");
287 return 0; 287 return 0;
288 } 288 }
289 break; 289 break;
@@ -292,13 +292,13 @@ static int parse_options(char *options, struct hfs_sb_info *hsb)
292 break; 292 break;
293 case opt_codepage: 293 case opt_codepage:
294 if (hsb->nls_disk) { 294 if (hsb->nls_disk) {
295 printk("HFS+-fs: unable to change codepage\n"); 295 printk(KERN_ERR "hfs: unable to change codepage\n");
296 return 0; 296 return 0;
297 } 297 }
298 p = match_strdup(&args[0]); 298 p = match_strdup(&args[0]);
299 hsb->nls_disk = load_nls(p); 299 hsb->nls_disk = load_nls(p);
300 if (!hsb->nls_disk) { 300 if (!hsb->nls_disk) {
301 printk("HFS+-fs: unable to load codepage \"%s\"\n", p); 301 printk(KERN_ERR "hfs: unable to load codepage \"%s\"\n", p);
302 kfree(p); 302 kfree(p);
303 return 0; 303 return 0;
304 } 304 }
@@ -306,13 +306,13 @@ static int parse_options(char *options, struct hfs_sb_info *hsb)
306 break; 306 break;
307 case opt_iocharset: 307 case opt_iocharset:
308 if (hsb->nls_io) { 308 if (hsb->nls_io) {
309 printk("HFS: unable to change iocharset\n"); 309 printk(KERN_ERR "hfs: unable to change iocharset\n");
310 return 0; 310 return 0;
311 } 311 }
312 p = match_strdup(&args[0]); 312 p = match_strdup(&args[0]);
313 hsb->nls_io = load_nls(p); 313 hsb->nls_io = load_nls(p);
314 if (!hsb->nls_io) { 314 if (!hsb->nls_io) {
315 printk("HFS: unable to load iocharset \"%s\"\n", p); 315 printk(KERN_ERR "hfs: unable to load iocharset \"%s\"\n", p);
316 kfree(p); 316 kfree(p);
317 return 0; 317 return 0;
318 } 318 }
@@ -326,7 +326,7 @@ static int parse_options(char *options, struct hfs_sb_info *hsb)
326 if (hsb->nls_disk && !hsb->nls_io) { 326 if (hsb->nls_disk && !hsb->nls_io) {
327 hsb->nls_io = load_nls_default(); 327 hsb->nls_io = load_nls_default();
328 if (!hsb->nls_io) { 328 if (!hsb->nls_io) {
329 printk("HFS: unable to load default iocharset\n"); 329 printk(KERN_ERR "hfs: unable to load default iocharset\n");
330 return 0; 330 return 0;
331 } 331 }
332 } 332 }
@@ -364,7 +364,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
364 364
365 res = -EINVAL; 365 res = -EINVAL;
366 if (!parse_options((char *)data, sbi)) { 366 if (!parse_options((char *)data, sbi)) {
367 hfs_warn("hfs_fs: unable to parse mount options.\n"); 367 printk(KERN_ERR "hfs: unable to parse mount options.\n");
368 goto bail; 368 goto bail;
369 } 369 }
370 370
@@ -375,7 +375,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
375 res = hfs_mdb_get(sb); 375 res = hfs_mdb_get(sb);
376 if (res) { 376 if (res) {
377 if (!silent) 377 if (!silent)
378 hfs_warn("VFS: Can't find a HFS filesystem on dev %s.\n", 378 printk(KERN_WARNING "hfs: can't find a HFS filesystem on dev %s.\n",
379 hfs_mdb_name(sb)); 379 hfs_mdb_name(sb));
380 res = -EINVAL; 380 res = -EINVAL;
381 goto bail; 381 goto bail;
@@ -407,7 +407,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
407bail_iput: 407bail_iput:
408 iput(root_inode); 408 iput(root_inode);
409bail_no_root: 409bail_no_root:
410 hfs_warn("hfs_fs: get root inode failed.\n"); 410 printk(KERN_ERR "hfs: get root inode failed.\n");
411bail: 411bail:
412 hfs_mdb_put(sb); 412 hfs_mdb_put(sb);
413 return res; 413 return res;
@@ -454,7 +454,7 @@ static void __exit exit_hfs_fs(void)
454{ 454{
455 unregister_filesystem(&hfs_fs_type); 455 unregister_filesystem(&hfs_fs_type);
456 if (kmem_cache_destroy(hfs_inode_cachep)) 456 if (kmem_cache_destroy(hfs_inode_cachep))
457 printk(KERN_INFO "hfs_inode_cache: not all structures were freed\n"); 457 printk(KERN_ERR "hfs_inode_cache: not all structures were freed\n");
458} 458}
459 459
460module_init(init_hfs_fs) 460module_init(init_hfs_fs)
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
index 257cdde0514..5007a41f1be 100644
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -64,7 +64,6 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
64 else 64 else
65 e = rec - 1; 65 e = rec - 1;
66 } while (b <= e); 66 } while (b <= e);
67 //printk("%d: %d,%d,%d\n", bnode->this, b, e, rec);
68 if (rec != e && e >= 0) { 67 if (rec != e && e >= 0) {
69 len = hfs_brec_lenoff(bnode, e, &off); 68 len = hfs_brec_lenoff(bnode, e, &off);
70 keylen = hfs_brec_keylen(bnode, e); 69 keylen = hfs_brec_keylen(bnode, e);
@@ -127,7 +126,7 @@ int hfs_brec_find(struct hfs_find_data *fd)
127 return res; 126 return res;
128 127
129invalid: 128invalid:
130 printk("HFS+-fs: inconsistency in B*Tree (%d,%d,%d,%u,%u)\n", 129 printk(KERN_ERR "hfs: inconsistency in B*Tree (%d,%d,%d,%u,%u)\n",
131 height, bnode->height, bnode->type, nidx, parent); 130 height, bnode->height, bnode->type, nidx, parent);
132 res = -EIO; 131 res = -EIO;
133release: 132release:
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index c7d316455fa..9fb51632303 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -29,7 +29,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
29 return size; 29 return size;
30 30
31 dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len); 31 dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
32 down(&HFSPLUS_SB(sb).alloc_file->i_sem); 32 mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
33 mapping = HFSPLUS_SB(sb).alloc_file->i_mapping; 33 mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
34 page = read_cache_page(mapping, offset / PAGE_CACHE_BITS, 34 page = read_cache_page(mapping, offset / PAGE_CACHE_BITS,
35 (filler_t *)mapping->a_ops->readpage, NULL); 35 (filler_t *)mapping->a_ops->readpage, NULL);
@@ -143,7 +143,7 @@ done:
143 sb->s_dirt = 1; 143 sb->s_dirt = 1;
144 dprint(DBG_BITMAP, "-> %u,%u\n", start, *max); 144 dprint(DBG_BITMAP, "-> %u,%u\n", start, *max);
145out: 145out:
146 up(&HFSPLUS_SB(sb).alloc_file->i_sem); 146 mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
147 return start; 147 return start;
148} 148}
149 149
@@ -164,7 +164,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
164 if ((offset + count) > HFSPLUS_SB(sb).total_blocks) 164 if ((offset + count) > HFSPLUS_SB(sb).total_blocks)
165 return -2; 165 return -2;
166 166
167 down(&HFSPLUS_SB(sb).alloc_file->i_sem); 167 mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
168 mapping = HFSPLUS_SB(sb).alloc_file->i_mapping; 168 mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
169 pnr = offset / PAGE_CACHE_BITS; 169 pnr = offset / PAGE_CACHE_BITS;
170 page = read_cache_page(mapping, pnr, (filler_t *)mapping->a_ops->readpage, NULL); 170 page = read_cache_page(mapping, pnr, (filler_t *)mapping->a_ops->readpage, NULL);
@@ -215,7 +215,7 @@ out:
215 kunmap(page); 215 kunmap(page);
216 HFSPLUS_SB(sb).free_blocks += len; 216 HFSPLUS_SB(sb).free_blocks += len;
217 sb->s_dirt = 1; 217 sb->s_dirt = 1;
218 up(&HFSPLUS_SB(sb).alloc_file->i_sem); 218 mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
219 219
220 return 0; 220 return 0;
221} 221}
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 930cd9212de..8f07e8fbd03 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -358,7 +358,7 @@ void hfs_bnode_unlink(struct hfs_bnode *node)
358 358
359 // move down? 359 // move down?
360 if (!node->prev && !node->next) { 360 if (!node->prev && !node->next) {
361 printk("hfs_btree_del_level\n"); 361 printk(KERN_DEBUG "hfs_btree_del_level\n");
362 } 362 }
363 if (!node->parent) { 363 if (!node->parent) {
364 tree->root = 0; 364 tree->root = 0;
@@ -379,7 +379,7 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid)
379 struct hfs_bnode *node; 379 struct hfs_bnode *node;
380 380
381 if (cnid >= tree->node_count) { 381 if (cnid >= tree->node_count) {
382 printk("HFS+-fs: request for non-existent node %d in B*Tree\n", cnid); 382 printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid);
383 return NULL; 383 return NULL;
384 } 384 }
385 385
@@ -402,7 +402,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
402 loff_t off; 402 loff_t off;
403 403
404 if (cnid >= tree->node_count) { 404 if (cnid >= tree->node_count) {
405 printk("HFS+-fs: request for non-existent node %d in B*Tree\n", cnid); 405 printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid);
406 return NULL; 406 return NULL;
407 } 407 }
408 408
@@ -576,8 +576,9 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num)
576 node = hfs_bnode_findhash(tree, num); 576 node = hfs_bnode_findhash(tree, num);
577 spin_unlock(&tree->hash_lock); 577 spin_unlock(&tree->hash_lock);
578 if (node) { 578 if (node) {
579 printk("new node %u already hashed?\n", num); 579 printk(KERN_CRIT "new node %u already hashed?\n", num);
580 BUG(); 580 WARN_ON(1);
581 return node;
581 } 582 }
582 node = __hfs_bnode_create(tree, num); 583 node = __hfs_bnode_create(tree, num);
583 if (!node) 584 if (!node)
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index 0ccef2ab790..c88e5d72a40 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -360,7 +360,7 @@ again:
360 end_off = hfs_bnode_read_u16(parent, end_rec_off); 360 end_off = hfs_bnode_read_u16(parent, end_rec_off);
361 if (end_rec_off - end_off < diff) { 361 if (end_rec_off - end_off < diff) {
362 362
363 printk("splitting index node...\n"); 363 printk(KERN_DEBUG "hfs: splitting index node...\n");
364 fd->bnode = parent; 364 fd->bnode = parent;
365 new_node = hfs_bnode_split(fd); 365 new_node = hfs_bnode_split(fd);
366 if (IS_ERR(new_node)) 366 if (IS_ERR(new_node))
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 44326aa2bd3..a67edfa34e9 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -31,17 +31,8 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
31 31
32 init_MUTEX(&tree->tree_lock); 32 init_MUTEX(&tree->tree_lock);
33 spin_lock_init(&tree->hash_lock); 33 spin_lock_init(&tree->hash_lock);
34 /* Set the correct compare function */
35 tree->sb = sb; 34 tree->sb = sb;
36 tree->cnid = id; 35 tree->cnid = id;
37 if (id == HFSPLUS_EXT_CNID) {
38 tree->keycmp = hfsplus_ext_cmp_key;
39 } else if (id == HFSPLUS_CAT_CNID) {
40 tree->keycmp = hfsplus_cat_cmp_key;
41 } else {
42 printk("HFS+-fs: unknown B*Tree requested\n");
43 goto free_tree;
44 }
45 tree->inode = iget(sb, id); 36 tree->inode = iget(sb, id);
46 if (!tree->inode) 37 if (!tree->inode)
47 goto free_tree; 38 goto free_tree;
@@ -64,6 +55,20 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
64 tree->max_key_len = be16_to_cpu(head->max_key_len); 55 tree->max_key_len = be16_to_cpu(head->max_key_len);
65 tree->depth = be16_to_cpu(head->depth); 56 tree->depth = be16_to_cpu(head->depth);
66 57
58 /* Set the correct compare function */
59 if (id == HFSPLUS_EXT_CNID) {
60 tree->keycmp = hfsplus_ext_cmp_key;
61 } else if (id == HFSPLUS_CAT_CNID) {
62 if ((HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX) &&
63 (head->key_type == HFSPLUS_KEY_BINARY))
64 tree->keycmp = hfsplus_cat_bin_cmp_key;
65 else
66 tree->keycmp = hfsplus_cat_case_cmp_key;
67 } else {
68 printk(KERN_ERR "hfs: unknown B*Tree requested\n");
69 goto fail_page;
70 }
71
67 size = tree->node_size; 72 size = tree->node_size;
68 if (!size || size & (size - 1)) 73 if (!size || size & (size - 1))
69 goto fail_page; 74 goto fail_page;
@@ -99,7 +104,7 @@ void hfs_btree_close(struct hfs_btree *tree)
99 while ((node = tree->node_hash[i])) { 104 while ((node = tree->node_hash[i])) {
100 tree->node_hash[i] = node->next_hash; 105 tree->node_hash[i] = node->next_hash;
101 if (atomic_read(&node->refcnt)) 106 if (atomic_read(&node->refcnt))
102 printk("HFS+: node %d:%d still has %d user(s)!\n", 107 printk(KERN_CRIT "hfs: node %d:%d still has %d user(s)!\n",
103 node->tree->cnid, node->this, atomic_read(&node->refcnt)); 108 node->tree->cnid, node->this, atomic_read(&node->refcnt));
104 hfs_bnode_free(node); 109 hfs_bnode_free(node);
105 tree->node_hash_cnt--; 110 tree->node_hash_cnt--;
@@ -223,10 +228,6 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
223 tree->free_nodes--; 228 tree->free_nodes--;
224 mark_inode_dirty(tree->inode); 229 mark_inode_dirty(tree->inode);
225 hfs_bnode_put(node); 230 hfs_bnode_put(node);
226 if (!idx) {
227 printk("unexpected idx %u (%u)\n", idx, node->this);
228 BUG();
229 }
230 return hfs_bnode_create(tree, idx); 231 return hfs_bnode_create(tree, idx);
231 } 232 }
232 } 233 }
@@ -242,7 +243,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
242 kunmap(*pagep); 243 kunmap(*pagep);
243 nidx = node->next; 244 nidx = node->next;
244 if (!nidx) { 245 if (!nidx) {
245 printk("create new bmap node...\n"); 246 printk(KERN_DEBUG "hfs: create new bmap node...\n");
246 next_node = hfs_bmap_new_bmap(node, idx); 247 next_node = hfs_bmap_new_bmap(node, idx);
247 } else 248 } else
248 next_node = hfs_bnode_find(tree, nidx); 249 next_node = hfs_bnode_find(tree, nidx);
@@ -284,7 +285,7 @@ void hfs_bmap_free(struct hfs_bnode *node)
284 hfs_bnode_put(node); 285 hfs_bnode_put(node);
285 if (!i) { 286 if (!i) {
286 /* panic */; 287 /* panic */;
287 printk("HFS: unable to free bnode %u. bmap not found!\n", node->this); 288 printk(KERN_CRIT "hfs: unable to free bnode %u. bmap not found!\n", node->this);
288 return; 289 return;
289 } 290 }
290 node = hfs_bnode_find(tree, i); 291 node = hfs_bnode_find(tree, i);
@@ -292,7 +293,7 @@ void hfs_bmap_free(struct hfs_bnode *node)
292 return; 293 return;
293 if (node->type != HFS_NODE_MAP) { 294 if (node->type != HFS_NODE_MAP) {
294 /* panic */; 295 /* panic */;
295 printk("HFS: invalid bmap found! (%u,%d)\n", node->this, node->type); 296 printk(KERN_CRIT "hfs: invalid bmap found! (%u,%d)\n", node->this, node->type);
296 hfs_bnode_put(node); 297 hfs_bnode_put(node);
297 return; 298 return;
298 } 299 }
@@ -305,7 +306,7 @@ void hfs_bmap_free(struct hfs_bnode *node)
305 m = 1 << (~nidx & 7); 306 m = 1 << (~nidx & 7);
306 byte = data[off]; 307 byte = data[off];
307 if (!(byte & m)) { 308 if (!(byte & m)) {
308 printk("HFS: trying to free free bnode %u(%d)\n", node->this, node->type); 309 printk(KERN_CRIT "hfs: trying to free free bnode %u(%d)\n", node->this, node->type);
309 kunmap(page); 310 kunmap(page);
310 hfs_bnode_put(node); 311 hfs_bnode_put(node);
311 return; 312 return;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 94712790c8b..f2d7c49ce75 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -13,7 +13,8 @@
13#include "hfsplus_fs.h" 13#include "hfsplus_fs.h"
14#include "hfsplus_raw.h" 14#include "hfsplus_raw.h"
15 15
16int hfsplus_cat_cmp_key(hfsplus_btree_key *k1, hfsplus_btree_key *k2) 16int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *k1,
17 const hfsplus_btree_key *k2)
17{ 18{
18 __be32 k1p, k2p; 19 __be32 k1p, k2p;
19 20
@@ -22,7 +23,20 @@ int hfsplus_cat_cmp_key(hfsplus_btree_key *k1, hfsplus_btree_key *k2)
22 if (k1p != k2p) 23 if (k1p != k2p)
23 return be32_to_cpu(k1p) < be32_to_cpu(k2p) ? -1 : 1; 24 return be32_to_cpu(k1p) < be32_to_cpu(k2p) ? -1 : 1;
24 25
25 return hfsplus_unistrcmp(&k1->cat.name, &k2->cat.name); 26 return hfsplus_strcasecmp(&k1->cat.name, &k2->cat.name);
27}
28
29int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *k1,
30 const hfsplus_btree_key *k2)
31{
32 __be32 k1p, k2p;
33
34 k1p = k1->cat.parent;
35 k2p = k2->cat.parent;
36 if (k1p != k2p)
37 return be32_to_cpu(k1p) < be32_to_cpu(k2p) ? -1 : 1;
38
39 return hfsplus_strcmp(&k1->cat.name, &k2->cat.name);
26} 40}
27 41
28void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key, 42void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key,
@@ -80,8 +94,11 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
80 memset(folder, 0, sizeof(*folder)); 94 memset(folder, 0, sizeof(*folder));
81 folder->type = cpu_to_be16(HFSPLUS_FOLDER); 95 folder->type = cpu_to_be16(HFSPLUS_FOLDER);
82 folder->id = cpu_to_be32(inode->i_ino); 96 folder->id = cpu_to_be32(inode->i_ino);
83 folder->create_date = folder->content_mod_date = 97 HFSPLUS_I(inode).create_date =
84 folder->attribute_mod_date = folder->access_date = hfsp_now2mt(); 98 folder->create_date =
99 folder->content_mod_date =
100 folder->attribute_mod_date =
101 folder->access_date = hfsp_now2mt();
85 hfsplus_set_perms(inode, &folder->permissions); 102 hfsplus_set_perms(inode, &folder->permissions);
86 if (inode == HFSPLUS_SB(inode->i_sb).hidden_dir) 103 if (inode == HFSPLUS_SB(inode->i_sb).hidden_dir)
87 /* invisible and namelocked */ 104 /* invisible and namelocked */
@@ -95,18 +112,27 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
95 file->type = cpu_to_be16(HFSPLUS_FILE); 112 file->type = cpu_to_be16(HFSPLUS_FILE);
96 file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS); 113 file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS);
97 file->id = cpu_to_be32(cnid); 114 file->id = cpu_to_be32(cnid);
98 file->create_date = file->content_mod_date = 115 HFSPLUS_I(inode).create_date =
99 file->attribute_mod_date = file->access_date = hfsp_now2mt(); 116 file->create_date =
117 file->content_mod_date =
118 file->attribute_mod_date =
119 file->access_date = hfsp_now2mt();
100 if (cnid == inode->i_ino) { 120 if (cnid == inode->i_ino) {
101 hfsplus_set_perms(inode, &file->permissions); 121 hfsplus_set_perms(inode, &file->permissions);
102 file->user_info.fdType = cpu_to_be32(HFSPLUS_SB(inode->i_sb).type); 122 if (S_ISLNK(inode->i_mode)) {
103 file->user_info.fdCreator = cpu_to_be32(HFSPLUS_SB(inode->i_sb).creator); 123 file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE);
124 file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR);
125 } else {
126 file->user_info.fdType = cpu_to_be32(HFSPLUS_SB(inode->i_sb).type);
127 file->user_info.fdCreator = cpu_to_be32(HFSPLUS_SB(inode->i_sb).creator);
128 }
104 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE) 129 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
105 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); 130 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
106 } else { 131 } else {
107 file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE); 132 file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE);
108 file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR); 133 file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR);
109 file->user_info.fdFlags = cpu_to_be16(0x100); 134 file->user_info.fdFlags = cpu_to_be16(0x100);
135 file->create_date = HFSPLUS_I(HFSPLUS_SB(inode->i_sb).hidden_dir).create_date;
110 file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode).dev); 136 file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode).dev);
111 } 137 }
112 return sizeof(*file); 138 return sizeof(*file);
@@ -139,7 +165,7 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
139 165
140 type = be16_to_cpu(tmp.type); 166 type = be16_to_cpu(tmp.type);
141 if (type != HFSPLUS_FOLDER_THREAD && type != HFSPLUS_FILE_THREAD) { 167 if (type != HFSPLUS_FOLDER_THREAD && type != HFSPLUS_FILE_THREAD) {
142 printk("HFS+-fs: Found bad thread record in catalog\n"); 168 printk(KERN_ERR "hfs: found bad thread record in catalog\n");
143 return -EIO; 169 return -EIO;
144 } 170 }
145 171
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 50c8f44b6c6..01a6fe3a395 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -66,25 +66,32 @@ again:
66 } 66 }
67 cnid = be32_to_cpu(entry.file.id); 67 cnid = be32_to_cpu(entry.file.id);
68 if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) && 68 if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) &&
69 entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR)) { 69 entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
70 (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb).hidden_dir).create_date ||
71 entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode).create_date) &&
72 HFSPLUS_SB(sb).hidden_dir) {
70 struct qstr str; 73 struct qstr str;
71 char name[32]; 74 char name[32];
72 75
73 if (dentry->d_fsdata) { 76 if (dentry->d_fsdata) {
74 err = -ENOENT; 77 /*
75 inode = NULL; 78 * We found a link pointing to another link,
76 goto out; 79 * so ignore it and treat it as regular file.
80 */
81 cnid = (unsigned long)dentry->d_fsdata;
82 linkid = 0;
83 } else {
84 dentry->d_fsdata = (void *)(unsigned long)cnid;
85 linkid = be32_to_cpu(entry.file.permissions.dev);
86 str.len = sprintf(name, "iNode%d", linkid);
87 str.name = name;
88 hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_SB(sb).hidden_dir->i_ino, &str);
89 goto again;
77 } 90 }
78 dentry->d_fsdata = (void *)(unsigned long)cnid;
79 linkid = be32_to_cpu(entry.file.permissions.dev);
80 str.len = sprintf(name, "iNode%d", linkid);
81 str.name = name;
82 hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_SB(sb).hidden_dir->i_ino, &str);
83 goto again;
84 } else if (!dentry->d_fsdata) 91 } else if (!dentry->d_fsdata)
85 dentry->d_fsdata = (void *)(unsigned long)cnid; 92 dentry->d_fsdata = (void *)(unsigned long)cnid;
86 } else { 93 } else {
87 printk("HFS+-fs: Illegal catalog entry type in lookup\n"); 94 printk(KERN_ERR "hfs: invalid catalog entry type in lookup\n");
88 err = -EIO; 95 err = -EIO;
89 goto fail; 96 goto fail;
90 } 97 }
@@ -132,12 +139,12 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
132 case 1: 139 case 1:
133 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); 140 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
134 if (be16_to_cpu(entry.type) != HFSPLUS_FOLDER_THREAD) { 141 if (be16_to_cpu(entry.type) != HFSPLUS_FOLDER_THREAD) {
135 printk("HFS+-fs: bad catalog folder thread\n"); 142 printk(KERN_ERR "hfs: bad catalog folder thread\n");
136 err = -EIO; 143 err = -EIO;
137 goto out; 144 goto out;
138 } 145 }
139 if (fd.entrylength < HFSPLUS_MIN_THREAD_SZ) { 146 if (fd.entrylength < HFSPLUS_MIN_THREAD_SZ) {
140 printk("HFS+-fs: truncated catalog thread\n"); 147 printk(KERN_ERR "hfs: truncated catalog thread\n");
141 err = -EIO; 148 err = -EIO;
142 goto out; 149 goto out;
143 } 150 }
@@ -156,7 +163,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
156 163
157 for (;;) { 164 for (;;) {
158 if (be32_to_cpu(fd.key->cat.parent) != inode->i_ino) { 165 if (be32_to_cpu(fd.key->cat.parent) != inode->i_ino) {
159 printk("HFS+-fs: walked past end of dir\n"); 166 printk(KERN_ERR "hfs: walked past end of dir\n");
160 err = -EIO; 167 err = -EIO;
161 goto out; 168 goto out;
162 } 169 }
@@ -168,7 +175,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
168 goto out; 175 goto out;
169 if (type == HFSPLUS_FOLDER) { 176 if (type == HFSPLUS_FOLDER) {
170 if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) { 177 if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) {
171 printk("HFS+-fs: small dir entry\n"); 178 printk(KERN_ERR "hfs: small dir entry\n");
172 err = -EIO; 179 err = -EIO;
173 goto out; 180 goto out;
174 } 181 }
@@ -180,7 +187,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
180 break; 187 break;
181 } else if (type == HFSPLUS_FILE) { 188 } else if (type == HFSPLUS_FILE) {
182 if (fd.entrylength < sizeof(struct hfsplus_cat_file)) { 189 if (fd.entrylength < sizeof(struct hfsplus_cat_file)) {
183 printk("HFS+-fs: small file entry\n"); 190 printk(KERN_ERR "hfs: small file entry\n");
184 err = -EIO; 191 err = -EIO;
185 goto out; 192 goto out;
186 } 193 }
@@ -188,7 +195,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
188 be32_to_cpu(entry.file.id), DT_REG)) 195 be32_to_cpu(entry.file.id), DT_REG))
189 break; 196 break;
190 } else { 197 } else {
191 printk("HFS+-fs: bad catalog entry type\n"); 198 printk(KERN_ERR "hfs: bad catalog entry type\n");
192 err = -EIO; 199 err = -EIO;
193 goto out; 200 goto out;
194 } 201 }
@@ -330,7 +337,8 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
330 if (res) 337 if (res)
331 return res; 338 return res;
332 339
333 inode->i_nlink--; 340 if (inode->i_nlink > 0)
341 inode->i_nlink--;
334 hfsplus_delete_inode(inode); 342 hfsplus_delete_inode(inode);
335 if (inode->i_ino != cnid && !inode->i_nlink) { 343 if (inode->i_ino != cnid && !inode->i_nlink) {
336 if (!atomic_read(&HFSPLUS_I(inode).opencnt)) { 344 if (!atomic_read(&HFSPLUS_I(inode).opencnt)) {
@@ -339,7 +347,8 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
339 hfsplus_delete_inode(inode); 347 hfsplus_delete_inode(inode);
340 } else 348 } else
341 inode->i_flags |= S_DEAD; 349 inode->i_flags |= S_DEAD;
342 } 350 } else
351 inode->i_nlink = 0;
343 inode->i_ctime = CURRENT_TIME_SEC; 352 inode->i_ctime = CURRENT_TIME_SEC;
344 mark_inode_dirty(inode); 353 mark_inode_dirty(inode);
345 354
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index e3ff56a0301..1a7480089e8 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -16,7 +16,8 @@
16#include "hfsplus_raw.h" 16#include "hfsplus_raw.h"
17 17
18/* Compare two extents keys, returns 0 on same, pos/neg for difference */ 18/* Compare two extents keys, returns 0 on same, pos/neg for difference */
19int hfsplus_ext_cmp_key(hfsplus_btree_key *k1, hfsplus_btree_key *k2) 19int hfsplus_ext_cmp_key(const hfsplus_btree_key *k1,
20 const hfsplus_btree_key *k2)
20{ 21{
21 __be32 k1id, k2id; 22 __be32 k1id, k2id;
22 __be32 k1s, k2s; 23 __be32 k1s, k2s;
@@ -349,10 +350,9 @@ int hfsplus_file_extend(struct inode *inode)
349 350
350 if (HFSPLUS_SB(sb).alloc_file->i_size * 8 < HFSPLUS_SB(sb).total_blocks - HFSPLUS_SB(sb).free_blocks + 8) { 351 if (HFSPLUS_SB(sb).alloc_file->i_size * 8 < HFSPLUS_SB(sb).total_blocks - HFSPLUS_SB(sb).free_blocks + 8) {
351 // extend alloc file 352 // extend alloc file
352 printk("extend alloc file! (%Lu,%u,%u)\n", HFSPLUS_SB(sb).alloc_file->i_size * 8, 353 printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n", HFSPLUS_SB(sb).alloc_file->i_size * 8,
353 HFSPLUS_SB(sb).total_blocks, HFSPLUS_SB(sb).free_blocks); 354 HFSPLUS_SB(sb).total_blocks, HFSPLUS_SB(sb).free_blocks);
354 return -ENOSPC; 355 return -ENOSPC;
355 //BUG();
356 } 356 }
357 357
358 down(&HFSPLUS_I(inode).extents_lock); 358 down(&HFSPLUS_I(inode).extents_lock);
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index c60e5635498..7ae393637a0 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -36,7 +36,7 @@
36#define HFSPLUS_TYPE_DATA 0x00 36#define HFSPLUS_TYPE_DATA 0x00
37#define HFSPLUS_TYPE_RSRC 0xFF 37#define HFSPLUS_TYPE_RSRC 0xFF
38 38
39typedef int (*btree_keycmp)(hfsplus_btree_key *, hfsplus_btree_key *); 39typedef int (*btree_keycmp)(const hfsplus_btree_key *, const hfsplus_btree_key *);
40 40
41#define NODE_HASH_SIZE 256 41#define NODE_HASH_SIZE 256
42 42
@@ -143,14 +143,13 @@ struct hfsplus_sb_info {
143 143
144 unsigned long flags; 144 unsigned long flags;
145 145
146 atomic_t inode_cnt;
147 u32 last_inode_cnt;
148
149 struct hlist_head rsrc_inodes; 146 struct hlist_head rsrc_inodes;
150}; 147};
151 148
152#define HFSPLUS_SB_WRITEBACKUP 0x0001 149#define HFSPLUS_SB_WRITEBACKUP 0x0001
153#define HFSPLUS_SB_NODECOMPOSE 0x0002 150#define HFSPLUS_SB_NODECOMPOSE 0x0002
151#define HFSPLUS_SB_FORCE 0x0004
152#define HFSPLUS_SB_HFSX 0x0008
154 153
155 154
156struct hfsplus_inode_info { 155struct hfsplus_inode_info {
@@ -167,6 +166,7 @@ struct hfsplus_inode_info {
167 struct inode *rsrc_inode; 166 struct inode *rsrc_inode;
168 unsigned long flags; 167 unsigned long flags;
169 168
169 __be32 create_date;
170 /* Device number in hfsplus_permissions in catalog */ 170 /* Device number in hfsplus_permissions in catalog */
171 u32 dev; 171 u32 dev;
172 /* BSD system and user file flags */ 172 /* BSD system and user file flags */
@@ -305,7 +305,8 @@ int hfs_brec_read(struct hfs_find_data *, void *, int);
305int hfs_brec_goto(struct hfs_find_data *, int); 305int hfs_brec_goto(struct hfs_find_data *, int);
306 306
307/* catalog.c */ 307/* catalog.c */
308int hfsplus_cat_cmp_key(hfsplus_btree_key *, hfsplus_btree_key *); 308int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *);
309int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *);
309void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *, u32, struct qstr *); 310void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *, u32, struct qstr *);
310int hfsplus_find_cat(struct super_block *, u32, struct hfs_find_data *); 311int hfsplus_find_cat(struct super_block *, u32, struct hfs_find_data *);
311int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *); 312int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *);
@@ -314,7 +315,7 @@ int hfsplus_rename_cat(u32, struct inode *, struct qstr *,
314 struct inode *, struct qstr *); 315 struct inode *, struct qstr *);
315 316
316/* extents.c */ 317/* extents.c */
317int hfsplus_ext_cmp_key(hfsplus_btree_key *, hfsplus_btree_key *); 318int hfsplus_ext_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *);
318void hfsplus_ext_write_extent(struct inode *); 319void hfsplus_ext_write_extent(struct inode *);
319int hfsplus_get_block(struct inode *, sector_t, struct buffer_head *, int); 320int hfsplus_get_block(struct inode *, sector_t, struct buffer_head *, int);
320int hfsplus_free_fork(struct super_block *, u32, struct hfsplus_fork_raw *, int); 321int hfsplus_free_fork(struct super_block *, u32, struct hfsplus_fork_raw *, int);
@@ -352,7 +353,8 @@ extern u16 hfsplus_decompose_table[];
352extern u16 hfsplus_compose_table[]; 353extern u16 hfsplus_compose_table[];
353 354
354/* unicode.c */ 355/* unicode.c */
355int hfsplus_unistrcmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *); 356int hfsplus_strcasecmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *);
357int hfsplus_strcmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *);
356int hfsplus_uni2asc(struct super_block *, const struct hfsplus_unistr *, char *, int *); 358int hfsplus_uni2asc(struct super_block *, const struct hfsplus_unistr *, char *, int *);
357int hfsplus_asc2uni(struct super_block *, struct hfsplus_unistr *, const char *, int); 359int hfsplus_asc2uni(struct super_block *, struct hfsplus_unistr *, const char *, int);
358 360
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index 5bad37cfdb2..49205531a50 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -22,8 +22,10 @@
22#define HFSPLUS_SECTOR_SHIFT 9 22#define HFSPLUS_SECTOR_SHIFT 9
23#define HFSPLUS_VOLHEAD_SECTOR 2 23#define HFSPLUS_VOLHEAD_SECTOR 2
24#define HFSPLUS_VOLHEAD_SIG 0x482b 24#define HFSPLUS_VOLHEAD_SIG 0x482b
25#define HFSPLUS_VOLHEAD_SIGX 0x4858
25#define HFSPLUS_SUPER_MAGIC 0x482b 26#define HFSPLUS_SUPER_MAGIC 0x482b
26#define HFSPLUS_CURRENT_VERSION 4 27#define HFSPLUS_MIN_VERSION 4
28#define HFSPLUS_CURRENT_VERSION 5
27 29
28#define HFSP_WRAP_MAGIC 0x4244 30#define HFSP_WRAP_MAGIC 0x4244
29#define HFSP_WRAP_ATTRIB_SLOCK 0x8000 31#define HFSP_WRAP_ATTRIB_SLOCK 0x8000
@@ -41,6 +43,9 @@
41#define HFSP_HARDLINK_TYPE 0x686c6e6b /* 'hlnk' */ 43#define HFSP_HARDLINK_TYPE 0x686c6e6b /* 'hlnk' */
42#define HFSP_HFSPLUS_CREATOR 0x6866732b /* 'hfs+' */ 44#define HFSP_HFSPLUS_CREATOR 0x6866732b /* 'hfs+' */
43 45
46#define HFSP_SYMLINK_TYPE 0x736c6e6b /* 'slnk' */
47#define HFSP_SYMLINK_CREATOR 0x72686170 /* 'rhap' */
48
44#define HFSP_MOUNT_VERSION 0x482b4c78 /* 'H+Lx' */ 49#define HFSP_MOUNT_VERSION 0x482b4c78 /* 'H+Lx' */
45 50
46/* Structures used on disk */ 51/* Structures used on disk */
@@ -123,11 +128,13 @@ struct hfsplus_vh {
123} __packed; 128} __packed;
124 129
125/* HFS+ volume attributes */ 130/* HFS+ volume attributes */
126#define HFSPLUS_VOL_UNMNT (1 << 8) 131#define HFSPLUS_VOL_UNMNT (1 << 8)
127#define HFSPLUS_VOL_SPARE_BLK (1 << 9) 132#define HFSPLUS_VOL_SPARE_BLK (1 << 9)
128#define HFSPLUS_VOL_NOCACHE (1 << 10) 133#define HFSPLUS_VOL_NOCACHE (1 << 10)
129#define HFSPLUS_VOL_INCNSTNT (1 << 11) 134#define HFSPLUS_VOL_INCNSTNT (1 << 11)
130#define HFSPLUS_VOL_SOFTLOCK (1 << 15) 135#define HFSPLUS_VOL_NODEID_REUSED (1 << 12)
136#define HFSPLUS_VOL_JOURNALED (1 << 13)
137#define HFSPLUS_VOL_SOFTLOCK (1 << 15)
131 138
132/* HFS+ BTree node descriptor */ 139/* HFS+ BTree node descriptor */
133struct hfs_bnode_desc { 140struct hfs_bnode_desc {
@@ -159,7 +166,7 @@ struct hfs_btree_header_rec {
159 u16 reserved1; 166 u16 reserved1;
160 __be32 clump_size; 167 __be32 clump_size;
161 u8 btree_type; 168 u8 btree_type;
162 u8 reserved2; 169 u8 key_type;
163 __be32 attributes; 170 __be32 attributes;
164 u32 reserved3[16]; 171 u32 reserved3[16];
165} __packed; 172} __packed;
@@ -184,6 +191,10 @@ struct hfs_btree_header_rec {
184#define HFSPLUS_EXCH_CNID 15 /* ExchangeFiles temp id */ 191#define HFSPLUS_EXCH_CNID 15 /* ExchangeFiles temp id */
185#define HFSPLUS_FIRSTUSER_CNID 16 /* first available user id */ 192#define HFSPLUS_FIRSTUSER_CNID 16 /* first available user id */
186 193
194/* btree key type */
195#define HFSPLUS_KEY_CASEFOLDING 0xCF /* case-insensitive */
196#define HFSPLUS_KEY_BINARY 0xBC /* case-sensitive */
197
187/* HFS+ catalog entry key */ 198/* HFS+ catalog entry key */
188struct hfsplus_cat_key { 199struct hfsplus_cat_key {
189 __be16 key_len; 200 __be16 key_len;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index fc98583cf04..12ed2b7d046 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -18,13 +18,11 @@
18 18
19static int hfsplus_readpage(struct file *file, struct page *page) 19static int hfsplus_readpage(struct file *file, struct page *page)
20{ 20{
21 //printk("readpage: %lu\n", page->index);
22 return block_read_full_page(page, hfsplus_get_block); 21 return block_read_full_page(page, hfsplus_get_block);
23} 22}
24 23
25static int hfsplus_writepage(struct page *page, struct writeback_control *wbc) 24static int hfsplus_writepage(struct page *page, struct writeback_control *wbc)
26{ 25{
27 //printk("writepage: %lu\n", page->index);
28 return block_write_full_page(page, hfsplus_get_block, wbc); 26 return block_write_full_page(page, hfsplus_get_block, wbc);
29} 27}
30 28
@@ -92,7 +90,6 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
92 } while (--i && nidx < tree->node_count); 90 } while (--i && nidx < tree->node_count);
93 spin_unlock(&tree->hash_lock); 91 spin_unlock(&tree->hash_lock);
94 } 92 }
95 //printk("releasepage: %lu,%x = %d\n", page->index, mask, res);
96 return res ? try_to_free_buffers(page) : 0; 93 return res ? try_to_free_buffers(page) : 0;
97} 94}
98 95
@@ -182,11 +179,6 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
182 igrab(dir); 179 igrab(dir);
183 hlist_add_head(&inode->i_hash, &HFSPLUS_SB(sb).rsrc_inodes); 180 hlist_add_head(&inode->i_hash, &HFSPLUS_SB(sb).rsrc_inodes);
184 mark_inode_dirty(inode); 181 mark_inode_dirty(inode);
185 {
186 void hfsplus_inode_check(struct super_block *sb);
187 atomic_inc(&HFSPLUS_SB(sb).inode_cnt);
188 hfsplus_inode_check(sb);
189 }
190out: 182out:
191 d_add(dentry, inode); 183 d_add(dentry, inode);
192 return NULL; 184 return NULL;
@@ -276,13 +268,13 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
276 if (atomic_read(&file->f_count) != 0) 268 if (atomic_read(&file->f_count) != 0)
277 return 0; 269 return 0;
278 if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) { 270 if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) {
279 down(&inode->i_sem); 271 mutex_lock(&inode->i_mutex);
280 hfsplus_file_truncate(inode); 272 hfsplus_file_truncate(inode);
281 if (inode->i_flags & S_DEAD) { 273 if (inode->i_flags & S_DEAD) {
282 hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL); 274 hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
283 hfsplus_delete_inode(inode); 275 hfsplus_delete_inode(inode);
284 } 276 }
285 up(&inode->i_sem); 277 mutex_unlock(&inode->i_mutex);
286 } 278 }
287 return 0; 279 return 0;
288} 280}
@@ -317,11 +309,6 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
317 if (!inode) 309 if (!inode)
318 return NULL; 310 return NULL;
319 311
320 {
321 void hfsplus_inode_check(struct super_block *sb);
322 atomic_inc(&HFSPLUS_SB(sb).inode_cnt);
323 hfsplus_inode_check(sb);
324 }
325 inode->i_ino = HFSPLUS_SB(sb).next_cnid++; 312 inode->i_ino = HFSPLUS_SB(sb).next_cnid++;
326 inode->i_mode = mode; 313 inode->i_mode = mode;
327 inode->i_uid = current->fsuid; 314 inode->i_uid = current->fsuid;
@@ -444,7 +431,8 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
444 inode->i_size = 2 + be32_to_cpu(folder->valence); 431 inode->i_size = 2 + be32_to_cpu(folder->valence);
445 inode->i_atime = hfsp_mt2ut(folder->access_date); 432 inode->i_atime = hfsp_mt2ut(folder->access_date);
446 inode->i_mtime = hfsp_mt2ut(folder->content_mod_date); 433 inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
447 inode->i_ctime = inode->i_mtime; 434 inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date);
435 HFSPLUS_I(inode).create_date = folder->create_date;
448 HFSPLUS_I(inode).fs_blocks = 0; 436 HFSPLUS_I(inode).fs_blocks = 0;
449 inode->i_op = &hfsplus_dir_inode_operations; 437 inode->i_op = &hfsplus_dir_inode_operations;
450 inode->i_fop = &hfsplus_dir_operations; 438 inode->i_fop = &hfsplus_dir_operations;
@@ -475,9 +463,10 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
475 } 463 }
476 inode->i_atime = hfsp_mt2ut(file->access_date); 464 inode->i_atime = hfsp_mt2ut(file->access_date);
477 inode->i_mtime = hfsp_mt2ut(file->content_mod_date); 465 inode->i_mtime = hfsp_mt2ut(file->content_mod_date);
478 inode->i_ctime = inode->i_mtime; 466 inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date);
467 HFSPLUS_I(inode).create_date = file->create_date;
479 } else { 468 } else {
480 printk("HFS+-fs: bad catalog entry used to create inode\n"); 469 printk(KERN_ERR "hfs: bad catalog entry used to create inode\n");
481 res = -EIO; 470 res = -EIO;
482 } 471 }
483 return res; 472 return res;
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index e07aa096e07..13cf848ac83 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -12,6 +12,7 @@
12 * hfsplus ioctls 12 * hfsplus ioctls
13 */ 13 */
14 14
15#include <linux/capability.h>
15#include <linux/fs.h> 16#include <linux/fs.h>
16#include <linux/sched.h> 17#include <linux/sched.h>
17#include <linux/xattr.h> 18#include <linux/xattr.h>
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index cca0818aa4c..dc64fac0083 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -22,7 +22,7 @@ enum {
22 opt_umask, opt_uid, opt_gid, 22 opt_umask, opt_uid, opt_gid,
23 opt_part, opt_session, opt_nls, 23 opt_part, opt_session, opt_nls,
24 opt_nodecompose, opt_decompose, 24 opt_nodecompose, opt_decompose,
25 opt_err 25 opt_force, opt_err
26}; 26};
27 27
28static match_table_t tokens = { 28static match_table_t tokens = {
@@ -36,6 +36,7 @@ static match_table_t tokens = {
36 { opt_nls, "nls=%s" }, 36 { opt_nls, "nls=%s" },
37 { opt_decompose, "decompose" }, 37 { opt_decompose, "decompose" },
38 { opt_nodecompose, "nodecompose" }, 38 { opt_nodecompose, "nodecompose" },
39 { opt_force, "force" },
39 { opt_err, NULL } 40 { opt_err, NULL }
40}; 41};
41 42
@@ -82,58 +83,58 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
82 switch (token) { 83 switch (token) {
83 case opt_creator: 84 case opt_creator:
84 if (match_fourchar(&args[0], &sbi->creator)) { 85 if (match_fourchar(&args[0], &sbi->creator)) {
85 printk("HFS+-fs: creator requires a 4 character value\n"); 86 printk(KERN_ERR "hfs: creator requires a 4 character value\n");
86 return 0; 87 return 0;
87 } 88 }
88 break; 89 break;
89 case opt_type: 90 case opt_type:
90 if (match_fourchar(&args[0], &sbi->type)) { 91 if (match_fourchar(&args[0], &sbi->type)) {
91 printk("HFS+-fs: type requires a 4 character value\n"); 92 printk(KERN_ERR "hfs: type requires a 4 character value\n");
92 return 0; 93 return 0;
93 } 94 }
94 break; 95 break;
95 case opt_umask: 96 case opt_umask:
96 if (match_octal(&args[0], &tmp)) { 97 if (match_octal(&args[0], &tmp)) {
97 printk("HFS+-fs: umask requires a value\n"); 98 printk(KERN_ERR "hfs: umask requires a value\n");
98 return 0; 99 return 0;
99 } 100 }
100 sbi->umask = (umode_t)tmp; 101 sbi->umask = (umode_t)tmp;
101 break; 102 break;
102 case opt_uid: 103 case opt_uid:
103 if (match_int(&args[0], &tmp)) { 104 if (match_int(&args[0], &tmp)) {
104 printk("HFS+-fs: uid requires an argument\n"); 105 printk(KERN_ERR "hfs: uid requires an argument\n");
105 return 0; 106 return 0;
106 } 107 }
107 sbi->uid = (uid_t)tmp; 108 sbi->uid = (uid_t)tmp;
108 break; 109 break;
109 case opt_gid: 110 case opt_gid:
110 if (match_int(&args[0], &tmp)) { 111 if (match_int(&args[0], &tmp)) {
111 printk("HFS+-fs: gid requires an argument\n"); 112 printk(KERN_ERR "hfs: gid requires an argument\n");
112 return 0; 113 return 0;
113 } 114 }
114 sbi->gid = (gid_t)tmp; 115 sbi->gid = (gid_t)tmp;
115 break; 116 break;
116 case opt_part: 117 case opt_part:
117 if (match_int(&args[0], &sbi->part)) { 118 if (match_int(&args[0], &sbi->part)) {
118 printk("HFS+-fs: part requires an argument\n"); 119 printk(KERN_ERR "hfs: part requires an argument\n");
119 return 0; 120 return 0;
120 } 121 }
121 break; 122 break;
122 case opt_session: 123 case opt_session:
123 if (match_int(&args[0], &sbi->session)) { 124 if (match_int(&args[0], &sbi->session)) {
124 printk("HFS+-fs: session requires an argument\n"); 125 printk(KERN_ERR "hfs: session requires an argument\n");
125 return 0; 126 return 0;
126 } 127 }
127 break; 128 break;
128 case opt_nls: 129 case opt_nls:
129 if (sbi->nls) { 130 if (sbi->nls) {
130 printk("HFS+-fs: unable to change nls mapping\n"); 131 printk(KERN_ERR "hfs: unable to change nls mapping\n");
131 return 0; 132 return 0;
132 } 133 }
133 p = match_strdup(&args[0]); 134 p = match_strdup(&args[0]);
134 sbi->nls = load_nls(p); 135 sbi->nls = load_nls(p);
135 if (!sbi->nls) { 136 if (!sbi->nls) {
136 printk("HFS+-fs: unable to load nls mapping \"%s\"\n", p); 137 printk(KERN_ERR "hfs: unable to load nls mapping \"%s\"\n", p);
137 kfree(p); 138 kfree(p);
138 return 0; 139 return 0;
139 } 140 }
@@ -145,6 +146,9 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
145 case opt_nodecompose: 146 case opt_nodecompose:
146 sbi->flags |= HFSPLUS_SB_NODECOMPOSE; 147 sbi->flags |= HFSPLUS_SB_NODECOMPOSE;
147 break; 148 break;
149 case opt_force:
150 sbi->flags |= HFSPLUS_SB_FORCE;
151 break;
148 default: 152 default:
149 return 0; 153 return 0;
150 } 154 }
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 0ce1c455ae5..7843f792a4b 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -22,29 +22,12 @@ static void hfsplus_destroy_inode(struct inode *inode);
22 22
23#include "hfsplus_fs.h" 23#include "hfsplus_fs.h"
24 24
25void hfsplus_inode_check(struct super_block *sb)
26{
27#if 0
28 u32 cnt = atomic_read(&HFSPLUS_SB(sb).inode_cnt);
29 u32 last_cnt = HFSPLUS_SB(sb).last_inode_cnt;
30
31 if (cnt <= (last_cnt / 2) ||
32 cnt >= (last_cnt * 2)) {
33 HFSPLUS_SB(sb).last_inode_cnt = cnt;
34 printk("inode_check: %u,%u,%u\n", cnt, last_cnt,
35 HFSPLUS_SB(sb).cat_tree ? HFSPLUS_SB(sb).cat_tree->node_hash_cnt : 0);
36 }
37#endif
38}
39
40static void hfsplus_read_inode(struct inode *inode) 25static void hfsplus_read_inode(struct inode *inode)
41{ 26{
42 struct hfs_find_data fd; 27 struct hfs_find_data fd;
43 struct hfsplus_vh *vhdr; 28 struct hfsplus_vh *vhdr;
44 int err; 29 int err;
45 30
46 atomic_inc(&HFSPLUS_SB(inode->i_sb).inode_cnt);
47 hfsplus_inode_check(inode->i_sb);
48 INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); 31 INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
49 init_MUTEX(&HFSPLUS_I(inode).extents_lock); 32 init_MUTEX(&HFSPLUS_I(inode).extents_lock);
50 HFSPLUS_I(inode).flags = 0; 33 HFSPLUS_I(inode).flags = 0;
@@ -155,12 +138,10 @@ static int hfsplus_write_inode(struct inode *inode, int unused)
155static void hfsplus_clear_inode(struct inode *inode) 138static void hfsplus_clear_inode(struct inode *inode)
156{ 139{
157 dprint(DBG_INODE, "hfsplus_clear_inode: %lu\n", inode->i_ino); 140 dprint(DBG_INODE, "hfsplus_clear_inode: %lu\n", inode->i_ino);
158 atomic_dec(&HFSPLUS_SB(inode->i_sb).inode_cnt);
159 if (HFSPLUS_IS_RSRC(inode)) { 141 if (HFSPLUS_IS_RSRC(inode)) {
160 HFSPLUS_I(HFSPLUS_I(inode).rsrc_inode).rsrc_inode = NULL; 142 HFSPLUS_I(HFSPLUS_I(inode).rsrc_inode).rsrc_inode = NULL;
161 iput(HFSPLUS_I(inode).rsrc_inode); 143 iput(HFSPLUS_I(inode).rsrc_inode);
162 } 144 }
163 hfsplus_inode_check(inode->i_sb);
164} 145}
165 146
166static void hfsplus_write_super(struct super_block *sb) 147static void hfsplus_write_super(struct super_block *sb)
@@ -188,7 +169,7 @@ static void hfsplus_write_super(struct super_block *sb)
188 block = HFSPLUS_SB(sb).blockoffset; 169 block = HFSPLUS_SB(sb).blockoffset;
189 block += (HFSPLUS_SB(sb).sect_count - 2) >> (sb->s_blocksize_bits - 9); 170 block += (HFSPLUS_SB(sb).sect_count - 2) >> (sb->s_blocksize_bits - 9);
190 offset = ((HFSPLUS_SB(sb).sect_count - 2) << 9) & (sb->s_blocksize - 1); 171 offset = ((HFSPLUS_SB(sb).sect_count - 2) << 9) & (sb->s_blocksize - 1);
191 printk("backup: %u,%u,%u,%u\n", HFSPLUS_SB(sb).blockoffset, 172 printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n", HFSPLUS_SB(sb).blockoffset,
192 HFSPLUS_SB(sb).sect_count, block, offset); 173 HFSPLUS_SB(sb).sect_count, block, offset);
193 bh = sb_bread(sb, block); 174 bh = sb_bread(sb, block);
194 if (bh) { 175 if (bh) {
@@ -198,7 +179,7 @@ static void hfsplus_write_super(struct super_block *sb)
198 mark_buffer_dirty(bh); 179 mark_buffer_dirty(bh);
199 brelse(bh); 180 brelse(bh);
200 } else 181 } else
201 printk("backup not found!\n"); 182 printk(KERN_WARNING "hfs: backup not found!\n");
202 } 183 }
203 } 184 }
204 HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP; 185 HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP;
@@ -251,14 +232,26 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
251 return 0; 232 return 0;
252 if (!(*flags & MS_RDONLY)) { 233 if (!(*flags & MS_RDONLY)) {
253 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; 234 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
235 struct hfsplus_sb_info sbi;
236
237 memset(&sbi, 0, sizeof(struct hfsplus_sb_info));
238 sbi.nls = HFSPLUS_SB(sb).nls;
239 if (!hfsplus_parse_options(data, &sbi))
240 return -EINVAL;
254 241
255 if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { 242 if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
256 printk("HFS+-fs warning: Filesystem was not cleanly unmounted, " 243 printk(KERN_WARNING "hfs: filesystem was not cleanly unmounted, "
257 "running fsck.hfsplus is recommended. leaving read-only.\n"); 244 "running fsck.hfsplus is recommended. leaving read-only.\n");
258 sb->s_flags |= MS_RDONLY; 245 sb->s_flags |= MS_RDONLY;
259 *flags |= MS_RDONLY; 246 *flags |= MS_RDONLY;
247 } else if (sbi.flags & HFSPLUS_SB_FORCE) {
248 /* nothing */
260 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { 249 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
261 printk("HFS+-fs: Filesystem is marked locked, leaving read-only.\n"); 250 printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n");
251 sb->s_flags |= MS_RDONLY;
252 *flags |= MS_RDONLY;
253 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) {
254 printk(KERN_WARNING "hfs: filesystem is marked journaled, leaving read-only.\n");
262 sb->s_flags |= MS_RDONLY; 255 sb->s_flags |= MS_RDONLY;
263 *flags |= MS_RDONLY; 256 *flags |= MS_RDONLY;
264 } 257 }
@@ -299,8 +292,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
299 INIT_HLIST_HEAD(&sbi->rsrc_inodes); 292 INIT_HLIST_HEAD(&sbi->rsrc_inodes);
300 hfsplus_fill_defaults(sbi); 293 hfsplus_fill_defaults(sbi);
301 if (!hfsplus_parse_options(data, sbi)) { 294 if (!hfsplus_parse_options(data, sbi)) {
302 if (!silent) 295 printk(KERN_ERR "hfs: unable to parse mount options\n");
303 printk("HFS+-fs: unable to parse mount options\n");
304 err = -EINVAL; 296 err = -EINVAL;
305 goto cleanup; 297 goto cleanup;
306 } 298 }
@@ -308,8 +300,8 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
308 /* temporarily use utf8 to correctly find the hidden dir below */ 300 /* temporarily use utf8 to correctly find the hidden dir below */
309 nls = sbi->nls; 301 nls = sbi->nls;
310 sbi->nls = load_nls("utf8"); 302 sbi->nls = load_nls("utf8");
311 if (!nls) { 303 if (!sbi->nls) {
312 printk("HFS+: unable to load nls for utf8\n"); 304 printk(KERN_ERR "hfs: unable to load nls for utf8\n");
313 err = -EINVAL; 305 err = -EINVAL;
314 goto cleanup; 306 goto cleanup;
315 } 307 }
@@ -317,17 +309,17 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
317 /* Grab the volume header */ 309 /* Grab the volume header */
318 if (hfsplus_read_wrapper(sb)) { 310 if (hfsplus_read_wrapper(sb)) {
319 if (!silent) 311 if (!silent)
320 printk("HFS+-fs: unable to find HFS+ superblock\n"); 312 printk(KERN_WARNING "hfs: unable to find HFS+ superblock\n");
321 err = -EINVAL; 313 err = -EINVAL;
322 goto cleanup; 314 goto cleanup;
323 } 315 }
324 vhdr = HFSPLUS_SB(sb).s_vhdr; 316 vhdr = HFSPLUS_SB(sb).s_vhdr;
325 317
326 /* Copy parts of the volume header into the superblock */ 318 /* Copy parts of the volume header into the superblock */
327 sb->s_magic = be16_to_cpu(vhdr->signature); 319 sb->s_magic = HFSPLUS_VOLHEAD_SIG;
328 if (be16_to_cpu(vhdr->version) != HFSPLUS_CURRENT_VERSION) { 320 if (be16_to_cpu(vhdr->version) < HFSPLUS_MIN_VERSION ||
329 if (!silent) 321 be16_to_cpu(vhdr->version) > HFSPLUS_CURRENT_VERSION) {
330 printk("HFS+-fs: wrong filesystem version\n"); 322 printk(KERN_ERR "hfs: wrong filesystem version\n");
331 goto cleanup; 323 goto cleanup;
332 } 324 }
333 HFSPLUS_SB(sb).total_blocks = be32_to_cpu(vhdr->total_blocks); 325 HFSPLUS_SB(sb).total_blocks = be32_to_cpu(vhdr->total_blocks);
@@ -348,34 +340,36 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
348 sb->s_maxbytes = MAX_LFS_FILESIZE; 340 sb->s_maxbytes = MAX_LFS_FILESIZE;
349 341
350 if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { 342 if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
351 if (!silent) 343 printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, "
352 printk("HFS+-fs warning: Filesystem was not cleanly unmounted, " 344 "running fsck.hfsplus is recommended. mounting read-only.\n");
353 "running fsck.hfsplus is recommended. mounting read-only.\n");
354 sb->s_flags |= MS_RDONLY; 345 sb->s_flags |= MS_RDONLY;
346 } else if (sbi->flags & HFSPLUS_SB_FORCE) {
347 /* nothing */
355 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { 348 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
356 if (!silent) 349 printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n");
357 printk("HFS+-fs: Filesystem is marked locked, mounting read-only.\n"); 350 sb->s_flags |= MS_RDONLY;
351 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) {
352 printk(KERN_WARNING "hfs: write access to a jounaled filesystem is not supported, "
353 "use the force option at your own risk, mounting read-only.\n");
358 sb->s_flags |= MS_RDONLY; 354 sb->s_flags |= MS_RDONLY;
359 } 355 }
356 sbi->flags &= ~HFSPLUS_SB_FORCE;
360 357
361 /* Load metadata objects (B*Trees) */ 358 /* Load metadata objects (B*Trees) */
362 HFSPLUS_SB(sb).ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID); 359 HFSPLUS_SB(sb).ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
363 if (!HFSPLUS_SB(sb).ext_tree) { 360 if (!HFSPLUS_SB(sb).ext_tree) {
364 if (!silent) 361 printk(KERN_ERR "hfs: failed to load extents file\n");
365 printk("HFS+-fs: failed to load extents file\n");
366 goto cleanup; 362 goto cleanup;
367 } 363 }
368 HFSPLUS_SB(sb).cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID); 364 HFSPLUS_SB(sb).cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
369 if (!HFSPLUS_SB(sb).cat_tree) { 365 if (!HFSPLUS_SB(sb).cat_tree) {
370 if (!silent) 366 printk(KERN_ERR "hfs: failed to load catalog file\n");
371 printk("HFS+-fs: failed to load catalog file\n");
372 goto cleanup; 367 goto cleanup;
373 } 368 }
374 369
375 HFSPLUS_SB(sb).alloc_file = iget(sb, HFSPLUS_ALLOC_CNID); 370 HFSPLUS_SB(sb).alloc_file = iget(sb, HFSPLUS_ALLOC_CNID);
376 if (!HFSPLUS_SB(sb).alloc_file) { 371 if (!HFSPLUS_SB(sb).alloc_file) {
377 if (!silent) 372 printk(KERN_ERR "hfs: failed to load allocation file\n");
378 printk("HFS+-fs: failed to load allocation file\n");
379 goto cleanup; 373 goto cleanup;
380 } 374 }
381 375
@@ -383,8 +377,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
383 root = iget(sb, HFSPLUS_ROOT_CNID); 377 root = iget(sb, HFSPLUS_ROOT_CNID);
384 sb->s_root = d_alloc_root(root); 378 sb->s_root = d_alloc_root(root);
385 if (!sb->s_root) { 379 if (!sb->s_root) {
386 if (!silent) 380 printk(KERN_ERR "hfs: failed to load root directory\n");
387 printk("HFS+-fs: failed to load root directory\n");
388 iput(root); 381 iput(root);
389 goto cleanup; 382 goto cleanup;
390 } 383 }
@@ -418,7 +411,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
418 sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh); 411 sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh);
419 412
420 if (!HFSPLUS_SB(sb).hidden_dir) { 413 if (!HFSPLUS_SB(sb).hidden_dir) {
421 printk("HFS+: create hidden dir...\n"); 414 printk(KERN_DEBUG "hfs: create hidden dir...\n");
422 HFSPLUS_SB(sb).hidden_dir = hfsplus_new_inode(sb, S_IFDIR); 415 HFSPLUS_SB(sb).hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
423 hfsplus_create_cat(HFSPLUS_SB(sb).hidden_dir->i_ino, sb->s_root->d_inode, 416 hfsplus_create_cat(HFSPLUS_SB(sb).hidden_dir->i_ino, sb->s_root->d_inode,
424 &str, HFSPLUS_SB(sb).hidden_dir); 417 &str, HFSPLUS_SB(sb).hidden_dir);
@@ -498,7 +491,7 @@ static void __exit exit_hfsplus_fs(void)
498{ 491{
499 unregister_filesystem(&hfsplus_fs_type); 492 unregister_filesystem(&hfsplus_fs_type);
500 if (kmem_cache_destroy(hfsplus_inode_cachep)) 493 if (kmem_cache_destroy(hfsplus_inode_cachep))
501 printk(KERN_INFO "hfsplus_inode_cache: not all structures were freed\n"); 494 printk(KERN_ERR "hfsplus_inode_cache: not all structures were freed\n");
502} 495}
503 496
504module_init(init_hfsplus_fs) 497module_init(init_hfsplus_fs)
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index 060c69048c3..689c8bd721f 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -28,7 +28,8 @@ static inline u16 case_fold(u16 c)
28} 28}
29 29
30/* Compare unicode strings, return values like normal strcmp */ 30/* Compare unicode strings, return values like normal strcmp */
31int hfsplus_unistrcmp(const struct hfsplus_unistr *s1, const struct hfsplus_unistr *s2) 31int hfsplus_strcasecmp(const struct hfsplus_unistr *s1,
32 const struct hfsplus_unistr *s2)
32{ 33{
33 u16 len1, len2, c1, c2; 34 u16 len1, len2, c1, c2;
34 const hfsplus_unichr *p1, *p2; 35 const hfsplus_unichr *p1, *p2;
@@ -59,6 +60,33 @@ int hfsplus_unistrcmp(const struct hfsplus_unistr *s1, const struct hfsplus_unis
59 } 60 }
60} 61}
61 62
63/* Compare names as a sequence of 16-bit unsigned integers */
64int hfsplus_strcmp(const struct hfsplus_unistr *s1,
65 const struct hfsplus_unistr *s2)
66{
67 u16 len1, len2, c1, c2;
68 const hfsplus_unichr *p1, *p2;
69 int len;
70
71 len1 = be16_to_cpu(s1->length);
72 len2 = be16_to_cpu(s2->length);
73 p1 = s1->unicode;
74 p2 = s2->unicode;
75
76 for (len = min(len1, len2); len > 0; len--) {
77 c1 = be16_to_cpu(*p1);
78 c2 = be16_to_cpu(*p2);
79 if (c1 != c2)
80 return c1 < c2 ? -1 : 1;
81 p1++;
82 p2++;
83 }
84
85 return len1 < len2 ? -1 :
86 len1 > len2 ? 1 : 0;
87}
88
89
62#define Hangul_SBase 0xac00 90#define Hangul_SBase 0xac00
63#define Hangul_LBase 0x1100 91#define Hangul_LBase 0x1100
64#define Hangul_VBase 0x1161 92#define Hangul_VBase 0x1161
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 95455e83923..72cab78f050 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -28,8 +28,11 @@ static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd)
28{ 28{
29 u32 extent; 29 u32 extent;
30 u16 attrib; 30 u16 attrib;
31 __be16 sig;
31 32
32 if (be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_EMBEDSIG)) != HFSPLUS_VOLHEAD_SIG) 33 sig = *(__be16 *)(bufptr + HFSP_WRAPOFF_EMBEDSIG);
34 if (sig != cpu_to_be16(HFSPLUS_VOLHEAD_SIG) &&
35 sig != cpu_to_be16(HFSPLUS_VOLHEAD_SIGX))
33 return 0; 36 return 0;
34 37
35 attrib = be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ATTRIB)); 38 attrib = be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ATTRIB));
@@ -70,7 +73,7 @@ static int hfsplus_get_last_session(struct super_block *sb,
70 *start = (sector_t)te.cdte_addr.lba << 2; 73 *start = (sector_t)te.cdte_addr.lba << 2;
71 return 0; 74 return 0;
72 } 75 }
73 printk(KERN_ERR "HFS: Invalid session number or type of track\n"); 76 printk(KERN_ERR "hfs: invalid session number or type of track\n");
74 return -EINVAL; 77 return -EINVAL;
75 } 78 }
76 ms_info.addr_format = CDROM_LBA; 79 ms_info.addr_format = CDROM_LBA;
@@ -114,6 +117,10 @@ int hfsplus_read_wrapper(struct super_block *sb)
114 } 117 }
115 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG)) 118 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
116 break; 119 break;
120 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) {
121 HFSPLUS_SB(sb).flags |= HFSPLUS_SB_HFSX;
122 break;
123 }
117 brelse(bh); 124 brelse(bh);
118 125
119 /* check for a partition block 126 /* check for a partition block
@@ -143,7 +150,7 @@ int hfsplus_read_wrapper(struct super_block *sb)
143 blocksize >>= 1; 150 blocksize >>= 1;
144 151
145 if (sb_set_blocksize(sb, blocksize) != blocksize) { 152 if (sb_set_blocksize(sb, blocksize) != blocksize) {
146 printk("HFS+: unable to blocksize to %u!\n", blocksize); 153 printk(KERN_ERR "hfs: unable to set blocksize to %u!\n", blocksize);
147 return -EINVAL; 154 return -EINVAL;
148 } 155 }
149 156
@@ -158,7 +165,9 @@ int hfsplus_read_wrapper(struct super_block *sb)
158 return -EIO; 165 return -EIO;
159 166
160 /* should still be the same... */ 167 /* should still be the same... */
161 if (be16_to_cpu(vhdr->signature) != HFSPLUS_VOLHEAD_SIG) 168 if (vhdr->signature != (HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX ?
169 cpu_to_be16(HFSPLUS_VOLHEAD_SIGX) :
170 cpu_to_be16(HFSPLUS_VOLHEAD_SIG)))
162 goto error; 171 goto error;
163 HFSPLUS_SB(sb).s_vhbh = bh; 172 HFSPLUS_SB(sb).s_vhbh = bh;
164 HFSPLUS_SB(sb).s_vhdr = vhdr; 173 HFSPLUS_SB(sb).s_vhdr = vhdr;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 4684eb7d48c..b3ad0bd0312 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -501,11 +501,16 @@ int hostfs_commit_write(struct file *file, struct page *page, unsigned from,
501 long long start; 501 long long start;
502 int err = 0; 502 int err = 0;
503 503
504 start = (long long) (page->index << PAGE_CACHE_SHIFT) + from; 504 start = (((long long) page->index) << PAGE_CACHE_SHIFT) + from;
505 buffer = kmap(page); 505 buffer = kmap(page);
506 err = write_file(FILE_HOSTFS_I(file)->fd, &start, buffer + from, 506 err = write_file(FILE_HOSTFS_I(file)->fd, &start, buffer + from,
507 to - from); 507 to - from);
508 if(err > 0) err = 0; 508 if(err > 0) err = 0;
509
510 /* Actually, if !err, write_file has added to-from to start, so, despite
511 * the appearance, we are comparing i_size against the _last_ written
512 * location, as we should. */
513
509 if(!err && (start > inode->i_size)) 514 if(!err && (start > inode->i_size))
510 inode->i_size = start; 515 inode->i_size = start;
511 516
@@ -910,10 +915,8 @@ static struct inode_operations hostfs_dir_iops = {
910int hostfs_link_readpage(struct file *file, struct page *page) 915int hostfs_link_readpage(struct file *file, struct page *page)
911{ 916{
912 char *buffer, *name; 917 char *buffer, *name;
913 long long start;
914 int err; 918 int err;
915 919
916 start = page->index << PAGE_CACHE_SHIFT;
917 buffer = kmap(page); 920 buffer = kmap(page);
918 name = inode_name(page->mapping->host, 0); 921 name = inode_name(page->mapping->host, 0);
919 if(name == NULL) return(-ENOMEM); 922 if(name == NULL) return(-ENOMEM);
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 0217c3a0444..5591f9623aa 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -32,19 +32,19 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
32 32
33 /*printk("dir lseek\n");*/ 33 /*printk("dir lseek\n");*/
34 if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok; 34 if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok;
35 down(&i->i_sem); 35 mutex_lock(&i->i_mutex);
36 pos = ((loff_t) hpfs_de_as_down_as_possible(s, hpfs_inode->i_dno) << 4) + 1; 36 pos = ((loff_t) hpfs_de_as_down_as_possible(s, hpfs_inode->i_dno) << 4) + 1;
37 while (pos != new_off) { 37 while (pos != new_off) {
38 if (map_pos_dirent(i, &pos, &qbh)) hpfs_brelse4(&qbh); 38 if (map_pos_dirent(i, &pos, &qbh)) hpfs_brelse4(&qbh);
39 else goto fail; 39 else goto fail;
40 if (pos == 12) goto fail; 40 if (pos == 12) goto fail;
41 } 41 }
42 up(&i->i_sem); 42 mutex_unlock(&i->i_mutex);
43ok: 43ok:
44 unlock_kernel(); 44 unlock_kernel();
45 return filp->f_pos = new_off; 45 return filp->f_pos = new_off;
46fail: 46fail:
47 up(&i->i_sem); 47 mutex_unlock(&i->i_mutex);
48 /*printk("illegal lseek: %016llx\n", new_off);*/ 48 /*printk("illegal lseek: %016llx\n", new_off);*/
49 unlock_kernel(); 49 unlock_kernel();
50 return -ESPIPE; 50 return -ESPIPE;
diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c
index 52930915bad..a44dc589739 100644
--- a/fs/hppfs/hppfs_kern.c
+++ b/fs/hppfs/hppfs_kern.c
@@ -171,12 +171,12 @@ static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry,
171 171
172 err = -ENOMEM; 172 err = -ENOMEM;
173 parent = HPPFS_I(ino)->proc_dentry; 173 parent = HPPFS_I(ino)->proc_dentry;
174 down(&parent->d_inode->i_sem); 174 mutex_lock(&parent->d_inode->i_mutex);
175 proc_dentry = d_lookup(parent, &dentry->d_name); 175 proc_dentry = d_lookup(parent, &dentry->d_name);
176 if(proc_dentry == NULL){ 176 if(proc_dentry == NULL){
177 proc_dentry = d_alloc(parent, &dentry->d_name); 177 proc_dentry = d_alloc(parent, &dentry->d_name);
178 if(proc_dentry == NULL){ 178 if(proc_dentry == NULL){
179 up(&parent->d_inode->i_sem); 179 mutex_unlock(&parent->d_inode->i_mutex);
180 goto out; 180 goto out;
181 } 181 }
182 new = (*parent->d_inode->i_op->lookup)(parent->d_inode, 182 new = (*parent->d_inode->i_op->lookup)(parent->d_inode,
@@ -186,7 +186,7 @@ static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry,
186 proc_dentry = new; 186 proc_dentry = new;
187 } 187 }
188 } 188 }
189 up(&parent->d_inode->i_sem); 189 mutex_unlock(&parent->d_inode->i_mutex);
190 190
191 if(IS_ERR(proc_dentry)) 191 if(IS_ERR(proc_dentry))
192 return(proc_dentry); 192 return(proc_dentry);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 64983ab5558..f568102da1e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -18,6 +18,7 @@
18#include <linux/highmem.h> 18#include <linux/highmem.h>
19#include <linux/init.h> 19#include <linux/init.h>
20#include <linux/string.h> 20#include <linux/string.h>
21#include <linux/capability.h>
21#include <linux/backing-dev.h> 22#include <linux/backing-dev.h>
22#include <linux/hugetlb.h> 23#include <linux/hugetlb.h>
23#include <linux/pagevec.h> 24#include <linux/pagevec.h>
@@ -100,9 +101,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
100 loff_t len, vma_len; 101 loff_t len, vma_len;
101 int ret; 102 int ret;
102 103
103 if ((vma->vm_flags & (VM_MAYSHARE | VM_WRITE)) == VM_WRITE)
104 return -EINVAL;
105
106 if (vma->vm_pgoff & (HPAGE_SIZE / PAGE_SIZE - 1)) 104 if (vma->vm_pgoff & (HPAGE_SIZE / PAGE_SIZE - 1))
107 return -EINVAL; 105 return -EINVAL;
108 106
@@ -121,7 +119,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
121 119
122 vma_len = (loff_t)(vma->vm_end - vma->vm_start); 120 vma_len = (loff_t)(vma->vm_end - vma->vm_start);
123 121
124 down(&inode->i_sem); 122 mutex_lock(&inode->i_mutex);
125 file_accessed(file); 123 file_accessed(file);
126 vma->vm_flags |= VM_HUGETLB | VM_RESERVED; 124 vma->vm_flags |= VM_HUGETLB | VM_RESERVED;
127 vma->vm_ops = &hugetlb_vm_ops; 125 vma->vm_ops = &hugetlb_vm_ops;
@@ -136,7 +134,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
136 if (inode->i_size < len) 134 if (inode->i_size < len)
137 inode->i_size = len; 135 inode->i_size = len;
138out: 136out:
139 up(&inode->i_sem); 137 mutex_unlock(&inode->i_mutex);
140 138
141 return ret; 139 return ret;
142} 140}
@@ -404,7 +402,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
404 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; 402 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
405 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 403 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
406 info = HUGETLBFS_I(inode); 404 info = HUGETLBFS_I(inode);
407 mpol_shared_policy_init(&info->policy); 405 mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, NULL);
408 switch (mode & S_IFMT) { 406 switch (mode & S_IFMT) {
409 default: 407 default:
410 init_special_inode(inode, mode, dev); 408 init_special_inode(inode, mode, dev);
@@ -512,10 +510,14 @@ static int hugetlbfs_statfs(struct super_block *sb, struct kstatfs *buf)
512 buf->f_bsize = HPAGE_SIZE; 510 buf->f_bsize = HPAGE_SIZE;
513 if (sbinfo) { 511 if (sbinfo) {
514 spin_lock(&sbinfo->stat_lock); 512 spin_lock(&sbinfo->stat_lock);
515 buf->f_blocks = sbinfo->max_blocks; 513 /* If no limits set, just report 0 for max/free/used
516 buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; 514 * blocks, like simple_statfs() */
517 buf->f_files = sbinfo->max_inodes; 515 if (sbinfo->max_blocks >= 0) {
518 buf->f_ffree = sbinfo->free_inodes; 516 buf->f_blocks = sbinfo->max_blocks;
517 buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
518 buf->f_files = sbinfo->max_inodes;
519 buf->f_ffree = sbinfo->free_inodes;
520 }
519 spin_unlock(&sbinfo->stat_lock); 521 spin_unlock(&sbinfo->stat_lock);
520 } 522 }
521 buf->f_namelen = NAME_MAX; 523 buf->f_namelen = NAME_MAX;
diff --git a/fs/inode.c b/fs/inode.c
index d8d04bd72b5..108138d4e90 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -22,6 +22,7 @@
22#include <linux/cdev.h> 22#include <linux/cdev.h>
23#include <linux/bootmem.h> 23#include <linux/bootmem.h>
24#include <linux/inotify.h> 24#include <linux/inotify.h>
25#include <linux/mount.h>
25 26
26/* 27/*
27 * This is needed for the following functions: 28 * This is needed for the following functions:
@@ -192,7 +193,7 @@ void inode_init_once(struct inode *inode)
192 INIT_HLIST_NODE(&inode->i_hash); 193 INIT_HLIST_NODE(&inode->i_hash);
193 INIT_LIST_HEAD(&inode->i_dentry); 194 INIT_LIST_HEAD(&inode->i_dentry);
194 INIT_LIST_HEAD(&inode->i_devices); 195 INIT_LIST_HEAD(&inode->i_devices);
195 sema_init(&inode->i_sem, 1); 196 mutex_init(&inode->i_mutex);
196 init_rwsem(&inode->i_alloc_sem); 197 init_rwsem(&inode->i_alloc_sem);
197 INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); 198 INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
198 rwlock_init(&inode->i_data.tree_lock); 199 rwlock_init(&inode->i_data.tree_lock);
@@ -770,7 +771,7 @@ EXPORT_SYMBOL(igrab);
770 * 771 *
771 * Note, @test is called with the inode_lock held, so can't sleep. 772 * Note, @test is called with the inode_lock held, so can't sleep.
772 */ 773 */
773static inline struct inode *ifind(struct super_block *sb, 774static struct inode *ifind(struct super_block *sb,
774 struct hlist_head *head, int (*test)(struct inode *, void *), 775 struct hlist_head *head, int (*test)(struct inode *, void *),
775 void *data, const int wait) 776 void *data, const int wait)
776{ 777{
@@ -804,7 +805,7 @@ static inline struct inode *ifind(struct super_block *sb,
804 * 805 *
805 * Otherwise NULL is returned. 806 * Otherwise NULL is returned.
806 */ 807 */
807static inline struct inode *ifind_fast(struct super_block *sb, 808static struct inode *ifind_fast(struct super_block *sb,
808 struct hlist_head *head, unsigned long ino) 809 struct hlist_head *head, unsigned long ino)
809{ 810{
810 struct inode *inode; 811 struct inode *inode;
@@ -1176,22 +1177,33 @@ sector_t bmap(struct inode * inode, sector_t block)
1176EXPORT_SYMBOL(bmap); 1177EXPORT_SYMBOL(bmap);
1177 1178
1178/** 1179/**
1179 * update_atime - update the access time 1180 * touch_atime - update the access time
1181 * @mnt: mount the inode is accessed on
1180 * @inode: inode accessed 1182 * @inode: inode accessed
1181 * 1183 *
1182 * Update the accessed time on an inode and mark it for writeback. 1184 * Update the accessed time on an inode and mark it for writeback.
1183 * This function automatically handles read only file systems and media, 1185 * This function automatically handles read only file systems and media,
1184 * as well as the "noatime" flag and inode specific "noatime" markers. 1186 * as well as the "noatime" flag and inode specific "noatime" markers.
1185 */ 1187 */
1186void update_atime(struct inode *inode) 1188void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
1187{ 1189{
1190 struct inode *inode = dentry->d_inode;
1188 struct timespec now; 1191 struct timespec now;
1189 1192
1190 if (IS_NOATIME(inode)) 1193 if (IS_RDONLY(inode))
1191 return; 1194 return;
1192 if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode)) 1195
1196 if ((inode->i_flags & S_NOATIME) ||
1197 (inode->i_sb->s_flags & MS_NOATIME) ||
1198 ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
1193 return; 1199 return;
1194 if (IS_RDONLY(inode)) 1200
1201 /*
1202 * We may have a NULL vfsmount when coming from NFSD
1203 */
1204 if (mnt &&
1205 ((mnt->mnt_flags & MNT_NOATIME) ||
1206 ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))))
1195 return; 1207 return;
1196 1208
1197 now = current_fs_time(inode->i_sb); 1209 now = current_fs_time(inode->i_sb);
@@ -1201,19 +1213,23 @@ void update_atime(struct inode *inode)
1201 } 1213 }
1202} 1214}
1203 1215
1204EXPORT_SYMBOL(update_atime); 1216EXPORT_SYMBOL(touch_atime);
1205 1217
1206/** 1218/**
1207 * inode_update_time - update mtime and ctime time 1219 * file_update_time - update mtime and ctime time
1208 * @inode: inode accessed 1220 * @file: file accessed
1209 * @ctime_too: update ctime too
1210 * 1221 *
1211 * Update the mtime time on an inode and mark it for writeback. 1222 * Update the mtime and ctime members of an inode and mark the inode
1212 * When ctime_too is specified update the ctime too. 1223 * for writeback. Note that this function is meant exclusively for
1224 * usage in the file write path of filesystems, and filesystems may
1225 * choose to explicitly ignore update via this function with the
1226 * S_NOCTIME inode flag, e.g. for network filesystem where these
1227 * timestamps are handled by the server.
1213 */ 1228 */
1214 1229
1215void inode_update_time(struct inode *inode, int ctime_too) 1230void file_update_time(struct file *file)
1216{ 1231{
1232 struct inode *inode = file->f_dentry->d_inode;
1217 struct timespec now; 1233 struct timespec now;
1218 int sync_it = 0; 1234 int sync_it = 0;
1219 1235
@@ -1227,16 +1243,15 @@ void inode_update_time(struct inode *inode, int ctime_too)
1227 sync_it = 1; 1243 sync_it = 1;
1228 inode->i_mtime = now; 1244 inode->i_mtime = now;
1229 1245
1230 if (ctime_too) { 1246 if (!timespec_equal(&inode->i_ctime, &now))
1231 if (!timespec_equal(&inode->i_ctime, &now)) 1247 sync_it = 1;
1232 sync_it = 1; 1248 inode->i_ctime = now;
1233 inode->i_ctime = now; 1249
1234 }
1235 if (sync_it) 1250 if (sync_it)
1236 mark_inode_dirty_sync(inode); 1251 mark_inode_dirty_sync(inode);
1237} 1252}
1238 1253
1239EXPORT_SYMBOL(inode_update_time); 1254EXPORT_SYMBOL(file_update_time);
1240 1255
1241int inode_needs_sync(struct inode *inode) 1256int inode_needs_sync(struct inode *inode)
1242{ 1257{
diff --git a/fs/inotify.c b/fs/inotify.c
index bf7ce1d2412..878ccca6121 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -33,6 +33,7 @@
33#include <linux/list.h> 33#include <linux/list.h>
34#include <linux/writeback.h> 34#include <linux/writeback.h>
35#include <linux/inotify.h> 35#include <linux/inotify.h>
36#include <linux/syscalls.h>
36 37
37#include <asm/ioctls.h> 38#include <asm/ioctls.h>
38 39
@@ -364,11 +365,12 @@ static int inotify_dev_get_wd(struct inotify_device *dev,
364/* 365/*
365 * find_inode - resolve a user-given path to a specific inode and return a nd 366 * find_inode - resolve a user-given path to a specific inode and return a nd
366 */ 367 */
367static int find_inode(const char __user *dirname, struct nameidata *nd) 368static int find_inode(const char __user *dirname, struct nameidata *nd,
369 unsigned flags)
368{ 370{
369 int error; 371 int error;
370 372
371 error = __user_walk(dirname, LOOKUP_FOLLOW, nd); 373 error = __user_walk(dirname, flags, nd);
372 if (error) 374 if (error)
373 return error; 375 return error;
374 /* you can only watch an inode if you have read permissions on it */ 376 /* you can only watch an inode if you have read permissions on it */
@@ -933,6 +935,7 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
933 struct file *filp; 935 struct file *filp;
934 int ret, fput_needed; 936 int ret, fput_needed;
935 int mask_add = 0; 937 int mask_add = 0;
938 unsigned flags = 0;
936 939
937 filp = fget_light(fd, &fput_needed); 940 filp = fget_light(fd, &fput_needed);
938 if (unlikely(!filp)) 941 if (unlikely(!filp))
@@ -944,7 +947,12 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
944 goto fput_and_out; 947 goto fput_and_out;
945 } 948 }
946 949
947 ret = find_inode(path, &nd); 950 if (!(mask & IN_DONT_FOLLOW))
951 flags |= LOOKUP_FOLLOW;
952 if (mask & IN_ONLYDIR)
953 flags |= LOOKUP_DIRECTORY;
954
955 ret = find_inode(path, &nd, flags);
948 if (unlikely(ret)) 956 if (unlikely(ret))
949 goto fput_and_out; 957 goto fput_and_out;
950 958
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 56920918142..f8aeec3ca10 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -8,6 +8,7 @@
8#include <linux/syscalls.h> 8#include <linux/syscalls.h>
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/smp_lock.h> 10#include <linux/smp_lock.h>
11#include <linux/capability.h>
11#include <linux/file.h> 12#include <linux/file.h>
12#include <linux/fs.h> 13#include <linux/fs.h>
13#include <linux/security.h> 14#include <linux/security.h>
diff --git a/fs/ioprio.c b/fs/ioprio.c
index d1c1f2b2c9d..ca77008146c 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -22,6 +22,8 @@
22#include <linux/kernel.h> 22#include <linux/kernel.h>
23#include <linux/ioprio.h> 23#include <linux/ioprio.h>
24#include <linux/blkdev.h> 24#include <linux/blkdev.h>
25#include <linux/capability.h>
26#include <linux/syscalls.h>
25 27
26static int set_task_ioprio(struct task_struct *task, int ioprio) 28static int set_task_ioprio(struct task_struct *task, int ioprio)
27{ 29{
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index e37e82b7cbf..e7ba0c30e07 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -185,8 +185,5 @@ struct dentry *isofs_lookup(struct inode * dir, struct dentry * dentry, struct n
185 } 185 }
186 } 186 }
187 unlock_kernel(); 187 unlock_kernel();
188 if (inode) 188 return d_splice_alias(inode, dentry);
189 return d_splice_alias(inode, dentry);
190 d_add(dentry, inode);
191 return NULL;
192} 189}
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 014a51fd00d..e6265a0b56b 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -24,29 +24,75 @@
24#include <linux/slab.h> 24#include <linux/slab.h>
25 25
26/* 26/*
27 * Unlink a buffer from a transaction. 27 * Unlink a buffer from a transaction checkpoint list.
28 * 28 *
29 * Called with j_list_lock held. 29 * Called with j_list_lock held.
30 */ 30 */
31 31
32static inline void __buffer_unlink(struct journal_head *jh) 32static void __buffer_unlink_first(struct journal_head *jh)
33{ 33{
34 transaction_t *transaction; 34 transaction_t *transaction;
35 35
36 transaction = jh->b_cp_transaction; 36 transaction = jh->b_cp_transaction;
37 jh->b_cp_transaction = NULL;
38 37
39 jh->b_cpnext->b_cpprev = jh->b_cpprev; 38 jh->b_cpnext->b_cpprev = jh->b_cpprev;
40 jh->b_cpprev->b_cpnext = jh->b_cpnext; 39 jh->b_cpprev->b_cpnext = jh->b_cpnext;
41 if (transaction->t_checkpoint_list == jh) 40 if (transaction->t_checkpoint_list == jh) {
42 transaction->t_checkpoint_list = jh->b_cpnext; 41 transaction->t_checkpoint_list = jh->b_cpnext;
43 if (transaction->t_checkpoint_list == jh) 42 if (transaction->t_checkpoint_list == jh)
44 transaction->t_checkpoint_list = NULL; 43 transaction->t_checkpoint_list = NULL;
44 }
45}
46
47/*
48 * Unlink a buffer from a transaction checkpoint(io) list.
49 *
50 * Called with j_list_lock held.
51 */
52
53static inline void __buffer_unlink(struct journal_head *jh)
54{
55 transaction_t *transaction;
56
57 transaction = jh->b_cp_transaction;
58
59 __buffer_unlink_first(jh);
60 if (transaction->t_checkpoint_io_list == jh) {
61 transaction->t_checkpoint_io_list = jh->b_cpnext;
62 if (transaction->t_checkpoint_io_list == jh)
63 transaction->t_checkpoint_io_list = NULL;
64 }
65}
66
67/*
68 * Move a buffer from the checkpoint list to the checkpoint io list
69 *
70 * Called with j_list_lock held
71 */
72
73static inline void __buffer_relink_io(struct journal_head *jh)
74{
75 transaction_t *transaction;
76
77 transaction = jh->b_cp_transaction;
78 __buffer_unlink_first(jh);
79
80 if (!transaction->t_checkpoint_io_list) {
81 jh->b_cpnext = jh->b_cpprev = jh;
82 } else {
83 jh->b_cpnext = transaction->t_checkpoint_io_list;
84 jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
85 jh->b_cpprev->b_cpnext = jh;
86 jh->b_cpnext->b_cpprev = jh;
87 }
88 transaction->t_checkpoint_io_list = jh;
45} 89}
46 90
47/* 91/*
48 * Try to release a checkpointed buffer from its transaction. 92 * Try to release a checkpointed buffer from its transaction.
49 * Returns 1 if we released it. 93 * Returns 1 if we released it and 2 if we also released the
94 * whole transaction.
95 *
50 * Requires j_list_lock 96 * Requires j_list_lock
51 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 97 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
52 */ 98 */
@@ -57,12 +103,11 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
57 103
58 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { 104 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
59 JBUFFER_TRACE(jh, "remove from checkpoint list"); 105 JBUFFER_TRACE(jh, "remove from checkpoint list");
60 __journal_remove_checkpoint(jh); 106 ret = __journal_remove_checkpoint(jh) + 1;
61 jbd_unlock_bh_state(bh); 107 jbd_unlock_bh_state(bh);
62 journal_remove_journal_head(bh); 108 journal_remove_journal_head(bh);
63 BUFFER_TRACE(bh, "release"); 109 BUFFER_TRACE(bh, "release");
64 __brelse(bh); 110 __brelse(bh);
65 ret = 1;
66 } else { 111 } else {
67 jbd_unlock_bh_state(bh); 112 jbd_unlock_bh_state(bh);
68 } 113 }
@@ -117,83 +162,53 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
117} 162}
118 163
119/* 164/*
120 * Clean up a transaction's checkpoint list. 165 * Clean up transaction's list of buffers submitted for io.
121 * 166 * We wait for any pending IO to complete and remove any clean
122 * We wait for any pending IO to complete and make sure any clean 167 * buffers. Note that we take the buffers in the opposite ordering
123 * buffers are removed from the transaction. 168 * from the one in which they were submitted for IO.
124 *
125 * Return 1 if we performed any actions which might have destroyed the
126 * checkpoint. (journal_remove_checkpoint() deletes the transaction when
127 * the last checkpoint buffer is cleansed)
128 * 169 *
129 * Called with j_list_lock held. 170 * Called with j_list_lock held.
130 */ 171 */
131static int __cleanup_transaction(journal_t *journal, transaction_t *transaction) 172
173static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
132{ 174{
133 struct journal_head *jh, *next_jh, *last_jh; 175 struct journal_head *jh;
134 struct buffer_head *bh; 176 struct buffer_head *bh;
135 int ret = 0; 177 tid_t this_tid;
136 178 int released = 0;
137 assert_spin_locked(&journal->j_list_lock); 179
138 jh = transaction->t_checkpoint_list; 180 this_tid = transaction->t_tid;
139 if (!jh) 181restart:
140 return 0; 182 /* Didn't somebody clean up the transaction in the meanwhile */
141 183 if (journal->j_checkpoint_transactions != transaction ||
142 last_jh = jh->b_cpprev; 184 transaction->t_tid != this_tid)
143 next_jh = jh; 185 return;
144 do { 186 while (!released && transaction->t_checkpoint_io_list) {
145 jh = next_jh; 187 jh = transaction->t_checkpoint_io_list;
146 bh = jh2bh(jh); 188 bh = jh2bh(jh);
189 if (!jbd_trylock_bh_state(bh)) {
190 jbd_sync_bh(journal, bh);
191 spin_lock(&journal->j_list_lock);
192 goto restart;
193 }
147 if (buffer_locked(bh)) { 194 if (buffer_locked(bh)) {
148 atomic_inc(&bh->b_count); 195 atomic_inc(&bh->b_count);
149 spin_unlock(&journal->j_list_lock); 196 spin_unlock(&journal->j_list_lock);
197 jbd_unlock_bh_state(bh);
150 wait_on_buffer(bh); 198 wait_on_buffer(bh);
151 /* the journal_head may have gone by now */ 199 /* the journal_head may have gone by now */
152 BUFFER_TRACE(bh, "brelse"); 200 BUFFER_TRACE(bh, "brelse");
153 __brelse(bh); 201 __brelse(bh);
154 goto out_return_1; 202 spin_lock(&journal->j_list_lock);
155 } 203 goto restart;
156
157 /*
158 * This is foul
159 */
160 if (!jbd_trylock_bh_state(bh)) {
161 jbd_sync_bh(journal, bh);
162 goto out_return_1;
163 } 204 }
164
165 if (jh->b_transaction != NULL) {
166 transaction_t *t = jh->b_transaction;
167 tid_t tid = t->t_tid;
168
169 spin_unlock(&journal->j_list_lock);
170 jbd_unlock_bh_state(bh);
171 log_start_commit(journal, tid);
172 log_wait_commit(journal, tid);
173 goto out_return_1;
174 }
175
176 /* 205 /*
177 * AKPM: I think the buffer_jbddirty test is redundant - it 206 * Now in whatever state the buffer currently is, we know that
178 * shouldn't have NULL b_transaction? 207 * it has been written out and so we can drop it from the list
179 */ 208 */
180 next_jh = jh->b_cpnext; 209 released = __journal_remove_checkpoint(jh);
181 if (!buffer_dirty(bh) && !buffer_jbddirty(bh)) { 210 jbd_unlock_bh_state(bh);
182 BUFFER_TRACE(bh, "remove from checkpoint"); 211 }
183 __journal_remove_checkpoint(jh);
184 jbd_unlock_bh_state(bh);
185 journal_remove_journal_head(bh);
186 __brelse(bh);
187 ret = 1;
188 } else {
189 jbd_unlock_bh_state(bh);
190 }
191 } while (jh != last_jh);
192
193 return ret;
194out_return_1:
195 spin_lock(&journal->j_list_lock);
196 return 1;
197} 212}
198 213
199#define NR_BATCH 64 214#define NR_BATCH 64
@@ -203,9 +218,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
203{ 218{
204 int i; 219 int i;
205 220
206 spin_unlock(&journal->j_list_lock);
207 ll_rw_block(SWRITE, *batch_count, bhs); 221 ll_rw_block(SWRITE, *batch_count, bhs);
208 spin_lock(&journal->j_list_lock);
209 for (i = 0; i < *batch_count; i++) { 222 for (i = 0; i < *batch_count; i++) {
210 struct buffer_head *bh = bhs[i]; 223 struct buffer_head *bh = bhs[i];
211 clear_buffer_jwrite(bh); 224 clear_buffer_jwrite(bh);
@@ -221,19 +234,46 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
221 * Return 1 if something happened which requires us to abort the current 234 * Return 1 if something happened which requires us to abort the current
222 * scan of the checkpoint list. 235 * scan of the checkpoint list.
223 * 236 *
224 * Called with j_list_lock held. 237 * Called with j_list_lock held and drops it if 1 is returned
225 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 238 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
226 */ 239 */
227static int __flush_buffer(journal_t *journal, struct journal_head *jh, 240static int __process_buffer(journal_t *journal, struct journal_head *jh,
228 struct buffer_head **bhs, int *batch_count, 241 struct buffer_head **bhs, int *batch_count)
229 int *drop_count)
230{ 242{
231 struct buffer_head *bh = jh2bh(jh); 243 struct buffer_head *bh = jh2bh(jh);
232 int ret = 0; 244 int ret = 0;
233 245
234 if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) { 246 if (buffer_locked(bh)) {
235 J_ASSERT_JH(jh, jh->b_transaction == NULL); 247 get_bh(bh);
248 spin_unlock(&journal->j_list_lock);
249 jbd_unlock_bh_state(bh);
250 wait_on_buffer(bh);
251 /* the journal_head may have gone by now */
252 BUFFER_TRACE(bh, "brelse");
253 put_bh(bh);
254 ret = 1;
255 }
256 else if (jh->b_transaction != NULL) {
257 transaction_t *t = jh->b_transaction;
258 tid_t tid = t->t_tid;
236 259
260 spin_unlock(&journal->j_list_lock);
261 jbd_unlock_bh_state(bh);
262 log_start_commit(journal, tid);
263 log_wait_commit(journal, tid);
264 ret = 1;
265 }
266 else if (!buffer_dirty(bh)) {
267 J_ASSERT_JH(jh, !buffer_jbddirty(bh));
268 BUFFER_TRACE(bh, "remove from checkpoint");
269 __journal_remove_checkpoint(jh);
270 spin_unlock(&journal->j_list_lock);
271 jbd_unlock_bh_state(bh);
272 journal_remove_journal_head(bh);
273 put_bh(bh);
274 ret = 1;
275 }
276 else {
237 /* 277 /*
238 * Important: we are about to write the buffer, and 278 * Important: we are about to write the buffer, and
239 * possibly block, while still holding the journal lock. 279 * possibly block, while still holding the journal lock.
@@ -246,45 +286,30 @@ static int __flush_buffer(journal_t *journal, struct journal_head *jh,
246 J_ASSERT_BH(bh, !buffer_jwrite(bh)); 286 J_ASSERT_BH(bh, !buffer_jwrite(bh));
247 set_buffer_jwrite(bh); 287 set_buffer_jwrite(bh);
248 bhs[*batch_count] = bh; 288 bhs[*batch_count] = bh;
289 __buffer_relink_io(jh);
249 jbd_unlock_bh_state(bh); 290 jbd_unlock_bh_state(bh);
250 (*batch_count)++; 291 (*batch_count)++;
251 if (*batch_count == NR_BATCH) { 292 if (*batch_count == NR_BATCH) {
293 spin_unlock(&journal->j_list_lock);
252 __flush_batch(journal, bhs, batch_count); 294 __flush_batch(journal, bhs, batch_count);
253 ret = 1; 295 ret = 1;
254 } 296 }
255 } else {
256 int last_buffer = 0;
257 if (jh->b_cpnext == jh) {
258 /* We may be about to drop the transaction. Tell the
259 * caller that the lists have changed.
260 */
261 last_buffer = 1;
262 }
263 if (__try_to_free_cp_buf(jh)) {
264 (*drop_count)++;
265 ret = last_buffer;
266 }
267 } 297 }
268 return ret; 298 return ret;
269} 299}
270 300
271/* 301/*
272 * Perform an actual checkpoint. We don't write out only enough to 302 * Perform an actual checkpoint. We take the first transaction on the
273 * satisfy the current blocked requests: rather we submit a reasonably 303 * list of transactions to be checkpointed and send all its buffers
274 * sized chunk of the outstanding data to disk at once for 304 * to disk. We submit larger chunks of data at once.
275 * efficiency. __log_wait_for_space() will retry if we didn't free enough.
276 * 305 *
277 * However, we _do_ take into account the amount requested so that once
278 * the IO has been queued, we can return as soon as enough of it has
279 * completed to disk.
280 *
281 * The journal should be locked before calling this function. 306 * The journal should be locked before calling this function.
282 */ 307 */
283int log_do_checkpoint(journal_t *journal) 308int log_do_checkpoint(journal_t *journal)
284{ 309{
310 transaction_t *transaction;
311 tid_t this_tid;
285 int result; 312 int result;
286 int batch_count = 0;
287 struct buffer_head *bhs[NR_BATCH];
288 313
289 jbd_debug(1, "Start checkpoint\n"); 314 jbd_debug(1, "Start checkpoint\n");
290 315
@@ -299,79 +324,70 @@ int log_do_checkpoint(journal_t *journal)
299 return result; 324 return result;
300 325
301 /* 326 /*
302 * OK, we need to start writing disk blocks. Try to free up a 327 * OK, we need to start writing disk blocks. Take one transaction
303 * quarter of the log in a single checkpoint if we can. 328 * and write it.
304 */ 329 */
330 spin_lock(&journal->j_list_lock);
331 if (!journal->j_checkpoint_transactions)
332 goto out;
333 transaction = journal->j_checkpoint_transactions;
334 this_tid = transaction->t_tid;
335restart:
305 /* 336 /*
306 * AKPM: check this code. I had a feeling a while back that it 337 * If someone cleaned up this transaction while we slept, we're
307 * degenerates into a busy loop at unmount time. 338 * done (maybe it's a new transaction, but it fell at the same
339 * address).
308 */ 340 */
309 spin_lock(&journal->j_list_lock); 341 if (journal->j_checkpoint_transactions == transaction &&
310 while (journal->j_checkpoint_transactions) { 342 transaction->t_tid == this_tid) {
311 transaction_t *transaction; 343 int batch_count = 0;
312 struct journal_head *jh, *last_jh, *next_jh; 344 struct buffer_head *bhs[NR_BATCH];
313 int drop_count = 0; 345 struct journal_head *jh;
314 int cleanup_ret, retry = 0; 346 int retry = 0;
315 tid_t this_tid; 347
316 348 while (!retry && transaction->t_checkpoint_list) {
317 transaction = journal->j_checkpoint_transactions;
318 this_tid = transaction->t_tid;
319 jh = transaction->t_checkpoint_list;
320 last_jh = jh->b_cpprev;
321 next_jh = jh;
322 do {
323 struct buffer_head *bh; 349 struct buffer_head *bh;
324 350
325 jh = next_jh; 351 jh = transaction->t_checkpoint_list;
326 next_jh = jh->b_cpnext;
327 bh = jh2bh(jh); 352 bh = jh2bh(jh);
328 if (!jbd_trylock_bh_state(bh)) { 353 if (!jbd_trylock_bh_state(bh)) {
329 jbd_sync_bh(journal, bh); 354 jbd_sync_bh(journal, bh);
330 spin_lock(&journal->j_list_lock);
331 retry = 1; 355 retry = 1;
332 break; 356 break;
333 } 357 }
334 retry = __flush_buffer(journal, jh, bhs, &batch_count, &drop_count); 358 retry = __process_buffer(journal, jh, bhs,
335 if (cond_resched_lock(&journal->j_list_lock)) { 359 &batch_count);
360 if (!retry &&
361 lock_need_resched(&journal->j_list_lock)) {
362 spin_unlock(&journal->j_list_lock);
336 retry = 1; 363 retry = 1;
337 break; 364 break;
338 } 365 }
339 } while (jh != last_jh && !retry); 366 }
340 367
341 if (batch_count) { 368 if (batch_count) {
369 if (!retry) {
370 spin_unlock(&journal->j_list_lock);
371 retry = 1;
372 }
342 __flush_batch(journal, bhs, &batch_count); 373 __flush_batch(journal, bhs, &batch_count);
343 retry = 1;
344 } 374 }
345 375
376 if (retry) {
377 spin_lock(&journal->j_list_lock);
378 goto restart;
379 }
346 /* 380 /*
347 * If someone cleaned up this transaction while we slept, we're 381 * Now we have cleaned up the first transaction's checkpoint
348 * done 382 * list. Let's clean up the second one.
349 */
350 if (journal->j_checkpoint_transactions != transaction)
351 break;
352 if (retry)
353 continue;
354 /*
355 * Maybe it's a new transaction, but it fell at the same
356 * address
357 */
358 if (transaction->t_tid != this_tid)
359 continue;
360 /*
361 * We have walked the whole transaction list without
362 * finding anything to write to disk. We had better be
363 * able to make some progress or we are in trouble.
364 */ 383 */
365 cleanup_ret = __cleanup_transaction(journal, transaction); 384 __wait_cp_io(journal, transaction);
366 J_ASSERT(drop_count != 0 || cleanup_ret != 0);
367 if (journal->j_checkpoint_transactions != transaction)
368 break;
369 } 385 }
386out:
370 spin_unlock(&journal->j_list_lock); 387 spin_unlock(&journal->j_list_lock);
371 result = cleanup_journal_tail(journal); 388 result = cleanup_journal_tail(journal);
372 if (result < 0) 389 if (result < 0)
373 return result; 390 return result;
374
375 return 0; 391 return 0;
376} 392}
377 393
@@ -456,52 +472,91 @@ int cleanup_journal_tail(journal_t *journal)
456/* Checkpoint list management */ 472/* Checkpoint list management */
457 473
458/* 474/*
475 * journal_clean_one_cp_list
476 *
477 * Find all the written-back checkpoint buffers in the given list and release them.
478 *
479 * Called with the journal locked.
480 * Called with j_list_lock held.
481 * Returns number of bufers reaped (for debug)
482 */
483
484static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
485{
486 struct journal_head *last_jh;
487 struct journal_head *next_jh = jh;
488 int ret, freed = 0;
489
490 *released = 0;
491 if (!jh)
492 return 0;
493
494 last_jh = jh->b_cpprev;
495 do {
496 jh = next_jh;
497 next_jh = jh->b_cpnext;
498 /* Use trylock because of the ranking */
499 if (jbd_trylock_bh_state(jh2bh(jh))) {
500 ret = __try_to_free_cp_buf(jh);
501 if (ret) {
502 freed++;
503 if (ret == 2) {
504 *released = 1;
505 return freed;
506 }
507 }
508 }
509 /*
510 * This function only frees up some memory if possible so we
511 * dont have an obligation to finish processing. Bail out if
512 * preemption requested:
513 */
514 if (need_resched())
515 return freed;
516 } while (jh != last_jh);
517
518 return freed;
519}
520
521/*
459 * journal_clean_checkpoint_list 522 * journal_clean_checkpoint_list
460 * 523 *
461 * Find all the written-back checkpoint buffers in the journal and release them. 524 * Find all the written-back checkpoint buffers in the journal and release them.
462 * 525 *
463 * Called with the journal locked. 526 * Called with the journal locked.
464 * Called with j_list_lock held. 527 * Called with j_list_lock held.
465 * Returns number of bufers reaped (for debug) 528 * Returns number of buffers reaped (for debug)
466 */ 529 */
467 530
468int __journal_clean_checkpoint_list(journal_t *journal) 531int __journal_clean_checkpoint_list(journal_t *journal)
469{ 532{
470 transaction_t *transaction, *last_transaction, *next_transaction; 533 transaction_t *transaction, *last_transaction, *next_transaction;
471 int ret = 0; 534 int ret = 0, released;
472 535
473 transaction = journal->j_checkpoint_transactions; 536 transaction = journal->j_checkpoint_transactions;
474 if (transaction == 0) 537 if (!transaction)
475 goto out; 538 goto out;
476 539
477 last_transaction = transaction->t_cpprev; 540 last_transaction = transaction->t_cpprev;
478 next_transaction = transaction; 541 next_transaction = transaction;
479 do { 542 do {
480 struct journal_head *jh;
481
482 transaction = next_transaction; 543 transaction = next_transaction;
483 next_transaction = transaction->t_cpnext; 544 next_transaction = transaction->t_cpnext;
484 jh = transaction->t_checkpoint_list; 545 ret += journal_clean_one_cp_list(transaction->
485 if (jh) { 546 t_checkpoint_list, &released);
486 struct journal_head *last_jh = jh->b_cpprev; 547 if (need_resched())
487 struct journal_head *next_jh = jh; 548 goto out;
488 549 if (released)
489 do { 550 continue;
490 jh = next_jh; 551 /*
491 next_jh = jh->b_cpnext; 552 * It is essential that we are as careful as in the case of
492 /* Use trylock because of the ranknig */ 553 * t_checkpoint_list with removing the buffer from the list as
493 if (jbd_trylock_bh_state(jh2bh(jh))) 554 * we can possibly see not yet submitted buffers on io_list
494 ret += __try_to_free_cp_buf(jh); 555 */
495 /* 556 ret += journal_clean_one_cp_list(transaction->
496 * This function only frees up some memory 557 t_checkpoint_io_list, &released);
497 * if possible so we dont have an obligation 558 if (need_resched())
498 * to finish processing. Bail out if preemption 559 goto out;
499 * requested:
500 */
501 if (need_resched())
502 goto out;
503 } while (jh != last_jh);
504 }
505 } while (transaction != last_transaction); 560 } while (transaction != last_transaction);
506out: 561out:
507 return ret; 562 return ret;
@@ -516,18 +571,22 @@ out:
516 * buffer updates committed in that transaction have safely been stored 571 * buffer updates committed in that transaction have safely been stored
517 * elsewhere on disk. To achieve this, all of the buffers in a 572 * elsewhere on disk. To achieve this, all of the buffers in a
518 * transaction need to be maintained on the transaction's checkpoint 573 * transaction need to be maintained on the transaction's checkpoint
519 * list until they have been rewritten, at which point this function is 574 * lists until they have been rewritten, at which point this function is
520 * called to remove the buffer from the existing transaction's 575 * called to remove the buffer from the existing transaction's
521 * checkpoint list. 576 * checkpoint lists.
577 *
578 * The function returns 1 if it frees the transaction, 0 otherwise.
522 * 579 *
523 * This function is called with the journal locked. 580 * This function is called with the journal locked.
524 * This function is called with j_list_lock held. 581 * This function is called with j_list_lock held.
582 * This function is called with jbd_lock_bh_state(jh2bh(jh))
525 */ 583 */
526 584
527void __journal_remove_checkpoint(struct journal_head *jh) 585int __journal_remove_checkpoint(struct journal_head *jh)
528{ 586{
529 transaction_t *transaction; 587 transaction_t *transaction;
530 journal_t *journal; 588 journal_t *journal;
589 int ret = 0;
531 590
532 JBUFFER_TRACE(jh, "entry"); 591 JBUFFER_TRACE(jh, "entry");
533 592
@@ -538,8 +597,10 @@ void __journal_remove_checkpoint(struct journal_head *jh)
538 journal = transaction->t_journal; 597 journal = transaction->t_journal;
539 598
540 __buffer_unlink(jh); 599 __buffer_unlink(jh);
600 jh->b_cp_transaction = NULL;
541 601
542 if (transaction->t_checkpoint_list != NULL) 602 if (transaction->t_checkpoint_list != NULL ||
603 transaction->t_checkpoint_io_list != NULL)
543 goto out; 604 goto out;
544 JBUFFER_TRACE(jh, "transaction has no more buffers"); 605 JBUFFER_TRACE(jh, "transaction has no more buffers");
545 606
@@ -565,8 +626,10 @@ void __journal_remove_checkpoint(struct journal_head *jh)
565 /* Just in case anybody was waiting for more transactions to be 626 /* Just in case anybody was waiting for more transactions to be
566 checkpointed... */ 627 checkpointed... */
567 wake_up(&journal->j_wait_logspace); 628 wake_up(&journal->j_wait_logspace);
629 ret = 1;
568out: 630out:
569 JBUFFER_TRACE(jh, "exit"); 631 JBUFFER_TRACE(jh, "exit");
632 return ret;
570} 633}
571 634
572/* 635/*
@@ -628,6 +691,7 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
628 J_ASSERT(transaction->t_shadow_list == NULL); 691 J_ASSERT(transaction->t_shadow_list == NULL);
629 J_ASSERT(transaction->t_log_list == NULL); 692 J_ASSERT(transaction->t_log_list == NULL);
630 J_ASSERT(transaction->t_checkpoint_list == NULL); 693 J_ASSERT(transaction->t_checkpoint_list == NULL);
694 J_ASSERT(transaction->t_checkpoint_io_list == NULL);
631 J_ASSERT(transaction->t_updates == 0); 695 J_ASSERT(transaction->t_updates == 0);
632 J_ASSERT(journal->j_committing_transaction != transaction); 696 J_ASSERT(journal->j_committing_transaction != transaction);
633 J_ASSERT(journal->j_running_transaction != transaction); 697 J_ASSERT(journal->j_running_transaction != transaction);
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 002ad2bbc76..29e62d98bae 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -829,7 +829,8 @@ restart_loop:
829 journal->j_committing_transaction = NULL; 829 journal->j_committing_transaction = NULL;
830 spin_unlock(&journal->j_state_lock); 830 spin_unlock(&journal->j_state_lock);
831 831
832 if (commit_transaction->t_checkpoint_list == NULL) { 832 if (commit_transaction->t_checkpoint_list == NULL &&
833 commit_transaction->t_checkpoint_io_list == NULL) {
833 __journal_drop_transaction(journal, commit_transaction); 834 __journal_drop_transaction(journal, commit_transaction);
834 } else { 835 } else {
835 if (journal->j_checkpoint_transactions == NULL) { 836 if (journal->j_checkpoint_transactions == NULL) {
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index 3dcc6d2162c..fc3855a1aef 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -757,7 +757,7 @@ jffs_do_readpage_nolock(struct file *file, struct page *page)
757 757
758 read_len = 0; 758 read_len = 0;
759 result = 0; 759 result = 0;
760 offset = page->index << PAGE_CACHE_SHIFT; 760 offset = page_offset(page);
761 761
762 kmap(page); 762 kmap(page);
763 buf = page_address(page); 763 buf = page_address(page);
@@ -1415,7 +1415,7 @@ jffs_file_write(struct file *filp, const char *buf, size_t count,
1415 * This will never trigger with sane page sizes. leave it in 1415 * This will never trigger with sane page sizes. leave it in
1416 * anyway, since I'm thinking about how to merge larger writes 1416 * anyway, since I'm thinking about how to merge larger writes
1417 * (the current idea is to poke a thread that does the actual 1417 * (the current idea is to poke a thread that does the actual
1418 * I/O and starts by doing a down(&inode->i_sem). then we 1418 * I/O and starts by doing a mutex_lock(&inode->i_mutex). then we
1419 * would need to get the page cache pages and have a list of 1419 * would need to get the page cache pages and have a list of
1420 * I/O requests and do write-merging here. 1420 * I/O requests and do write-merging here.
1421 * -- prumpf 1421 * -- prumpf
@@ -1545,7 +1545,7 @@ jffs_commit_write(struct file *filp, struct page *page,
1545{ 1545{
1546 void *addr = page_address(page) + from; 1546 void *addr = page_address(page) + from;
1547 /* XXX: PAGE_CACHE_SHIFT or PAGE_SHIFT */ 1547 /* XXX: PAGE_CACHE_SHIFT or PAGE_SHIFT */
1548 loff_t pos = (page->index<<PAGE_CACHE_SHIFT) + from; 1548 loff_t pos = page_offset(page) + from;
1549 1549
1550 return jffs_file_write(filp, addr, to-from, &pos); 1550 return jffs_file_write(filp, addr, to-from, &pos);
1551} /* jffs_commit_write() */ 1551} /* jffs_commit_write() */
diff --git a/fs/jffs/jffs_fm.c b/fs/jffs/jffs_fm.c
index 053e3a98a27..6da13b309bd 100644
--- a/fs/jffs/jffs_fm.c
+++ b/fs/jffs/jffs_fm.c
@@ -20,6 +20,7 @@
20#include <linux/blkdev.h> 20#include <linux/blkdev.h>
21#include <linux/jffs.h> 21#include <linux/jffs.h>
22#include "jffs_fm.h" 22#include "jffs_fm.h"
23#include "intrep.h"
23 24
24#if defined(JFFS_MARK_OBSOLETE) && JFFS_MARK_OBSOLETE 25#if defined(JFFS_MARK_OBSOLETE) && JFFS_MARK_OBSOLETE
25static int jffs_mark_obsolete(struct jffs_fmcontrol *fmc, __u32 fm_offset); 26static int jffs_mark_obsolete(struct jffs_fmcontrol *fmc, __u32 fm_offset);
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index fff108bb118..70f7a896c04 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -47,7 +47,7 @@ next_inode(int *i, struct jffs2_inode_cache *ic, struct jffs2_sb_info *c)
47 ic = next_inode(&i, ic, (c))) 47 ic = next_inode(&i, ic, (c)))
48 48
49 49
50static inline void jffs2_build_inode_pass1(struct jffs2_sb_info *c, 50static void jffs2_build_inode_pass1(struct jffs2_sb_info *c,
51 struct jffs2_inode_cache *ic) 51 struct jffs2_inode_cache *ic)
52{ 52{
53 struct jffs2_full_dirent *fd; 53 struct jffs2_full_dirent *fd;
diff --git a/fs/jffs2/debug.h b/fs/jffs2/debug.h
index f193d43a8a5..162af6dfe29 100644
--- a/fs/jffs2/debug.h
+++ b/fs/jffs2/debug.h
@@ -82,28 +82,28 @@
82 do { \ 82 do { \
83 printk(JFFS2_ERR_MSG_PREFIX \ 83 printk(JFFS2_ERR_MSG_PREFIX \
84 " (%d) %s: " fmt, current->pid, \ 84 " (%d) %s: " fmt, current->pid, \
85 __FUNCTION__, ##__VA_ARGS__); \ 85 __FUNCTION__ , ##__VA_ARGS__); \
86 } while(0) 86 } while(0)
87 87
88#define JFFS2_WARNING(fmt, ...) \ 88#define JFFS2_WARNING(fmt, ...) \
89 do { \ 89 do { \
90 printk(JFFS2_WARN_MSG_PREFIX \ 90 printk(JFFS2_WARN_MSG_PREFIX \
91 " (%d) %s: " fmt, current->pid, \ 91 " (%d) %s: " fmt, current->pid, \
92 __FUNCTION__, ##__VA_ARGS__); \ 92 __FUNCTION__ , ##__VA_ARGS__); \
93 } while(0) 93 } while(0)
94 94
95#define JFFS2_NOTICE(fmt, ...) \ 95#define JFFS2_NOTICE(fmt, ...) \
96 do { \ 96 do { \
97 printk(JFFS2_NOTICE_MSG_PREFIX \ 97 printk(JFFS2_NOTICE_MSG_PREFIX \
98 " (%d) %s: " fmt, current->pid, \ 98 " (%d) %s: " fmt, current->pid, \
99 __FUNCTION__, ##__VA_ARGS__); \ 99 __FUNCTION__ , ##__VA_ARGS__); \
100 } while(0) 100 } while(0)
101 101
102#define JFFS2_DEBUG(fmt, ...) \ 102#define JFFS2_DEBUG(fmt, ...) \
103 do { \ 103 do { \
104 printk(JFFS2_DBG_MSG_PREFIX \ 104 printk(JFFS2_DBG_MSG_PREFIX \
105 " (%d) %s: " fmt, current->pid, \ 105 " (%d) %s: " fmt, current->pid, \
106 __FUNCTION__, ##__VA_ARGS__); \ 106 __FUNCTION__ , ##__VA_ARGS__); \
107 } while(0) 107 } while(0)
108 108
109/* 109/*
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 543420665c5..09e5d10b884 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -11,6 +11,7 @@
11 * 11 *
12 */ 12 */
13 13
14#include <linux/capability.h>
14#include <linux/config.h> 15#include <linux/config.h>
15#include <linux/kernel.h> 16#include <linux/kernel.h>
16#include <linux/sched.h> 17#include <linux/sched.h>
@@ -234,6 +235,7 @@ void jffs2_read_inode (struct inode *inode)
234 c = JFFS2_SB_INFO(inode->i_sb); 235 c = JFFS2_SB_INFO(inode->i_sb);
235 236
236 jffs2_init_inode_info(f); 237 jffs2_init_inode_info(f);
238 down(&f->sem);
237 239
238 ret = jffs2_do_read_inode(c, f, inode->i_ino, &latest_node); 240 ret = jffs2_do_read_inode(c, f, inode->i_ino, &latest_node);
239 241
@@ -400,6 +402,7 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i
400 402
401 f = JFFS2_INODE_INFO(inode); 403 f = JFFS2_INODE_INFO(inode);
402 jffs2_init_inode_info(f); 404 jffs2_init_inode_info(f);
405 down(&f->sem);
403 406
404 memset(ri, 0, sizeof(*ri)); 407 memset(ri, 0, sizeof(*ri));
405 /* Set OS-specific defaults for new inodes */ 408 /* Set OS-specific defaults for new inodes */
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index c79eebb8ab3..b635e167a3f 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -134,7 +134,7 @@ static void jffs2_fragtree_insert(struct jffs2_node_frag *newfrag, struct jffs2_
134/* 134/*
135 * Allocate and initializes a new fragment. 135 * Allocate and initializes a new fragment.
136 */ 136 */
137static inline struct jffs2_node_frag * new_fragment(struct jffs2_full_dnode *fn, uint32_t ofs, uint32_t size) 137static struct jffs2_node_frag * new_fragment(struct jffs2_full_dnode *fn, uint32_t ofs, uint32_t size)
138{ 138{
139 struct jffs2_node_frag *newfrag; 139 struct jffs2_node_frag *newfrag;
140 140
@@ -513,7 +513,7 @@ free_out:
513 * 513 *
514 * Checks the node if we are in the checking stage. 514 * Checks the node if we are in the checking stage.
515 */ 515 */
516static inline int check_node(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_tmp_dnode_info *tn) 516static int check_node(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_tmp_dnode_info *tn)
517{ 517{
518 int ret; 518 int ret;
519 519
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 0e7456ec99f..3e51dd1da8a 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -284,9 +284,6 @@ int jffs2_fill_scan_buf (struct jffs2_sb_info *c, void *buf,
284 D1(printk(KERN_WARNING "Read at 0x%x gave only 0x%zx bytes\n", ofs, retlen)); 284 D1(printk(KERN_WARNING "Read at 0x%x gave only 0x%zx bytes\n", ofs, retlen));
285 return -EIO; 285 return -EIO;
286 } 286 }
287 D2(printk(KERN_DEBUG "Read 0x%x bytes from 0x%08x into buf\n", len, ofs));
288 D2(printk(KERN_DEBUG "000: %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n",
289 buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7], buf[8], buf[9], buf[10], buf[11], buf[12], buf[13], buf[14], buf[15]));
290 return 0; 287 return 0;
291} 288}
292 289
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 9e0b5458d9c..93883817cbd 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -51,7 +51,7 @@ static void jffs2_i_init_once(void * foo, kmem_cache_t * cachep, unsigned long f
51 51
52 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 52 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
53 SLAB_CTOR_CONSTRUCTOR) { 53 SLAB_CTOR_CONSTRUCTOR) {
54 init_MUTEX_LOCKED(&ei->sem); 54 init_MUTEX(&ei->sem);
55 inode_init_once(&ei->vfs_inode); 55 inode_init_once(&ei->vfs_inode);
56 } 56 }
57} 57}
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index adb9f05093b..038d8b76d11 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -302,8 +302,7 @@ int dbSync(struct inode *ipbmap)
302 /* 302 /*
303 * write out dirty pages of bmap 303 * write out dirty pages of bmap
304 */ 304 */
305 filemap_fdatawrite(ipbmap->i_mapping); 305 filemap_write_and_wait(ipbmap->i_mapping);
306 filemap_fdatawait(ipbmap->i_mapping);
307 306
308 diWriteSpecial(ipbmap, 0); 307 diWriteSpecial(ipbmap, 0);
309 308
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 28201b194f5..31b4aa13dd4 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -265,8 +265,7 @@ int diSync(struct inode *ipimap)
265 /* 265 /*
266 * write out dirty pages of imap 266 * write out dirty pages of imap
267 */ 267 */
268 filemap_fdatawrite(ipimap->i_mapping); 268 filemap_write_and_wait(ipimap->i_mapping);
269 filemap_fdatawait(ipimap->i_mapping);
270 269
271 diWriteSpecial(ipimap, 0); 270 diWriteSpecial(ipimap, 0);
272 271
@@ -565,8 +564,7 @@ void diFreeSpecial(struct inode *ip)
565 jfs_err("diFreeSpecial called with NULL ip!"); 564 jfs_err("diFreeSpecial called with NULL ip!");
566 return; 565 return;
567 } 566 }
568 filemap_fdatawrite(ip->i_mapping); 567 filemap_write_and_wait(ip->i_mapping);
569 filemap_fdatawait(ip->i_mapping);
570 truncate_inode_pages(ip->i_mapping, 0); 568 truncate_inode_pages(ip->i_mapping, 0);
571 iput(ip); 569 iput(ip);
572} 570}
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index c0fd7b3eadc..dc21a5bd54d 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -58,7 +58,7 @@ struct jfs_inode_info {
58 /* 58 /*
59 * rdwrlock serializes xtree between reads & writes and synchronizes 59 * rdwrlock serializes xtree between reads & writes and synchronizes
60 * changes to special inodes. It's use would be redundant on 60 * changes to special inodes. It's use would be redundant on
61 * directories since the i_sem taken in the VFS is sufficient. 61 * directories since the i_mutex taken in the VFS is sufficient.
62 */ 62 */
63 struct rw_semaphore rdwrlock; 63 struct rw_semaphore rdwrlock;
64 /* 64 /*
@@ -68,7 +68,7 @@ struct jfs_inode_info {
68 * inode is blocked in txBegin or TxBeginAnon 68 * inode is blocked in txBegin or TxBeginAnon
69 */ 69 */
70 struct semaphore commit_sem; 70 struct semaphore commit_sem;
71 /* xattr_sem allows us to access the xattrs without taking i_sem */ 71 /* xattr_sem allows us to access the xattrs without taking i_mutex */
72 struct rw_semaphore xattr_sem; 72 struct rw_semaphore xattr_sem;
73 lid_t xtlid; /* lid of xtree lock on directory */ 73 lid_t xtlid; /* lid of xtree lock on directory */
74#ifdef CONFIG_JFS_POSIX_ACL 74#ifdef CONFIG_JFS_POSIX_ACL
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index b660c93c92d..2ddb6b892bc 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -1231,10 +1231,8 @@ int txCommit(tid_t tid, /* transaction identifier */
1231 * when we don't need to worry about it at all. 1231 * when we don't need to worry about it at all.
1232 * 1232 *
1233 * if ((!S_ISDIR(ip->i_mode)) 1233 * if ((!S_ISDIR(ip->i_mode))
1234 * && (tblk->flag & COMMIT_DELETE) == 0) { 1234 * && (tblk->flag & COMMIT_DELETE) == 0)
1235 * filemap_fdatawrite(ip->i_mapping); 1235 * filemap_write_and_wait(ip->i_mapping);
1236 * filemap_fdatawait(ip->i_mapping);
1237 * }
1238 */ 1236 */
1239 1237
1240 /* 1238 /*
diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c
index 5cf91785b54..21eaf7ac0fc 100644
--- a/fs/jfs/jfs_umount.c
+++ b/fs/jfs/jfs_umount.c
@@ -108,8 +108,7 @@ int jfs_umount(struct super_block *sb)
108 * Make sure all metadata makes it to disk before we mark 108 * Make sure all metadata makes it to disk before we mark
109 * the superblock as clean 109 * the superblock as clean
110 */ 110 */
111 filemap_fdatawrite(sbi->direct_inode->i_mapping); 111 filemap_write_and_wait(sbi->direct_inode->i_mapping);
112 filemap_fdatawait(sbi->direct_inode->i_mapping);
113 112
114 /* 113 /*
115 * ensure all file system file pages are propagated to their 114 * ensure all file system file pages are propagated to their
@@ -161,8 +160,7 @@ int jfs_umount_rw(struct super_block *sb)
161 * mark the superblock clean before everything is flushed to 160 * mark the superblock clean before everything is flushed to
162 * disk. 161 * disk.
163 */ 162 */
164 filemap_fdatawrite(sbi->direct_inode->i_mapping); 163 filemap_write_and_wait(sbi->direct_inode->i_mapping);
165 filemap_fdatawait(sbi->direct_inode->i_mapping);
166 164
167 updateSuper(sb, FM_CLEAN); 165 updateSuper(sb, FM_CLEAN);
168 166
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index c6dc254d325..45180361871 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -376,8 +376,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
376 * by txCommit(); 376 * by txCommit();
377 */ 377 */
378 filemap_fdatawait(ipbmap->i_mapping); 378 filemap_fdatawait(ipbmap->i_mapping);
379 filemap_fdatawrite(ipbmap->i_mapping); 379 filemap_write_and_wait(ipbmap->i_mapping);
380 filemap_fdatawait(ipbmap->i_mapping);
381 diWriteSpecial(ipbmap, 0); 380 diWriteSpecial(ipbmap, 0);
382 381
383 newPage = nPages; /* first new page number */ 382 newPage = nPages; /* first new page number */
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 4226af3ea91..8d31f133643 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -502,8 +502,7 @@ out_no_rw:
502 jfs_err("jfs_umount failed with return code %d", rc); 502 jfs_err("jfs_umount failed with return code %d", rc);
503 } 503 }
504out_mount_failed: 504out_mount_failed:
505 filemap_fdatawrite(sbi->direct_inode->i_mapping); 505 filemap_write_and_wait(sbi->direct_inode->i_mapping);
506 filemap_fdatawait(sbi->direct_inode->i_mapping);
507 truncate_inode_pages(sbi->direct_inode->i_mapping, 0); 506 truncate_inode_pages(sbi->direct_inode->i_mapping, 0);
508 make_bad_inode(sbi->direct_inode); 507 make_bad_inode(sbi->direct_inode);
509 iput(sbi->direct_inode); 508 iput(sbi->direct_inode);
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 23aa5066b5a..f23048f9471 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -17,6 +17,7 @@
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */ 18 */
19 19
20#include <linux/capability.h>
20#include <linux/fs.h> 21#include <linux/fs.h>
21#include <linux/xattr.h> 22#include <linux/xattr.h>
22#include <linux/posix_acl_xattr.h> 23#include <linux/posix_acl_xattr.h>
@@ -83,21 +84,6 @@ struct ea_buffer {
83#define EA_NEW 0x0004 84#define EA_NEW 0x0004
84#define EA_MALLOC 0x0008 85#define EA_MALLOC 0x0008
85 86
86/* Namespaces */
87#define XATTR_SYSTEM_PREFIX "system."
88#define XATTR_SYSTEM_PREFIX_LEN (sizeof (XATTR_SYSTEM_PREFIX) - 1)
89
90#define XATTR_USER_PREFIX "user."
91#define XATTR_USER_PREFIX_LEN (sizeof (XATTR_USER_PREFIX) - 1)
92
93#define XATTR_OS2_PREFIX "os2."
94#define XATTR_OS2_PREFIX_LEN (sizeof (XATTR_OS2_PREFIX) - 1)
95
96/* XATTR_SECURITY_PREFIX is defined in include/linux/xattr.h */
97#define XATTR_SECURITY_PREFIX_LEN (sizeof (XATTR_SECURITY_PREFIX) - 1)
98
99#define XATTR_TRUSTED_PREFIX "trusted."
100#define XATTR_TRUSTED_PREFIX_LEN (sizeof (XATTR_TRUSTED_PREFIX) - 1)
101 87
102/* 88/*
103 * These three routines are used to recognize on-disk extended attributes 89 * These three routines are used to recognize on-disk extended attributes
@@ -773,36 +759,23 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
773static int can_set_xattr(struct inode *inode, const char *name, 759static int can_set_xattr(struct inode *inode, const char *name,
774 const void *value, size_t value_len) 760 const void *value, size_t value_len)
775{ 761{
776 if (IS_RDONLY(inode)) 762 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
777 return -EROFS;
778
779 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
780 return -EPERM;
781
782 if(strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) == 0)
783 /*
784 * "system.*"
785 */
786 return can_set_system_xattr(inode, name, value, value_len); 763 return can_set_system_xattr(inode, name, value, value_len);
787 764
788 if(strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) 765 /*
789 return (capable(CAP_SYS_ADMIN) ? 0 : -EPERM); 766 * Don't allow setting an attribute in an unknown namespace.
790 767 */
791#ifdef CONFIG_JFS_SECURITY 768 if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) &&
792 if (strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) 769 strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
793 == 0) 770 strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
794 return 0; /* Leave it to the security module */ 771 strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN))
795#endif
796
797 if((strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) != 0) &&
798 (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) != 0))
799 return -EOPNOTSUPP; 772 return -EOPNOTSUPP;
800 773
801 if (!S_ISREG(inode->i_mode) && 774 if (!S_ISREG(inode->i_mode) &&
802 (!S_ISDIR(inode->i_mode) || inode->i_mode &S_ISVTX)) 775 (!S_ISDIR(inode->i_mode) || inode->i_mode &S_ISVTX))
803 return -EPERM; 776 return -EPERM;
804 777
805 return permission(inode, MAY_WRITE, NULL); 778 return 0;
806} 779}
807 780
808int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name, 781int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name,
@@ -972,22 +945,6 @@ int jfs_setxattr(struct dentry *dentry, const char *name, const void *value,
972 return rc; 945 return rc;
973} 946}
974 947
975static int can_get_xattr(struct inode *inode, const char *name)
976{
977#ifdef CONFIG_JFS_SECURITY
978 if(strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0)
979 return 0;
980#endif
981
982 if(strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0)
983 return (capable(CAP_SYS_ADMIN) ? 0 : -EPERM);
984
985 if(strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) == 0)
986 return 0;
987
988 return permission(inode, MAY_READ, NULL);
989}
990
991ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data, 948ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data,
992 size_t buf_size) 949 size_t buf_size)
993{ 950{
@@ -998,12 +955,8 @@ ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data,
998 ssize_t size; 955 ssize_t size;
999 int namelen = strlen(name); 956 int namelen = strlen(name);
1000 char *os2name = NULL; 957 char *os2name = NULL;
1001 int rc;
1002 char *value; 958 char *value;
1003 959
1004 if ((rc = can_get_xattr(inode, name)))
1005 return rc;
1006
1007 if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { 960 if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
1008 os2name = kmalloc(namelen - XATTR_OS2_PREFIX_LEN + 1, 961 os2name = kmalloc(namelen - XATTR_OS2_PREFIX_LEN + 1,
1009 GFP_KERNEL); 962 GFP_KERNEL);
diff --git a/fs/libfs.c b/fs/libfs.c
index 58101dff2c6..63c020e6589 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -74,7 +74,7 @@ int dcache_dir_close(struct inode *inode, struct file *file)
74 74
75loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin) 75loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
76{ 76{
77 down(&file->f_dentry->d_inode->i_sem); 77 mutex_lock(&file->f_dentry->d_inode->i_mutex);
78 switch (origin) { 78 switch (origin) {
79 case 1: 79 case 1:
80 offset += file->f_pos; 80 offset += file->f_pos;
@@ -82,7 +82,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
82 if (offset >= 0) 82 if (offset >= 0)
83 break; 83 break;
84 default: 84 default:
85 up(&file->f_dentry->d_inode->i_sem); 85 mutex_unlock(&file->f_dentry->d_inode->i_mutex);
86 return -EINVAL; 86 return -EINVAL;
87 } 87 }
88 if (offset != file->f_pos) { 88 if (offset != file->f_pos) {
@@ -93,20 +93,20 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
93 loff_t n = file->f_pos - 2; 93 loff_t n = file->f_pos - 2;
94 94
95 spin_lock(&dcache_lock); 95 spin_lock(&dcache_lock);
96 list_del(&cursor->d_child); 96 list_del(&cursor->d_u.d_child);
97 p = file->f_dentry->d_subdirs.next; 97 p = file->f_dentry->d_subdirs.next;
98 while (n && p != &file->f_dentry->d_subdirs) { 98 while (n && p != &file->f_dentry->d_subdirs) {
99 struct dentry *next; 99 struct dentry *next;
100 next = list_entry(p, struct dentry, d_child); 100 next = list_entry(p, struct dentry, d_u.d_child);
101 if (!d_unhashed(next) && next->d_inode) 101 if (!d_unhashed(next) && next->d_inode)
102 n--; 102 n--;
103 p = p->next; 103 p = p->next;
104 } 104 }
105 list_add_tail(&cursor->d_child, p); 105 list_add_tail(&cursor->d_u.d_child, p);
106 spin_unlock(&dcache_lock); 106 spin_unlock(&dcache_lock);
107 } 107 }
108 } 108 }
109 up(&file->f_dentry->d_inode->i_sem); 109 mutex_unlock(&file->f_dentry->d_inode->i_mutex);
110 return offset; 110 return offset;
111} 111}
112 112
@@ -126,7 +126,7 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
126{ 126{
127 struct dentry *dentry = filp->f_dentry; 127 struct dentry *dentry = filp->f_dentry;
128 struct dentry *cursor = filp->private_data; 128 struct dentry *cursor = filp->private_data;
129 struct list_head *p, *q = &cursor->d_child; 129 struct list_head *p, *q = &cursor->d_u.d_child;
130 ino_t ino; 130 ino_t ino;
131 int i = filp->f_pos; 131 int i = filp->f_pos;
132 132
@@ -153,7 +153,7 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
153 } 153 }
154 for (p=q->next; p != &dentry->d_subdirs; p=p->next) { 154 for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
155 struct dentry *next; 155 struct dentry *next;
156 next = list_entry(p, struct dentry, d_child); 156 next = list_entry(p, struct dentry, d_u.d_child);
157 if (d_unhashed(next) || !next->d_inode) 157 if (d_unhashed(next) || !next->d_inode)
158 continue; 158 continue;
159 159
@@ -261,7 +261,7 @@ int simple_empty(struct dentry *dentry)
261 int ret = 0; 261 int ret = 0;
262 262
263 spin_lock(&dcache_lock); 263 spin_lock(&dcache_lock);
264 list_for_each_entry(child, &dentry->d_subdirs, d_child) 264 list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child)
265 if (simple_positive(child)) 265 if (simple_positive(child))
266 goto out; 266 goto out;
267 ret = 1; 267 ret = 1;
@@ -356,7 +356,7 @@ int simple_commit_write(struct file *file, struct page *page,
356 356
357 /* 357 /*
358 * No need to use i_size_read() here, the i_size 358 * No need to use i_size_read() here, the i_size
359 * cannot change under us because we hold the i_sem. 359 * cannot change under us because we hold the i_mutex.
360 */ 360 */
361 if (pos > inode->i_size) 361 if (pos > inode->i_size)
362 i_size_write(inode, pos); 362 i_size_write(inode, pos);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 006bb9e1457..3eaf6e70108 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -157,6 +157,8 @@ void nlmclnt_mark_reclaim(struct nlm_host *host)
157 inode = fl->fl_file->f_dentry->d_inode; 157 inode = fl->fl_file->f_dentry->d_inode;
158 if (inode->i_sb->s_magic != NFS_SUPER_MAGIC) 158 if (inode->i_sb->s_magic != NFS_SUPER_MAGIC)
159 continue; 159 continue;
160 if (fl->fl_u.nfs_fl.owner == NULL)
161 continue;
160 if (fl->fl_u.nfs_fl.owner->host != host) 162 if (fl->fl_u.nfs_fl.owner->host != host)
161 continue; 163 continue;
162 if (!(fl->fl_u.nfs_fl.flags & NFS_LCK_GRANTED)) 164 if (!(fl->fl_u.nfs_fl.flags & NFS_LCK_GRANTED))
@@ -226,6 +228,8 @@ restart:
226 inode = fl->fl_file->f_dentry->d_inode; 228 inode = fl->fl_file->f_dentry->d_inode;
227 if (inode->i_sb->s_magic != NFS_SUPER_MAGIC) 229 if (inode->i_sb->s_magic != NFS_SUPER_MAGIC)
228 continue; 230 continue;
231 if (fl->fl_u.nfs_fl.owner == NULL)
232 continue;
229 if (fl->fl_u.nfs_fl.owner->host != host) 233 if (fl->fl_u.nfs_fl.owner->host != host)
230 continue; 234 continue;
231 if (!(fl->fl_u.nfs_fl.flags & NFS_LCK_RECLAIM)) 235 if (!(fl->fl_u.nfs_fl.flags & NFS_LCK_RECLAIM))
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index c5a33648e9f..14552403957 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -26,11 +26,12 @@
26static int nlmclnt_test(struct nlm_rqst *, struct file_lock *); 26static int nlmclnt_test(struct nlm_rqst *, struct file_lock *);
27static int nlmclnt_lock(struct nlm_rqst *, struct file_lock *); 27static int nlmclnt_lock(struct nlm_rqst *, struct file_lock *);
28static int nlmclnt_unlock(struct nlm_rqst *, struct file_lock *); 28static int nlmclnt_unlock(struct nlm_rqst *, struct file_lock *);
29static void nlmclnt_unlock_callback(struct rpc_task *);
30static void nlmclnt_cancel_callback(struct rpc_task *);
31static int nlm_stat_to_errno(u32 stat); 29static int nlm_stat_to_errno(u32 stat);
32static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host); 30static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host);
33 31
32static const struct rpc_call_ops nlmclnt_unlock_ops;
33static const struct rpc_call_ops nlmclnt_cancel_ops;
34
34/* 35/*
35 * Cookie counter for NLM requests 36 * Cookie counter for NLM requests
36 */ 37 */
@@ -221,8 +222,7 @@ nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl)
221 goto done; 222 goto done;
222 } 223 }
223 clnt->cl_softrtry = nfssrv->client->cl_softrtry; 224 clnt->cl_softrtry = nfssrv->client->cl_softrtry;
224 clnt->cl_intr = nfssrv->client->cl_intr; 225 clnt->cl_intr = nfssrv->client->cl_intr;
225 clnt->cl_chatty = nfssrv->client->cl_chatty;
226 } 226 }
227 227
228 /* Keep the old signal mask */ 228 /* Keep the old signal mask */
@@ -399,8 +399,7 @@ in_grace_period:
399/* 399/*
400 * Generic NLM call, async version. 400 * Generic NLM call, async version.
401 */ 401 */
402int 402int nlmsvc_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
403nlmsvc_async_call(struct nlm_rqst *req, u32 proc, rpc_action callback)
404{ 403{
405 struct nlm_host *host = req->a_host; 404 struct nlm_host *host = req->a_host;
406 struct rpc_clnt *clnt; 405 struct rpc_clnt *clnt;
@@ -419,13 +418,12 @@ nlmsvc_async_call(struct nlm_rqst *req, u32 proc, rpc_action callback)
419 msg.rpc_proc = &clnt->cl_procinfo[proc]; 418 msg.rpc_proc = &clnt->cl_procinfo[proc];
420 419
421 /* bootstrap and kick off the async RPC call */ 420 /* bootstrap and kick off the async RPC call */
422 status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, callback, req); 421 status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, tk_ops, req);
423 422
424 return status; 423 return status;
425} 424}
426 425
427static int 426static int nlmclnt_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
428nlmclnt_async_call(struct nlm_rqst *req, u32 proc, rpc_action callback)
429{ 427{
430 struct nlm_host *host = req->a_host; 428 struct nlm_host *host = req->a_host;
431 struct rpc_clnt *clnt; 429 struct rpc_clnt *clnt;
@@ -448,7 +446,7 @@ nlmclnt_async_call(struct nlm_rqst *req, u32 proc, rpc_action callback)
448 /* Increment host refcount */ 446 /* Increment host refcount */
449 nlm_get_host(host); 447 nlm_get_host(host);
450 /* bootstrap and kick off the async RPC call */ 448 /* bootstrap and kick off the async RPC call */
451 status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, callback, req); 449 status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, tk_ops, req);
452 if (status < 0) 450 if (status < 0)
453 nlm_release_host(host); 451 nlm_release_host(host);
454 return status; 452 return status;
@@ -664,7 +662,7 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
664 662
665 if (req->a_flags & RPC_TASK_ASYNC) { 663 if (req->a_flags & RPC_TASK_ASYNC) {
666 status = nlmclnt_async_call(req, NLMPROC_UNLOCK, 664 status = nlmclnt_async_call(req, NLMPROC_UNLOCK,
667 nlmclnt_unlock_callback); 665 &nlmclnt_unlock_ops);
668 /* Hrmf... Do the unlock early since locks_remove_posix() 666 /* Hrmf... Do the unlock early since locks_remove_posix()
669 * really expects us to free the lock synchronously */ 667 * really expects us to free the lock synchronously */
670 do_vfs_lock(fl); 668 do_vfs_lock(fl);
@@ -692,10 +690,9 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
692 return -ENOLCK; 690 return -ENOLCK;
693} 691}
694 692
695static void 693static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
696nlmclnt_unlock_callback(struct rpc_task *task)
697{ 694{
698 struct nlm_rqst *req = (struct nlm_rqst *) task->tk_calldata; 695 struct nlm_rqst *req = data;
699 int status = req->a_res.status; 696 int status = req->a_res.status;
700 697
701 if (RPC_ASSASSINATED(task)) 698 if (RPC_ASSASSINATED(task))
@@ -722,6 +719,10 @@ die:
722 rpc_restart_call(task); 719 rpc_restart_call(task);
723} 720}
724 721
722static const struct rpc_call_ops nlmclnt_unlock_ops = {
723 .rpc_call_done = nlmclnt_unlock_callback,
724};
725
725/* 726/*
726 * Cancel a blocked lock request. 727 * Cancel a blocked lock request.
727 * We always use an async RPC call for this in order not to hang a 728 * We always use an async RPC call for this in order not to hang a
@@ -750,8 +751,7 @@ nlmclnt_cancel(struct nlm_host *host, struct file_lock *fl)
750 751
751 nlmclnt_setlockargs(req, fl); 752 nlmclnt_setlockargs(req, fl);
752 753
753 status = nlmclnt_async_call(req, NLMPROC_CANCEL, 754 status = nlmclnt_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops);
754 nlmclnt_cancel_callback);
755 if (status < 0) { 755 if (status < 0) {
756 nlmclnt_release_lockargs(req); 756 nlmclnt_release_lockargs(req);
757 kfree(req); 757 kfree(req);
@@ -765,10 +765,9 @@ nlmclnt_cancel(struct nlm_host *host, struct file_lock *fl)
765 return status; 765 return status;
766} 766}
767 767
768static void 768static void nlmclnt_cancel_callback(struct rpc_task *task, void *data)
769nlmclnt_cancel_callback(struct rpc_task *task)
770{ 769{
771 struct nlm_rqst *req = (struct nlm_rqst *) task->tk_calldata; 770 struct nlm_rqst *req = data;
772 771
773 if (RPC_ASSASSINATED(task)) 772 if (RPC_ASSASSINATED(task))
774 goto die; 773 goto die;
@@ -807,6 +806,10 @@ retry_cancel:
807 rpc_delay(task, 30 * HZ); 806 rpc_delay(task, 30 * HZ);
808} 807}
809 808
809static const struct rpc_call_ops nlmclnt_cancel_ops = {
810 .rpc_call_done = nlmclnt_cancel_callback,
811};
812
810/* 813/*
811 * Convert an NLM status code to a generic kernel errno 814 * Convert an NLM status code to a generic kernel errno
812 */ 815 */
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index c4c8601096e..82f7a0b1d8a 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -177,7 +177,7 @@ nlm_bind_host(struct nlm_host *host)
177 if ((clnt = host->h_rpcclnt) != NULL) { 177 if ((clnt = host->h_rpcclnt) != NULL) {
178 xprt = clnt->cl_xprt; 178 xprt = clnt->cl_xprt;
179 if (time_after_eq(jiffies, host->h_nextrebind)) { 179 if (time_after_eq(jiffies, host->h_nextrebind)) {
180 clnt->cl_port = 0; 180 rpc_force_rebind(clnt);
181 host->h_nextrebind = jiffies + NLM_HOST_REBIND; 181 host->h_nextrebind = jiffies + NLM_HOST_REBIND;
182 dprintk("lockd: next rebind in %ld jiffies\n", 182 dprintk("lockd: next rebind in %ld jiffies\n",
183 host->h_nextrebind - jiffies); 183 host->h_nextrebind - jiffies);
@@ -217,7 +217,7 @@ nlm_rebind_host(struct nlm_host *host)
217{ 217{
218 dprintk("lockd: rebind host %s\n", host->h_name); 218 dprintk("lockd: rebind host %s\n", host->h_name);
219 if (host->h_rpcclnt && time_after_eq(jiffies, host->h_nextrebind)) { 219 if (host->h_rpcclnt && time_after_eq(jiffies, host->h_nextrebind)) {
220 host->h_rpcclnt->cl_port = 0; 220 rpc_force_rebind(host->h_rpcclnt);
221 host->h_nextrebind = jiffies + NLM_HOST_REBIND; 221 host->h_nextrebind = jiffies + NLM_HOST_REBIND;
222 } 222 }
223} 223}
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 2d144abe84a..0edc03e6796 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -123,7 +123,6 @@ nsm_create(void)
123 if (IS_ERR(clnt)) 123 if (IS_ERR(clnt))
124 goto out_err; 124 goto out_err;
125 clnt->cl_softrtry = 1; 125 clnt->cl_softrtry = 1;
126 clnt->cl_chatty = 1;
127 clnt->cl_oneshot = 1; 126 clnt->cl_oneshot = 1;
128 return clnt; 127 return clnt;
129 128
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 12a857c29e2..71a30b416d1 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -178,6 +178,8 @@ lockd(struct svc_rqst *rqstp)
178 178
179 } 179 }
180 180
181 flush_signals(current);
182
181 /* 183 /*
182 * Check whether there's a new lockd process before 184 * Check whether there's a new lockd process before
183 * shutting down the hosts and clearing the slot. 185 * shutting down the hosts and clearing the slot.
@@ -192,8 +194,6 @@ lockd(struct svc_rqst *rqstp)
192 "lockd: new process, skipping host shutdown\n"); 194 "lockd: new process, skipping host shutdown\n");
193 wake_up(&lockd_exit); 195 wake_up(&lockd_exit);
194 196
195 flush_signals(current);
196
197 /* Exit the RPC thread */ 197 /* Exit the RPC thread */
198 svc_exit_thread(rqstp); 198 svc_exit_thread(rqstp);
199 199
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 489670e2176..4063095d849 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -22,7 +22,8 @@
22#define NLMDBG_FACILITY NLMDBG_CLIENT 22#define NLMDBG_FACILITY NLMDBG_CLIENT
23 23
24static u32 nlm4svc_callback(struct svc_rqst *, u32, struct nlm_res *); 24static u32 nlm4svc_callback(struct svc_rqst *, u32, struct nlm_res *);
25static void nlm4svc_callback_exit(struct rpc_task *); 25
26static const struct rpc_call_ops nlm4svc_callback_ops;
26 27
27/* 28/*
28 * Obtain client and file from arguments 29 * Obtain client and file from arguments
@@ -470,7 +471,6 @@ nlm4svc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res *argp,
470} 471}
471 472
472 473
473
474/* 474/*
475 * This is the generic lockd callback for async RPC calls 475 * This is the generic lockd callback for async RPC calls
476 */ 476 */
@@ -494,7 +494,7 @@ nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_res *resp)
494 call->a_host = host; 494 call->a_host = host;
495 memcpy(&call->a_args, resp, sizeof(*resp)); 495 memcpy(&call->a_args, resp, sizeof(*resp));
496 496
497 if (nlmsvc_async_call(call, proc, nlm4svc_callback_exit) < 0) 497 if (nlmsvc_async_call(call, proc, &nlm4svc_callback_ops) < 0)
498 goto error; 498 goto error;
499 499
500 return rpc_success; 500 return rpc_success;
@@ -504,10 +504,9 @@ nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_res *resp)
504 return rpc_system_err; 504 return rpc_system_err;
505} 505}
506 506
507static void 507static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
508nlm4svc_callback_exit(struct rpc_task *task)
509{ 508{
510 struct nlm_rqst *call = (struct nlm_rqst *) task->tk_calldata; 509 struct nlm_rqst *call = data;
511 510
512 if (task->tk_status < 0) { 511 if (task->tk_status < 0) {
513 dprintk("lockd: %4d callback failed (errno = %d)\n", 512 dprintk("lockd: %4d callback failed (errno = %d)\n",
@@ -517,6 +516,10 @@ nlm4svc_callback_exit(struct rpc_task *task)
517 kfree(call); 516 kfree(call);
518} 517}
519 518
519static const struct rpc_call_ops nlm4svc_callback_ops = {
520 .rpc_call_done = nlm4svc_callback_exit,
521};
522
520/* 523/*
521 * NLM Server procedures. 524 * NLM Server procedures.
522 */ 525 */
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 49f959796b6..9cfced65d4a 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -41,7 +41,8 @@
41 41
42static void nlmsvc_insert_block(struct nlm_block *block, unsigned long); 42static void nlmsvc_insert_block(struct nlm_block *block, unsigned long);
43static int nlmsvc_remove_block(struct nlm_block *block); 43static int nlmsvc_remove_block(struct nlm_block *block);
44static void nlmsvc_grant_callback(struct rpc_task *task); 44
45static const struct rpc_call_ops nlmsvc_grant_ops;
45 46
46/* 47/*
47 * The list of blocked locks to retry 48 * The list of blocked locks to retry
@@ -226,31 +227,27 @@ failed:
226 * It is the caller's responsibility to check whether the file 227 * It is the caller's responsibility to check whether the file
227 * can be closed hereafter. 228 * can be closed hereafter.
228 */ 229 */
229static void 230static int
230nlmsvc_delete_block(struct nlm_block *block, int unlock) 231nlmsvc_delete_block(struct nlm_block *block, int unlock)
231{ 232{
232 struct file_lock *fl = &block->b_call.a_args.lock.fl; 233 struct file_lock *fl = &block->b_call.a_args.lock.fl;
233 struct nlm_file *file = block->b_file; 234 struct nlm_file *file = block->b_file;
234 struct nlm_block **bp; 235 struct nlm_block **bp;
236 int status = 0;
235 237
236 dprintk("lockd: deleting block %p...\n", block); 238 dprintk("lockd: deleting block %p...\n", block);
237 239
238 /* Remove block from list */ 240 /* Remove block from list */
239 nlmsvc_remove_block(block); 241 nlmsvc_remove_block(block);
240 if (fl->fl_next) 242 if (unlock)
241 posix_unblock_lock(file->f_file, fl); 243 status = posix_unblock_lock(file->f_file, fl);
242 if (unlock) {
243 fl->fl_type = F_UNLCK;
244 posix_lock_file(file->f_file, fl);
245 block->b_granted = 0;
246 }
247 244
248 /* If the block is in the middle of a GRANT callback, 245 /* If the block is in the middle of a GRANT callback,
249 * don't kill it yet. */ 246 * don't kill it yet. */
250 if (block->b_incall) { 247 if (block->b_incall) {
251 nlmsvc_insert_block(block, NLM_NEVER); 248 nlmsvc_insert_block(block, NLM_NEVER);
252 block->b_done = 1; 249 block->b_done = 1;
253 return; 250 return status;
254 } 251 }
255 252
256 /* Remove block from file's list of blocks */ 253 /* Remove block from file's list of blocks */
@@ -265,6 +262,7 @@ nlmsvc_delete_block(struct nlm_block *block, int unlock)
265 nlm_release_host(block->b_host); 262 nlm_release_host(block->b_host);
266 nlmclnt_freegrantargs(&block->b_call); 263 nlmclnt_freegrantargs(&block->b_call);
267 kfree(block); 264 kfree(block);
265 return status;
268} 266}
269 267
270/* 268/*
@@ -275,6 +273,7 @@ int
275nlmsvc_traverse_blocks(struct nlm_host *host, struct nlm_file *file, int action) 273nlmsvc_traverse_blocks(struct nlm_host *host, struct nlm_file *file, int action)
276{ 274{
277 struct nlm_block *block, *next; 275 struct nlm_block *block, *next;
276 /* XXX: Will everything get cleaned up if we don't unlock here? */
278 277
279 down(&file->f_sema); 278 down(&file->f_sema);
280 for (block = file->f_blocks; block; block = next) { 279 for (block = file->f_blocks; block; block = next) {
@@ -444,6 +443,7 @@ u32
444nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock) 443nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
445{ 444{
446 struct nlm_block *block; 445 struct nlm_block *block;
446 int status = 0;
447 447
448 dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n", 448 dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n",
449 file->f_file->f_dentry->d_inode->i_sb->s_id, 449 file->f_file->f_dentry->d_inode->i_sb->s_id,
@@ -454,9 +454,9 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
454 454
455 down(&file->f_sema); 455 down(&file->f_sema);
456 if ((block = nlmsvc_lookup_block(file, lock, 1)) != NULL) 456 if ((block = nlmsvc_lookup_block(file, lock, 1)) != NULL)
457 nlmsvc_delete_block(block, 1); 457 status = nlmsvc_delete_block(block, 1);
458 up(&file->f_sema); 458 up(&file->f_sema);
459 return nlm_granted; 459 return status ? nlm_lck_denied : nlm_granted;
460} 460}
461 461
462/* 462/*
@@ -562,7 +562,7 @@ callback:
562 /* Call the client */ 562 /* Call the client */
563 nlm_get_host(block->b_call.a_host); 563 nlm_get_host(block->b_call.a_host);
564 if (nlmsvc_async_call(&block->b_call, NLMPROC_GRANTED_MSG, 564 if (nlmsvc_async_call(&block->b_call, NLMPROC_GRANTED_MSG,
565 nlmsvc_grant_callback) < 0) 565 &nlmsvc_grant_ops) < 0)
566 nlm_release_host(block->b_call.a_host); 566 nlm_release_host(block->b_call.a_host);
567 up(&file->f_sema); 567 up(&file->f_sema);
568} 568}
@@ -575,10 +575,9 @@ callback:
575 * chain once more in order to have it removed by lockd itself (which can 575 * chain once more in order to have it removed by lockd itself (which can
576 * then sleep on the file semaphore without disrupting e.g. the nfs client). 576 * then sleep on the file semaphore without disrupting e.g. the nfs client).
577 */ 577 */
578static void 578static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
579nlmsvc_grant_callback(struct rpc_task *task)
580{ 579{
581 struct nlm_rqst *call = (struct nlm_rqst *) task->tk_calldata; 580 struct nlm_rqst *call = data;
582 struct nlm_block *block; 581 struct nlm_block *block;
583 unsigned long timeout; 582 unsigned long timeout;
584 struct sockaddr_in *peer_addr = RPC_PEERADDR(task->tk_client); 583 struct sockaddr_in *peer_addr = RPC_PEERADDR(task->tk_client);
@@ -614,6 +613,10 @@ nlmsvc_grant_callback(struct rpc_task *task)
614 nlm_release_host(call->a_host); 613 nlm_release_host(call->a_host);
615} 614}
616 615
616static const struct rpc_call_ops nlmsvc_grant_ops = {
617 .rpc_call_done = nlmsvc_grant_callback,
618};
619
617/* 620/*
618 * We received a GRANT_RES callback. Try to find the corresponding 621 * We received a GRANT_RES callback. Try to find the corresponding
619 * block. 622 * block.
@@ -633,11 +636,12 @@ nlmsvc_grant_reply(struct svc_rqst *rqstp, struct nlm_cookie *cookie, u32 status
633 636
634 file->f_count++; 637 file->f_count++;
635 down(&file->f_sema); 638 down(&file->f_sema);
636 if ((block = nlmsvc_find_block(cookie,&rqstp->rq_addr)) != NULL) { 639 block = nlmsvc_find_block(cookie, &rqstp->rq_addr);
640 if (block) {
637 if (status == NLM_LCK_DENIED_GRACE_PERIOD) { 641 if (status == NLM_LCK_DENIED_GRACE_PERIOD) {
638 /* Try again in a couple of seconds */ 642 /* Try again in a couple of seconds */
639 nlmsvc_insert_block(block, 10 * HZ); 643 nlmsvc_insert_block(block, 10 * HZ);
640 block = NULL; 644 up(&file->f_sema);
641 } else { 645 } else {
642 /* Lock is now held by client, or has been rejected. 646 /* Lock is now held by client, or has been rejected.
643 * In both cases, the block should be removed. */ 647 * In both cases, the block should be removed. */
@@ -648,8 +652,6 @@ nlmsvc_grant_reply(struct svc_rqst *rqstp, struct nlm_cookie *cookie, u32 status
648 nlmsvc_delete_block(block, 1); 652 nlmsvc_delete_block(block, 1);
649 } 653 }
650 } 654 }
651 if (!block)
652 up(&file->f_sema);
653 nlm_release_file(file); 655 nlm_release_file(file);
654} 656}
655 657
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 757e344cf20..3bc437e0cf5 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -23,7 +23,8 @@
23#define NLMDBG_FACILITY NLMDBG_CLIENT 23#define NLMDBG_FACILITY NLMDBG_CLIENT
24 24
25static u32 nlmsvc_callback(struct svc_rqst *, u32, struct nlm_res *); 25static u32 nlmsvc_callback(struct svc_rqst *, u32, struct nlm_res *);
26static void nlmsvc_callback_exit(struct rpc_task *); 26
27static const struct rpc_call_ops nlmsvc_callback_ops;
27 28
28#ifdef CONFIG_LOCKD_V4 29#ifdef CONFIG_LOCKD_V4
29static u32 30static u32
@@ -518,7 +519,7 @@ nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_res *resp)
518 call->a_host = host; 519 call->a_host = host;
519 memcpy(&call->a_args, resp, sizeof(*resp)); 520 memcpy(&call->a_args, resp, sizeof(*resp));
520 521
521 if (nlmsvc_async_call(call, proc, nlmsvc_callback_exit) < 0) 522 if (nlmsvc_async_call(call, proc, &nlmsvc_callback_ops) < 0)
522 goto error; 523 goto error;
523 524
524 return rpc_success; 525 return rpc_success;
@@ -528,10 +529,9 @@ nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_res *resp)
528 return rpc_system_err; 529 return rpc_system_err;
529} 530}
530 531
531static void 532static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
532nlmsvc_callback_exit(struct rpc_task *task)
533{ 533{
534 struct nlm_rqst *call = (struct nlm_rqst *) task->tk_calldata; 534 struct nlm_rqst *call = data;
535 535
536 if (task->tk_status < 0) { 536 if (task->tk_status < 0) {
537 dprintk("lockd: %4d callback failed (errno = %d)\n", 537 dprintk("lockd: %4d callback failed (errno = %d)\n",
@@ -541,6 +541,10 @@ nlmsvc_callback_exit(struct rpc_task *task)
541 kfree(call); 541 kfree(call);
542} 542}
543 543
544static const struct rpc_call_ops nlmsvc_callback_ops = {
545 .rpc_call_done = nlmsvc_callback_exit,
546};
547
544/* 548/*
545 * NLM Server procedures. 549 * NLM Server procedures.
546 */ 550 */
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index f01e9c0d267..200fbda2c6d 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -44,7 +44,7 @@ loff_t_to_s32(loff_t offset)
44/* 44/*
45 * XDR functions for basic NLM types 45 * XDR functions for basic NLM types
46 */ 46 */
47static inline u32 *nlm_decode_cookie(u32 *p, struct nlm_cookie *c) 47static u32 *nlm_decode_cookie(u32 *p, struct nlm_cookie *c)
48{ 48{
49 unsigned int len; 49 unsigned int len;
50 50
@@ -79,7 +79,7 @@ nlm_encode_cookie(u32 *p, struct nlm_cookie *c)
79 return p; 79 return p;
80} 80}
81 81
82static inline u32 * 82static u32 *
83nlm_decode_fh(u32 *p, struct nfs_fh *f) 83nlm_decode_fh(u32 *p, struct nfs_fh *f)
84{ 84{
85 unsigned int len; 85 unsigned int len;
@@ -119,7 +119,7 @@ nlm_encode_oh(u32 *p, struct xdr_netobj *oh)
119 return xdr_encode_netobj(p, oh); 119 return xdr_encode_netobj(p, oh);
120} 120}
121 121
122static inline u32 * 122static u32 *
123nlm_decode_lock(u32 *p, struct nlm_lock *lock) 123nlm_decode_lock(u32 *p, struct nlm_lock *lock)
124{ 124{
125 struct file_lock *fl = &lock->fl; 125 struct file_lock *fl = &lock->fl;
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index ae4d6b426c6..fdcf105a530 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -354,7 +354,9 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, u32 *p, struct nlm_reboot *argp)
354 return 0; 354 return 0;
355 argp->state = ntohl(*p++); 355 argp->state = ntohl(*p++);
356 /* Preserve the address in network byte order */ 356 /* Preserve the address in network byte order */
357 argp->addr = *p++; 357 argp->addr = *p++;
358 argp->vers = *p++;
359 argp->proto = *p++;
358 return xdr_argsize_check(rqstp, p); 360 return xdr_argsize_check(rqstp, p);
359} 361}
360 362
diff --git a/fs/locks.c b/fs/locks.c
index a1e8b224801..909eab8fb1d 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -154,7 +154,7 @@ static struct file_lock *locks_alloc_lock(void)
154} 154}
155 155
156/* Free a lock which is not in use. */ 156/* Free a lock which is not in use. */
157static inline void locks_free_lock(struct file_lock *fl) 157static void locks_free_lock(struct file_lock *fl)
158{ 158{
159 if (fl == NULL) { 159 if (fl == NULL) {
160 BUG(); 160 BUG();
@@ -475,8 +475,7 @@ static inline int locks_overlap(struct file_lock *fl1, struct file_lock *fl2)
475/* 475/*
476 * Check whether two locks have the same owner. 476 * Check whether two locks have the same owner.
477 */ 477 */
478static inline int 478static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
479posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
480{ 479{
481 if (fl1->fl_lmops && fl1->fl_lmops->fl_compare_owner) 480 if (fl1->fl_lmops && fl1->fl_lmops->fl_compare_owner)
482 return fl2->fl_lmops == fl1->fl_lmops && 481 return fl2->fl_lmops == fl1->fl_lmops &&
@@ -487,7 +486,7 @@ posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
487/* Remove waiter from blocker's block list. 486/* Remove waiter from blocker's block list.
488 * When blocker ends up pointing to itself then the list is empty. 487 * When blocker ends up pointing to itself then the list is empty.
489 */ 488 */
490static inline void __locks_delete_block(struct file_lock *waiter) 489static void __locks_delete_block(struct file_lock *waiter)
491{ 490{
492 list_del_init(&waiter->fl_block); 491 list_del_init(&waiter->fl_block);
493 list_del_init(&waiter->fl_link); 492 list_del_init(&waiter->fl_link);
@@ -1105,7 +1104,6 @@ static void time_out_leases(struct inode *inode)
1105 before = &fl->fl_next; 1104 before = &fl->fl_next;
1106 continue; 1105 continue;
1107 } 1106 }
1108 printk(KERN_INFO "lease broken - owner pid = %d\n", fl->fl_pid);
1109 lease_modify(before, fl->fl_type & ~F_INPROGRESS); 1107 lease_modify(before, fl->fl_type & ~F_INPROGRESS);
1110 if (fl == *before) /* lease_modify may have freed fl */ 1108 if (fl == *before) /* lease_modify may have freed fl */
1111 before = &fl->fl_next; 1109 before = &fl->fl_next;
@@ -1430,7 +1428,7 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
1430 lock_kernel(); 1428 lock_kernel();
1431 1429
1432 error = __setlease(filp, arg, &flp); 1430 error = __setlease(filp, arg, &flp);
1433 if (error) 1431 if (error || arg == F_UNLCK)
1434 goto out_unlock; 1432 goto out_unlock;
1435 1433
1436 error = fasync_helper(fd, filp, 1, &flp->fl_fasync); 1434 error = fasync_helper(fd, filp, 1, &flp->fl_fasync);
@@ -1959,22 +1957,18 @@ EXPORT_SYMBOL(posix_block_lock);
1959 * 1957 *
1960 * lockd needs to block waiting for locks. 1958 * lockd needs to block waiting for locks.
1961 */ 1959 */
1962void 1960int
1963posix_unblock_lock(struct file *filp, struct file_lock *waiter) 1961posix_unblock_lock(struct file *filp, struct file_lock *waiter)
1964{ 1962{
1965 /* 1963 int status = 0;
1966 * A remote machine may cancel the lock request after it's been 1964
1967 * granted locally. If that happens, we need to delete the lock.
1968 */
1969 lock_kernel(); 1965 lock_kernel();
1970 if (waiter->fl_next) { 1966 if (waiter->fl_next)
1971 __locks_delete_block(waiter); 1967 __locks_delete_block(waiter);
1972 unlock_kernel(); 1968 else
1973 } else { 1969 status = -ENOENT;
1974 unlock_kernel(); 1970 unlock_kernel();
1975 waiter->fl_type = F_UNLCK; 1971 return status;
1976 posix_lock_file(filp, waiter);
1977 }
1978} 1972}
1979 1973
1980EXPORT_SYMBOL(posix_unblock_lock); 1974EXPORT_SYMBOL(posix_unblock_lock);
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 0f1e4530670..f5bbe4c97c5 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -126,7 +126,7 @@ __mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
126} 126}
127 127
128 128
129static inline void 129static void
130__mb_cache_entry_unhash(struct mb_cache_entry *ce) 130__mb_cache_entry_unhash(struct mb_cache_entry *ce)
131{ 131{
132 int n; 132 int n;
@@ -139,7 +139,7 @@ __mb_cache_entry_unhash(struct mb_cache_entry *ce)
139} 139}
140 140
141 141
142static inline void 142static void
143__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask) 143__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
144{ 144{
145 struct mb_cache *cache = ce->e_cache; 145 struct mb_cache *cache = ce->e_cache;
@@ -158,7 +158,7 @@ __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
158} 158}
159 159
160 160
161static inline void 161static void
162__mb_cache_entry_release_unlock(struct mb_cache_entry *ce) 162__mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
163{ 163{
164 /* Wake up all processes queuing for this cache entry. */ 164 /* Wake up all processes queuing for this cache entry. */
diff --git a/fs/mpage.c b/fs/mpage.c
index c5adcdddf3c..e431cb3878d 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -184,7 +184,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
184 if (page_has_buffers(page)) 184 if (page_has_buffers(page))
185 goto confused; 185 goto confused;
186 186
187 block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits); 187 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
188 last_block = (i_size_read(inode) + blocksize - 1) >> blkbits; 188 last_block = (i_size_read(inode) + blocksize - 1) >> blkbits;
189 189
190 bh.b_page = page; 190 bh.b_page = page;
@@ -466,7 +466,7 @@ __mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
466 * The page has no buffers: map it to disk 466 * The page has no buffers: map it to disk
467 */ 467 */
468 BUG_ON(!PageUptodate(page)); 468 BUG_ON(!PageUptodate(page));
469 block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits); 469 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
470 last_block = (i_size - 1) >> blkbits; 470 last_block = (i_size - 1) >> blkbits;
471 map_bh.b_page = page; 471 map_bh.b_page = page;
472 for (page_block = 0; page_block < blocks_per_page; ) { 472 for (page_block = 0; page_block < blocks_per_page; ) {
@@ -721,7 +721,7 @@ retry:
721 &last_block_in_bio, &ret, wbc, 721 &last_block_in_bio, &ret, wbc,
722 page->mapping->a_ops->writepage); 722 page->mapping->a_ops->writepage);
723 } 723 }
724 if (unlikely(ret == WRITEPAGE_ACTIVATE)) 724 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
725 unlock_page(page); 725 unlock_page(page);
726 if (ret || (--(wbc->nr_to_write) <= 0)) 726 if (ret || (--(wbc->nr_to_write) <= 0))
727 done = 1; 727 done = 1;
diff --git a/fs/namei.c b/fs/namei.c
index 6dbbd42d8b9..4acdac043b6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -28,7 +28,10 @@
28#include <linux/syscalls.h> 28#include <linux/syscalls.h>
29#include <linux/mount.h> 29#include <linux/mount.h>
30#include <linux/audit.h> 30#include <linux/audit.h>
31#include <linux/capability.h>
31#include <linux/file.h> 32#include <linux/file.h>
33#include <linux/fcntl.h>
34#include <linux/namei.h>
32#include <asm/namei.h> 35#include <asm/namei.h>
33#include <asm/uaccess.h> 36#include <asm/uaccess.h>
34 37
@@ -112,7 +115,7 @@
112 * POSIX.1 2.4: an empty pathname is invalid (ENOENT). 115 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
113 * PATH_MAX includes the nul terminator --RR. 116 * PATH_MAX includes the nul terminator --RR.
114 */ 117 */
115static inline int do_getname(const char __user *filename, char *page) 118static int do_getname(const char __user *filename, char *page)
116{ 119{
117 int retval; 120 int retval;
118 unsigned long len = PATH_MAX; 121 unsigned long len = PATH_MAX;
@@ -395,7 +398,7 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name,
395 * short-cut DAC fails, then call permission() to do more 398 * short-cut DAC fails, then call permission() to do more
396 * complete permission check. 399 * complete permission check.
397 */ 400 */
398static inline int exec_permission_lite(struct inode *inode, 401static int exec_permission_lite(struct inode *inode,
399 struct nameidata *nd) 402 struct nameidata *nd)
400{ 403{
401 umode_t mode = inode->i_mode; 404 umode_t mode = inode->i_mode;
@@ -438,7 +441,7 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s
438 struct dentry * result; 441 struct dentry * result;
439 struct inode *dir = parent->d_inode; 442 struct inode *dir = parent->d_inode;
440 443
441 down(&dir->i_sem); 444 mutex_lock(&dir->i_mutex);
442 /* 445 /*
443 * First re-do the cached lookup just in case it was created 446 * First re-do the cached lookup just in case it was created
444 * while we waited for the directory semaphore.. 447 * while we waited for the directory semaphore..
@@ -464,7 +467,7 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s
464 else 467 else
465 result = dentry; 468 result = dentry;
466 } 469 }
467 up(&dir->i_sem); 470 mutex_unlock(&dir->i_mutex);
468 return result; 471 return result;
469 } 472 }
470 473
@@ -472,7 +475,7 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s
472 * Uhhuh! Nasty case: the cache was re-populated while 475 * Uhhuh! Nasty case: the cache was re-populated while
473 * we waited on the semaphore. Need to revalidate. 476 * we waited on the semaphore. Need to revalidate.
474 */ 477 */
475 up(&dir->i_sem); 478 mutex_unlock(&dir->i_mutex);
476 if (result->d_op && result->d_op->d_revalidate) { 479 if (result->d_op && result->d_op->d_revalidate) {
477 if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) { 480 if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) {
478 dput(result); 481 dput(result);
@@ -485,7 +488,7 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s
485static int __emul_lookup_dentry(const char *, struct nameidata *); 488static int __emul_lookup_dentry(const char *, struct nameidata *);
486 489
487/* SMP-safe */ 490/* SMP-safe */
488static inline int 491static __always_inline int
489walk_init_root(const char *name, struct nameidata *nd) 492walk_init_root(const char *name, struct nameidata *nd)
490{ 493{
491 read_lock(&current->fs->lock); 494 read_lock(&current->fs->lock);
@@ -503,7 +506,7 @@ walk_init_root(const char *name, struct nameidata *nd)
503 return 1; 506 return 1;
504} 507}
505 508
506static inline int __vfs_follow_link(struct nameidata *nd, const char *link) 509static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
507{ 510{
508 int res = 0; 511 int res = 0;
509 char *name; 512 char *name;
@@ -543,7 +546,7 @@ struct path {
543 struct dentry *dentry; 546 struct dentry *dentry;
544}; 547};
545 548
546static inline int __do_follow_link(struct path *path, struct nameidata *nd) 549static __always_inline int __do_follow_link(struct path *path, struct nameidata *nd)
547{ 550{
548 int error; 551 int error;
549 void *cookie; 552 void *cookie;
@@ -689,7 +692,7 @@ int follow_down(struct vfsmount **mnt, struct dentry **dentry)
689 return 0; 692 return 0;
690} 693}
691 694
692static inline void follow_dotdot(struct nameidata *nd) 695static __always_inline void follow_dotdot(struct nameidata *nd)
693{ 696{
694 while(1) { 697 while(1) {
695 struct vfsmount *parent; 698 struct vfsmount *parent;
@@ -1062,7 +1065,8 @@ set_it:
1062} 1065}
1063 1066
1064/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ 1067/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
1065int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata *nd) 1068static int fastcall do_path_lookup(int dfd, const char *name,
1069 unsigned int flags, struct nameidata *nd)
1066{ 1070{
1067 int retval = 0; 1071 int retval = 0;
1068 1072
@@ -1082,9 +1086,38 @@ int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata
1082 } 1086 }
1083 nd->mnt = mntget(current->fs->rootmnt); 1087 nd->mnt = mntget(current->fs->rootmnt);
1084 nd->dentry = dget(current->fs->root); 1088 nd->dentry = dget(current->fs->root);
1085 } else { 1089 } else if (dfd == AT_FDCWD) {
1086 nd->mnt = mntget(current->fs->pwdmnt); 1090 nd->mnt = mntget(current->fs->pwdmnt);
1087 nd->dentry = dget(current->fs->pwd); 1091 nd->dentry = dget(current->fs->pwd);
1092 } else {
1093 struct file *file;
1094 int fput_needed;
1095 struct dentry *dentry;
1096
1097 file = fget_light(dfd, &fput_needed);
1098 if (!file) {
1099 retval = -EBADF;
1100 goto out_fail;
1101 }
1102
1103 dentry = file->f_dentry;
1104
1105 if (!S_ISDIR(dentry->d_inode->i_mode)) {
1106 retval = -ENOTDIR;
1107 fput_light(file, fput_needed);
1108 goto out_fail;
1109 }
1110
1111 retval = file_permission(file, MAY_EXEC);
1112 if (retval) {
1113 fput_light(file, fput_needed);
1114 goto out_fail;
1115 }
1116
1117 nd->mnt = mntget(file->f_vfsmnt);
1118 nd->dentry = dget(dentry);
1119
1120 fput_light(file, fput_needed);
1088 } 1121 }
1089 read_unlock(&current->fs->lock); 1122 read_unlock(&current->fs->lock);
1090 current->total_link_count = 0; 1123 current->total_link_count = 0;
@@ -1093,11 +1126,19 @@ out:
1093 if (unlikely(current->audit_context 1126 if (unlikely(current->audit_context
1094 && nd && nd->dentry && nd->dentry->d_inode)) 1127 && nd && nd->dentry && nd->dentry->d_inode))
1095 audit_inode(name, nd->dentry->d_inode, flags); 1128 audit_inode(name, nd->dentry->d_inode, flags);
1129out_fail:
1096 return retval; 1130 return retval;
1097} 1131}
1098 1132
1099static int __path_lookup_intent_open(const char *name, unsigned int lookup_flags, 1133int fastcall path_lookup(const char *name, unsigned int flags,
1100 struct nameidata *nd, int open_flags, int create_mode) 1134 struct nameidata *nd)
1135{
1136 return do_path_lookup(AT_FDCWD, name, flags, nd);
1137}
1138
1139static int __path_lookup_intent_open(int dfd, const char *name,
1140 unsigned int lookup_flags, struct nameidata *nd,
1141 int open_flags, int create_mode)
1101{ 1142{
1102 struct file *filp = get_empty_filp(); 1143 struct file *filp = get_empty_filp();
1103 int err; 1144 int err;
@@ -1107,7 +1148,7 @@ static int __path_lookup_intent_open(const char *name, unsigned int lookup_flags
1107 nd->intent.open.file = filp; 1148 nd->intent.open.file = filp;
1108 nd->intent.open.flags = open_flags; 1149 nd->intent.open.flags = open_flags;
1109 nd->intent.open.create_mode = create_mode; 1150 nd->intent.open.create_mode = create_mode;
1110 err = path_lookup(name, lookup_flags|LOOKUP_OPEN, nd); 1151 err = do_path_lookup(dfd, name, lookup_flags|LOOKUP_OPEN, nd);
1111 if (IS_ERR(nd->intent.open.file)) { 1152 if (IS_ERR(nd->intent.open.file)) {
1112 if (err == 0) { 1153 if (err == 0) {
1113 err = PTR_ERR(nd->intent.open.file); 1154 err = PTR_ERR(nd->intent.open.file);
@@ -1125,10 +1166,10 @@ static int __path_lookup_intent_open(const char *name, unsigned int lookup_flags
1125 * @nd: pointer to nameidata 1166 * @nd: pointer to nameidata
1126 * @open_flags: open intent flags 1167 * @open_flags: open intent flags
1127 */ 1168 */
1128int path_lookup_open(const char *name, unsigned int lookup_flags, 1169int path_lookup_open(int dfd, const char *name, unsigned int lookup_flags,
1129 struct nameidata *nd, int open_flags) 1170 struct nameidata *nd, int open_flags)
1130{ 1171{
1131 return __path_lookup_intent_open(name, lookup_flags, nd, 1172 return __path_lookup_intent_open(dfd, name, lookup_flags, nd,
1132 open_flags, 0); 1173 open_flags, 0);
1133} 1174}
1134 1175
@@ -1140,12 +1181,12 @@ int path_lookup_open(const char *name, unsigned int lookup_flags,
1140 * @open_flags: open intent flags 1181 * @open_flags: open intent flags
1141 * @create_mode: create intent flags 1182 * @create_mode: create intent flags
1142 */ 1183 */
1143static int path_lookup_create(const char *name, unsigned int lookup_flags, 1184static int path_lookup_create(int dfd, const char *name,
1144 struct nameidata *nd, int open_flags, 1185 unsigned int lookup_flags, struct nameidata *nd,
1145 int create_mode) 1186 int open_flags, int create_mode)
1146{ 1187{
1147 return __path_lookup_intent_open(name, lookup_flags|LOOKUP_CREATE, nd, 1188 return __path_lookup_intent_open(dfd, name, lookup_flags|LOOKUP_CREATE,
1148 open_flags, create_mode); 1189 nd, open_flags, create_mode);
1149} 1190}
1150 1191
1151int __user_path_lookup_open(const char __user *name, unsigned int lookup_flags, 1192int __user_path_lookup_open(const char __user *name, unsigned int lookup_flags,
@@ -1155,7 +1196,7 @@ int __user_path_lookup_open(const char __user *name, unsigned int lookup_flags,
1155 int err = PTR_ERR(tmp); 1196 int err = PTR_ERR(tmp);
1156 1197
1157 if (!IS_ERR(tmp)) { 1198 if (!IS_ERR(tmp)) {
1158 err = __path_lookup_intent_open(tmp, lookup_flags, nd, open_flags, 0); 1199 err = __path_lookup_intent_open(AT_FDCWD, tmp, lookup_flags, nd, open_flags, 0);
1159 putname(tmp); 1200 putname(tmp);
1160 } 1201 }
1161 return err; 1202 return err;
@@ -1247,18 +1288,24 @@ access:
1247 * that namei follows links, while lnamei does not. 1288 * that namei follows links, while lnamei does not.
1248 * SMP-safe 1289 * SMP-safe
1249 */ 1290 */
1250int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd) 1291int fastcall __user_walk_fd(int dfd, const char __user *name, unsigned flags,
1292 struct nameidata *nd)
1251{ 1293{
1252 char *tmp = getname(name); 1294 char *tmp = getname(name);
1253 int err = PTR_ERR(tmp); 1295 int err = PTR_ERR(tmp);
1254 1296
1255 if (!IS_ERR(tmp)) { 1297 if (!IS_ERR(tmp)) {
1256 err = path_lookup(tmp, flags, nd); 1298 err = do_path_lookup(dfd, tmp, flags, nd);
1257 putname(tmp); 1299 putname(tmp);
1258 } 1300 }
1259 return err; 1301 return err;
1260} 1302}
1261 1303
1304int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
1305{
1306 return __user_walk_fd(AT_FDCWD, name, flags, nd);
1307}
1308
1262/* 1309/*
1263 * It's inline, so penalty for filesystems that don't use sticky bit is 1310 * It's inline, so penalty for filesystems that don't use sticky bit is
1264 * minimal. 1311 * minimal.
@@ -1293,7 +1340,7 @@ static inline int check_sticky(struct inode *dir, struct inode *inode)
1293 * 10. We don't allow removal of NFS sillyrenamed files; it's handled by 1340 * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
1294 * nfs_async_unlink(). 1341 * nfs_async_unlink().
1295 */ 1342 */
1296static inline int may_delete(struct inode *dir,struct dentry *victim,int isdir) 1343static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
1297{ 1344{
1298 int error; 1345 int error;
1299 1346
@@ -1366,7 +1413,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
1366 struct dentry *p; 1413 struct dentry *p;
1367 1414
1368 if (p1 == p2) { 1415 if (p1 == p2) {
1369 down(&p1->d_inode->i_sem); 1416 mutex_lock(&p1->d_inode->i_mutex);
1370 return NULL; 1417 return NULL;
1371 } 1418 }
1372 1419
@@ -1374,30 +1421,30 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
1374 1421
1375 for (p = p1; p->d_parent != p; p = p->d_parent) { 1422 for (p = p1; p->d_parent != p; p = p->d_parent) {
1376 if (p->d_parent == p2) { 1423 if (p->d_parent == p2) {
1377 down(&p2->d_inode->i_sem); 1424 mutex_lock(&p2->d_inode->i_mutex);
1378 down(&p1->d_inode->i_sem); 1425 mutex_lock(&p1->d_inode->i_mutex);
1379 return p; 1426 return p;
1380 } 1427 }
1381 } 1428 }
1382 1429
1383 for (p = p2; p->d_parent != p; p = p->d_parent) { 1430 for (p = p2; p->d_parent != p; p = p->d_parent) {
1384 if (p->d_parent == p1) { 1431 if (p->d_parent == p1) {
1385 down(&p1->d_inode->i_sem); 1432 mutex_lock(&p1->d_inode->i_mutex);
1386 down(&p2->d_inode->i_sem); 1433 mutex_lock(&p2->d_inode->i_mutex);
1387 return p; 1434 return p;
1388 } 1435 }
1389 } 1436 }
1390 1437
1391 down(&p1->d_inode->i_sem); 1438 mutex_lock(&p1->d_inode->i_mutex);
1392 down(&p2->d_inode->i_sem); 1439 mutex_lock(&p2->d_inode->i_mutex);
1393 return NULL; 1440 return NULL;
1394} 1441}
1395 1442
1396void unlock_rename(struct dentry *p1, struct dentry *p2) 1443void unlock_rename(struct dentry *p1, struct dentry *p2)
1397{ 1444{
1398 up(&p1->d_inode->i_sem); 1445 mutex_unlock(&p1->d_inode->i_mutex);
1399 if (p1 != p2) { 1446 if (p1 != p2) {
1400 up(&p2->d_inode->i_sem); 1447 mutex_unlock(&p2->d_inode->i_mutex);
1401 up(&p1->d_inode->i_sb->s_vfs_rename_sem); 1448 up(&p1->d_inode->i_sb->s_vfs_rename_sem);
1402 } 1449 }
1403} 1450}
@@ -1491,7 +1538,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
1491 if (!error) { 1538 if (!error) {
1492 DQUOT_INIT(inode); 1539 DQUOT_INIT(inode);
1493 1540
1494 error = do_truncate(dentry, 0, NULL); 1541 error = do_truncate(dentry, 0, ATTR_MTIME|ATTR_CTIME, NULL);
1495 } 1542 }
1496 put_write_access(inode); 1543 put_write_access(inode);
1497 if (error) 1544 if (error)
@@ -1517,7 +1564,8 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
1517 * for symlinks (where the permissions are checked later). 1564 * for symlinks (where the permissions are checked later).
1518 * SMP-safe 1565 * SMP-safe
1519 */ 1566 */
1520int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd) 1567int open_namei(int dfd, const char *pathname, int flag,
1568 int mode, struct nameidata *nd)
1521{ 1569{
1522 int acc_mode, error; 1570 int acc_mode, error;
1523 struct path path; 1571 struct path path;
@@ -1539,7 +1587,8 @@ int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
1539 * The simplest case - just a plain lookup. 1587 * The simplest case - just a plain lookup.
1540 */ 1588 */
1541 if (!(flag & O_CREAT)) { 1589 if (!(flag & O_CREAT)) {
1542 error = path_lookup_open(pathname, lookup_flags(flag), nd, flag); 1590 error = path_lookup_open(dfd, pathname, lookup_flags(flag),
1591 nd, flag);
1543 if (error) 1592 if (error)
1544 return error; 1593 return error;
1545 goto ok; 1594 goto ok;
@@ -1548,7 +1597,7 @@ int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
1548 /* 1597 /*
1549 * Create - we need to know the parent. 1598 * Create - we need to know the parent.
1550 */ 1599 */
1551 error = path_lookup_create(pathname, LOOKUP_PARENT, nd, flag, mode); 1600 error = path_lookup_create(dfd,pathname,LOOKUP_PARENT,nd,flag,mode);
1552 if (error) 1601 if (error)
1553 return error; 1602 return error;
1554 1603
@@ -1563,14 +1612,14 @@ int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
1563 1612
1564 dir = nd->dentry; 1613 dir = nd->dentry;
1565 nd->flags &= ~LOOKUP_PARENT; 1614 nd->flags &= ~LOOKUP_PARENT;
1566 down(&dir->d_inode->i_sem); 1615 mutex_lock(&dir->d_inode->i_mutex);
1567 path.dentry = lookup_hash(nd); 1616 path.dentry = lookup_hash(nd);
1568 path.mnt = nd->mnt; 1617 path.mnt = nd->mnt;
1569 1618
1570do_last: 1619do_last:
1571 error = PTR_ERR(path.dentry); 1620 error = PTR_ERR(path.dentry);
1572 if (IS_ERR(path.dentry)) { 1621 if (IS_ERR(path.dentry)) {
1573 up(&dir->d_inode->i_sem); 1622 mutex_unlock(&dir->d_inode->i_mutex);
1574 goto exit; 1623 goto exit;
1575 } 1624 }
1576 1625
@@ -1579,7 +1628,7 @@ do_last:
1579 if (!IS_POSIXACL(dir->d_inode)) 1628 if (!IS_POSIXACL(dir->d_inode))
1580 mode &= ~current->fs->umask; 1629 mode &= ~current->fs->umask;
1581 error = vfs_create(dir->d_inode, path.dentry, mode, nd); 1630 error = vfs_create(dir->d_inode, path.dentry, mode, nd);
1582 up(&dir->d_inode->i_sem); 1631 mutex_unlock(&dir->d_inode->i_mutex);
1583 dput(nd->dentry); 1632 dput(nd->dentry);
1584 nd->dentry = path.dentry; 1633 nd->dentry = path.dentry;
1585 if (error) 1634 if (error)
@@ -1593,7 +1642,7 @@ do_last:
1593 /* 1642 /*
1594 * It already exists. 1643 * It already exists.
1595 */ 1644 */
1596 up(&dir->d_inode->i_sem); 1645 mutex_unlock(&dir->d_inode->i_mutex);
1597 1646
1598 error = -EEXIST; 1647 error = -EEXIST;
1599 if (flag & O_EXCL) 1648 if (flag & O_EXCL)
@@ -1665,7 +1714,7 @@ do_link:
1665 goto exit; 1714 goto exit;
1666 } 1715 }
1667 dir = nd->dentry; 1716 dir = nd->dentry;
1668 down(&dir->d_inode->i_sem); 1717 mutex_lock(&dir->d_inode->i_mutex);
1669 path.dentry = lookup_hash(nd); 1718 path.dentry = lookup_hash(nd);
1670 path.mnt = nd->mnt; 1719 path.mnt = nd->mnt;
1671 __putname(nd->last.name); 1720 __putname(nd->last.name);
@@ -1680,13 +1729,13 @@ do_link:
1680 * Simple function to lookup and return a dentry and create it 1729 * Simple function to lookup and return a dentry and create it
1681 * if it doesn't exist. Is SMP-safe. 1730 * if it doesn't exist. Is SMP-safe.
1682 * 1731 *
1683 * Returns with nd->dentry->d_inode->i_sem locked. 1732 * Returns with nd->dentry->d_inode->i_mutex locked.
1684 */ 1733 */
1685struct dentry *lookup_create(struct nameidata *nd, int is_dir) 1734struct dentry *lookup_create(struct nameidata *nd, int is_dir)
1686{ 1735{
1687 struct dentry *dentry = ERR_PTR(-EEXIST); 1736 struct dentry *dentry = ERR_PTR(-EEXIST);
1688 1737
1689 down(&nd->dentry->d_inode->i_sem); 1738 mutex_lock(&nd->dentry->d_inode->i_mutex);
1690 /* 1739 /*
1691 * Yucky last component or no last component at all? 1740 * Yucky last component or no last component at all?
1692 * (foo/., foo/.., /////) 1741 * (foo/., foo/.., /////)
@@ -1743,7 +1792,8 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1743 return error; 1792 return error;
1744} 1793}
1745 1794
1746asmlinkage long sys_mknod(const char __user * filename, int mode, unsigned dev) 1795asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
1796 unsigned dev)
1747{ 1797{
1748 int error = 0; 1798 int error = 0;
1749 char * tmp; 1799 char * tmp;
@@ -1756,7 +1806,7 @@ asmlinkage long sys_mknod(const char __user * filename, int mode, unsigned dev)
1756 if (IS_ERR(tmp)) 1806 if (IS_ERR(tmp))
1757 return PTR_ERR(tmp); 1807 return PTR_ERR(tmp);
1758 1808
1759 error = path_lookup(tmp, LOOKUP_PARENT, &nd); 1809 error = do_path_lookup(dfd, tmp, LOOKUP_PARENT, &nd);
1760 if (error) 1810 if (error)
1761 goto out; 1811 goto out;
1762 dentry = lookup_create(&nd, 0); 1812 dentry = lookup_create(&nd, 0);
@@ -1784,7 +1834,7 @@ asmlinkage long sys_mknod(const char __user * filename, int mode, unsigned dev)
1784 } 1834 }
1785 dput(dentry); 1835 dput(dentry);
1786 } 1836 }
1787 up(&nd.dentry->d_inode->i_sem); 1837 mutex_unlock(&nd.dentry->d_inode->i_mutex);
1788 path_release(&nd); 1838 path_release(&nd);
1789out: 1839out:
1790 putname(tmp); 1840 putname(tmp);
@@ -1792,6 +1842,11 @@ out:
1792 return error; 1842 return error;
1793} 1843}
1794 1844
1845asmlinkage long sys_mknod(const char __user *filename, int mode, unsigned dev)
1846{
1847 return sys_mknodat(AT_FDCWD, filename, mode, dev);
1848}
1849
1795int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) 1850int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1796{ 1851{
1797 int error = may_create(dir, dentry, NULL); 1852 int error = may_create(dir, dentry, NULL);
@@ -1814,7 +1869,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1814 return error; 1869 return error;
1815} 1870}
1816 1871
1817asmlinkage long sys_mkdir(const char __user * pathname, int mode) 1872asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
1818{ 1873{
1819 int error = 0; 1874 int error = 0;
1820 char * tmp; 1875 char * tmp;
@@ -1825,7 +1880,7 @@ asmlinkage long sys_mkdir(const char __user * pathname, int mode)
1825 struct dentry *dentry; 1880 struct dentry *dentry;
1826 struct nameidata nd; 1881 struct nameidata nd;
1827 1882
1828 error = path_lookup(tmp, LOOKUP_PARENT, &nd); 1883 error = do_path_lookup(dfd, tmp, LOOKUP_PARENT, &nd);
1829 if (error) 1884 if (error)
1830 goto out; 1885 goto out;
1831 dentry = lookup_create(&nd, 1); 1886 dentry = lookup_create(&nd, 1);
@@ -1836,7 +1891,7 @@ asmlinkage long sys_mkdir(const char __user * pathname, int mode)
1836 error = vfs_mkdir(nd.dentry->d_inode, dentry, mode); 1891 error = vfs_mkdir(nd.dentry->d_inode, dentry, mode);
1837 dput(dentry); 1892 dput(dentry);
1838 } 1893 }
1839 up(&nd.dentry->d_inode->i_sem); 1894 mutex_unlock(&nd.dentry->d_inode->i_mutex);
1840 path_release(&nd); 1895 path_release(&nd);
1841out: 1896out:
1842 putname(tmp); 1897 putname(tmp);
@@ -1845,6 +1900,11 @@ out:
1845 return error; 1900 return error;
1846} 1901}
1847 1902
1903asmlinkage long sys_mkdir(const char __user *pathname, int mode)
1904{
1905 return sys_mkdirat(AT_FDCWD, pathname, mode);
1906}
1907
1848/* 1908/*
1849 * We try to drop the dentry early: we should have 1909 * We try to drop the dentry early: we should have
1850 * a usage count of 2 if we're the only user of this 1910 * a usage count of 2 if we're the only user of this
@@ -1885,7 +1945,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
1885 1945
1886 DQUOT_INIT(dir); 1946 DQUOT_INIT(dir);
1887 1947
1888 down(&dentry->d_inode->i_sem); 1948 mutex_lock(&dentry->d_inode->i_mutex);
1889 dentry_unhash(dentry); 1949 dentry_unhash(dentry);
1890 if (d_mountpoint(dentry)) 1950 if (d_mountpoint(dentry))
1891 error = -EBUSY; 1951 error = -EBUSY;
@@ -1897,7 +1957,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
1897 dentry->d_inode->i_flags |= S_DEAD; 1957 dentry->d_inode->i_flags |= S_DEAD;
1898 } 1958 }
1899 } 1959 }
1900 up(&dentry->d_inode->i_sem); 1960 mutex_unlock(&dentry->d_inode->i_mutex);
1901 if (!error) { 1961 if (!error) {
1902 d_delete(dentry); 1962 d_delete(dentry);
1903 } 1963 }
@@ -1906,7 +1966,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
1906 return error; 1966 return error;
1907} 1967}
1908 1968
1909asmlinkage long sys_rmdir(const char __user * pathname) 1969static long do_rmdir(int dfd, const char __user *pathname)
1910{ 1970{
1911 int error = 0; 1971 int error = 0;
1912 char * name; 1972 char * name;
@@ -1917,7 +1977,7 @@ asmlinkage long sys_rmdir(const char __user * pathname)
1917 if(IS_ERR(name)) 1977 if(IS_ERR(name))
1918 return PTR_ERR(name); 1978 return PTR_ERR(name);
1919 1979
1920 error = path_lookup(name, LOOKUP_PARENT, &nd); 1980 error = do_path_lookup(dfd, name, LOOKUP_PARENT, &nd);
1921 if (error) 1981 if (error)
1922 goto exit; 1982 goto exit;
1923 1983
@@ -1932,14 +1992,14 @@ asmlinkage long sys_rmdir(const char __user * pathname)
1932 error = -EBUSY; 1992 error = -EBUSY;
1933 goto exit1; 1993 goto exit1;
1934 } 1994 }
1935 down(&nd.dentry->d_inode->i_sem); 1995 mutex_lock(&nd.dentry->d_inode->i_mutex);
1936 dentry = lookup_hash(&nd); 1996 dentry = lookup_hash(&nd);
1937 error = PTR_ERR(dentry); 1997 error = PTR_ERR(dentry);
1938 if (!IS_ERR(dentry)) { 1998 if (!IS_ERR(dentry)) {
1939 error = vfs_rmdir(nd.dentry->d_inode, dentry); 1999 error = vfs_rmdir(nd.dentry->d_inode, dentry);
1940 dput(dentry); 2000 dput(dentry);
1941 } 2001 }
1942 up(&nd.dentry->d_inode->i_sem); 2002 mutex_unlock(&nd.dentry->d_inode->i_mutex);
1943exit1: 2003exit1:
1944 path_release(&nd); 2004 path_release(&nd);
1945exit: 2005exit:
@@ -1947,6 +2007,11 @@ exit:
1947 return error; 2007 return error;
1948} 2008}
1949 2009
2010asmlinkage long sys_rmdir(const char __user *pathname)
2011{
2012 return do_rmdir(AT_FDCWD, pathname);
2013}
2014
1950int vfs_unlink(struct inode *dir, struct dentry *dentry) 2015int vfs_unlink(struct inode *dir, struct dentry *dentry)
1951{ 2016{
1952 int error = may_delete(dir, dentry, 0); 2017 int error = may_delete(dir, dentry, 0);
@@ -1959,7 +2024,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
1959 2024
1960 DQUOT_INIT(dir); 2025 DQUOT_INIT(dir);
1961 2026
1962 down(&dentry->d_inode->i_sem); 2027 mutex_lock(&dentry->d_inode->i_mutex);
1963 if (d_mountpoint(dentry)) 2028 if (d_mountpoint(dentry))
1964 error = -EBUSY; 2029 error = -EBUSY;
1965 else { 2030 else {
@@ -1967,7 +2032,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
1967 if (!error) 2032 if (!error)
1968 error = dir->i_op->unlink(dir, dentry); 2033 error = dir->i_op->unlink(dir, dentry);
1969 } 2034 }
1970 up(&dentry->d_inode->i_sem); 2035 mutex_unlock(&dentry->d_inode->i_mutex);
1971 2036
1972 /* We don't d_delete() NFS sillyrenamed files--they still exist. */ 2037 /* We don't d_delete() NFS sillyrenamed files--they still exist. */
1973 if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) { 2038 if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
@@ -1979,11 +2044,11 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
1979 2044
1980/* 2045/*
1981 * Make sure that the actual truncation of the file will occur outside its 2046 * Make sure that the actual truncation of the file will occur outside its
1982 * directory's i_sem. Truncate can take a long time if there is a lot of 2047 * directory's i_mutex. Truncate can take a long time if there is a lot of
1983 * writeout happening, and we don't want to prevent access to the directory 2048 * writeout happening, and we don't want to prevent access to the directory
1984 * while waiting on the I/O. 2049 * while waiting on the I/O.
1985 */ 2050 */
1986asmlinkage long sys_unlink(const char __user * pathname) 2051static long do_unlinkat(int dfd, const char __user *pathname)
1987{ 2052{
1988 int error = 0; 2053 int error = 0;
1989 char * name; 2054 char * name;
@@ -1995,13 +2060,13 @@ asmlinkage long sys_unlink(const char __user * pathname)
1995 if(IS_ERR(name)) 2060 if(IS_ERR(name))
1996 return PTR_ERR(name); 2061 return PTR_ERR(name);
1997 2062
1998 error = path_lookup(name, LOOKUP_PARENT, &nd); 2063 error = do_path_lookup(dfd, name, LOOKUP_PARENT, &nd);
1999 if (error) 2064 if (error)
2000 goto exit; 2065 goto exit;
2001 error = -EISDIR; 2066 error = -EISDIR;
2002 if (nd.last_type != LAST_NORM) 2067 if (nd.last_type != LAST_NORM)
2003 goto exit1; 2068 goto exit1;
2004 down(&nd.dentry->d_inode->i_sem); 2069 mutex_lock(&nd.dentry->d_inode->i_mutex);
2005 dentry = lookup_hash(&nd); 2070 dentry = lookup_hash(&nd);
2006 error = PTR_ERR(dentry); 2071 error = PTR_ERR(dentry);
2007 if (!IS_ERR(dentry)) { 2072 if (!IS_ERR(dentry)) {
@@ -2015,7 +2080,7 @@ asmlinkage long sys_unlink(const char __user * pathname)
2015 exit2: 2080 exit2:
2016 dput(dentry); 2081 dput(dentry);
2017 } 2082 }
2018 up(&nd.dentry->d_inode->i_sem); 2083 mutex_unlock(&nd.dentry->d_inode->i_mutex);
2019 if (inode) 2084 if (inode)
2020 iput(inode); /* truncate the inode here */ 2085 iput(inode); /* truncate the inode here */
2021exit1: 2086exit1:
@@ -2030,6 +2095,22 @@ slashes:
2030 goto exit2; 2095 goto exit2;
2031} 2096}
2032 2097
2098asmlinkage long sys_unlinkat(int dfd, const char __user *pathname, int flag)
2099{
2100 if ((flag & ~AT_REMOVEDIR) != 0)
2101 return -EINVAL;
2102
2103 if (flag & AT_REMOVEDIR)
2104 return do_rmdir(dfd, pathname);
2105
2106 return do_unlinkat(dfd, pathname);
2107}
2108
2109asmlinkage long sys_unlink(const char __user *pathname)
2110{
2111 return do_unlinkat(AT_FDCWD, pathname);
2112}
2113
2033int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, int mode) 2114int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, int mode)
2034{ 2115{
2035 int error = may_create(dir, dentry, NULL); 2116 int error = may_create(dir, dentry, NULL);
@@ -2051,7 +2132,8 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, i
2051 return error; 2132 return error;
2052} 2133}
2053 2134
2054asmlinkage long sys_symlink(const char __user * oldname, const char __user * newname) 2135asmlinkage long sys_symlinkat(const char __user *oldname,
2136 int newdfd, const char __user *newname)
2055{ 2137{
2056 int error = 0; 2138 int error = 0;
2057 char * from; 2139 char * from;
@@ -2066,7 +2148,7 @@ asmlinkage long sys_symlink(const char __user * oldname, const char __user * new
2066 struct dentry *dentry; 2148 struct dentry *dentry;
2067 struct nameidata nd; 2149 struct nameidata nd;
2068 2150
2069 error = path_lookup(to, LOOKUP_PARENT, &nd); 2151 error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
2070 if (error) 2152 if (error)
2071 goto out; 2153 goto out;
2072 dentry = lookup_create(&nd, 0); 2154 dentry = lookup_create(&nd, 0);
@@ -2075,7 +2157,7 @@ asmlinkage long sys_symlink(const char __user * oldname, const char __user * new
2075 error = vfs_symlink(nd.dentry->d_inode, dentry, from, S_IALLUGO); 2157 error = vfs_symlink(nd.dentry->d_inode, dentry, from, S_IALLUGO);
2076 dput(dentry); 2158 dput(dentry);
2077 } 2159 }
2078 up(&nd.dentry->d_inode->i_sem); 2160 mutex_unlock(&nd.dentry->d_inode->i_mutex);
2079 path_release(&nd); 2161 path_release(&nd);
2080out: 2162out:
2081 putname(to); 2163 putname(to);
@@ -2084,6 +2166,11 @@ out:
2084 return error; 2166 return error;
2085} 2167}
2086 2168
2169asmlinkage long sys_symlink(const char __user *oldname, const char __user *newname)
2170{
2171 return sys_symlinkat(oldname, AT_FDCWD, newname);
2172}
2173
2087int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) 2174int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)
2088{ 2175{
2089 struct inode *inode = old_dentry->d_inode; 2176 struct inode *inode = old_dentry->d_inode;
@@ -2113,10 +2200,10 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
2113 if (error) 2200 if (error)
2114 return error; 2201 return error;
2115 2202
2116 down(&old_dentry->d_inode->i_sem); 2203 mutex_lock(&old_dentry->d_inode->i_mutex);
2117 DQUOT_INIT(dir); 2204 DQUOT_INIT(dir);
2118 error = dir->i_op->link(old_dentry, dir, new_dentry); 2205 error = dir->i_op->link(old_dentry, dir, new_dentry);
2119 up(&old_dentry->d_inode->i_sem); 2206 mutex_unlock(&old_dentry->d_inode->i_mutex);
2120 if (!error) 2207 if (!error)
2121 fsnotify_create(dir, new_dentry->d_name.name); 2208 fsnotify_create(dir, new_dentry->d_name.name);
2122 return error; 2209 return error;
@@ -2131,7 +2218,8 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
2131 * with linux 2.0, and to avoid hard-linking to directories 2218 * with linux 2.0, and to avoid hard-linking to directories
2132 * and other special files. --ADM 2219 * and other special files. --ADM
2133 */ 2220 */
2134asmlinkage long sys_link(const char __user * oldname, const char __user * newname) 2221asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
2222 int newdfd, const char __user *newname)
2135{ 2223{
2136 struct dentry *new_dentry; 2224 struct dentry *new_dentry;
2137 struct nameidata nd, old_nd; 2225 struct nameidata nd, old_nd;
@@ -2142,10 +2230,10 @@ asmlinkage long sys_link(const char __user * oldname, const char __user * newnam
2142 if (IS_ERR(to)) 2230 if (IS_ERR(to))
2143 return PTR_ERR(to); 2231 return PTR_ERR(to);
2144 2232
2145 error = __user_walk(oldname, 0, &old_nd); 2233 error = __user_walk_fd(olddfd, oldname, 0, &old_nd);
2146 if (error) 2234 if (error)
2147 goto exit; 2235 goto exit;
2148 error = path_lookup(to, LOOKUP_PARENT, &nd); 2236 error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
2149 if (error) 2237 if (error)
2150 goto out; 2238 goto out;
2151 error = -EXDEV; 2239 error = -EXDEV;
@@ -2157,7 +2245,7 @@ asmlinkage long sys_link(const char __user * oldname, const char __user * newnam
2157 error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry); 2245 error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry);
2158 dput(new_dentry); 2246 dput(new_dentry);
2159 } 2247 }
2160 up(&nd.dentry->d_inode->i_sem); 2248 mutex_unlock(&nd.dentry->d_inode->i_mutex);
2161out_release: 2249out_release:
2162 path_release(&nd); 2250 path_release(&nd);
2163out: 2251out:
@@ -2168,6 +2256,11 @@ exit:
2168 return error; 2256 return error;
2169} 2257}
2170 2258
2259asmlinkage long sys_link(const char __user *oldname, const char __user *newname)
2260{
2261 return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname);
2262}
2263
2171/* 2264/*
2172 * The worst of all namespace operations - renaming directory. "Perverted" 2265 * The worst of all namespace operations - renaming directory. "Perverted"
2173 * doesn't even start to describe it. Somebody in UCB had a heck of a trip... 2266 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
@@ -2178,7 +2271,7 @@ exit:
2178 * sb->s_vfs_rename_sem. We might be more accurate, but that's another 2271 * sb->s_vfs_rename_sem. We might be more accurate, but that's another
2179 * story. 2272 * story.
2180 * c) we have to lock _three_ objects - parents and victim (if it exists). 2273 * c) we have to lock _three_ objects - parents and victim (if it exists).
2181 * And that - after we got ->i_sem on parents (until then we don't know 2274 * And that - after we got ->i_mutex on parents (until then we don't know
2182 * whether the target exists). Solution: try to be smart with locking 2275 * whether the target exists). Solution: try to be smart with locking
2183 * order for inodes. We rely on the fact that tree topology may change 2276 * order for inodes. We rely on the fact that tree topology may change
2184 * only under ->s_vfs_rename_sem _and_ that parent of the object we 2277 * only under ->s_vfs_rename_sem _and_ that parent of the object we
@@ -2195,9 +2288,9 @@ exit:
2195 * stuff into VFS), but the former is not going away. Solution: the same 2288 * stuff into VFS), but the former is not going away. Solution: the same
2196 * trick as in rmdir(). 2289 * trick as in rmdir().
2197 * e) conversion from fhandle to dentry may come in the wrong moment - when 2290 * e) conversion from fhandle to dentry may come in the wrong moment - when
2198 * we are removing the target. Solution: we will have to grab ->i_sem 2291 * we are removing the target. Solution: we will have to grab ->i_mutex
2199 * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on 2292 * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
2200 * ->i_sem on parents, which works but leads to some truely excessive 2293 * ->i_mutex on parents, which works but leads to some truely excessive
2201 * locking]. 2294 * locking].
2202 */ 2295 */
2203static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, 2296static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
@@ -2222,7 +2315,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
2222 2315
2223 target = new_dentry->d_inode; 2316 target = new_dentry->d_inode;
2224 if (target) { 2317 if (target) {
2225 down(&target->i_sem); 2318 mutex_lock(&target->i_mutex);
2226 dentry_unhash(new_dentry); 2319 dentry_unhash(new_dentry);
2227 } 2320 }
2228 if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) 2321 if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
@@ -2232,7 +2325,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
2232 if (target) { 2325 if (target) {
2233 if (!error) 2326 if (!error)
2234 target->i_flags |= S_DEAD; 2327 target->i_flags |= S_DEAD;
2235 up(&target->i_sem); 2328 mutex_unlock(&target->i_mutex);
2236 if (d_unhashed(new_dentry)) 2329 if (d_unhashed(new_dentry))
2237 d_rehash(new_dentry); 2330 d_rehash(new_dentry);
2238 dput(new_dentry); 2331 dput(new_dentry);
@@ -2255,7 +2348,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
2255 dget(new_dentry); 2348 dget(new_dentry);
2256 target = new_dentry->d_inode; 2349 target = new_dentry->d_inode;
2257 if (target) 2350 if (target)
2258 down(&target->i_sem); 2351 mutex_lock(&target->i_mutex);
2259 if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) 2352 if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
2260 error = -EBUSY; 2353 error = -EBUSY;
2261 else 2354 else
@@ -2266,7 +2359,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
2266 d_move(old_dentry, new_dentry); 2359 d_move(old_dentry, new_dentry);
2267 } 2360 }
2268 if (target) 2361 if (target)
2269 up(&target->i_sem); 2362 mutex_unlock(&target->i_mutex);
2270 dput(new_dentry); 2363 dput(new_dentry);
2271 return error; 2364 return error;
2272} 2365}
@@ -2314,7 +2407,8 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
2314 return error; 2407 return error;
2315} 2408}
2316 2409
2317static inline int do_rename(const char * oldname, const char * newname) 2410static int do_rename(int olddfd, const char *oldname,
2411 int newdfd, const char *newname)
2318{ 2412{
2319 int error = 0; 2413 int error = 0;
2320 struct dentry * old_dir, * new_dir; 2414 struct dentry * old_dir, * new_dir;
@@ -2322,11 +2416,11 @@ static inline int do_rename(const char * oldname, const char * newname)
2322 struct dentry * trap; 2416 struct dentry * trap;
2323 struct nameidata oldnd, newnd; 2417 struct nameidata oldnd, newnd;
2324 2418
2325 error = path_lookup(oldname, LOOKUP_PARENT, &oldnd); 2419 error = do_path_lookup(olddfd, oldname, LOOKUP_PARENT, &oldnd);
2326 if (error) 2420 if (error)
2327 goto exit; 2421 goto exit;
2328 2422
2329 error = path_lookup(newname, LOOKUP_PARENT, &newnd); 2423 error = do_path_lookup(newdfd, newname, LOOKUP_PARENT, &newnd);
2330 if (error) 2424 if (error)
2331 goto exit1; 2425 goto exit1;
2332 2426
@@ -2390,7 +2484,8 @@ exit:
2390 return error; 2484 return error;
2391} 2485}
2392 2486
2393asmlinkage long sys_rename(const char __user * oldname, const char __user * newname) 2487asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
2488 int newdfd, const char __user *newname)
2394{ 2489{
2395 int error; 2490 int error;
2396 char * from; 2491 char * from;
@@ -2402,13 +2497,18 @@ asmlinkage long sys_rename(const char __user * oldname, const char __user * newn
2402 to = getname(newname); 2497 to = getname(newname);
2403 error = PTR_ERR(to); 2498 error = PTR_ERR(to);
2404 if (!IS_ERR(to)) { 2499 if (!IS_ERR(to)) {
2405 error = do_rename(from,to); 2500 error = do_rename(olddfd, from, newdfd, to);
2406 putname(to); 2501 putname(to);
2407 } 2502 }
2408 putname(from); 2503 putname(from);
2409 return error; 2504 return error;
2410} 2505}
2411 2506
2507asmlinkage long sys_rename(const char __user *oldname, const char __user *newname)
2508{
2509 return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname);
2510}
2511
2412int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link) 2512int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link)
2413{ 2513{
2414 int len; 2514 int len;
@@ -2552,6 +2652,7 @@ struct inode_operations page_symlink_inode_operations = {
2552}; 2652};
2553 2653
2554EXPORT_SYMBOL(__user_walk); 2654EXPORT_SYMBOL(__user_walk);
2655EXPORT_SYMBOL(__user_walk_fd);
2555EXPORT_SYMBOL(follow_down); 2656EXPORT_SYMBOL(follow_down);
2556EXPORT_SYMBOL(follow_up); 2657EXPORT_SYMBOL(follow_up);
2557EXPORT_SYMBOL(get_write_access); /* binfmt_aout */ 2658EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
diff --git a/fs/namespace.c b/fs/namespace.c
index 2019899f2ab..ce97becff46 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -16,6 +16,7 @@
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/quotaops.h> 17#include <linux/quotaops.h>
18#include <linux/acct.h> 18#include <linux/acct.h>
19#include <linux/capability.h>
19#include <linux/module.h> 20#include <linux/module.h>
20#include <linux/seq_file.h> 21#include <linux/seq_file.h>
21#include <linux/namespace.h> 22#include <linux/namespace.h>
@@ -47,6 +48,10 @@ static int hash_mask __read_mostly, hash_bits __read_mostly;
47static kmem_cache_t *mnt_cache; 48static kmem_cache_t *mnt_cache;
48static struct rw_semaphore namespace_sem; 49static struct rw_semaphore namespace_sem;
49 50
51/* /sys/fs */
52decl_subsys(fs, NULL, NULL);
53EXPORT_SYMBOL_GPL(fs_subsys);
54
50static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) 55static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
51{ 56{
52 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); 57 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
@@ -355,14 +360,14 @@ static int show_vfsmnt(struct seq_file *m, void *v)
355 { MS_SYNCHRONOUS, ",sync" }, 360 { MS_SYNCHRONOUS, ",sync" },
356 { MS_DIRSYNC, ",dirsync" }, 361 { MS_DIRSYNC, ",dirsync" },
357 { MS_MANDLOCK, ",mand" }, 362 { MS_MANDLOCK, ",mand" },
358 { MS_NOATIME, ",noatime" },
359 { MS_NODIRATIME, ",nodiratime" },
360 { 0, NULL } 363 { 0, NULL }
361 }; 364 };
362 static struct proc_fs_info mnt_info[] = { 365 static struct proc_fs_info mnt_info[] = {
363 { MNT_NOSUID, ",nosuid" }, 366 { MNT_NOSUID, ",nosuid" },
364 { MNT_NODEV, ",nodev" }, 367 { MNT_NODEV, ",nodev" },
365 { MNT_NOEXEC, ",noexec" }, 368 { MNT_NOEXEC, ",noexec" },
369 { MNT_NOATIME, ",noatime" },
370 { MNT_NODIRATIME, ",nodiratime" },
366 { 0, NULL } 371 { 0, NULL }
367 }; 372 };
368 struct proc_fs_info *fs_infop; 373 struct proc_fs_info *fs_infop;
@@ -451,7 +456,7 @@ EXPORT_SYMBOL(may_umount);
451void release_mounts(struct list_head *head) 456void release_mounts(struct list_head *head)
452{ 457{
453 struct vfsmount *mnt; 458 struct vfsmount *mnt;
454 while(!list_empty(head)) { 459 while (!list_empty(head)) {
455 mnt = list_entry(head->next, struct vfsmount, mnt_hash); 460 mnt = list_entry(head->next, struct vfsmount, mnt_hash);
456 list_del_init(&mnt->mnt_hash); 461 list_del_init(&mnt->mnt_hash);
457 if (mnt->mnt_parent != mnt) { 462 if (mnt->mnt_parent != mnt) {
@@ -814,7 +819,7 @@ static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
814 return -ENOTDIR; 819 return -ENOTDIR;
815 820
816 err = -ENOENT; 821 err = -ENOENT;
817 down(&nd->dentry->d_inode->i_sem); 822 mutex_lock(&nd->dentry->d_inode->i_mutex);
818 if (IS_DEADDIR(nd->dentry->d_inode)) 823 if (IS_DEADDIR(nd->dentry->d_inode))
819 goto out_unlock; 824 goto out_unlock;
820 825
@@ -826,7 +831,7 @@ static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
826 if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry)) 831 if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry))
827 err = attach_recursive_mnt(mnt, nd, NULL); 832 err = attach_recursive_mnt(mnt, nd, NULL);
828out_unlock: 833out_unlock:
829 up(&nd->dentry->d_inode->i_sem); 834 mutex_unlock(&nd->dentry->d_inode->i_mutex);
830 if (!err) 835 if (!err)
831 security_sb_post_addmount(mnt, nd); 836 security_sb_post_addmount(mnt, nd);
832 return err; 837 return err;
@@ -962,7 +967,7 @@ static int do_move_mount(struct nameidata *nd, char *old_name)
962 goto out; 967 goto out;
963 968
964 err = -ENOENT; 969 err = -ENOENT;
965 down(&nd->dentry->d_inode->i_sem); 970 mutex_lock(&nd->dentry->d_inode->i_mutex);
966 if (IS_DEADDIR(nd->dentry->d_inode)) 971 if (IS_DEADDIR(nd->dentry->d_inode))
967 goto out1; 972 goto out1;
968 973
@@ -1004,7 +1009,7 @@ static int do_move_mount(struct nameidata *nd, char *old_name)
1004 list_del_init(&old_nd.mnt->mnt_expire); 1009 list_del_init(&old_nd.mnt->mnt_expire);
1005 spin_unlock(&vfsmount_lock); 1010 spin_unlock(&vfsmount_lock);
1006out1: 1011out1:
1007 up(&nd->dentry->d_inode->i_sem); 1012 mutex_unlock(&nd->dentry->d_inode->i_mutex);
1008out: 1013out:
1009 up_write(&namespace_sem); 1014 up_write(&namespace_sem);
1010 if (!err) 1015 if (!err)
@@ -1286,7 +1291,13 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
1286 mnt_flags |= MNT_NODEV; 1291 mnt_flags |= MNT_NODEV;
1287 if (flags & MS_NOEXEC) 1292 if (flags & MS_NOEXEC)
1288 mnt_flags |= MNT_NOEXEC; 1293 mnt_flags |= MNT_NOEXEC;
1289 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE); 1294 if (flags & MS_NOATIME)
1295 mnt_flags |= MNT_NOATIME;
1296 if (flags & MS_NODIRATIME)
1297 mnt_flags |= MNT_NODIRATIME;
1298
1299 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
1300 MS_NOATIME | MS_NODIRATIME);
1290 1301
1291 /* ... and get the mountpoint */ 1302 /* ... and get the mountpoint */
1292 retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd); 1303 retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd);
@@ -1526,6 +1537,10 @@ static void chroot_fs_refs(struct nameidata *old_nd, struct nameidata *new_nd)
1526 * pointed to by put_old must yield the same directory as new_root. No other 1537 * pointed to by put_old must yield the same directory as new_root. No other
1527 * file system may be mounted on put_old. After all, new_root is a mountpoint. 1538 * file system may be mounted on put_old. After all, new_root is a mountpoint.
1528 * 1539 *
1540 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
1541 * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives
1542 * in this situation.
1543 *
1529 * Notes: 1544 * Notes:
1530 * - we don't move root/cwd if they are not at the root (reason: if something 1545 * - we don't move root/cwd if they are not at the root (reason: if something
1531 * cared enough to change them, it's probably wrong to force them elsewhere) 1546 * cared enough to change them, it's probably wrong to force them elsewhere)
@@ -1569,7 +1584,7 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
1569 user_nd.dentry = dget(current->fs->root); 1584 user_nd.dentry = dget(current->fs->root);
1570 read_unlock(&current->fs->lock); 1585 read_unlock(&current->fs->lock);
1571 down_write(&namespace_sem); 1586 down_write(&namespace_sem);
1572 down(&old_nd.dentry->d_inode->i_sem); 1587 mutex_lock(&old_nd.dentry->d_inode->i_mutex);
1573 error = -EINVAL; 1588 error = -EINVAL;
1574 if (IS_MNT_SHARED(old_nd.mnt) || 1589 if (IS_MNT_SHARED(old_nd.mnt) ||
1575 IS_MNT_SHARED(new_nd.mnt->mnt_parent) || 1590 IS_MNT_SHARED(new_nd.mnt->mnt_parent) ||
@@ -1622,7 +1637,7 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
1622 path_release(&root_parent); 1637 path_release(&root_parent);
1623 path_release(&parent_nd); 1638 path_release(&parent_nd);
1624out2: 1639out2:
1625 up(&old_nd.dentry->d_inode->i_sem); 1640 mutex_unlock(&old_nd.dentry->d_inode->i_mutex);
1626 up_write(&namespace_sem); 1641 up_write(&namespace_sem);
1627 path_release(&user_nd); 1642 path_release(&user_nd);
1628 path_release(&old_nd); 1643 path_release(&old_nd);
@@ -1714,6 +1729,7 @@ void __init mnt_init(unsigned long mempages)
1714 i--; 1729 i--;
1715 } while (i); 1730 } while (i);
1716 sysfs_init(); 1731 sysfs_init();
1732 subsystem_register(&fs_subsys);
1717 init_rootfs(); 1733 init_rootfs();
1718 init_mount_tree(); 1734 init_mount_tree();
1719} 1735}
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index a9f7a8ab1d5..cfd76f431dc 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -365,7 +365,7 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
365 spin_lock(&dcache_lock); 365 spin_lock(&dcache_lock);
366 next = parent->d_subdirs.next; 366 next = parent->d_subdirs.next;
367 while (next != &parent->d_subdirs) { 367 while (next != &parent->d_subdirs) {
368 dent = list_entry(next, struct dentry, d_child); 368 dent = list_entry(next, struct dentry, d_u.d_child);
369 if ((unsigned long)dent->d_fsdata == fpos) { 369 if ((unsigned long)dent->d_fsdata == fpos) {
370 if (dent->d_inode) 370 if (dent->d_inode)
371 dget_locked(dent); 371 dget_locked(dent);
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 4947d9b11fc..973b444d691 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -262,7 +262,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
262 } 262 }
263 vfree(bouncebuffer); 263 vfree(bouncebuffer);
264 264
265 inode_update_time(inode, 1); 265 file_update_time(file);
266 266
267 *ppos = pos; 267 *ppos = pos;
268 268
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 8c8839203cd..d277a58bd12 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -716,10 +716,8 @@ static void ncp_put_super(struct super_block *sb)
716 fput(server->ncp_filp); 716 fput(server->ncp_filp);
717 kill_proc(server->m.wdog_pid, SIGTERM, 1); 717 kill_proc(server->m.wdog_pid, SIGTERM, 1);
718 718
719 if (server->priv.data) 719 kfree(server->priv.data);
720 ncp_kfree_s(server->priv.data, server->priv.len); 720 kfree(server->auth.object_name);
721 if (server->auth.object_name)
722 ncp_kfree_s(server->auth.object_name, server->auth.object_name_len);
723 vfree(server->packet); 721 vfree(server->packet);
724 sb->s_fs_info = NULL; 722 sb->s_fs_info = NULL;
725 kfree(server); 723 kfree(server);
@@ -958,11 +956,6 @@ out:
958 return result; 956 return result;
959} 957}
960 958
961#ifdef DEBUG_NCP_MALLOC
962int ncp_malloced;
963int ncp_current_malloced;
964#endif
965
966static struct super_block *ncp_get_sb(struct file_system_type *fs_type, 959static struct super_block *ncp_get_sb(struct file_system_type *fs_type,
967 int flags, const char *dev_name, void *data) 960 int flags, const char *dev_name, void *data)
968{ 961{
@@ -981,10 +974,6 @@ static int __init init_ncp_fs(void)
981 int err; 974 int err;
982 DPRINTK("ncpfs: init_module called\n"); 975 DPRINTK("ncpfs: init_module called\n");
983 976
984#ifdef DEBUG_NCP_MALLOC
985 ncp_malloced = 0;
986 ncp_current_malloced = 0;
987#endif
988 err = init_inodecache(); 977 err = init_inodecache();
989 if (err) 978 if (err)
990 goto out1; 979 goto out1;
@@ -1003,10 +992,6 @@ static void __exit exit_ncp_fs(void)
1003 DPRINTK("ncpfs: cleanup_module called\n"); 992 DPRINTK("ncpfs: cleanup_module called\n");
1004 unregister_filesystem(&ncp_fs_type); 993 unregister_filesystem(&ncp_fs_type);
1005 destroy_inodecache(); 994 destroy_inodecache();
1006#ifdef DEBUG_NCP_MALLOC
1007 PRINTK("ncp_malloced: %d\n", ncp_malloced);
1008 PRINTK("ncp_current_malloced: %d\n", ncp_current_malloced);
1009#endif
1010} 995}
1011 996
1012module_init(init_ncp_fs) 997module_init(init_ncp_fs)
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index fd3efdca5ae..eb3813ad136 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -10,6 +10,7 @@
10#include <linux/config.h> 10#include <linux/config.h>
11 11
12#include <asm/uaccess.h> 12#include <asm/uaccess.h>
13#include <linux/capability.h>
13#include <linux/errno.h> 14#include <linux/errno.h>
14#include <linux/fs.h> 15#include <linux/fs.h>
15#include <linux/ioctl.h> 16#include <linux/ioctl.h>
@@ -517,10 +518,11 @@ outrel:
517 if (user.object_name_len > NCP_OBJECT_NAME_MAX_LEN) 518 if (user.object_name_len > NCP_OBJECT_NAME_MAX_LEN)
518 return -ENOMEM; 519 return -ENOMEM;
519 if (user.object_name_len) { 520 if (user.object_name_len) {
520 newname = ncp_kmalloc(user.object_name_len, GFP_USER); 521 newname = kmalloc(user.object_name_len, GFP_USER);
521 if (!newname) return -ENOMEM; 522 if (!newname)
523 return -ENOMEM;
522 if (copy_from_user(newname, user.object_name, user.object_name_len)) { 524 if (copy_from_user(newname, user.object_name, user.object_name_len)) {
523 ncp_kfree_s(newname, user.object_name_len); 525 kfree(newname);
524 return -EFAULT; 526 return -EFAULT;
525 } 527 }
526 } else { 528 } else {
@@ -539,8 +541,8 @@ outrel:
539 server->priv.len = 0; 541 server->priv.len = 0;
540 server->priv.data = NULL; 542 server->priv.data = NULL;
541 /* leave critical section */ 543 /* leave critical section */
542 if (oldprivate) ncp_kfree_s(oldprivate, oldprivatelen); 544 kfree(oldprivate);
543 if (oldname) ncp_kfree_s(oldname, oldnamelen); 545 kfree(oldname);
544 return 0; 546 return 0;
545 } 547 }
546 case NCP_IOC_GETPRIVATEDATA: 548 case NCP_IOC_GETPRIVATEDATA:
@@ -580,10 +582,11 @@ outrel:
580 if (user.len > NCP_PRIVATE_DATA_MAX_LEN) 582 if (user.len > NCP_PRIVATE_DATA_MAX_LEN)
581 return -ENOMEM; 583 return -ENOMEM;
582 if (user.len) { 584 if (user.len) {
583 new = ncp_kmalloc(user.len, GFP_USER); 585 new = kmalloc(user.len, GFP_USER);
584 if (!new) return -ENOMEM; 586 if (!new)
587 return -ENOMEM;
585 if (copy_from_user(new, user.data, user.len)) { 588 if (copy_from_user(new, user.data, user.len)) {
586 ncp_kfree_s(new, user.len); 589 kfree(new);
587 return -EFAULT; 590 return -EFAULT;
588 } 591 }
589 } else { 592 } else {
@@ -595,7 +598,7 @@ outrel:
595 server->priv.len = user.len; 598 server->priv.len = user.len;
596 server->priv.data = new; 599 server->priv.data = new;
597 /* leave critical section */ 600 /* leave critical section */
598 if (old) ncp_kfree_s(old, oldlen); 601 kfree(old);
599 return 0; 602 return 0;
600 } 603 }
601 604
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 9e4dc30c243..799e5c2bec5 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -196,7 +196,7 @@ ncp_renew_dentries(struct dentry *parent)
196 spin_lock(&dcache_lock); 196 spin_lock(&dcache_lock);
197 next = parent->d_subdirs.next; 197 next = parent->d_subdirs.next;
198 while (next != &parent->d_subdirs) { 198 while (next != &parent->d_subdirs) {
199 dentry = list_entry(next, struct dentry, d_child); 199 dentry = list_entry(next, struct dentry, d_u.d_child);
200 200
201 if (dentry->d_fsdata == NULL) 201 if (dentry->d_fsdata == NULL)
202 ncp_age_dentry(server, dentry); 202 ncp_age_dentry(server, dentry);
@@ -218,7 +218,7 @@ ncp_invalidate_dircache_entries(struct dentry *parent)
218 spin_lock(&dcache_lock); 218 spin_lock(&dcache_lock);
219 next = parent->d_subdirs.next; 219 next = parent->d_subdirs.next;
220 while (next != &parent->d_subdirs) { 220 while (next != &parent->d_subdirs) {
221 dentry = list_entry(next, struct dentry, d_child); 221 dentry = list_entry(next, struct dentry, d_u.d_child);
222 dentry->d_fsdata = NULL; 222 dentry->d_fsdata = NULL;
223 ncp_age_dentry(server, dentry); 223 ncp_age_dentry(server, dentry);
224 next = next->next; 224 next = next->next;
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 8b3bb715d17..ec61fd56a1a 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -13,4 +13,5 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
13 delegation.o idmap.o \ 13 delegation.o idmap.o \
14 callback.o callback_xdr.o callback_proc.o 14 callback.o callback_xdr.o callback_proc.o
15nfs-$(CONFIG_NFS_DIRECTIO) += direct.o 15nfs-$(CONFIG_NFS_DIRECTIO) += direct.o
16nfs-$(CONFIG_SYSCTL) += sysctl.o
16nfs-objs := $(nfs-y) 17nfs-objs := $(nfs-y)
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index f2ca782aba3..fcd97406a77 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -14,6 +14,9 @@
14#include <linux/sunrpc/svc.h> 14#include <linux/sunrpc/svc.h>
15#include <linux/sunrpc/svcsock.h> 15#include <linux/sunrpc/svcsock.h>
16#include <linux/nfs_fs.h> 16#include <linux/nfs_fs.h>
17
18#include <net/inet_sock.h>
19
17#include "nfs4_fs.h" 20#include "nfs4_fs.h"
18#include "callback.h" 21#include "callback.h"
19 22
@@ -31,6 +34,7 @@ static struct nfs_callback_data nfs_callback_info;
31static DECLARE_MUTEX(nfs_callback_sema); 34static DECLARE_MUTEX(nfs_callback_sema);
32static struct svc_program nfs4_callback_program; 35static struct svc_program nfs4_callback_program;
33 36
37unsigned int nfs_callback_set_tcpport;
34unsigned short nfs_callback_tcpport; 38unsigned short nfs_callback_tcpport;
35 39
36/* 40/*
@@ -95,7 +99,7 @@ int nfs_callback_up(void)
95 if (!serv) 99 if (!serv)
96 goto out_err; 100 goto out_err;
97 /* FIXME: We don't want to register this socket with the portmapper */ 101 /* FIXME: We don't want to register this socket with the portmapper */
98 ret = svc_makesock(serv, IPPROTO_TCP, 0); 102 ret = svc_makesock(serv, IPPROTO_TCP, nfs_callback_set_tcpport);
99 if (ret < 0) 103 if (ret < 0)
100 goto out_destroy; 104 goto out_destroy;
101 if (!list_empty(&serv->sv_permsocks)) { 105 if (!list_empty(&serv->sv_permsocks)) {
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index a0db2d4f941..b252e7fe53a 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -65,6 +65,7 @@ extern unsigned nfs4_callback_recall(struct cb_recallargs *args, void *dummy);
65extern int nfs_callback_up(void); 65extern int nfs_callback_up(void);
66extern int nfs_callback_down(void); 66extern int nfs_callback_down(void);
67 67
68extern unsigned int nfs_callback_set_tcpport;
68extern unsigned short nfs_callback_tcpport; 69extern unsigned short nfs_callback_tcpport;
69 70
70#endif /* __LINUX_FS_NFS_CALLBACK_H */ 71#endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 65f1e19e4d1..462cfceb50c 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -35,7 +35,9 @@ unsigned nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres
35 if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0) 35 if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0)
36 goto out_iput; 36 goto out_iput;
37 res->size = i_size_read(inode); 37 res->size = i_size_read(inode);
38 res->change_attr = NFS_CHANGE_ATTR(inode); 38 res->change_attr = delegation->change_attr;
39 if (nfsi->npages != 0)
40 res->change_attr++;
39 res->ctime = inode->i_ctime; 41 res->ctime = inode->i_ctime;
40 res->mtime = inode->i_mtime; 42 res->mtime = inode->i_mtime;
41 res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) & 43 res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) &
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 618a327027b..c6f07c1c71e 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -8,6 +8,7 @@
8 */ 8 */
9#include <linux/config.h> 9#include <linux/config.h>
10#include <linux/completion.h> 10#include <linux/completion.h>
11#include <linux/kthread.h>
11#include <linux/module.h> 12#include <linux/module.h>
12#include <linux/sched.h> 13#include <linux/sched.h>
13#include <linux/spinlock.h> 14#include <linux/spinlock.h>
@@ -130,6 +131,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
130 sizeof(delegation->stateid.data)); 131 sizeof(delegation->stateid.data));
131 delegation->type = res->delegation_type; 132 delegation->type = res->delegation_type;
132 delegation->maxsize = res->maxsize; 133 delegation->maxsize = res->maxsize;
134 delegation->change_attr = nfsi->change_attr;
133 delegation->cred = get_rpccred(cred); 135 delegation->cred = get_rpccred(cred);
134 delegation->inode = inode; 136 delegation->inode = inode;
135 137
@@ -157,8 +159,6 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *
157{ 159{
158 int res = 0; 160 int res = 0;
159 161
160 __nfs_revalidate_inode(NFS_SERVER(inode), inode);
161
162 res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid); 162 res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid);
163 nfs_free_delegation(delegation); 163 nfs_free_delegation(delegation);
164 return res; 164 return res;
@@ -231,6 +231,49 @@ restart:
231 spin_unlock(&clp->cl_lock); 231 spin_unlock(&clp->cl_lock);
232} 232}
233 233
234int nfs_do_expire_all_delegations(void *ptr)
235{
236 struct nfs4_client *clp = ptr;
237 struct nfs_delegation *delegation;
238 struct inode *inode;
239
240 allow_signal(SIGKILL);
241restart:
242 spin_lock(&clp->cl_lock);
243 if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) != 0)
244 goto out;
245 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0)
246 goto out;
247 list_for_each_entry(delegation, &clp->cl_delegations, super_list) {
248 inode = igrab(delegation->inode);
249 if (inode == NULL)
250 continue;
251 spin_unlock(&clp->cl_lock);
252 nfs_inode_return_delegation(inode);
253 iput(inode);
254 goto restart;
255 }
256out:
257 spin_unlock(&clp->cl_lock);
258 nfs4_put_client(clp);
259 module_put_and_exit(0);
260}
261
262void nfs_expire_all_delegations(struct nfs4_client *clp)
263{
264 struct task_struct *task;
265
266 __module_get(THIS_MODULE);
267 atomic_inc(&clp->cl_count);
268 task = kthread_run(nfs_do_expire_all_delegations, clp,
269 "%u.%u.%u.%u-delegreturn",
270 NIPQUAD(clp->cl_addr));
271 if (!IS_ERR(task))
272 return;
273 nfs4_put_client(clp);
274 module_put(THIS_MODULE);
275}
276
234/* 277/*
235 * Return all delegations following an NFS4ERR_CB_PATH_DOWN error. 278 * Return all delegations following an NFS4ERR_CB_PATH_DOWN error.
236 */ 279 */
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 2fcc30de924..7a0b2bfce77 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -21,6 +21,7 @@ struct nfs_delegation {
21#define NFS_DELEGATION_NEED_RECLAIM 1 21#define NFS_DELEGATION_NEED_RECLAIM 1
22 long flags; 22 long flags;
23 loff_t maxsize; 23 loff_t maxsize;
24 __u64 change_attr;
24}; 25};
25 26
26int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); 27int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
@@ -30,6 +31,7 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s
30 31
31struct inode *nfs_delegation_find_inode(struct nfs4_client *clp, const struct nfs_fh *fhandle); 32struct inode *nfs_delegation_find_inode(struct nfs4_client *clp, const struct nfs_fh *fhandle);
32void nfs_return_all_delegations(struct super_block *sb); 33void nfs_return_all_delegations(struct super_block *sb);
34void nfs_expire_all_delegations(struct nfs4_client *clp);
33void nfs_handle_cb_pathdown(struct nfs4_client *clp); 35void nfs_handle_cb_pathdown(struct nfs4_client *clp);
34 36
35void nfs_delegation_mark_reclaim(struct nfs4_client *clp); 37void nfs_delegation_mark_reclaim(struct nfs4_client *clp);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 7370583b61e..a1554bead69 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -194,7 +194,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
194 spin_unlock(&inode->i_lock); 194 spin_unlock(&inode->i_lock);
195 /* Ensure consistent page alignment of the data. 195 /* Ensure consistent page alignment of the data.
196 * Note: assumes we have exclusive access to this mapping either 196 * Note: assumes we have exclusive access to this mapping either
197 * through inode->i_sem or some other mechanism. 197 * through inode->i_mutex or some other mechanism.
198 */ 198 */
199 if (page->index == 0) 199 if (page->index == 0)
200 invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1); 200 invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1);
@@ -573,7 +573,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
573 573
574loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) 574loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
575{ 575{
576 down(&filp->f_dentry->d_inode->i_sem); 576 mutex_lock(&filp->f_dentry->d_inode->i_mutex);
577 switch (origin) { 577 switch (origin) {
578 case 1: 578 case 1:
579 offset += filp->f_pos; 579 offset += filp->f_pos;
@@ -589,7 +589,7 @@ loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
589 ((struct nfs_open_context *)filp->private_data)->dir_cookie = 0; 589 ((struct nfs_open_context *)filp->private_data)->dir_cookie = 0;
590 } 590 }
591out: 591out:
592 up(&filp->f_dentry->d_inode->i_sem); 592 mutex_unlock(&filp->f_dentry->d_inode->i_mutex);
593 return offset; 593 return offset;
594} 594}
595 595
@@ -1001,7 +1001,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1001 openflags &= ~(O_CREAT|O_TRUNC); 1001 openflags &= ~(O_CREAT|O_TRUNC);
1002 1002
1003 /* 1003 /*
1004 * Note: we're not holding inode->i_sem and so may be racing with 1004 * Note: we're not holding inode->i_mutex and so may be racing with
1005 * operations that change the directory. We therefore save the 1005 * operations that change the directory. We therefore save the
1006 * change attribute *before* we do the RPC call. 1006 * change attribute *before* we do the RPC call.
1007 */ 1007 */
@@ -1051,7 +1051,7 @@ static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
1051 return dentry; 1051 return dentry;
1052 if (!desc->plus || !(entry->fattr->valid & NFS_ATTR_FATTR)) 1052 if (!desc->plus || !(entry->fattr->valid & NFS_ATTR_FATTR))
1053 return NULL; 1053 return NULL;
1054 /* Note: caller is already holding the dir->i_sem! */ 1054 /* Note: caller is already holding the dir->i_mutex! */
1055 dentry = d_alloc(parent, &name); 1055 dentry = d_alloc(parent, &name);
1056 if (dentry == NULL) 1056 if (dentry == NULL)
1057 return NULL; 1057 return NULL;
@@ -1287,6 +1287,7 @@ dentry->d_parent->d_name.name, dentry->d_name.name);
1287 nfs_begin_data_update(dentry->d_inode); 1287 nfs_begin_data_update(dentry->d_inode);
1288 error = NFS_PROTO(dir)->rename(dir, &dentry->d_name, 1288 error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
1289 dir, &qsilly); 1289 dir, &qsilly);
1290 nfs_mark_for_revalidate(dentry->d_inode);
1290 nfs_end_data_update(dentry->d_inode); 1291 nfs_end_data_update(dentry->d_inode);
1291 } else 1292 } else
1292 error = NFS_PROTO(dir)->rename(dir, &dentry->d_name, 1293 error = NFS_PROTO(dir)->rename(dir, &dentry->d_name,
@@ -1334,6 +1335,7 @@ static int nfs_safe_remove(struct dentry *dentry)
1334 /* The VFS may want to delete this inode */ 1335 /* The VFS may want to delete this inode */
1335 if (error == 0) 1336 if (error == 0)
1336 inode->i_nlink--; 1337 inode->i_nlink--;
1338 nfs_mark_for_revalidate(inode);
1337 nfs_end_data_update(inode); 1339 nfs_end_data_update(inode);
1338 } else 1340 } else
1339 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); 1341 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
@@ -1548,14 +1550,17 @@ go_ahead:
1548 } 1550 }
1549 nfs_inode_return_delegation(old_inode); 1551 nfs_inode_return_delegation(old_inode);
1550 1552
1551 if (new_inode) 1553 if (new_inode != NULL) {
1554 nfs_inode_return_delegation(new_inode);
1552 d_delete(new_dentry); 1555 d_delete(new_dentry);
1556 }
1553 1557
1554 nfs_begin_data_update(old_dir); 1558 nfs_begin_data_update(old_dir);
1555 nfs_begin_data_update(new_dir); 1559 nfs_begin_data_update(new_dir);
1556 nfs_begin_data_update(old_inode); 1560 nfs_begin_data_update(old_inode);
1557 error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name, 1561 error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name,
1558 new_dir, &new_dentry->d_name); 1562 new_dir, &new_dentry->d_name);
1563 nfs_mark_for_revalidate(old_inode);
1559 nfs_end_data_update(old_inode); 1564 nfs_end_data_update(old_inode);
1560 nfs_end_data_update(new_dir); 1565 nfs_end_data_update(new_dir);
1561 nfs_end_data_update(old_dir); 1566 nfs_end_data_update(old_dir);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index b497c71384e..10ae377e68f 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -122,9 +122,10 @@ nfs_free_user_pages(struct page **pages, int npages, int do_dirty)
122{ 122{
123 int i; 123 int i;
124 for (i = 0; i < npages; i++) { 124 for (i = 0; i < npages; i++) {
125 if (do_dirty) 125 struct page *page = pages[i];
126 set_page_dirty_lock(pages[i]); 126 if (do_dirty && !PageCompound(page))
127 page_cache_release(pages[i]); 127 set_page_dirty_lock(page);
128 page_cache_release(page);
128 } 129 }
129 kfree(pages); 130 kfree(pages);
130} 131}
@@ -154,6 +155,7 @@ static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, unsigned int
154 struct list_head *list; 155 struct list_head *list;
155 struct nfs_direct_req *dreq; 156 struct nfs_direct_req *dreq;
156 unsigned int reads = 0; 157 unsigned int reads = 0;
158 unsigned int rpages = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
157 159
158 dreq = kmem_cache_alloc(nfs_direct_cachep, SLAB_KERNEL); 160 dreq = kmem_cache_alloc(nfs_direct_cachep, SLAB_KERNEL);
159 if (!dreq) 161 if (!dreq)
@@ -167,7 +169,7 @@ static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, unsigned int
167 169
168 list = &dreq->list; 170 list = &dreq->list;
169 for(;;) { 171 for(;;) {
170 struct nfs_read_data *data = nfs_readdata_alloc(); 172 struct nfs_read_data *data = nfs_readdata_alloc(rpages);
171 173
172 if (unlikely(!data)) { 174 if (unlikely(!data)) {
173 while (!list_empty(list)) { 175 while (!list_empty(list)) {
@@ -268,8 +270,6 @@ static void nfs_direct_read_schedule(struct nfs_direct_req *dreq,
268 NFS_PROTO(inode)->read_setup(data); 270 NFS_PROTO(inode)->read_setup(data);
269 271
270 data->task.tk_cookie = (unsigned long) inode; 272 data->task.tk_cookie = (unsigned long) inode;
271 data->task.tk_calldata = data;
272 data->task.tk_release = nfs_readdata_release;
273 data->complete = nfs_direct_read_result; 273 data->complete = nfs_direct_read_result;
274 274
275 lock_kernel(); 275 lock_kernel();
@@ -433,7 +433,7 @@ static ssize_t nfs_direct_write_seg(struct inode *inode,
433 struct nfs_writeverf first_verf; 433 struct nfs_writeverf first_verf;
434 struct nfs_write_data *wdata; 434 struct nfs_write_data *wdata;
435 435
436 wdata = nfs_writedata_alloc(); 436 wdata = nfs_writedata_alloc(NFS_SERVER(inode)->wpages);
437 if (!wdata) 437 if (!wdata)
438 return -ENOMEM; 438 return -ENOMEM;
439 439
@@ -662,10 +662,10 @@ nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t
662 .iov_len = count, 662 .iov_len = count,
663 }; 663 };
664 664
665 dprintk("nfs: direct read(%s/%s, %lu@%lu)\n", 665 dprintk("nfs: direct read(%s/%s, %lu@%Ld)\n",
666 file->f_dentry->d_parent->d_name.name, 666 file->f_dentry->d_parent->d_name.name,
667 file->f_dentry->d_name.name, 667 file->f_dentry->d_name.name,
668 (unsigned long) count, (unsigned long) pos); 668 (unsigned long) count, (long long) pos);
669 669
670 if (!is_sync_kiocb(iocb)) 670 if (!is_sync_kiocb(iocb))
671 goto out; 671 goto out;
@@ -678,15 +678,9 @@ nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t
678 if (!count) 678 if (!count)
679 goto out; 679 goto out;
680 680
681 if (mapping->nrpages) { 681 retval = nfs_sync_mapping(mapping);
682 retval = filemap_fdatawrite(mapping); 682 if (retval)
683 if (retval == 0) 683 goto out;
684 retval = nfs_wb_all(inode);
685 if (retval == 0)
686 retval = filemap_fdatawait(mapping);
687 if (retval)
688 goto out;
689 }
690 684
691 retval = nfs_direct_read(inode, ctx, &iov, pos, 1); 685 retval = nfs_direct_read(inode, ctx, &iov, pos, 1);
692 if (retval > 0) 686 if (retval > 0)
@@ -724,9 +718,7 @@ out:
724ssize_t 718ssize_t
725nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos) 719nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
726{ 720{
727 ssize_t retval = -EINVAL; 721 ssize_t retval;
728 loff_t *ppos = &iocb->ki_pos;
729 unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
730 struct file *file = iocb->ki_filp; 722 struct file *file = iocb->ki_filp;
731 struct nfs_open_context *ctx = 723 struct nfs_open_context *ctx =
732 (struct nfs_open_context *) file->private_data; 724 (struct nfs_open_context *) file->private_data;
@@ -734,51 +726,42 @@ nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count,
734 struct inode *inode = mapping->host; 726 struct inode *inode = mapping->host;
735 struct iovec iov = { 727 struct iovec iov = {
736 .iov_base = (char __user *)buf, 728 .iov_base = (char __user *)buf,
737 .iov_len = count,
738 }; 729 };
739 730
740 dfprintk(VFS, "nfs: direct write(%s/%s(%ld), %lu@%lu)\n", 731 dfprintk(VFS, "nfs: direct write(%s/%s, %lu@%Ld)\n",
741 file->f_dentry->d_parent->d_name.name, 732 file->f_dentry->d_parent->d_name.name,
742 file->f_dentry->d_name.name, inode->i_ino, 733 file->f_dentry->d_name.name,
743 (unsigned long) count, (unsigned long) pos); 734 (unsigned long) count, (long long) pos);
744 735
736 retval = -EINVAL;
745 if (!is_sync_kiocb(iocb)) 737 if (!is_sync_kiocb(iocb))
746 goto out; 738 goto out;
747 if (count < 0) 739
748 goto out; 740 retval = generic_write_checks(file, &pos, &count, 0);
749 if (pos < 0) 741 if (retval)
750 goto out; 742 goto out;
751 retval = -EFAULT; 743
752 if (!access_ok(VERIFY_READ, iov.iov_base, iov.iov_len)) 744 retval = -EINVAL;
745 if ((ssize_t) count < 0)
753 goto out; 746 goto out;
754 retval = -EFBIG;
755 if (limit != RLIM_INFINITY) {
756 if (pos >= limit) {
757 send_sig(SIGXFSZ, current, 0);
758 goto out;
759 }
760 if (count > limit - (unsigned long) pos)
761 count = limit - (unsigned long) pos;
762 }
763 retval = 0; 747 retval = 0;
764 if (!count) 748 if (!count)
765 goto out; 749 goto out;
750 iov.iov_len = count,
766 751
767 if (mapping->nrpages) { 752 retval = -EFAULT;
768 retval = filemap_fdatawrite(mapping); 753 if (!access_ok(VERIFY_READ, iov.iov_base, iov.iov_len))
769 if (retval == 0) 754 goto out;
770 retval = nfs_wb_all(inode); 755
771 if (retval == 0) 756 retval = nfs_sync_mapping(mapping);
772 retval = filemap_fdatawait(mapping); 757 if (retval)
773 if (retval) 758 goto out;
774 goto out;
775 }
776 759
777 retval = nfs_direct_write(inode, ctx, &iov, pos, 1); 760 retval = nfs_direct_write(inode, ctx, &iov, pos, 1);
778 if (mapping->nrpages) 761 if (mapping->nrpages)
779 invalidate_inode_pages2(mapping); 762 invalidate_inode_pages2(mapping);
780 if (retval > 0) 763 if (retval > 0)
781 *ppos = pos + retval; 764 iocb->ki_pos = pos + retval;
782 765
783out: 766out:
784 return retval; 767 return retval;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 57d3e77d97e..7a79fbe9f53 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -433,11 +433,7 @@ static int do_unlk(struct file *filp, int cmd, struct file_lock *fl)
433 * Flush all pending writes before doing anything 433 * Flush all pending writes before doing anything
434 * with locks.. 434 * with locks..
435 */ 435 */
436 filemap_fdatawrite(filp->f_mapping); 436 nfs_sync_mapping(filp->f_mapping);
437 down(&inode->i_sem);
438 nfs_wb_all(inode);
439 up(&inode->i_sem);
440 filemap_fdatawait(filp->f_mapping);
441 437
442 /* NOTE: special case 438 /* NOTE: special case
443 * If we're signalled while cleaning up locks on process exit, we 439 * If we're signalled while cleaning up locks on process exit, we
@@ -465,15 +461,8 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
465 * Flush all pending writes before doing anything 461 * Flush all pending writes before doing anything
466 * with locks.. 462 * with locks..
467 */ 463 */
468 status = filemap_fdatawrite(filp->f_mapping); 464 status = nfs_sync_mapping(filp->f_mapping);
469 if (status == 0) { 465 if (status != 0)
470 down(&inode->i_sem);
471 status = nfs_wb_all(inode);
472 up(&inode->i_sem);
473 if (status == 0)
474 status = filemap_fdatawait(filp->f_mapping);
475 }
476 if (status < 0)
477 goto out; 466 goto out;
478 467
479 lock_kernel(); 468 lock_kernel();
@@ -497,11 +486,7 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
497 * Make sure we clear the cache whenever we try to get the lock. 486 * Make sure we clear the cache whenever we try to get the lock.
498 * This makes locking act as a cache coherency point. 487 * This makes locking act as a cache coherency point.
499 */ 488 */
500 filemap_fdatawrite(filp->f_mapping); 489 nfs_sync_mapping(filp->f_mapping);
501 down(&inode->i_sem);
502 nfs_wb_all(inode); /* we may have slept */
503 up(&inode->i_sem);
504 filemap_fdatawait(filp->f_mapping);
505 nfs_zap_caches(inode); 490 nfs_zap_caches(inode);
506out: 491out:
507 rpc_clnt_sigunmask(NFS_CLIENT(inode), &oldset); 492 rpc_clnt_sigunmask(NFS_CLIENT(inode), &oldset);
@@ -524,7 +509,8 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
524 return -EINVAL; 509 return -EINVAL;
525 510
526 /* No mandatory locks over NFS */ 511 /* No mandatory locks over NFS */
527 if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) 512 if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
513 fl->fl_type != F_UNLCK)
528 return -ENOLCK; 514 return -ENOLCK;
529 515
530 if (IS_GETLK(cmd)) 516 if (IS_GETLK(cmd))
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index ffb8df91dc3..821edd30333 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -54,7 +54,11 @@
54 54
55#define IDMAP_HASH_SZ 128 55#define IDMAP_HASH_SZ 128
56 56
57/* Default cache timeout is 10 minutes */
58unsigned int nfs_idmap_cache_timeout = 600 * HZ;
59
57struct idmap_hashent { 60struct idmap_hashent {
61 unsigned long ih_expires;
58 __u32 ih_id; 62 __u32 ih_id;
59 int ih_namelen; 63 int ih_namelen;
60 char ih_name[IDMAP_NAMESZ]; 64 char ih_name[IDMAP_NAMESZ];
@@ -149,6 +153,8 @@ idmap_lookup_name(struct idmap_hashtable *h, const char *name, size_t len)
149 153
150 if (he->ih_namelen != len || memcmp(he->ih_name, name, len) != 0) 154 if (he->ih_namelen != len || memcmp(he->ih_name, name, len) != 0)
151 return NULL; 155 return NULL;
156 if (time_after(jiffies, he->ih_expires))
157 return NULL;
152 return he; 158 return he;
153} 159}
154 160
@@ -164,6 +170,8 @@ idmap_lookup_id(struct idmap_hashtable *h, __u32 id)
164 struct idmap_hashent *he = idmap_id_hash(h, id); 170 struct idmap_hashent *he = idmap_id_hash(h, id);
165 if (he->ih_id != id || he->ih_namelen == 0) 171 if (he->ih_id != id || he->ih_namelen == 0)
166 return NULL; 172 return NULL;
173 if (time_after(jiffies, he->ih_expires))
174 return NULL;
167 return he; 175 return he;
168} 176}
169 177
@@ -192,6 +200,7 @@ idmap_update_entry(struct idmap_hashent *he, const char *name,
192 memcpy(he->ih_name, name, namelen); 200 memcpy(he->ih_name, name, namelen);
193 he->ih_name[namelen] = '\0'; 201 he->ih_name[namelen] = '\0';
194 he->ih_namelen = namelen; 202 he->ih_namelen = namelen;
203 he->ih_expires = jiffies + nfs_idmap_cache_timeout;
195} 204}
196 205
197/* 206/*
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 6391d896421..a77ee95b7ef 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -40,6 +40,7 @@
40#include <asm/uaccess.h> 40#include <asm/uaccess.h>
41 41
42#include "nfs4_fs.h" 42#include "nfs4_fs.h"
43#include "callback.h"
43#include "delegation.h" 44#include "delegation.h"
44 45
45#define NFSDBG_FACILITY NFSDBG_VFS 46#define NFSDBG_FACILITY NFSDBG_VFS
@@ -54,7 +55,7 @@
54#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1) 55#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1)
55 56
56static void nfs_invalidate_inode(struct inode *); 57static void nfs_invalidate_inode(struct inode *);
57static int nfs_update_inode(struct inode *, struct nfs_fattr *, unsigned long); 58static int nfs_update_inode(struct inode *, struct nfs_fattr *);
58 59
59static struct inode *nfs_alloc_inode(struct super_block *sb); 60static struct inode *nfs_alloc_inode(struct super_block *sb);
60static void nfs_destroy_inode(struct inode *); 61static void nfs_destroy_inode(struct inode *);
@@ -221,10 +222,10 @@ nfs_calc_block_size(u64 tsize)
221static inline unsigned long 222static inline unsigned long
222nfs_block_size(unsigned long bsize, unsigned char *nrbitsp) 223nfs_block_size(unsigned long bsize, unsigned char *nrbitsp)
223{ 224{
224 if (bsize < 1024) 225 if (bsize < NFS_MIN_FILE_IO_SIZE)
225 bsize = NFS_DEF_FILE_IO_BUFFER_SIZE; 226 bsize = NFS_DEF_FILE_IO_SIZE;
226 else if (bsize >= NFS_MAX_FILE_IO_BUFFER_SIZE) 227 else if (bsize >= NFS_MAX_FILE_IO_SIZE)
227 bsize = NFS_MAX_FILE_IO_BUFFER_SIZE; 228 bsize = NFS_MAX_FILE_IO_SIZE;
228 229
229 return nfs_block_bits(bsize, nrbitsp); 230 return nfs_block_bits(bsize, nrbitsp);
230} 231}
@@ -307,20 +308,15 @@ nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor)
307 max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL); 308 max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL);
308 if (server->rsize > max_rpc_payload) 309 if (server->rsize > max_rpc_payload)
309 server->rsize = max_rpc_payload; 310 server->rsize = max_rpc_payload;
310 if (server->wsize > max_rpc_payload) 311 if (server->rsize > NFS_MAX_FILE_IO_SIZE)
311 server->wsize = max_rpc_payload; 312 server->rsize = NFS_MAX_FILE_IO_SIZE;
312
313 server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 313 server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
314 if (server->rpages > NFS_READ_MAXIOV) {
315 server->rpages = NFS_READ_MAXIOV;
316 server->rsize = server->rpages << PAGE_CACHE_SHIFT;
317 }
318 314
315 if (server->wsize > max_rpc_payload)
316 server->wsize = max_rpc_payload;
317 if (server->wsize > NFS_MAX_FILE_IO_SIZE)
318 server->wsize = NFS_MAX_FILE_IO_SIZE;
319 server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 319 server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
320 if (server->wpages > NFS_WRITE_MAXIOV) {
321 server->wpages = NFS_WRITE_MAXIOV;
322 server->wsize = server->wpages << PAGE_CACHE_SHIFT;
323 }
324 320
325 if (sb->s_blocksize == 0) 321 if (sb->s_blocksize == 0)
326 sb->s_blocksize = nfs_block_bits(server->wsize, 322 sb->s_blocksize = nfs_block_bits(server->wsize,
@@ -417,7 +413,6 @@ nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data)
417 413
418 clnt->cl_intr = 1; 414 clnt->cl_intr = 1;
419 clnt->cl_softrtry = 1; 415 clnt->cl_softrtry = 1;
420 clnt->cl_chatty = 1;
421 416
422 return clnt; 417 return clnt;
423 418
@@ -575,11 +570,10 @@ nfs_statfs(struct super_block *sb, struct kstatfs *buf)
575 buf->f_namelen = server->namelen; 570 buf->f_namelen = server->namelen;
576 out: 571 out:
577 unlock_kernel(); 572 unlock_kernel();
578
579 return 0; 573 return 0;
580 574
581 out_err: 575 out_err:
582 printk(KERN_WARNING "nfs_statfs: statfs error = %d\n", -error); 576 dprintk("%s: statfs error = %d\n", __FUNCTION__, -error);
583 buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1; 577 buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1;
584 goto out; 578 goto out;
585 579
@@ -640,17 +634,32 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
640 return 0; 634 return 0;
641} 635}
642 636
637/**
638 * nfs_sync_mapping - helper to flush all mmapped dirty data to disk
639 */
640int nfs_sync_mapping(struct address_space *mapping)
641{
642 int ret;
643
644 if (mapping->nrpages == 0)
645 return 0;
646 unmap_mapping_range(mapping, 0, 0, 0);
647 ret = filemap_write_and_wait(mapping);
648 if (ret != 0)
649 goto out;
650 ret = nfs_wb_all(mapping->host);
651out:
652 return ret;
653}
654
643/* 655/*
644 * Invalidate the local caches 656 * Invalidate the local caches
645 */ 657 */
646void 658static void nfs_zap_caches_locked(struct inode *inode)
647nfs_zap_caches(struct inode *inode)
648{ 659{
649 struct nfs_inode *nfsi = NFS_I(inode); 660 struct nfs_inode *nfsi = NFS_I(inode);
650 int mode = inode->i_mode; 661 int mode = inode->i_mode;
651 662
652 spin_lock(&inode->i_lock);
653
654 NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode); 663 NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode);
655 NFS_ATTRTIMEO_UPDATE(inode) = jiffies; 664 NFS_ATTRTIMEO_UPDATE(inode) = jiffies;
656 665
@@ -659,7 +668,12 @@ nfs_zap_caches(struct inode *inode)
659 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; 668 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
660 else 669 else
661 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; 670 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
671}
662 672
673void nfs_zap_caches(struct inode *inode)
674{
675 spin_lock(&inode->i_lock);
676 nfs_zap_caches_locked(inode);
663 spin_unlock(&inode->i_lock); 677 spin_unlock(&inode->i_lock);
664} 678}
665 679
@@ -676,16 +690,13 @@ static void nfs_zap_acl_cache(struct inode *inode)
676} 690}
677 691
678/* 692/*
679 * Invalidate, but do not unhash, the inode 693 * Invalidate, but do not unhash, the inode.
694 * NB: must be called with inode->i_lock held!
680 */ 695 */
681static void 696static void nfs_invalidate_inode(struct inode *inode)
682nfs_invalidate_inode(struct inode *inode)
683{ 697{
684 umode_t save_mode = inode->i_mode; 698 set_bit(NFS_INO_STALE, &NFS_FLAGS(inode));
685 699 nfs_zap_caches_locked(inode);
686 make_bad_inode(inode);
687 inode->i_mode = save_mode;
688 nfs_zap_caches(inode);
689} 700}
690 701
691struct nfs_find_desc { 702struct nfs_find_desc {
@@ -850,8 +861,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
850 nfs_begin_data_update(inode); 861 nfs_begin_data_update(inode);
851 /* Write all dirty data if we're changing file permissions or size */ 862 /* Write all dirty data if we're changing file permissions or size */
852 if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE)) != 0) { 863 if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE)) != 0) {
853 if (filemap_fdatawrite(inode->i_mapping) == 0) 864 filemap_write_and_wait(inode->i_mapping);
854 filemap_fdatawait(inode->i_mapping);
855 nfs_wb_all(inode); 865 nfs_wb_all(inode);
856 } 866 }
857 /* 867 /*
@@ -938,11 +948,22 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
938 int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; 948 int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
939 int err; 949 int err;
940 950
941 if (__IS_FLG(inode, MS_NOATIME)) 951 /* Flush out writes to the server in order to update c/mtime */
942 need_atime = 0; 952 nfs_sync_inode(inode, 0, 0, FLUSH_WAIT|FLUSH_NOCOMMIT);
943 else if (__IS_FLG(inode, MS_NODIRATIME) && S_ISDIR(inode->i_mode)) 953
954 /*
955 * We may force a getattr if the user cares about atime.
956 *
957 * Note that we only have to check the vfsmount flags here:
958 * - NFS always sets S_NOATIME by so checking it would give a
959 * bogus result
960 * - NFS never sets MS_NOATIME or MS_NODIRATIME so there is
961 * no point in checking those.
962 */
963 if ((mnt->mnt_flags & MNT_NOATIME) ||
964 ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
944 need_atime = 0; 965 need_atime = 0;
945 /* We may force a getattr if the user cares about atime */ 966
946 if (need_atime) 967 if (need_atime)
947 err = __nfs_revalidate_inode(NFS_SERVER(inode), inode); 968 err = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
948 else 969 else
@@ -1081,8 +1102,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
1081 int status = -ESTALE; 1102 int status = -ESTALE;
1082 struct nfs_fattr fattr; 1103 struct nfs_fattr fattr;
1083 struct nfs_inode *nfsi = NFS_I(inode); 1104 struct nfs_inode *nfsi = NFS_I(inode);
1084 unsigned long verifier;
1085 unsigned long cache_validity;
1086 1105
1087 dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", 1106 dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
1088 inode->i_sb->s_id, (long long)NFS_FILEID(inode)); 1107 inode->i_sb->s_id, (long long)NFS_FILEID(inode));
@@ -1107,8 +1126,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
1107 } 1126 }
1108 } 1127 }
1109 1128
1110 /* Protect against RPC races by saving the change attribute */
1111 verifier = nfs_save_change_attribute(inode);
1112 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); 1129 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
1113 if (status != 0) { 1130 if (status != 0) {
1114 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", 1131 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
@@ -1123,7 +1140,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
1123 } 1140 }
1124 1141
1125 spin_lock(&inode->i_lock); 1142 spin_lock(&inode->i_lock);
1126 status = nfs_update_inode(inode, &fattr, verifier); 1143 status = nfs_update_inode(inode, &fattr);
1127 if (status) { 1144 if (status) {
1128 spin_unlock(&inode->i_lock); 1145 spin_unlock(&inode->i_lock);
1129 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", 1146 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
@@ -1131,20 +1148,11 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
1131 (long long)NFS_FILEID(inode), status); 1148 (long long)NFS_FILEID(inode), status);
1132 goto out; 1149 goto out;
1133 } 1150 }
1134 cache_validity = nfsi->cache_validity;
1135 nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE;
1136
1137 /*
1138 * We may need to keep the attributes marked as invalid if
1139 * we raced with nfs_end_attr_update().
1140 */
1141 if (time_after_eq(verifier, nfsi->cache_change_attribute))
1142 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME);
1143 spin_unlock(&inode->i_lock); 1151 spin_unlock(&inode->i_lock);
1144 1152
1145 nfs_revalidate_mapping(inode, inode->i_mapping); 1153 nfs_revalidate_mapping(inode, inode->i_mapping);
1146 1154
1147 if (cache_validity & NFS_INO_INVALID_ACL) 1155 if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
1148 nfs_zap_acl_cache(inode); 1156 nfs_zap_acl_cache(inode);
1149 1157
1150 dfprintk(PAGECACHE, "NFS: (%s/%Ld) revalidation complete\n", 1158 dfprintk(PAGECACHE, "NFS: (%s/%Ld) revalidation complete\n",
@@ -1193,11 +1201,8 @@ void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
1193 struct nfs_inode *nfsi = NFS_I(inode); 1201 struct nfs_inode *nfsi = NFS_I(inode);
1194 1202
1195 if (nfsi->cache_validity & NFS_INO_INVALID_DATA) { 1203 if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
1196 if (S_ISREG(inode->i_mode)) { 1204 if (S_ISREG(inode->i_mode))
1197 if (filemap_fdatawrite(mapping) == 0) 1205 nfs_sync_mapping(mapping);
1198 filemap_fdatawait(mapping);
1199 nfs_wb_all(inode);
1200 }
1201 invalidate_inode_pages2(mapping); 1206 invalidate_inode_pages2(mapping);
1202 1207
1203 spin_lock(&inode->i_lock); 1208 spin_lock(&inode->i_lock);
@@ -1248,6 +1253,33 @@ void nfs_end_data_update(struct inode *inode)
1248 atomic_dec(&nfsi->data_updates); 1253 atomic_dec(&nfsi->data_updates);
1249} 1254}
1250 1255
1256static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1257{
1258 struct nfs_inode *nfsi = NFS_I(inode);
1259
1260 if ((fattr->valid & NFS_ATTR_PRE_CHANGE) != 0
1261 && nfsi->change_attr == fattr->pre_change_attr) {
1262 nfsi->change_attr = fattr->change_attr;
1263 nfsi->cache_change_attribute = jiffies;
1264 }
1265
1266 /* If we have atomic WCC data, we may update some attributes */
1267 if ((fattr->valid & NFS_ATTR_WCC) != 0) {
1268 if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) {
1269 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
1270 nfsi->cache_change_attribute = jiffies;
1271 }
1272 if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
1273 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
1274 nfsi->cache_change_attribute = jiffies;
1275 }
1276 if (inode->i_size == fattr->pre_size && nfsi->npages == 0) {
1277 inode->i_size = fattr->size;
1278 nfsi->cache_change_attribute = jiffies;
1279 }
1280 }
1281}
1282
1251/** 1283/**
1252 * nfs_check_inode_attributes - verify consistency of the inode attribute cache 1284 * nfs_check_inode_attributes - verify consistency of the inode attribute cache
1253 * @inode - pointer to inode 1285 * @inode - pointer to inode
@@ -1264,22 +1296,20 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
1264 int data_unstable; 1296 int data_unstable;
1265 1297
1266 1298
1299 if ((fattr->valid & NFS_ATTR_FATTR) == 0)
1300 return 0;
1301
1267 /* Are we in the process of updating data on the server? */ 1302 /* Are we in the process of updating data on the server? */
1268 data_unstable = nfs_caches_unstable(inode); 1303 data_unstable = nfs_caches_unstable(inode);
1269 1304
1270 if (fattr->valid & NFS_ATTR_FATTR_V4) { 1305 /* Do atomic weak cache consistency updates */
1271 if ((fattr->valid & NFS_ATTR_PRE_CHANGE) != 0 1306 nfs_wcc_update_inode(inode, fattr);
1272 && nfsi->change_attr == fattr->pre_change_attr)
1273 nfsi->change_attr = fattr->change_attr;
1274 if (nfsi->change_attr != fattr->change_attr) {
1275 nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
1276 if (!data_unstable)
1277 nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE;
1278 }
1279 }
1280 1307
1281 if ((fattr->valid & NFS_ATTR_FATTR) == 0) { 1308 if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
1282 return 0; 1309 nfsi->change_attr != fattr->change_attr) {
1310 nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
1311 if (!data_unstable)
1312 nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE;
1283 } 1313 }
1284 1314
1285 /* Has the inode gone and changed behind our back? */ 1315 /* Has the inode gone and changed behind our back? */
@@ -1291,14 +1321,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
1291 cur_size = i_size_read(inode); 1321 cur_size = i_size_read(inode);
1292 new_isize = nfs_size_to_loff_t(fattr->size); 1322 new_isize = nfs_size_to_loff_t(fattr->size);
1293 1323
1294 /* If we have atomic WCC data, we may update some attributes */
1295 if ((fattr->valid & NFS_ATTR_WCC) != 0) {
1296 if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
1297 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
1298 if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime))
1299 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
1300 }
1301
1302 /* Verify a few of the more important attributes */ 1324 /* Verify a few of the more important attributes */
1303 if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) { 1325 if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
1304 nfsi->cache_validity |= NFS_INO_INVALID_ATTR; 1326 nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
@@ -1347,10 +1369,8 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
1347 return 0; 1369 return 0;
1348 spin_lock(&inode->i_lock); 1370 spin_lock(&inode->i_lock);
1349 nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE; 1371 nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE;
1350 if (nfs_verify_change_attribute(inode, fattr->time_start))
1351 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME);
1352 if (time_after(fattr->time_start, nfsi->last_updated)) 1372 if (time_after(fattr->time_start, nfsi->last_updated))
1353 status = nfs_update_inode(inode, fattr, fattr->time_start); 1373 status = nfs_update_inode(inode, fattr);
1354 else 1374 else
1355 status = nfs_check_inode_attributes(inode, fattr); 1375 status = nfs_check_inode_attributes(inode, fattr);
1356 1376
@@ -1376,10 +1396,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1376 nfsi->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS; 1396 nfsi->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS;
1377 goto out; 1397 goto out;
1378 } 1398 }
1379 status = nfs_update_inode(inode, fattr, fattr->time_start); 1399 status = nfs_update_inode(inode, fattr);
1380 if (time_after_eq(fattr->time_start, nfsi->cache_change_attribute))
1381 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME|NFS_INO_REVAL_PAGECACHE);
1382 nfsi->cache_change_attribute = jiffies;
1383out: 1400out:
1384 spin_unlock(&inode->i_lock); 1401 spin_unlock(&inode->i_lock);
1385 return status; 1402 return status;
@@ -1397,12 +1414,12 @@ out:
1397 * 1414 *
1398 * A very similar scenario holds for the dir cache. 1415 * A very similar scenario holds for the dir cache.
1399 */ 1416 */
1400static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsigned long verifier) 1417static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1401{ 1418{
1402 struct nfs_inode *nfsi = NFS_I(inode); 1419 struct nfs_inode *nfsi = NFS_I(inode);
1403 loff_t cur_isize, new_isize; 1420 loff_t cur_isize, new_isize;
1404 unsigned int invalid = 0; 1421 unsigned int invalid = 0;
1405 int data_unstable; 1422 int data_stable;
1406 1423
1407 dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n", 1424 dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n",
1408 __FUNCTION__, inode->i_sb->s_id, inode->i_ino, 1425 __FUNCTION__, inode->i_sb->s_id, inode->i_ino,
@@ -1411,14 +1428,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign
1411 if ((fattr->valid & NFS_ATTR_FATTR) == 0) 1428 if ((fattr->valid & NFS_ATTR_FATTR) == 0)
1412 return 0; 1429 return 0;
1413 1430
1414 if (nfsi->fileid != fattr->fileid) { 1431 if (nfsi->fileid != fattr->fileid)
1415 printk(KERN_ERR "%s: inode number mismatch\n" 1432 goto out_fileid;
1416 "expected (%s/0x%Lx), got (%s/0x%Lx)\n",
1417 __FUNCTION__,
1418 inode->i_sb->s_id, (long long)nfsi->fileid,
1419 inode->i_sb->s_id, (long long)fattr->fileid);
1420 goto out_err;
1421 }
1422 1433
1423 /* 1434 /*
1424 * Make sure the inode's type hasn't changed. 1435 * Make sure the inode's type hasn't changed.
@@ -1433,8 +1444,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign
1433 nfsi->last_updated = jiffies; 1444 nfsi->last_updated = jiffies;
1434 1445
1435 /* Are we racing with known updates of the metadata on the server? */ 1446 /* Are we racing with known updates of the metadata on the server? */
1436 data_unstable = ! (nfs_verify_change_attribute(inode, verifier) || 1447 data_stable = nfs_verify_change_attribute(inode, fattr->time_start);
1437 (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)); 1448 if (data_stable)
1449 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME);
1450
1451 /* Do atomic weak cache consistency updates */
1452 nfs_wcc_update_inode(inode, fattr);
1438 1453
1439 /* Check if our cached file size is stale */ 1454 /* Check if our cached file size is stale */
1440 new_isize = nfs_size_to_loff_t(fattr->size); 1455 new_isize = nfs_size_to_loff_t(fattr->size);
@@ -1443,7 +1458,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign
1443 /* Do we perhaps have any outstanding writes? */ 1458 /* Do we perhaps have any outstanding writes? */
1444 if (nfsi->npages == 0) { 1459 if (nfsi->npages == 0) {
1445 /* No, but did we race with nfs_end_data_update()? */ 1460 /* No, but did we race with nfs_end_data_update()? */
1446 if (time_after_eq(verifier, nfsi->cache_change_attribute)) { 1461 if (data_stable) {
1447 inode->i_size = new_isize; 1462 inode->i_size = new_isize;
1448 invalid |= NFS_INO_INVALID_DATA; 1463 invalid |= NFS_INO_INVALID_DATA;
1449 } 1464 }
@@ -1452,6 +1467,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign
1452 inode->i_size = new_isize; 1467 inode->i_size = new_isize;
1453 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; 1468 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
1454 } 1469 }
1470 nfsi->cache_change_attribute = jiffies;
1455 dprintk("NFS: isize change on server for file %s/%ld\n", 1471 dprintk("NFS: isize change on server for file %s/%ld\n",
1456 inode->i_sb->s_id, inode->i_ino); 1472 inode->i_sb->s_id, inode->i_ino);
1457 } 1473 }
@@ -1461,8 +1477,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign
1461 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); 1477 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
1462 dprintk("NFS: mtime change on server for file %s/%ld\n", 1478 dprintk("NFS: mtime change on server for file %s/%ld\n",
1463 inode->i_sb->s_id, inode->i_ino); 1479 inode->i_sb->s_id, inode->i_ino);
1464 if (!data_unstable) 1480 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
1465 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; 1481 nfsi->cache_change_attribute = jiffies;
1466 } 1482 }
1467 1483
1468 if ((fattr->valid & NFS_ATTR_FATTR_V4) 1484 if ((fattr->valid & NFS_ATTR_FATTR_V4)
@@ -1470,15 +1486,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign
1470 dprintk("NFS: change_attr change on server for file %s/%ld\n", 1486 dprintk("NFS: change_attr change on server for file %s/%ld\n",
1471 inode->i_sb->s_id, inode->i_ino); 1487 inode->i_sb->s_id, inode->i_ino);
1472 nfsi->change_attr = fattr->change_attr; 1488 nfsi->change_attr = fattr->change_attr;
1473 if (!data_unstable) 1489 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1474 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1490 nfsi->cache_change_attribute = jiffies;
1475 } 1491 }
1476 1492
1477 /* If ctime has changed we should definitely clear access+acl caches */ 1493 /* If ctime has changed we should definitely clear access+acl caches */
1478 if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) { 1494 if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
1479 if (!data_unstable) 1495 invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1480 invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1481 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); 1496 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
1497 nfsi->cache_change_attribute = jiffies;
1482 } 1498 }
1483 memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); 1499 memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
1484 1500
@@ -1516,6 +1532,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign
1516 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) 1532 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
1517 || S_ISLNK(inode->i_mode))) 1533 || S_ISLNK(inode->i_mode)))
1518 invalid &= ~NFS_INO_INVALID_DATA; 1534 invalid &= ~NFS_INO_INVALID_DATA;
1535 if (data_stable)
1536 invalid &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME|NFS_INO_REVAL_PAGECACHE);
1519 if (!nfs_have_delegation(inode, FMODE_READ)) 1537 if (!nfs_have_delegation(inode, FMODE_READ))
1520 nfsi->cache_validity |= invalid; 1538 nfsi->cache_validity |= invalid;
1521 1539
@@ -1528,15 +1546,21 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign
1528 printk(KERN_DEBUG "%s: inode %ld mode changed, %07o to %07o\n", 1546 printk(KERN_DEBUG "%s: inode %ld mode changed, %07o to %07o\n",
1529 __FUNCTION__, inode->i_ino, inode->i_mode, fattr->mode); 1547 __FUNCTION__, inode->i_ino, inode->i_mode, fattr->mode);
1530#endif 1548#endif
1549 out_err:
1531 /* 1550 /*
1532 * No need to worry about unhashing the dentry, as the 1551 * No need to worry about unhashing the dentry, as the
1533 * lookup validation will know that the inode is bad. 1552 * lookup validation will know that the inode is bad.
1534 * (But we fall through to invalidate the caches.) 1553 * (But we fall through to invalidate the caches.)
1535 */ 1554 */
1536 nfs_invalidate_inode(inode); 1555 nfs_invalidate_inode(inode);
1537 out_err:
1538 set_bit(NFS_INO_STALE, &NFS_FLAGS(inode));
1539 return -ESTALE; 1556 return -ESTALE;
1557
1558 out_fileid:
1559 printk(KERN_ERR "NFS: server %s error: fileid changed\n"
1560 "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n",
1561 NFS_SERVER(inode)->hostname, inode->i_sb->s_id,
1562 (long long)nfsi->fileid, (long long)fattr->fileid);
1563 goto out_err;
1540} 1564}
1541 1565
1542/* 1566/*
@@ -1818,25 +1842,10 @@ static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data,
1818 } 1842 }
1819 clnt->cl_intr = 1; 1843 clnt->cl_intr = 1;
1820 clnt->cl_softrtry = 1; 1844 clnt->cl_softrtry = 1;
1821 clnt->cl_chatty = 1;
1822 clp->cl_rpcclient = clnt; 1845 clp->cl_rpcclient = clnt;
1823 clp->cl_cred = rpcauth_lookupcred(clnt->cl_auth, 0);
1824 if (IS_ERR(clp->cl_cred)) {
1825 up_write(&clp->cl_sem);
1826 err = PTR_ERR(clp->cl_cred);
1827 clp->cl_cred = NULL;
1828 goto out_fail;
1829 }
1830 memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr)); 1846 memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr));
1831 nfs_idmap_new(clp); 1847 nfs_idmap_new(clp);
1832 } 1848 }
1833 if (list_empty(&clp->cl_superblocks)) {
1834 err = nfs4_init_client(clp);
1835 if (err != 0) {
1836 up_write(&clp->cl_sem);
1837 goto out_fail;
1838 }
1839 }
1840 list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks); 1849 list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
1841 clnt = rpc_clone_client(clp->cl_rpcclient); 1850 clnt = rpc_clone_client(clp->cl_rpcclient);
1842 if (!IS_ERR(clnt)) 1851 if (!IS_ERR(clnt))
@@ -2031,6 +2040,35 @@ static struct file_system_type nfs4_fs_type = {
2031 .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, 2040 .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
2032}; 2041};
2033 2042
2043static const int nfs_set_port_min = 0;
2044static const int nfs_set_port_max = 65535;
2045static int param_set_port(const char *val, struct kernel_param *kp)
2046{
2047 char *endp;
2048 int num = simple_strtol(val, &endp, 0);
2049 if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max)
2050 return -EINVAL;
2051 *((int *)kp->arg) = num;
2052 return 0;
2053}
2054
2055module_param_call(callback_tcpport, param_set_port, param_get_int,
2056 &nfs_callback_set_tcpport, 0644);
2057
2058static int param_set_idmap_timeout(const char *val, struct kernel_param *kp)
2059{
2060 char *endp;
2061 int num = simple_strtol(val, &endp, 0);
2062 int jif = num * HZ;
2063 if (endp == val || *endp || num < 0 || jif < num)
2064 return -EINVAL;
2065 *((int *)kp->arg) = jif;
2066 return 0;
2067}
2068
2069module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
2070 &nfs_idmap_cache_timeout, 0644);
2071
2034#define nfs4_init_once(nfsi) \ 2072#define nfs4_init_once(nfsi) \
2035 do { \ 2073 do { \
2036 INIT_LIST_HEAD(&(nfsi)->open_states); \ 2074 INIT_LIST_HEAD(&(nfsi)->open_states); \
@@ -2038,8 +2076,25 @@ static struct file_system_type nfs4_fs_type = {
2038 nfsi->delegation_state = 0; \ 2076 nfsi->delegation_state = 0; \
2039 init_rwsem(&nfsi->rwsem); \ 2077 init_rwsem(&nfsi->rwsem); \
2040 } while(0) 2078 } while(0)
2041#define register_nfs4fs() register_filesystem(&nfs4_fs_type) 2079
2042#define unregister_nfs4fs() unregister_filesystem(&nfs4_fs_type) 2080static inline int register_nfs4fs(void)
2081{
2082 int ret;
2083
2084 ret = nfs_register_sysctl();
2085 if (ret != 0)
2086 return ret;
2087 ret = register_filesystem(&nfs4_fs_type);
2088 if (ret != 0)
2089 nfs_unregister_sysctl();
2090 return ret;
2091}
2092
2093static inline void unregister_nfs4fs(void)
2094{
2095 unregister_filesystem(&nfs4_fs_type);
2096 nfs_unregister_sysctl();
2097}
2043#else 2098#else
2044#define nfs4_init_once(nfsi) \ 2099#define nfs4_init_once(nfsi) \
2045 do { } while (0) 2100 do { } while (0)
@@ -2068,6 +2123,7 @@ static struct inode *nfs_alloc_inode(struct super_block *sb)
2068 return NULL; 2123 return NULL;
2069 nfsi->flags = 0UL; 2124 nfsi->flags = 0UL;
2070 nfsi->cache_validity = 0UL; 2125 nfsi->cache_validity = 0UL;
2126 nfsi->cache_change_attribute = jiffies;
2071#ifdef CONFIG_NFS_V3_ACL 2127#ifdef CONFIG_NFS_V3_ACL
2072 nfsi->acl_access = ERR_PTR(-EAGAIN); 2128 nfsi->acl_access = ERR_PTR(-EAGAIN);
2073 nfsi->acl_default = ERR_PTR(-EAGAIN); 2129 nfsi->acl_default = ERR_PTR(-EAGAIN);
@@ -2163,11 +2219,11 @@ out:
2163#ifdef CONFIG_PROC_FS 2219#ifdef CONFIG_PROC_FS
2164 rpc_proc_unregister("nfs"); 2220 rpc_proc_unregister("nfs");
2165#endif 2221#endif
2166 nfs_destroy_writepagecache();
2167#ifdef CONFIG_NFS_DIRECTIO 2222#ifdef CONFIG_NFS_DIRECTIO
2168out0:
2169 nfs_destroy_directcache(); 2223 nfs_destroy_directcache();
2224out0:
2170#endif 2225#endif
2226 nfs_destroy_writepagecache();
2171out1: 2227out1:
2172 nfs_destroy_readpagecache(); 2228 nfs_destroy_readpagecache();
2173out2: 2229out2:
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 0e82617f2de..db99b8f678f 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -82,7 +82,6 @@ mnt_create(char *hostname, struct sockaddr_in *srvaddr, int version,
82 RPC_AUTH_UNIX); 82 RPC_AUTH_UNIX);
83 if (!IS_ERR(clnt)) { 83 if (!IS_ERR(clnt)) {
84 clnt->cl_softrtry = 1; 84 clnt->cl_softrtry = 1;
85 clnt->cl_chatty = 1;
86 clnt->cl_oneshot = 1; 85 clnt->cl_oneshot = 1;
87 clnt->cl_intr = 1; 86 clnt->cl_intr = 1;
88 } 87 }
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 59049e864ca..7fc0560c89c 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -146,23 +146,23 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
146 return p; 146 return p;
147} 147}
148 148
149#define SATTR(p, attr, flag, field) \
150 *p++ = (attr->ia_valid & flag) ? htonl(attr->field) : ~(u32) 0
151static inline u32 * 149static inline u32 *
152xdr_encode_sattr(u32 *p, struct iattr *attr) 150xdr_encode_sattr(u32 *p, struct iattr *attr)
153{ 151{
154 SATTR(p, attr, ATTR_MODE, ia_mode); 152 const u32 not_set = __constant_htonl(0xFFFFFFFF);
155 SATTR(p, attr, ATTR_UID, ia_uid); 153
156 SATTR(p, attr, ATTR_GID, ia_gid); 154 *p++ = (attr->ia_valid & ATTR_MODE) ? htonl(attr->ia_mode) : not_set;
157 SATTR(p, attr, ATTR_SIZE, ia_size); 155 *p++ = (attr->ia_valid & ATTR_UID) ? htonl(attr->ia_uid) : not_set;
156 *p++ = (attr->ia_valid & ATTR_GID) ? htonl(attr->ia_gid) : not_set;
157 *p++ = (attr->ia_valid & ATTR_SIZE) ? htonl(attr->ia_size) : not_set;
158 158
159 if (attr->ia_valid & ATTR_ATIME_SET) { 159 if (attr->ia_valid & ATTR_ATIME_SET) {
160 p = xdr_encode_time(p, &attr->ia_atime); 160 p = xdr_encode_time(p, &attr->ia_atime);
161 } else if (attr->ia_valid & ATTR_ATIME) { 161 } else if (attr->ia_valid & ATTR_ATIME) {
162 p = xdr_encode_current_server_time(p, &attr->ia_atime); 162 p = xdr_encode_current_server_time(p, &attr->ia_atime);
163 } else { 163 } else {
164 *p++ = ~(u32) 0; 164 *p++ = not_set;
165 *p++ = ~(u32) 0; 165 *p++ = not_set;
166 } 166 }
167 167
168 if (attr->ia_valid & ATTR_MTIME_SET) { 168 if (attr->ia_valid & ATTR_MTIME_SET) {
@@ -170,12 +170,11 @@ xdr_encode_sattr(u32 *p, struct iattr *attr)
170 } else if (attr->ia_valid & ATTR_MTIME) { 170 } else if (attr->ia_valid & ATTR_MTIME) {
171 p = xdr_encode_current_server_time(p, &attr->ia_mtime); 171 p = xdr_encode_current_server_time(p, &attr->ia_mtime);
172 } else { 172 } else {
173 *p++ = ~(u32) 0; 173 *p++ = not_set;
174 *p++ = ~(u32) 0; 174 *p++ = not_set;
175 } 175 }
176 return p; 176 return p;
177} 177}
178#undef SATTR
179 178
180/* 179/*
181 * NFS encode functions 180 * NFS encode functions
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 92c870d19cc..ed67567f055 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -68,27 +68,39 @@ nfs3_async_handle_jukebox(struct rpc_task *task)
68 return 1; 68 return 1;
69} 69}
70 70
71/*
72 * Bare-bones access to getattr: this is for nfs_read_super.
73 */
74static int 71static int
75nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, 72do_proc_get_root(struct rpc_clnt *client, struct nfs_fh *fhandle,
76 struct nfs_fsinfo *info) 73 struct nfs_fsinfo *info)
77{ 74{
78 int status; 75 int status;
79 76
80 dprintk("%s: call fsinfo\n", __FUNCTION__); 77 dprintk("%s: call fsinfo\n", __FUNCTION__);
81 nfs_fattr_init(info->fattr); 78 nfs_fattr_init(info->fattr);
82 status = rpc_call(server->client_sys, NFS3PROC_FSINFO, fhandle, info, 0); 79 status = rpc_call(client, NFS3PROC_FSINFO, fhandle, info, 0);
83 dprintk("%s: reply fsinfo: %d\n", __FUNCTION__, status); 80 dprintk("%s: reply fsinfo: %d\n", __FUNCTION__, status);
84 if (!(info->fattr->valid & NFS_ATTR_FATTR)) { 81 if (!(info->fattr->valid & NFS_ATTR_FATTR)) {
85 status = rpc_call(server->client_sys, NFS3PROC_GETATTR, fhandle, info->fattr, 0); 82 status = rpc_call(client, NFS3PROC_GETATTR, fhandle, info->fattr, 0);
86 dprintk("%s: reply getattr: %d\n", __FUNCTION__, status); 83 dprintk("%s: reply getattr: %d\n", __FUNCTION__, status);
87 } 84 }
88 return status; 85 return status;
89} 86}
90 87
91/* 88/*
89 * Bare-bones access to getattr: this is for nfs_read_super.
90 */
91static int
92nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
93 struct nfs_fsinfo *info)
94{
95 int status;
96
97 status = do_proc_get_root(server->client, fhandle, info);
98 if (status && server->client_sys != server->client)
99 status = do_proc_get_root(server->client_sys, fhandle, info);
100 return status;
101}
102
103/*
92 * One function for each procedure in the NFS protocol. 104 * One function for each procedure in the NFS protocol.
93 */ 105 */
94static int 106static int
@@ -732,19 +744,23 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
732 744
733extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int); 745extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int);
734 746
735static void 747static void nfs3_read_done(struct rpc_task *task, void *calldata)
736nfs3_read_done(struct rpc_task *task)
737{ 748{
738 struct nfs_read_data *data = (struct nfs_read_data *) task->tk_calldata; 749 struct nfs_read_data *data = calldata;
739 750
740 if (nfs3_async_handle_jukebox(task)) 751 if (nfs3_async_handle_jukebox(task))
741 return; 752 return;
742 /* Call back common NFS readpage processing */ 753 /* Call back common NFS readpage processing */
743 if (task->tk_status >= 0) 754 if (task->tk_status >= 0)
744 nfs_refresh_inode(data->inode, &data->fattr); 755 nfs_refresh_inode(data->inode, &data->fattr);
745 nfs_readpage_result(task); 756 nfs_readpage_result(task, calldata);
746} 757}
747 758
759static const struct rpc_call_ops nfs3_read_ops = {
760 .rpc_call_done = nfs3_read_done,
761 .rpc_release = nfs_readdata_release,
762};
763
748static void 764static void
749nfs3_proc_read_setup(struct nfs_read_data *data) 765nfs3_proc_read_setup(struct nfs_read_data *data)
750{ 766{
@@ -762,23 +778,26 @@ nfs3_proc_read_setup(struct nfs_read_data *data)
762 flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0); 778 flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0);
763 779
764 /* Finalize the task. */ 780 /* Finalize the task. */
765 rpc_init_task(task, NFS_CLIENT(inode), nfs3_read_done, flags); 781 rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs3_read_ops, data);
766 rpc_call_setup(task, &msg, 0); 782 rpc_call_setup(task, &msg, 0);
767} 783}
768 784
769static void 785static void nfs3_write_done(struct rpc_task *task, void *calldata)
770nfs3_write_done(struct rpc_task *task)
771{ 786{
772 struct nfs_write_data *data; 787 struct nfs_write_data *data = calldata;
773 788
774 if (nfs3_async_handle_jukebox(task)) 789 if (nfs3_async_handle_jukebox(task))
775 return; 790 return;
776 data = (struct nfs_write_data *)task->tk_calldata;
777 if (task->tk_status >= 0) 791 if (task->tk_status >= 0)
778 nfs_post_op_update_inode(data->inode, data->res.fattr); 792 nfs_post_op_update_inode(data->inode, data->res.fattr);
779 nfs_writeback_done(task); 793 nfs_writeback_done(task, calldata);
780} 794}
781 795
796static const struct rpc_call_ops nfs3_write_ops = {
797 .rpc_call_done = nfs3_write_done,
798 .rpc_release = nfs_writedata_release,
799};
800
782static void 801static void
783nfs3_proc_write_setup(struct nfs_write_data *data, int how) 802nfs3_proc_write_setup(struct nfs_write_data *data, int how)
784{ 803{
@@ -806,23 +825,26 @@ nfs3_proc_write_setup(struct nfs_write_data *data, int how)
806 flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; 825 flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
807 826
808 /* Finalize the task. */ 827 /* Finalize the task. */
809 rpc_init_task(task, NFS_CLIENT(inode), nfs3_write_done, flags); 828 rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs3_write_ops, data);
810 rpc_call_setup(task, &msg, 0); 829 rpc_call_setup(task, &msg, 0);
811} 830}
812 831
813static void 832static void nfs3_commit_done(struct rpc_task *task, void *calldata)
814nfs3_commit_done(struct rpc_task *task)
815{ 833{
816 struct nfs_write_data *data; 834 struct nfs_write_data *data = calldata;
817 835
818 if (nfs3_async_handle_jukebox(task)) 836 if (nfs3_async_handle_jukebox(task))
819 return; 837 return;
820 data = (struct nfs_write_data *)task->tk_calldata;
821 if (task->tk_status >= 0) 838 if (task->tk_status >= 0)
822 nfs_post_op_update_inode(data->inode, data->res.fattr); 839 nfs_post_op_update_inode(data->inode, data->res.fattr);
823 nfs_commit_done(task); 840 nfs_commit_done(task, calldata);
824} 841}
825 842
843static const struct rpc_call_ops nfs3_commit_ops = {
844 .rpc_call_done = nfs3_commit_done,
845 .rpc_release = nfs_commit_release,
846};
847
826static void 848static void
827nfs3_proc_commit_setup(struct nfs_write_data *data, int how) 849nfs3_proc_commit_setup(struct nfs_write_data *data, int how)
828{ 850{
@@ -840,7 +862,7 @@ nfs3_proc_commit_setup(struct nfs_write_data *data, int how)
840 flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; 862 flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
841 863
842 /* Finalize the task. */ 864 /* Finalize the task. */
843 rpc_init_task(task, NFS_CLIENT(inode), nfs3_commit_done, flags); 865 rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs3_commit_ops, data);
844 rpc_call_setup(task, &msg, 0); 866 rpc_call_setup(task, &msg, 0);
845} 867}
846 868
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 0498bd36602..b6c0b5012bc 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -182,7 +182,7 @@ xdr_encode_sattr(u32 *p, struct iattr *attr)
182{ 182{
183 if (attr->ia_valid & ATTR_MODE) { 183 if (attr->ia_valid & ATTR_MODE) {
184 *p++ = xdr_one; 184 *p++ = xdr_one;
185 *p++ = htonl(attr->ia_mode); 185 *p++ = htonl(attr->ia_mode & S_IALLUGO);
186 } else { 186 } else {
187 *p++ = xdr_zero; 187 *p++ = xdr_zero;
188 } 188 }
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index b7f262dcb6e..0f5e4e7cdde 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -38,7 +38,8 @@ struct idmap;
38 ((err) != NFSERR_NOFILEHANDLE)) 38 ((err) != NFSERR_NOFILEHANDLE))
39 39
40enum nfs4_client_state { 40enum nfs4_client_state {
41 NFS4CLNT_OK = 0, 41 NFS4CLNT_STATE_RECOVER = 0,
42 NFS4CLNT_LEASE_EXPIRED,
42}; 43};
43 44
44/* 45/*
@@ -67,7 +68,6 @@ struct nfs4_client {
67 atomic_t cl_count; 68 atomic_t cl_count;
68 69
69 struct rpc_clnt * cl_rpcclient; 70 struct rpc_clnt * cl_rpcclient;
70 struct rpc_cred * cl_cred;
71 71
72 struct list_head cl_superblocks; /* List of nfs_server structs */ 72 struct list_head cl_superblocks; /* List of nfs_server structs */
73 73
@@ -76,7 +76,6 @@ struct nfs4_client {
76 struct work_struct cl_renewd; 76 struct work_struct cl_renewd;
77 struct work_struct cl_recoverd; 77 struct work_struct cl_recoverd;
78 78
79 wait_queue_head_t cl_waitq;
80 struct rpc_wait_queue cl_rpcwaitq; 79 struct rpc_wait_queue cl_rpcwaitq;
81 80
82 /* used for the setclientid verifier */ 81 /* used for the setclientid verifier */
@@ -182,8 +181,9 @@ struct nfs4_state {
182 181
183 nfs4_stateid stateid; 182 nfs4_stateid stateid;
184 183
185 unsigned int nreaders; 184 unsigned int n_rdonly;
186 unsigned int nwriters; 185 unsigned int n_wronly;
186 unsigned int n_rdwr;
187 int state; /* State on the server (R,W, or RW) */ 187 int state; /* State on the server (R,W, or RW) */
188 atomic_t count; 188 atomic_t count;
189}; 189};
@@ -210,10 +210,10 @@ extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
210 210
211/* nfs4proc.c */ 211/* nfs4proc.c */
212extern int nfs4_map_errors(int err); 212extern int nfs4_map_errors(int err);
213extern int nfs4_proc_setclientid(struct nfs4_client *, u32, unsigned short); 213extern int nfs4_proc_setclientid(struct nfs4_client *, u32, unsigned short, struct rpc_cred *);
214extern int nfs4_proc_setclientid_confirm(struct nfs4_client *); 214extern int nfs4_proc_setclientid_confirm(struct nfs4_client *, struct rpc_cred *);
215extern int nfs4_proc_async_renew(struct nfs4_client *); 215extern int nfs4_proc_async_renew(struct nfs4_client *, struct rpc_cred *);
216extern int nfs4_proc_renew(struct nfs4_client *); 216extern int nfs4_proc_renew(struct nfs4_client *, struct rpc_cred *);
217extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state); 217extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state);
218extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); 218extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
219extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *); 219extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
@@ -237,8 +237,8 @@ extern void init_nfsv4_state(struct nfs_server *);
237extern void destroy_nfsv4_state(struct nfs_server *); 237extern void destroy_nfsv4_state(struct nfs_server *);
238extern struct nfs4_client *nfs4_get_client(struct in_addr *); 238extern struct nfs4_client *nfs4_get_client(struct in_addr *);
239extern void nfs4_put_client(struct nfs4_client *clp); 239extern void nfs4_put_client(struct nfs4_client *clp);
240extern int nfs4_init_client(struct nfs4_client *clp);
241extern struct nfs4_client *nfs4_find_client(struct in_addr *); 240extern struct nfs4_client *nfs4_find_client(struct in_addr *);
241struct rpc_cred *nfs4_get_renew_cred(struct nfs4_client *clp);
242extern u32 nfs4_alloc_lockowner_id(struct nfs4_client *); 242extern u32 nfs4_alloc_lockowner_id(struct nfs4_client *);
243 243
244extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); 244extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 21482b2518f..984ca3454d0 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -57,11 +57,13 @@
57#define NFS4_POLL_RETRY_MIN (1*HZ) 57#define NFS4_POLL_RETRY_MIN (1*HZ)
58#define NFS4_POLL_RETRY_MAX (15*HZ) 58#define NFS4_POLL_RETRY_MAX (15*HZ)
59 59
60static int _nfs4_proc_open_confirm(struct rpc_clnt *clnt, const struct nfs_fh *fh, struct nfs4_state_owner *sp, nfs4_stateid *stateid, struct nfs_seqid *seqid); 60struct nfs4_opendata;
61static int _nfs4_proc_open(struct nfs4_opendata *data);
61static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); 62static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
62static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *); 63static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *);
63static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry); 64static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry);
64static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception); 65static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception);
66static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs4_client *clp);
65extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus); 67extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus);
66extern struct rpc_procinfo nfs4_procedures[]; 68extern struct rpc_procinfo nfs4_procedures[];
67 69
@@ -173,8 +175,7 @@ static void nfs4_setup_readdir(u64 cookie, u32 *verifier, struct dentry *dentry,
173 kunmap_atomic(start, KM_USER0); 175 kunmap_atomic(start, KM_USER0);
174} 176}
175 177
176static void 178static void renew_lease(const struct nfs_server *server, unsigned long timestamp)
177renew_lease(struct nfs_server *server, unsigned long timestamp)
178{ 179{
179 struct nfs4_client *clp = server->nfs4_state; 180 struct nfs4_client *clp = server->nfs4_state;
180 spin_lock(&clp->cl_lock); 181 spin_lock(&clp->cl_lock);
@@ -194,21 +195,123 @@ static void update_changeattr(struct inode *inode, struct nfs4_change_info *cinf
194 spin_unlock(&inode->i_lock); 195 spin_unlock(&inode->i_lock);
195} 196}
196 197
198struct nfs4_opendata {
199 atomic_t count;
200 struct nfs_openargs o_arg;
201 struct nfs_openres o_res;
202 struct nfs_open_confirmargs c_arg;
203 struct nfs_open_confirmres c_res;
204 struct nfs_fattr f_attr;
205 struct nfs_fattr dir_attr;
206 struct dentry *dentry;
207 struct dentry *dir;
208 struct nfs4_state_owner *owner;
209 struct iattr attrs;
210 unsigned long timestamp;
211 int rpc_status;
212 int cancelled;
213};
214
215static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
216 struct nfs4_state_owner *sp, int flags,
217 const struct iattr *attrs)
218{
219 struct dentry *parent = dget_parent(dentry);
220 struct inode *dir = parent->d_inode;
221 struct nfs_server *server = NFS_SERVER(dir);
222 struct nfs4_opendata *p;
223
224 p = kzalloc(sizeof(*p), GFP_KERNEL);
225 if (p == NULL)
226 goto err;
227 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid);
228 if (p->o_arg.seqid == NULL)
229 goto err_free;
230 atomic_set(&p->count, 1);
231 p->dentry = dget(dentry);
232 p->dir = parent;
233 p->owner = sp;
234 atomic_inc(&sp->so_count);
235 p->o_arg.fh = NFS_FH(dir);
236 p->o_arg.open_flags = flags,
237 p->o_arg.clientid = server->nfs4_state->cl_clientid;
238 p->o_arg.id = sp->so_id;
239 p->o_arg.name = &dentry->d_name;
240 p->o_arg.server = server;
241 p->o_arg.bitmask = server->attr_bitmask;
242 p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
243 p->o_res.f_attr = &p->f_attr;
244 p->o_res.dir_attr = &p->dir_attr;
245 p->o_res.server = server;
246 nfs_fattr_init(&p->f_attr);
247 nfs_fattr_init(&p->dir_attr);
248 if (flags & O_EXCL) {
249 u32 *s = (u32 *) p->o_arg.u.verifier.data;
250 s[0] = jiffies;
251 s[1] = current->pid;
252 } else if (flags & O_CREAT) {
253 p->o_arg.u.attrs = &p->attrs;
254 memcpy(&p->attrs, attrs, sizeof(p->attrs));
255 }
256 p->c_arg.fh = &p->o_res.fh;
257 p->c_arg.stateid = &p->o_res.stateid;
258 p->c_arg.seqid = p->o_arg.seqid;
259 return p;
260err_free:
261 kfree(p);
262err:
263 dput(parent);
264 return NULL;
265}
266
267static void nfs4_opendata_free(struct nfs4_opendata *p)
268{
269 if (p != NULL && atomic_dec_and_test(&p->count)) {
270 nfs_free_seqid(p->o_arg.seqid);
271 nfs4_put_state_owner(p->owner);
272 dput(p->dir);
273 dput(p->dentry);
274 kfree(p);
275 }
276}
277
197/* Helper for asynchronous RPC calls */ 278/* Helper for asynchronous RPC calls */
198static int nfs4_call_async(struct rpc_clnt *clnt, rpc_action tk_begin, 279static int nfs4_call_async(struct rpc_clnt *clnt,
199 rpc_action tk_exit, void *calldata) 280 const struct rpc_call_ops *tk_ops, void *calldata)
200{ 281{
201 struct rpc_task *task; 282 struct rpc_task *task;
202 283
203 if (!(task = rpc_new_task(clnt, tk_exit, RPC_TASK_ASYNC))) 284 if (!(task = rpc_new_task(clnt, RPC_TASK_ASYNC, tk_ops, calldata)))
204 return -ENOMEM; 285 return -ENOMEM;
205
206 task->tk_calldata = calldata;
207 task->tk_action = tk_begin;
208 rpc_execute(task); 286 rpc_execute(task);
209 return 0; 287 return 0;
210} 288}
211 289
290static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task)
291{
292 sigset_t oldset;
293 int ret;
294
295 rpc_clnt_sigmask(task->tk_client, &oldset);
296 ret = rpc_wait_for_completion_task(task);
297 rpc_clnt_sigunmask(task->tk_client, &oldset);
298 return ret;
299}
300
301static inline void update_open_stateflags(struct nfs4_state *state, mode_t open_flags)
302{
303 switch (open_flags) {
304 case FMODE_WRITE:
305 state->n_wronly++;
306 break;
307 case FMODE_READ:
308 state->n_rdonly++;
309 break;
310 case FMODE_READ|FMODE_WRITE:
311 state->n_rdwr++;
312 }
313}
314
212static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags) 315static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags)
213{ 316{
214 struct inode *inode = state->inode; 317 struct inode *inode = state->inode;
@@ -218,41 +321,134 @@ static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid,
218 spin_lock(&state->owner->so_lock); 321 spin_lock(&state->owner->so_lock);
219 spin_lock(&inode->i_lock); 322 spin_lock(&inode->i_lock);
220 memcpy(&state->stateid, stateid, sizeof(state->stateid)); 323 memcpy(&state->stateid, stateid, sizeof(state->stateid));
221 if ((open_flags & FMODE_WRITE)) 324 update_open_stateflags(state, open_flags);
222 state->nwriters++;
223 if (open_flags & FMODE_READ)
224 state->nreaders++;
225 nfs4_state_set_mode_locked(state, state->state | open_flags); 325 nfs4_state_set_mode_locked(state, state->state | open_flags);
226 spin_unlock(&inode->i_lock); 326 spin_unlock(&inode->i_lock);
227 spin_unlock(&state->owner->so_lock); 327 spin_unlock(&state->owner->so_lock);
228} 328}
229 329
330static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
331{
332 struct inode *inode;
333 struct nfs4_state *state = NULL;
334
335 if (!(data->f_attr.valid & NFS_ATTR_FATTR))
336 goto out;
337 inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr);
338 if (inode == NULL)
339 goto out;
340 state = nfs4_get_open_state(inode, data->owner);
341 if (state == NULL)
342 goto put_inode;
343 update_open_stateid(state, &data->o_res.stateid, data->o_arg.open_flags);
344put_inode:
345 iput(inode);
346out:
347 return state;
348}
349
350static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state)
351{
352 struct nfs_inode *nfsi = NFS_I(state->inode);
353 struct nfs_open_context *ctx;
354
355 spin_lock(&state->inode->i_lock);
356 list_for_each_entry(ctx, &nfsi->open_files, list) {
357 if (ctx->state != state)
358 continue;
359 get_nfs_open_context(ctx);
360 spin_unlock(&state->inode->i_lock);
361 return ctx;
362 }
363 spin_unlock(&state->inode->i_lock);
364 return ERR_PTR(-ENOENT);
365}
366
367static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, mode_t openflags, nfs4_stateid *stateid)
368{
369 int ret;
370
371 opendata->o_arg.open_flags = openflags;
372 ret = _nfs4_proc_open(opendata);
373 if (ret != 0)
374 return ret;
375 memcpy(stateid->data, opendata->o_res.stateid.data,
376 sizeof(stateid->data));
377 return 0;
378}
379
380static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *state)
381{
382 nfs4_stateid stateid;
383 struct nfs4_state *newstate;
384 int mode = 0;
385 int delegation = 0;
386 int ret;
387
388 /* memory barrier prior to reading state->n_* */
389 smp_rmb();
390 if (state->n_rdwr != 0) {
391 ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &stateid);
392 if (ret != 0)
393 return ret;
394 mode |= FMODE_READ|FMODE_WRITE;
395 if (opendata->o_res.delegation_type != 0)
396 delegation = opendata->o_res.delegation_type;
397 smp_rmb();
398 }
399 if (state->n_wronly != 0) {
400 ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &stateid);
401 if (ret != 0)
402 return ret;
403 mode |= FMODE_WRITE;
404 if (opendata->o_res.delegation_type != 0)
405 delegation = opendata->o_res.delegation_type;
406 smp_rmb();
407 }
408 if (state->n_rdonly != 0) {
409 ret = nfs4_open_recover_helper(opendata, FMODE_READ, &stateid);
410 if (ret != 0)
411 return ret;
412 mode |= FMODE_READ;
413 }
414 clear_bit(NFS_DELEGATED_STATE, &state->flags);
415 if (mode == 0)
416 return 0;
417 if (opendata->o_res.delegation_type == 0)
418 opendata->o_res.delegation_type = delegation;
419 opendata->o_arg.open_flags |= mode;
420 newstate = nfs4_opendata_to_nfs4_state(opendata);
421 if (newstate != NULL) {
422 if (opendata->o_res.delegation_type != 0) {
423 struct nfs_inode *nfsi = NFS_I(newstate->inode);
424 int delegation_flags = 0;
425 if (nfsi->delegation)
426 delegation_flags = nfsi->delegation->flags;
427 if (!(delegation_flags & NFS_DELEGATION_NEED_RECLAIM))
428 nfs_inode_set_delegation(newstate->inode,
429 opendata->owner->so_cred,
430 &opendata->o_res);
431 else
432 nfs_inode_reclaim_delegation(newstate->inode,
433 opendata->owner->so_cred,
434 &opendata->o_res);
435 }
436 nfs4_close_state(newstate, opendata->o_arg.open_flags);
437 }
438 if (newstate != state)
439 return -ESTALE;
440 return 0;
441}
442
230/* 443/*
231 * OPEN_RECLAIM: 444 * OPEN_RECLAIM:
232 * reclaim state on the server after a reboot. 445 * reclaim state on the server after a reboot.
233 */ 446 */
234static int _nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state) 447static int _nfs4_do_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state, struct dentry *dentry)
235{ 448{
236 struct inode *inode = state->inode; 449 struct nfs_delegation *delegation = NFS_I(state->inode)->delegation;
237 struct nfs_server *server = NFS_SERVER(inode); 450 struct nfs4_opendata *opendata;
238 struct nfs_delegation *delegation = NFS_I(inode)->delegation; 451 int delegation_type = 0;
239 struct nfs_openargs o_arg = {
240 .fh = NFS_FH(inode),
241 .id = sp->so_id,
242 .open_flags = state->state,
243 .clientid = server->nfs4_state->cl_clientid,
244 .claim = NFS4_OPEN_CLAIM_PREVIOUS,
245 .bitmask = server->attr_bitmask,
246 };
247 struct nfs_openres o_res = {
248 .server = server, /* Grrr */
249 };
250 struct rpc_message msg = {
251 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR],
252 .rpc_argp = &o_arg,
253 .rpc_resp = &o_res,
254 .rpc_cred = sp->so_cred,
255 };
256 int status; 452 int status;
257 453
258 if (delegation != NULL) { 454 if (delegation != NULL) {
@@ -262,38 +458,27 @@ static int _nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *st
262 set_bit(NFS_DELEGATED_STATE, &state->flags); 458 set_bit(NFS_DELEGATED_STATE, &state->flags);
263 return 0; 459 return 0;
264 } 460 }
265 o_arg.u.delegation_type = delegation->type; 461 delegation_type = delegation->type;
266 } 462 }
267 o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid); 463 opendata = nfs4_opendata_alloc(dentry, sp, 0, NULL);
268 if (o_arg.seqid == NULL) 464 if (opendata == NULL)
269 return -ENOMEM; 465 return -ENOMEM;
270 status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); 466 opendata->o_arg.claim = NFS4_OPEN_CLAIM_PREVIOUS;
271 /* Confirm the sequence as being established */ 467 opendata->o_arg.fh = NFS_FH(state->inode);
272 nfs_confirm_seqid(&sp->so_seqid, status); 468 nfs_copy_fh(&opendata->o_res.fh, opendata->o_arg.fh);
273 nfs_increment_open_seqid(status, o_arg.seqid); 469 opendata->o_arg.u.delegation_type = delegation_type;
274 if (status == 0) { 470 status = nfs4_open_recover(opendata, state);
275 memcpy(&state->stateid, &o_res.stateid, sizeof(state->stateid)); 471 nfs4_opendata_free(opendata);
276 if (o_res.delegation_type != 0) {
277 nfs_inode_reclaim_delegation(inode, sp->so_cred, &o_res);
278 /* Did the server issue an immediate delegation recall? */
279 if (o_res.do_recall)
280 nfs_async_inode_return_delegation(inode, &o_res.stateid);
281 }
282 }
283 nfs_free_seqid(o_arg.seqid);
284 clear_bit(NFS_DELEGATED_STATE, &state->flags);
285 /* Ensure we update the inode attributes */
286 NFS_CACHEINV(inode);
287 return status; 472 return status;
288} 473}
289 474
290static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state) 475static int nfs4_do_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state, struct dentry *dentry)
291{ 476{
292 struct nfs_server *server = NFS_SERVER(state->inode); 477 struct nfs_server *server = NFS_SERVER(state->inode);
293 struct nfs4_exception exception = { }; 478 struct nfs4_exception exception = { };
294 int err; 479 int err;
295 do { 480 do {
296 err = _nfs4_open_reclaim(sp, state); 481 err = _nfs4_do_open_reclaim(sp, state, dentry);
297 if (err != -NFS4ERR_DELAY) 482 if (err != -NFS4ERR_DELAY)
298 break; 483 break;
299 nfs4_handle_exception(server, err, &exception); 484 nfs4_handle_exception(server, err, &exception);
@@ -301,63 +486,36 @@ static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *sta
301 return err; 486 return err;
302} 487}
303 488
489static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state)
490{
491 struct nfs_open_context *ctx;
492 int ret;
493
494 ctx = nfs4_state_find_open_context(state);
495 if (IS_ERR(ctx))
496 return PTR_ERR(ctx);
497 ret = nfs4_do_open_reclaim(sp, state, ctx->dentry);
498 put_nfs_open_context(ctx);
499 return ret;
500}
501
304static int _nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state) 502static int _nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state)
305{ 503{
306 struct nfs4_state_owner *sp = state->owner; 504 struct nfs4_state_owner *sp = state->owner;
307 struct inode *inode = dentry->d_inode; 505 struct nfs4_opendata *opendata;
308 struct nfs_server *server = NFS_SERVER(inode); 506 int ret;
309 struct dentry *parent = dget_parent(dentry);
310 struct nfs_openargs arg = {
311 .fh = NFS_FH(parent->d_inode),
312 .clientid = server->nfs4_state->cl_clientid,
313 .name = &dentry->d_name,
314 .id = sp->so_id,
315 .server = server,
316 .bitmask = server->attr_bitmask,
317 .claim = NFS4_OPEN_CLAIM_DELEGATE_CUR,
318 };
319 struct nfs_openres res = {
320 .server = server,
321 };
322 struct rpc_message msg = {
323 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR],
324 .rpc_argp = &arg,
325 .rpc_resp = &res,
326 .rpc_cred = sp->so_cred,
327 };
328 int status = 0;
329 507
330 if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) 508 if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
331 goto out; 509 return 0;
332 if (state->state == 0) 510 opendata = nfs4_opendata_alloc(dentry, sp, 0, NULL);
333 goto out; 511 if (opendata == NULL)
334 arg.seqid = nfs_alloc_seqid(&sp->so_seqid); 512 return -ENOMEM;
335 status = -ENOMEM; 513 opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR;
336 if (arg.seqid == NULL) 514 memcpy(opendata->o_arg.u.delegation.data, state->stateid.data,
337 goto out; 515 sizeof(opendata->o_arg.u.delegation.data));
338 arg.open_flags = state->state; 516 ret = nfs4_open_recover(opendata, state);
339 memcpy(arg.u.delegation.data, state->stateid.data, sizeof(arg.u.delegation.data)); 517 nfs4_opendata_free(opendata);
340 status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); 518 return ret;
341 nfs_increment_open_seqid(status, arg.seqid);
342 if (status != 0)
343 goto out_free;
344 if(res.rflags & NFS4_OPEN_RESULT_CONFIRM) {
345 status = _nfs4_proc_open_confirm(server->client, NFS_FH(inode),
346 sp, &res.stateid, arg.seqid);
347 if (status != 0)
348 goto out_free;
349 }
350 nfs_confirm_seqid(&sp->so_seqid, 0);
351 if (status >= 0) {
352 memcpy(state->stateid.data, res.stateid.data,
353 sizeof(state->stateid.data));
354 clear_bit(NFS_DELEGATED_STATE, &state->flags);
355 }
356out_free:
357 nfs_free_seqid(arg.seqid);
358out:
359 dput(parent);
360 return status;
361} 519}
362 520
363int nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state) 521int nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state)
@@ -382,82 +540,202 @@ int nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state)
382 return err; 540 return err;
383} 541}
384 542
385static int _nfs4_proc_open_confirm(struct rpc_clnt *clnt, const struct nfs_fh *fh, struct nfs4_state_owner *sp, nfs4_stateid *stateid, struct nfs_seqid *seqid) 543static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata)
386{ 544{
387 struct nfs_open_confirmargs arg = { 545 struct nfs4_opendata *data = calldata;
388 .fh = fh, 546 struct rpc_message msg = {
389 .seqid = seqid, 547 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM],
390 .stateid = *stateid, 548 .rpc_argp = &data->c_arg,
391 }; 549 .rpc_resp = &data->c_res,
392 struct nfs_open_confirmres res; 550 .rpc_cred = data->owner->so_cred,
393 struct rpc_message msg = {
394 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM],
395 .rpc_argp = &arg,
396 .rpc_resp = &res,
397 .rpc_cred = sp->so_cred,
398 }; 551 };
552 data->timestamp = jiffies;
553 rpc_call_setup(task, &msg, 0);
554}
555
556static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
557{
558 struct nfs4_opendata *data = calldata;
559
560 data->rpc_status = task->tk_status;
561 if (RPC_ASSASSINATED(task))
562 return;
563 if (data->rpc_status == 0) {
564 memcpy(data->o_res.stateid.data, data->c_res.stateid.data,
565 sizeof(data->o_res.stateid.data));
566 renew_lease(data->o_res.server, data->timestamp);
567 }
568 nfs_increment_open_seqid(data->rpc_status, data->c_arg.seqid);
569 nfs_confirm_seqid(&data->owner->so_seqid, data->rpc_status);
570}
571
572static void nfs4_open_confirm_release(void *calldata)
573{
574 struct nfs4_opendata *data = calldata;
575 struct nfs4_state *state = NULL;
576
577 /* If this request hasn't been cancelled, do nothing */
578 if (data->cancelled == 0)
579 goto out_free;
580 /* In case of error, no cleanup! */
581 if (data->rpc_status != 0)
582 goto out_free;
583 nfs_confirm_seqid(&data->owner->so_seqid, 0);
584 state = nfs4_opendata_to_nfs4_state(data);
585 if (state != NULL)
586 nfs4_close_state(state, data->o_arg.open_flags);
587out_free:
588 nfs4_opendata_free(data);
589}
590
591static const struct rpc_call_ops nfs4_open_confirm_ops = {
592 .rpc_call_prepare = nfs4_open_confirm_prepare,
593 .rpc_call_done = nfs4_open_confirm_done,
594 .rpc_release = nfs4_open_confirm_release,
595};
596
597/*
598 * Note: On error, nfs4_proc_open_confirm will free the struct nfs4_opendata
599 */
600static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
601{
602 struct nfs_server *server = NFS_SERVER(data->dir->d_inode);
603 struct rpc_task *task;
399 int status; 604 int status;
400 605
401 status = rpc_call_sync(clnt, &msg, RPC_TASK_NOINTR); 606 atomic_inc(&data->count);
402 /* Confirm the sequence as being established */ 607 task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_open_confirm_ops, data);
403 nfs_confirm_seqid(&sp->so_seqid, status); 608 if (IS_ERR(task)) {
404 nfs_increment_open_seqid(status, seqid); 609 nfs4_opendata_free(data);
405 if (status >= 0) 610 return PTR_ERR(task);
406 memcpy(stateid, &res.stateid, sizeof(*stateid)); 611 }
612 status = nfs4_wait_for_completion_rpc_task(task);
613 if (status != 0) {
614 data->cancelled = 1;
615 smp_wmb();
616 } else
617 status = data->rpc_status;
618 rpc_release_task(task);
407 return status; 619 return status;
408} 620}
409 621
410static int _nfs4_proc_open(struct inode *dir, struct nfs4_state_owner *sp, struct nfs_openargs *o_arg, struct nfs_openres *o_res) 622static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
411{ 623{
412 struct nfs_server *server = NFS_SERVER(dir); 624 struct nfs4_opendata *data = calldata;
625 struct nfs4_state_owner *sp = data->owner;
413 struct rpc_message msg = { 626 struct rpc_message msg = {
414 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN], 627 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN],
415 .rpc_argp = o_arg, 628 .rpc_argp = &data->o_arg,
416 .rpc_resp = o_res, 629 .rpc_resp = &data->o_res,
417 .rpc_cred = sp->so_cred, 630 .rpc_cred = sp->so_cred,
418 }; 631 };
419 int status; 632
633 if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0)
634 return;
635 /* Update sequence id. */
636 data->o_arg.id = sp->so_id;
637 data->o_arg.clientid = sp->so_client->cl_clientid;
638 if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS)
639 msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
640 data->timestamp = jiffies;
641 rpc_call_setup(task, &msg, 0);
642}
420 643
421 /* Update sequence id. The caller must serialize! */ 644static void nfs4_open_done(struct rpc_task *task, void *calldata)
422 o_arg->id = sp->so_id; 645{
423 o_arg->clientid = sp->so_client->cl_clientid; 646 struct nfs4_opendata *data = calldata;
424 647
425 status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); 648 data->rpc_status = task->tk_status;
426 if (status == 0) { 649 if (RPC_ASSASSINATED(task))
427 /* OPEN on anything except a regular file is disallowed in NFSv4 */ 650 return;
428 switch (o_res->f_attr->mode & S_IFMT) { 651 if (task->tk_status == 0) {
652 switch (data->o_res.f_attr->mode & S_IFMT) {
429 case S_IFREG: 653 case S_IFREG:
430 break; 654 break;
431 case S_IFLNK: 655 case S_IFLNK:
432 status = -ELOOP; 656 data->rpc_status = -ELOOP;
433 break; 657 break;
434 case S_IFDIR: 658 case S_IFDIR:
435 status = -EISDIR; 659 data->rpc_status = -EISDIR;
436 break; 660 break;
437 default: 661 default:
438 status = -ENOTDIR; 662 data->rpc_status = -ENOTDIR;
439 } 663 }
664 renew_lease(data->o_res.server, data->timestamp);
440 } 665 }
666 nfs_increment_open_seqid(data->rpc_status, data->o_arg.seqid);
667}
668
669static void nfs4_open_release(void *calldata)
670{
671 struct nfs4_opendata *data = calldata;
672 struct nfs4_state *state = NULL;
673
674 /* If this request hasn't been cancelled, do nothing */
675 if (data->cancelled == 0)
676 goto out_free;
677 /* In case of error, no cleanup! */
678 if (data->rpc_status != 0)
679 goto out_free;
680 /* In case we need an open_confirm, no cleanup! */
681 if (data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM)
682 goto out_free;
683 nfs_confirm_seqid(&data->owner->so_seqid, 0);
684 state = nfs4_opendata_to_nfs4_state(data);
685 if (state != NULL)
686 nfs4_close_state(state, data->o_arg.open_flags);
687out_free:
688 nfs4_opendata_free(data);
689}
690
691static const struct rpc_call_ops nfs4_open_ops = {
692 .rpc_call_prepare = nfs4_open_prepare,
693 .rpc_call_done = nfs4_open_done,
694 .rpc_release = nfs4_open_release,
695};
441 696
442 nfs_increment_open_seqid(status, o_arg->seqid); 697/*
698 * Note: On error, nfs4_proc_open will free the struct nfs4_opendata
699 */
700static int _nfs4_proc_open(struct nfs4_opendata *data)
701{
702 struct inode *dir = data->dir->d_inode;
703 struct nfs_server *server = NFS_SERVER(dir);
704 struct nfs_openargs *o_arg = &data->o_arg;
705 struct nfs_openres *o_res = &data->o_res;
706 struct rpc_task *task;
707 int status;
708
709 atomic_inc(&data->count);
710 task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_open_ops, data);
711 if (IS_ERR(task)) {
712 nfs4_opendata_free(data);
713 return PTR_ERR(task);
714 }
715 status = nfs4_wait_for_completion_rpc_task(task);
716 if (status != 0) {
717 data->cancelled = 1;
718 smp_wmb();
719 } else
720 status = data->rpc_status;
721 rpc_release_task(task);
443 if (status != 0) 722 if (status != 0)
444 goto out; 723 return status;
724
445 if (o_arg->open_flags & O_CREAT) { 725 if (o_arg->open_flags & O_CREAT) {
446 update_changeattr(dir, &o_res->cinfo); 726 update_changeattr(dir, &o_res->cinfo);
447 nfs_post_op_update_inode(dir, o_res->dir_attr); 727 nfs_post_op_update_inode(dir, o_res->dir_attr);
448 } else 728 } else
449 nfs_refresh_inode(dir, o_res->dir_attr); 729 nfs_refresh_inode(dir, o_res->dir_attr);
450 if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { 730 if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
451 status = _nfs4_proc_open_confirm(server->client, &o_res->fh, 731 status = _nfs4_proc_open_confirm(data);
452 sp, &o_res->stateid, o_arg->seqid);
453 if (status != 0) 732 if (status != 0)
454 goto out; 733 return status;
455 } 734 }
456 nfs_confirm_seqid(&sp->so_seqid, 0); 735 nfs_confirm_seqid(&data->owner->so_seqid, 0);
457 if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) 736 if (!(o_res->f_attr->valid & NFS_ATTR_FATTR))
458 status = server->rpc_ops->getattr(server, &o_res->fh, o_res->f_attr); 737 return server->rpc_ops->getattr(server, &o_res->fh, o_res->f_attr);
459out: 738 return 0;
460 return status;
461} 739}
462 740
463static int _nfs4_do_access(struct inode *inode, struct rpc_cred *cred, int openflags) 741static int _nfs4_do_access(struct inode *inode, struct rpc_cred *cred, int openflags)
@@ -488,6 +766,15 @@ out:
488 return -EACCES; 766 return -EACCES;
489} 767}
490 768
769int nfs4_recover_expired_lease(struct nfs_server *server)
770{
771 struct nfs4_client *clp = server->nfs4_state;
772
773 if (test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
774 nfs4_schedule_state_recovery(clp);
775 return nfs4_wait_clnt_recover(server->client, clp);
776}
777
491/* 778/*
492 * OPEN_EXPIRED: 779 * OPEN_EXPIRED:
493 * reclaim state on the server after a network partition. 780 * reclaim state on the server after a network partition.
@@ -495,77 +782,31 @@ out:
495 */ 782 */
496static int _nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state, struct dentry *dentry) 783static int _nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state, struct dentry *dentry)
497{ 784{
498 struct dentry *parent = dget_parent(dentry);
499 struct inode *dir = parent->d_inode;
500 struct inode *inode = state->inode; 785 struct inode *inode = state->inode;
501 struct nfs_server *server = NFS_SERVER(dir);
502 struct nfs_delegation *delegation = NFS_I(inode)->delegation; 786 struct nfs_delegation *delegation = NFS_I(inode)->delegation;
503 struct nfs_fattr f_attr, dir_attr; 787 struct nfs4_opendata *opendata;
504 struct nfs_openargs o_arg = { 788 int openflags = state->state & (FMODE_READ|FMODE_WRITE);
505 .fh = NFS_FH(dir), 789 int ret;
506 .open_flags = state->state,
507 .name = &dentry->d_name,
508 .bitmask = server->attr_bitmask,
509 .claim = NFS4_OPEN_CLAIM_NULL,
510 };
511 struct nfs_openres o_res = {
512 .f_attr = &f_attr,
513 .dir_attr = &dir_attr,
514 .server = server,
515 };
516 int status = 0;
517 790
518 if (delegation != NULL && !(delegation->flags & NFS_DELEGATION_NEED_RECLAIM)) { 791 if (delegation != NULL && !(delegation->flags & NFS_DELEGATION_NEED_RECLAIM)) {
519 status = _nfs4_do_access(inode, sp->so_cred, state->state); 792 ret = _nfs4_do_access(inode, sp->so_cred, openflags);
520 if (status < 0) 793 if (ret < 0)
521 goto out; 794 return ret;
522 memcpy(&state->stateid, &delegation->stateid, sizeof(state->stateid)); 795 memcpy(&state->stateid, &delegation->stateid, sizeof(state->stateid));
523 set_bit(NFS_DELEGATED_STATE, &state->flags); 796 set_bit(NFS_DELEGATED_STATE, &state->flags);
524 goto out; 797 return 0;
525 } 798 }
526 o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid); 799 opendata = nfs4_opendata_alloc(dentry, sp, openflags, NULL);
527 status = -ENOMEM; 800 if (opendata == NULL)
528 if (o_arg.seqid == NULL) 801 return -ENOMEM;
529 goto out; 802 ret = nfs4_open_recover(opendata, state);
530 nfs_fattr_init(&f_attr); 803 if (ret == -ESTALE) {
531 nfs_fattr_init(&dir_attr); 804 /* Invalidate the state owner so we don't ever use it again */
532 status = _nfs4_proc_open(dir, sp, &o_arg, &o_res); 805 nfs4_drop_state_owner(sp);
533 if (status != 0) 806 d_drop(dentry);
534 goto out_nodeleg;
535 /* Check if files differ */
536 if ((f_attr.mode & S_IFMT) != (inode->i_mode & S_IFMT))
537 goto out_stale;
538 /* Has the file handle changed? */
539 if (nfs_compare_fh(&o_res.fh, NFS_FH(inode)) != 0) {
540 /* Verify if the change attributes are the same */
541 if (f_attr.change_attr != NFS_I(inode)->change_attr)
542 goto out_stale;
543 if (nfs_size_to_loff_t(f_attr.size) != inode->i_size)
544 goto out_stale;
545 /* Lets just pretend that this is the same file */
546 nfs_copy_fh(NFS_FH(inode), &o_res.fh);
547 NFS_I(inode)->fileid = f_attr.fileid;
548 }
549 memcpy(&state->stateid, &o_res.stateid, sizeof(state->stateid));
550 if (o_res.delegation_type != 0) {
551 if (!(delegation->flags & NFS_DELEGATION_NEED_RECLAIM))
552 nfs_inode_set_delegation(inode, sp->so_cred, &o_res);
553 else
554 nfs_inode_reclaim_delegation(inode, sp->so_cred, &o_res);
555 } 807 }
556out_nodeleg: 808 nfs4_opendata_free(opendata);
557 nfs_free_seqid(o_arg.seqid); 809 return ret;
558 clear_bit(NFS_DELEGATED_STATE, &state->flags);
559out:
560 dput(parent);
561 return status;
562out_stale:
563 status = -ESTALE;
564 /* Invalidate the state owner so we don't ever use it again */
565 nfs4_drop_state_owner(sp);
566 d_drop(dentry);
567 /* Should we be trying to close that stateid? */
568 goto out_nodeleg;
569} 810}
570 811
571static inline int nfs4_do_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state, struct dentry *dentry) 812static inline int nfs4_do_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state, struct dentry *dentry)
@@ -584,26 +825,19 @@ static inline int nfs4_do_open_expired(struct nfs4_state_owner *sp, struct nfs4_
584 825
585static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) 826static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
586{ 827{
587 struct nfs_inode *nfsi = NFS_I(state->inode);
588 struct nfs_open_context *ctx; 828 struct nfs_open_context *ctx;
589 int status; 829 int ret;
590 830
591 spin_lock(&state->inode->i_lock); 831 ctx = nfs4_state_find_open_context(state);
592 list_for_each_entry(ctx, &nfsi->open_files, list) { 832 if (IS_ERR(ctx))
593 if (ctx->state != state) 833 return PTR_ERR(ctx);
594 continue; 834 ret = nfs4_do_open_expired(sp, state, ctx->dentry);
595 get_nfs_open_context(ctx); 835 put_nfs_open_context(ctx);
596 spin_unlock(&state->inode->i_lock); 836 return ret;
597 status = nfs4_do_open_expired(sp, state, ctx->dentry);
598 put_nfs_open_context(ctx);
599 return status;
600 }
601 spin_unlock(&state->inode->i_lock);
602 return -ENOENT;
603} 837}
604 838
605/* 839/*
606 * Returns an nfs4_state + an extra reference to the inode 840 * Returns a referenced nfs4_state if there is an open delegation on the file
607 */ 841 */
608static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred *cred, struct nfs4_state **res) 842static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred *cred, struct nfs4_state **res)
609{ 843{
@@ -616,6 +850,14 @@ static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred
616 int open_flags = flags & (FMODE_READ|FMODE_WRITE); 850 int open_flags = flags & (FMODE_READ|FMODE_WRITE);
617 int err; 851 int err;
618 852
853 err = -ENOMEM;
854 if (!(sp = nfs4_get_state_owner(server, cred))) {
855 dprintk("%s: nfs4_get_state_owner failed!\n", __FUNCTION__);
856 return err;
857 }
858 err = nfs4_recover_expired_lease(server);
859 if (err != 0)
860 goto out_put_state_owner;
619 /* Protect against reboot recovery - NOTE ORDER! */ 861 /* Protect against reboot recovery - NOTE ORDER! */
620 down_read(&clp->cl_sem); 862 down_read(&clp->cl_sem);
621 /* Protect against delegation recall */ 863 /* Protect against delegation recall */
@@ -625,10 +867,6 @@ static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred
625 if (delegation == NULL || (delegation->type & open_flags) != open_flags) 867 if (delegation == NULL || (delegation->type & open_flags) != open_flags)
626 goto out_err; 868 goto out_err;
627 err = -ENOMEM; 869 err = -ENOMEM;
628 if (!(sp = nfs4_get_state_owner(server, cred))) {
629 dprintk("%s: nfs4_get_state_owner failed!\n", __FUNCTION__);
630 goto out_err;
631 }
632 state = nfs4_get_open_state(inode, sp); 870 state = nfs4_get_open_state(inode, sp);
633 if (state == NULL) 871 if (state == NULL)
634 goto out_err; 872 goto out_err;
@@ -636,39 +874,34 @@ static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred
636 err = -ENOENT; 874 err = -ENOENT;
637 if ((state->state & open_flags) == open_flags) { 875 if ((state->state & open_flags) == open_flags) {
638 spin_lock(&inode->i_lock); 876 spin_lock(&inode->i_lock);
639 if (open_flags & FMODE_READ) 877 update_open_stateflags(state, open_flags);
640 state->nreaders++;
641 if (open_flags & FMODE_WRITE)
642 state->nwriters++;
643 spin_unlock(&inode->i_lock); 878 spin_unlock(&inode->i_lock);
644 goto out_ok; 879 goto out_ok;
645 } else if (state->state != 0) 880 } else if (state->state != 0)
646 goto out_err; 881 goto out_put_open_state;
647 882
648 lock_kernel(); 883 lock_kernel();
649 err = _nfs4_do_access(inode, cred, open_flags); 884 err = _nfs4_do_access(inode, cred, open_flags);
650 unlock_kernel(); 885 unlock_kernel();
651 if (err != 0) 886 if (err != 0)
652 goto out_err; 887 goto out_put_open_state;
653 set_bit(NFS_DELEGATED_STATE, &state->flags); 888 set_bit(NFS_DELEGATED_STATE, &state->flags);
654 update_open_stateid(state, &delegation->stateid, open_flags); 889 update_open_stateid(state, &delegation->stateid, open_flags);
655out_ok: 890out_ok:
656 nfs4_put_state_owner(sp); 891 nfs4_put_state_owner(sp);
657 up_read(&nfsi->rwsem); 892 up_read(&nfsi->rwsem);
658 up_read(&clp->cl_sem); 893 up_read(&clp->cl_sem);
659 igrab(inode);
660 *res = state; 894 *res = state;
661 return 0; 895 return 0;
896out_put_open_state:
897 nfs4_put_open_state(state);
662out_err: 898out_err:
663 if (sp != NULL) {
664 if (state != NULL)
665 nfs4_put_open_state(state);
666 nfs4_put_state_owner(sp);
667 }
668 up_read(&nfsi->rwsem); 899 up_read(&nfsi->rwsem);
669 up_read(&clp->cl_sem); 900 up_read(&clp->cl_sem);
670 if (err != -EACCES) 901 if (err != -EACCES)
671 nfs_inode_return_delegation(inode); 902 nfs_inode_return_delegation(inode);
903out_put_state_owner:
904 nfs4_put_state_owner(sp);
672 return err; 905 return err;
673} 906}
674 907
@@ -689,7 +922,7 @@ static struct nfs4_state *nfs4_open_delegated(struct inode *inode, int flags, st
689} 922}
690 923
691/* 924/*
692 * Returns an nfs4_state + an referenced inode 925 * Returns a referenced nfs4_state
693 */ 926 */
694static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res) 927static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res)
695{ 928{
@@ -697,73 +930,46 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st
697 struct nfs4_state *state = NULL; 930 struct nfs4_state *state = NULL;
698 struct nfs_server *server = NFS_SERVER(dir); 931 struct nfs_server *server = NFS_SERVER(dir);
699 struct nfs4_client *clp = server->nfs4_state; 932 struct nfs4_client *clp = server->nfs4_state;
700 struct inode *inode = NULL; 933 struct nfs4_opendata *opendata;
701 int status; 934 int status;
702 struct nfs_fattr f_attr, dir_attr;
703 struct nfs_openargs o_arg = {
704 .fh = NFS_FH(dir),
705 .open_flags = flags,
706 .name = &dentry->d_name,
707 .server = server,
708 .bitmask = server->attr_bitmask,
709 .claim = NFS4_OPEN_CLAIM_NULL,
710 };
711 struct nfs_openres o_res = {
712 .f_attr = &f_attr,
713 .dir_attr = &dir_attr,
714 .server = server,
715 };
716 935
717 /* Protect against reboot recovery conflicts */ 936 /* Protect against reboot recovery conflicts */
718 down_read(&clp->cl_sem);
719 status = -ENOMEM; 937 status = -ENOMEM;
720 if (!(sp = nfs4_get_state_owner(server, cred))) { 938 if (!(sp = nfs4_get_state_owner(server, cred))) {
721 dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n"); 939 dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n");
722 goto out_err; 940 goto out_err;
723 } 941 }
724 if (flags & O_EXCL) { 942 status = nfs4_recover_expired_lease(server);
725 u32 *p = (u32 *) o_arg.u.verifier.data; 943 if (status != 0)
726 p[0] = jiffies; 944 goto err_put_state_owner;
727 p[1] = current->pid; 945 down_read(&clp->cl_sem);
728 } else 946 status = -ENOMEM;
729 o_arg.u.attrs = sattr; 947 opendata = nfs4_opendata_alloc(dentry, sp, flags, sattr);
730 /* Serialization for the sequence id */ 948 if (opendata == NULL)
949 goto err_put_state_owner;
731 950
732 o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid); 951 status = _nfs4_proc_open(opendata);
733 if (o_arg.seqid == NULL)
734 return -ENOMEM;
735 nfs_fattr_init(&f_attr);
736 nfs_fattr_init(&dir_attr);
737 status = _nfs4_proc_open(dir, sp, &o_arg, &o_res);
738 if (status != 0) 952 if (status != 0)
739 goto out_err; 953 goto err_opendata_free;
740 954
741 status = -ENOMEM; 955 status = -ENOMEM;
742 inode = nfs_fhget(dir->i_sb, &o_res.fh, &f_attr); 956 state = nfs4_opendata_to_nfs4_state(opendata);
743 if (!inode) 957 if (state == NULL)
744 goto out_err; 958 goto err_opendata_free;
745 state = nfs4_get_open_state(inode, sp); 959 if (opendata->o_res.delegation_type != 0)
746 if (!state) 960 nfs_inode_set_delegation(state->inode, cred, &opendata->o_res);
747 goto out_err; 961 nfs4_opendata_free(opendata);
748 update_open_stateid(state, &o_res.stateid, flags);
749 if (o_res.delegation_type != 0)
750 nfs_inode_set_delegation(inode, cred, &o_res);
751 nfs_free_seqid(o_arg.seqid);
752 nfs4_put_state_owner(sp); 962 nfs4_put_state_owner(sp);
753 up_read(&clp->cl_sem); 963 up_read(&clp->cl_sem);
754 *res = state; 964 *res = state;
755 return 0; 965 return 0;
966err_opendata_free:
967 nfs4_opendata_free(opendata);
968err_put_state_owner:
969 nfs4_put_state_owner(sp);
756out_err: 970out_err:
757 if (sp != NULL) {
758 if (state != NULL)
759 nfs4_put_open_state(state);
760 nfs_free_seqid(o_arg.seqid);
761 nfs4_put_state_owner(sp);
762 }
763 /* Note: clp->cl_sem must be released before nfs4_put_open_state()! */ 971 /* Note: clp->cl_sem must be released before nfs4_put_open_state()! */
764 up_read(&clp->cl_sem); 972 up_read(&clp->cl_sem);
765 if (inode != NULL)
766 iput(inode);
767 *res = NULL; 973 *res = NULL;
768 return status; 974 return status;
769} 975}
@@ -830,6 +1036,7 @@ static int _nfs4_do_setattr(struct nfs_server *server, struct nfs_fattr *fattr,
830 .rpc_argp = &arg, 1036 .rpc_argp = &arg,
831 .rpc_resp = &res, 1037 .rpc_resp = &res,
832 }; 1038 };
1039 unsigned long timestamp = jiffies;
833 int status; 1040 int status;
834 1041
835 nfs_fattr_init(fattr); 1042 nfs_fattr_init(fattr);
@@ -841,6 +1048,8 @@ static int _nfs4_do_setattr(struct nfs_server *server, struct nfs_fattr *fattr,
841 memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); 1048 memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
842 1049
843 status = rpc_call_sync(server->client, &msg, 0); 1050 status = rpc_call_sync(server->client, &msg, 0);
1051 if (status == 0 && state != NULL)
1052 renew_lease(server, timestamp);
844 return status; 1053 return status;
845} 1054}
846 1055
@@ -865,12 +1074,13 @@ struct nfs4_closedata {
865 struct nfs_closeargs arg; 1074 struct nfs_closeargs arg;
866 struct nfs_closeres res; 1075 struct nfs_closeres res;
867 struct nfs_fattr fattr; 1076 struct nfs_fattr fattr;
1077 unsigned long timestamp;
868}; 1078};
869 1079
870static void nfs4_free_closedata(struct nfs4_closedata *calldata) 1080static void nfs4_free_closedata(void *data)
871{ 1081{
872 struct nfs4_state *state = calldata->state; 1082 struct nfs4_closedata *calldata = data;
873 struct nfs4_state_owner *sp = state->owner; 1083 struct nfs4_state_owner *sp = calldata->state->owner;
874 1084
875 nfs4_put_open_state(calldata->state); 1085 nfs4_put_open_state(calldata->state);
876 nfs_free_seqid(calldata->arg.seqid); 1086 nfs_free_seqid(calldata->arg.seqid);
@@ -878,12 +1088,14 @@ static void nfs4_free_closedata(struct nfs4_closedata *calldata)
878 kfree(calldata); 1088 kfree(calldata);
879} 1089}
880 1090
881static void nfs4_close_done(struct rpc_task *task) 1091static void nfs4_close_done(struct rpc_task *task, void *data)
882{ 1092{
883 struct nfs4_closedata *calldata = (struct nfs4_closedata *)task->tk_calldata; 1093 struct nfs4_closedata *calldata = data;
884 struct nfs4_state *state = calldata->state; 1094 struct nfs4_state *state = calldata->state;
885 struct nfs_server *server = NFS_SERVER(calldata->inode); 1095 struct nfs_server *server = NFS_SERVER(calldata->inode);
886 1096
1097 if (RPC_ASSASSINATED(task))
1098 return;
887 /* hmm. we are done with the inode, and in the process of freeing 1099 /* hmm. we are done with the inode, and in the process of freeing
888 * the state_owner. we keep this around to process errors 1100 * the state_owner. we keep this around to process errors
889 */ 1101 */
@@ -892,6 +1104,7 @@ static void nfs4_close_done(struct rpc_task *task)
892 case 0: 1104 case 0:
893 memcpy(&state->stateid, &calldata->res.stateid, 1105 memcpy(&state->stateid, &calldata->res.stateid,
894 sizeof(state->stateid)); 1106 sizeof(state->stateid));
1107 renew_lease(server, calldata->timestamp);
895 break; 1108 break;
896 case -NFS4ERR_STALE_STATEID: 1109 case -NFS4ERR_STALE_STATEID:
897 case -NFS4ERR_EXPIRED: 1110 case -NFS4ERR_EXPIRED:
@@ -904,12 +1117,11 @@ static void nfs4_close_done(struct rpc_task *task)
904 } 1117 }
905 } 1118 }
906 nfs_refresh_inode(calldata->inode, calldata->res.fattr); 1119 nfs_refresh_inode(calldata->inode, calldata->res.fattr);
907 nfs4_free_closedata(calldata);
908} 1120}
909 1121
910static void nfs4_close_begin(struct rpc_task *task) 1122static void nfs4_close_prepare(struct rpc_task *task, void *data)
911{ 1123{
912 struct nfs4_closedata *calldata = (struct nfs4_closedata *)task->tk_calldata; 1124 struct nfs4_closedata *calldata = data;
913 struct nfs4_state *state = calldata->state; 1125 struct nfs4_state *state = calldata->state;
914 struct rpc_message msg = { 1126 struct rpc_message msg = {
915 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE], 1127 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE],
@@ -918,10 +1130,8 @@ static void nfs4_close_begin(struct rpc_task *task)
918 .rpc_cred = state->owner->so_cred, 1130 .rpc_cred = state->owner->so_cred,
919 }; 1131 };
920 int mode = 0, old_mode; 1132 int mode = 0, old_mode;
921 int status;
922 1133
923 status = nfs_wait_on_sequence(calldata->arg.seqid, task); 1134 if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
924 if (status != 0)
925 return; 1135 return;
926 /* Recalculate the new open mode in case someone reopened the file 1136 /* Recalculate the new open mode in case someone reopened the file
927 * while we were waiting in line to be scheduled. 1137 * while we were waiting in line to be scheduled.
@@ -929,26 +1139,34 @@ static void nfs4_close_begin(struct rpc_task *task)
929 spin_lock(&state->owner->so_lock); 1139 spin_lock(&state->owner->so_lock);
930 spin_lock(&calldata->inode->i_lock); 1140 spin_lock(&calldata->inode->i_lock);
931 mode = old_mode = state->state; 1141 mode = old_mode = state->state;
932 if (state->nreaders == 0) 1142 if (state->n_rdwr == 0) {
933 mode &= ~FMODE_READ; 1143 if (state->n_rdonly == 0)
934 if (state->nwriters == 0) 1144 mode &= ~FMODE_READ;
935 mode &= ~FMODE_WRITE; 1145 if (state->n_wronly == 0)
1146 mode &= ~FMODE_WRITE;
1147 }
936 nfs4_state_set_mode_locked(state, mode); 1148 nfs4_state_set_mode_locked(state, mode);
937 spin_unlock(&calldata->inode->i_lock); 1149 spin_unlock(&calldata->inode->i_lock);
938 spin_unlock(&state->owner->so_lock); 1150 spin_unlock(&state->owner->so_lock);
939 if (mode == old_mode || test_bit(NFS_DELEGATED_STATE, &state->flags)) { 1151 if (mode == old_mode || test_bit(NFS_DELEGATED_STATE, &state->flags)) {
940 nfs4_free_closedata(calldata); 1152 /* Note: exit _without_ calling nfs4_close_done */
941 task->tk_exit = NULL; 1153 task->tk_action = NULL;
942 rpc_exit(task, 0);
943 return; 1154 return;
944 } 1155 }
945 nfs_fattr_init(calldata->res.fattr); 1156 nfs_fattr_init(calldata->res.fattr);
946 if (mode != 0) 1157 if (mode != 0)
947 msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; 1158 msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
948 calldata->arg.open_flags = mode; 1159 calldata->arg.open_flags = mode;
1160 calldata->timestamp = jiffies;
949 rpc_call_setup(task, &msg, 0); 1161 rpc_call_setup(task, &msg, 0);
950} 1162}
951 1163
1164static const struct rpc_call_ops nfs4_close_ops = {
1165 .rpc_call_prepare = nfs4_close_prepare,
1166 .rpc_call_done = nfs4_close_done,
1167 .rpc_release = nfs4_free_closedata,
1168};
1169
952/* 1170/*
953 * It is possible for data to be read/written from a mem-mapped file 1171 * It is possible for data to be read/written from a mem-mapped file
954 * after the sys_close call (which hits the vfs layer as a flush). 1172 * after the sys_close call (which hits the vfs layer as a flush).
@@ -981,8 +1199,7 @@ int nfs4_do_close(struct inode *inode, struct nfs4_state *state)
981 calldata->res.fattr = &calldata->fattr; 1199 calldata->res.fattr = &calldata->fattr;
982 calldata->res.server = server; 1200 calldata->res.server = server;
983 1201
984 status = nfs4_call_async(server->client, nfs4_close_begin, 1202 status = nfs4_call_async(server->client, &nfs4_close_ops, calldata);
985 nfs4_close_done, calldata);
986 if (status == 0) 1203 if (status == 0)
987 goto out; 1204 goto out;
988 1205
@@ -1034,7 +1251,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
1034 d_add(dentry, NULL); 1251 d_add(dentry, NULL);
1035 return (struct dentry *)state; 1252 return (struct dentry *)state;
1036 } 1253 }
1037 res = d_add_unique(dentry, state->inode); 1254 res = d_add_unique(dentry, igrab(state->inode));
1038 if (res != NULL) 1255 if (res != NULL)
1039 dentry = res; 1256 dentry = res;
1040 nfs4_intent_set_file(nd, dentry, state); 1257 nfs4_intent_set_file(nd, dentry, state);
@@ -1046,7 +1263,6 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
1046{ 1263{
1047 struct rpc_cred *cred; 1264 struct rpc_cred *cred;
1048 struct nfs4_state *state; 1265 struct nfs4_state *state;
1049 struct inode *inode;
1050 1266
1051 cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0); 1267 cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0);
1052 if (IS_ERR(cred)) 1268 if (IS_ERR(cred))
@@ -1070,9 +1286,7 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
1070 } 1286 }
1071 goto out_drop; 1287 goto out_drop;
1072 } 1288 }
1073 inode = state->inode; 1289 if (state->inode == dentry->d_inode) {
1074 iput(inode);
1075 if (inode == dentry->d_inode) {
1076 nfs4_intent_set_file(nd, dentry, state); 1290 nfs4_intent_set_file(nd, dentry, state);
1077 return 1; 1291 return 1;
1078 } 1292 }
@@ -1506,10 +1720,17 @@ static int _nfs4_proc_write(struct nfs_write_data *wdata)
1506 dprintk("NFS call write %d @ %Ld\n", wdata->args.count, 1720 dprintk("NFS call write %d @ %Ld\n", wdata->args.count,
1507 (long long) wdata->args.offset); 1721 (long long) wdata->args.offset);
1508 1722
1723 wdata->args.bitmask = server->attr_bitmask;
1724 wdata->res.server = server;
1725 wdata->timestamp = jiffies;
1509 nfs_fattr_init(fattr); 1726 nfs_fattr_init(fattr);
1510 status = rpc_call_sync(server->client, &msg, rpcflags); 1727 status = rpc_call_sync(server->client, &msg, rpcflags);
1511 dprintk("NFS reply write: %d\n", status); 1728 dprintk("NFS reply write: %d\n", status);
1512 return status; 1729 if (status < 0)
1730 return status;
1731 renew_lease(server, wdata->timestamp);
1732 nfs_post_op_update_inode(inode, fattr);
1733 return wdata->res.count;
1513} 1734}
1514 1735
1515static int nfs4_proc_write(struct nfs_write_data *wdata) 1736static int nfs4_proc_write(struct nfs_write_data *wdata)
@@ -1540,9 +1761,16 @@ static int _nfs4_proc_commit(struct nfs_write_data *cdata)
1540 dprintk("NFS call commit %d @ %Ld\n", cdata->args.count, 1761 dprintk("NFS call commit %d @ %Ld\n", cdata->args.count,
1541 (long long) cdata->args.offset); 1762 (long long) cdata->args.offset);
1542 1763
1764 cdata->args.bitmask = server->attr_bitmask;
1765 cdata->res.server = server;
1766 cdata->timestamp = jiffies;
1543 nfs_fattr_init(fattr); 1767 nfs_fattr_init(fattr);
1544 status = rpc_call_sync(server->client, &msg, 0); 1768 status = rpc_call_sync(server->client, &msg, 0);
1769 if (status >= 0)
1770 renew_lease(server, cdata->timestamp);
1545 dprintk("NFS reply commit: %d\n", status); 1771 dprintk("NFS reply commit: %d\n", status);
1772 if (status >= 0)
1773 nfs_post_op_update_inode(inode, fattr);
1546 return status; 1774 return status;
1547} 1775}
1548 1776
@@ -1592,7 +1820,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
1592 status = PTR_ERR(state); 1820 status = PTR_ERR(state);
1593 goto out; 1821 goto out;
1594 } 1822 }
1595 d_instantiate(dentry, state->inode); 1823 d_instantiate(dentry, igrab(state->inode));
1596 if (flags & O_EXCL) { 1824 if (flags & O_EXCL) {
1597 struct nfs_fattr fattr; 1825 struct nfs_fattr fattr;
1598 status = nfs4_do_setattr(NFS_SERVER(dir), &fattr, 1826 status = nfs4_do_setattr(NFS_SERVER(dir), &fattr,
@@ -2116,10 +2344,9 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
2116 return err; 2344 return err;
2117} 2345}
2118 2346
2119static void 2347static void nfs4_read_done(struct rpc_task *task, void *calldata)
2120nfs4_read_done(struct rpc_task *task)
2121{ 2348{
2122 struct nfs_read_data *data = (struct nfs_read_data *) task->tk_calldata; 2349 struct nfs_read_data *data = calldata;
2123 struct inode *inode = data->inode; 2350 struct inode *inode = data->inode;
2124 2351
2125 if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) { 2352 if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) {
@@ -2129,9 +2356,14 @@ nfs4_read_done(struct rpc_task *task)
2129 if (task->tk_status > 0) 2356 if (task->tk_status > 0)
2130 renew_lease(NFS_SERVER(inode), data->timestamp); 2357 renew_lease(NFS_SERVER(inode), data->timestamp);
2131 /* Call back common NFS readpage processing */ 2358 /* Call back common NFS readpage processing */
2132 nfs_readpage_result(task); 2359 nfs_readpage_result(task, calldata);
2133} 2360}
2134 2361
2362static const struct rpc_call_ops nfs4_read_ops = {
2363 .rpc_call_done = nfs4_read_done,
2364 .rpc_release = nfs_readdata_release,
2365};
2366
2135static void 2367static void
2136nfs4_proc_read_setup(struct nfs_read_data *data) 2368nfs4_proc_read_setup(struct nfs_read_data *data)
2137{ 2369{
@@ -2151,14 +2383,13 @@ nfs4_proc_read_setup(struct nfs_read_data *data)
2151 flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0); 2383 flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0);
2152 2384
2153 /* Finalize the task. */ 2385 /* Finalize the task. */
2154 rpc_init_task(task, NFS_CLIENT(inode), nfs4_read_done, flags); 2386 rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs4_read_ops, data);
2155 rpc_call_setup(task, &msg, 0); 2387 rpc_call_setup(task, &msg, 0);
2156} 2388}
2157 2389
2158static void 2390static void nfs4_write_done(struct rpc_task *task, void *calldata)
2159nfs4_write_done(struct rpc_task *task)
2160{ 2391{
2161 struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata; 2392 struct nfs_write_data *data = calldata;
2162 struct inode *inode = data->inode; 2393 struct inode *inode = data->inode;
2163 2394
2164 if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) { 2395 if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) {
@@ -2170,9 +2401,14 @@ nfs4_write_done(struct rpc_task *task)
2170 nfs_post_op_update_inode(inode, data->res.fattr); 2401 nfs_post_op_update_inode(inode, data->res.fattr);
2171 } 2402 }
2172 /* Call back common NFS writeback processing */ 2403 /* Call back common NFS writeback processing */
2173 nfs_writeback_done(task); 2404 nfs_writeback_done(task, calldata);
2174} 2405}
2175 2406
2407static const struct rpc_call_ops nfs4_write_ops = {
2408 .rpc_call_done = nfs4_write_done,
2409 .rpc_release = nfs_writedata_release,
2410};
2411
2176static void 2412static void
2177nfs4_proc_write_setup(struct nfs_write_data *data, int how) 2413nfs4_proc_write_setup(struct nfs_write_data *data, int how)
2178{ 2414{
@@ -2205,14 +2441,13 @@ nfs4_proc_write_setup(struct nfs_write_data *data, int how)
2205 flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; 2441 flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
2206 2442
2207 /* Finalize the task. */ 2443 /* Finalize the task. */
2208 rpc_init_task(task, NFS_CLIENT(inode), nfs4_write_done, flags); 2444 rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs4_write_ops, data);
2209 rpc_call_setup(task, &msg, 0); 2445 rpc_call_setup(task, &msg, 0);
2210} 2446}
2211 2447
2212static void 2448static void nfs4_commit_done(struct rpc_task *task, void *calldata)
2213nfs4_commit_done(struct rpc_task *task)
2214{ 2449{
2215 struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata; 2450 struct nfs_write_data *data = calldata;
2216 struct inode *inode = data->inode; 2451 struct inode *inode = data->inode;
2217 2452
2218 if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) { 2453 if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) {
@@ -2222,9 +2457,14 @@ nfs4_commit_done(struct rpc_task *task)
2222 if (task->tk_status >= 0) 2457 if (task->tk_status >= 0)
2223 nfs_post_op_update_inode(inode, data->res.fattr); 2458 nfs_post_op_update_inode(inode, data->res.fattr);
2224 /* Call back common NFS writeback processing */ 2459 /* Call back common NFS writeback processing */
2225 nfs_commit_done(task); 2460 nfs_commit_done(task, calldata);
2226} 2461}
2227 2462
2463static const struct rpc_call_ops nfs4_commit_ops = {
2464 .rpc_call_done = nfs4_commit_done,
2465 .rpc_release = nfs_commit_release,
2466};
2467
2228static void 2468static void
2229nfs4_proc_commit_setup(struct nfs_write_data *data, int how) 2469nfs4_proc_commit_setup(struct nfs_write_data *data, int how)
2230{ 2470{
@@ -2246,7 +2486,7 @@ nfs4_proc_commit_setup(struct nfs_write_data *data, int how)
2246 flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; 2486 flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
2247 2487
2248 /* Finalize the task. */ 2488 /* Finalize the task. */
2249 rpc_init_task(task, NFS_CLIENT(inode), nfs4_commit_done, flags); 2489 rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs4_commit_ops, data);
2250 rpc_call_setup(task, &msg, 0); 2490 rpc_call_setup(task, &msg, 0);
2251} 2491}
2252 2492
@@ -2254,11 +2494,10 @@ nfs4_proc_commit_setup(struct nfs_write_data *data, int how)
2254 * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special 2494 * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special
2255 * standalone procedure for queueing an asynchronous RENEW. 2495 * standalone procedure for queueing an asynchronous RENEW.
2256 */ 2496 */
2257static void 2497static void nfs4_renew_done(struct rpc_task *task, void *data)
2258renew_done(struct rpc_task *task)
2259{ 2498{
2260 struct nfs4_client *clp = (struct nfs4_client *)task->tk_msg.rpc_argp; 2499 struct nfs4_client *clp = (struct nfs4_client *)task->tk_msg.rpc_argp;
2261 unsigned long timestamp = (unsigned long)task->tk_calldata; 2500 unsigned long timestamp = (unsigned long)data;
2262 2501
2263 if (task->tk_status < 0) { 2502 if (task->tk_status < 0) {
2264 switch (task->tk_status) { 2503 switch (task->tk_status) {
@@ -2275,26 +2514,28 @@ renew_done(struct rpc_task *task)
2275 spin_unlock(&clp->cl_lock); 2514 spin_unlock(&clp->cl_lock);
2276} 2515}
2277 2516
2278int 2517static const struct rpc_call_ops nfs4_renew_ops = {
2279nfs4_proc_async_renew(struct nfs4_client *clp) 2518 .rpc_call_done = nfs4_renew_done,
2519};
2520
2521int nfs4_proc_async_renew(struct nfs4_client *clp, struct rpc_cred *cred)
2280{ 2522{
2281 struct rpc_message msg = { 2523 struct rpc_message msg = {
2282 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW], 2524 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW],
2283 .rpc_argp = clp, 2525 .rpc_argp = clp,
2284 .rpc_cred = clp->cl_cred, 2526 .rpc_cred = cred,
2285 }; 2527 };
2286 2528
2287 return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, 2529 return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT,
2288 renew_done, (void *)jiffies); 2530 &nfs4_renew_ops, (void *)jiffies);
2289} 2531}
2290 2532
2291int 2533int nfs4_proc_renew(struct nfs4_client *clp, struct rpc_cred *cred)
2292nfs4_proc_renew(struct nfs4_client *clp)
2293{ 2534{
2294 struct rpc_message msg = { 2535 struct rpc_message msg = {
2295 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW], 2536 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW],
2296 .rpc_argp = clp, 2537 .rpc_argp = clp,
2297 .rpc_cred = clp->cl_cred, 2538 .rpc_cred = cred,
2298 }; 2539 };
2299 unsigned long now = jiffies; 2540 unsigned long now = jiffies;
2300 int status; 2541 int status;
@@ -2510,7 +2751,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
2510 case -NFS4ERR_EXPIRED: 2751 case -NFS4ERR_EXPIRED:
2511 rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL, NULL); 2752 rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL, NULL);
2512 nfs4_schedule_state_recovery(clp); 2753 nfs4_schedule_state_recovery(clp);
2513 if (test_bit(NFS4CLNT_OK, &clp->cl_state)) 2754 if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) == 0)
2514 rpc_wake_up_task(task); 2755 rpc_wake_up_task(task);
2515 task->tk_status = 0; 2756 task->tk_status = 0;
2516 return -EAGAIN; 2757 return -EAGAIN;
@@ -2527,25 +2768,25 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
2527 return 0; 2768 return 0;
2528} 2769}
2529 2770
2771static int nfs4_wait_bit_interruptible(void *word)
2772{
2773 if (signal_pending(current))
2774 return -ERESTARTSYS;
2775 schedule();
2776 return 0;
2777}
2778
2530static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs4_client *clp) 2779static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs4_client *clp)
2531{ 2780{
2532 DEFINE_WAIT(wait);
2533 sigset_t oldset; 2781 sigset_t oldset;
2534 int interruptible, res = 0; 2782 int res;
2535 2783
2536 might_sleep(); 2784 might_sleep();
2537 2785
2538 rpc_clnt_sigmask(clnt, &oldset); 2786 rpc_clnt_sigmask(clnt, &oldset);
2539 interruptible = TASK_UNINTERRUPTIBLE; 2787 res = wait_on_bit(&clp->cl_state, NFS4CLNT_STATE_RECOVER,
2540 if (clnt->cl_intr) 2788 nfs4_wait_bit_interruptible,
2541 interruptible = TASK_INTERRUPTIBLE; 2789 TASK_INTERRUPTIBLE);
2542 prepare_to_wait(&clp->cl_waitq, &wait, interruptible);
2543 nfs4_schedule_state_recovery(clp);
2544 if (clnt->cl_intr && signalled())
2545 res = -ERESTARTSYS;
2546 else if (!test_bit(NFS4CLNT_OK, &clp->cl_state))
2547 schedule();
2548 finish_wait(&clp->cl_waitq, &wait);
2549 rpc_clnt_sigunmask(clnt, &oldset); 2790 rpc_clnt_sigunmask(clnt, &oldset);
2550 return res; 2791 return res;
2551} 2792}
@@ -2588,6 +2829,7 @@ int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct
2588 case -NFS4ERR_STALE_CLIENTID: 2829 case -NFS4ERR_STALE_CLIENTID:
2589 case -NFS4ERR_STALE_STATEID: 2830 case -NFS4ERR_STALE_STATEID:
2590 case -NFS4ERR_EXPIRED: 2831 case -NFS4ERR_EXPIRED:
2832 nfs4_schedule_state_recovery(clp);
2591 ret = nfs4_wait_clnt_recover(server->client, clp); 2833 ret = nfs4_wait_clnt_recover(server->client, clp);
2592 if (ret == 0) 2834 if (ret == 0)
2593 exception->retry = 1; 2835 exception->retry = 1;
@@ -2604,7 +2846,7 @@ int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct
2604 return nfs4_map_errors(ret); 2846 return nfs4_map_errors(ret);
2605} 2847}
2606 2848
2607int nfs4_proc_setclientid(struct nfs4_client *clp, u32 program, unsigned short port) 2849int nfs4_proc_setclientid(struct nfs4_client *clp, u32 program, unsigned short port, struct rpc_cred *cred)
2608{ 2850{
2609 nfs4_verifier sc_verifier; 2851 nfs4_verifier sc_verifier;
2610 struct nfs4_setclientid setclientid = { 2852 struct nfs4_setclientid setclientid = {
@@ -2615,7 +2857,7 @@ int nfs4_proc_setclientid(struct nfs4_client *clp, u32 program, unsigned short p
2615 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID], 2857 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
2616 .rpc_argp = &setclientid, 2858 .rpc_argp = &setclientid,
2617 .rpc_resp = clp, 2859 .rpc_resp = clp,
2618 .rpc_cred = clp->cl_cred, 2860 .rpc_cred = cred,
2619 }; 2861 };
2620 u32 *p; 2862 u32 *p;
2621 int loop = 0; 2863 int loop = 0;
@@ -2629,7 +2871,7 @@ int nfs4_proc_setclientid(struct nfs4_client *clp, u32 program, unsigned short p
2629 setclientid.sc_name_len = scnprintf(setclientid.sc_name, 2871 setclientid.sc_name_len = scnprintf(setclientid.sc_name,
2630 sizeof(setclientid.sc_name), "%s/%u.%u.%u.%u %s %u", 2872 sizeof(setclientid.sc_name), "%s/%u.%u.%u.%u %s %u",
2631 clp->cl_ipaddr, NIPQUAD(clp->cl_addr.s_addr), 2873 clp->cl_ipaddr, NIPQUAD(clp->cl_addr.s_addr),
2632 clp->cl_cred->cr_ops->cr_name, 2874 cred->cr_ops->cr_name,
2633 clp->cl_id_uniquifier); 2875 clp->cl_id_uniquifier);
2634 setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, 2876 setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
2635 sizeof(setclientid.sc_netid), "tcp"); 2877 sizeof(setclientid.sc_netid), "tcp");
@@ -2652,14 +2894,14 @@ int nfs4_proc_setclientid(struct nfs4_client *clp, u32 program, unsigned short p
2652} 2894}
2653 2895
2654int 2896int
2655nfs4_proc_setclientid_confirm(struct nfs4_client *clp) 2897nfs4_proc_setclientid_confirm(struct nfs4_client *clp, struct rpc_cred *cred)
2656{ 2898{
2657 struct nfs_fsinfo fsinfo; 2899 struct nfs_fsinfo fsinfo;
2658 struct rpc_message msg = { 2900 struct rpc_message msg = {
2659 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM], 2901 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM],
2660 .rpc_argp = clp, 2902 .rpc_argp = clp,
2661 .rpc_resp = &fsinfo, 2903 .rpc_resp = &fsinfo,
2662 .rpc_cred = clp->cl_cred, 2904 .rpc_cred = cred,
2663 }; 2905 };
2664 unsigned long now; 2906 unsigned long now;
2665 int status; 2907 int status;
@@ -2670,24 +2912,92 @@ nfs4_proc_setclientid_confirm(struct nfs4_client *clp)
2670 spin_lock(&clp->cl_lock); 2912 spin_lock(&clp->cl_lock);
2671 clp->cl_lease_time = fsinfo.lease_time * HZ; 2913 clp->cl_lease_time = fsinfo.lease_time * HZ;
2672 clp->cl_last_renewal = now; 2914 clp->cl_last_renewal = now;
2915 clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
2673 spin_unlock(&clp->cl_lock); 2916 spin_unlock(&clp->cl_lock);
2674 } 2917 }
2675 return status; 2918 return status;
2676} 2919}
2677 2920
2678static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid) 2921struct nfs4_delegreturndata {
2922 struct nfs4_delegreturnargs args;
2923 struct nfs4_delegreturnres res;
2924 struct nfs_fh fh;
2925 nfs4_stateid stateid;
2926 struct rpc_cred *cred;
2927 unsigned long timestamp;
2928 struct nfs_fattr fattr;
2929 int rpc_status;
2930};
2931
2932static void nfs4_delegreturn_prepare(struct rpc_task *task, void *calldata)
2679{ 2933{
2680 struct nfs4_delegreturnargs args = { 2934 struct nfs4_delegreturndata *data = calldata;
2681 .fhandle = NFS_FH(inode),
2682 .stateid = stateid,
2683 };
2684 struct rpc_message msg = { 2935 struct rpc_message msg = {
2685 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DELEGRETURN], 2936 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DELEGRETURN],
2686 .rpc_argp = &args, 2937 .rpc_argp = &data->args,
2687 .rpc_cred = cred, 2938 .rpc_resp = &data->res,
2939 .rpc_cred = data->cred,
2688 }; 2940 };
2941 nfs_fattr_init(data->res.fattr);
2942 rpc_call_setup(task, &msg, 0);
2943}
2689 2944
2690 return rpc_call_sync(NFS_CLIENT(inode), &msg, 0); 2945static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
2946{
2947 struct nfs4_delegreturndata *data = calldata;
2948 data->rpc_status = task->tk_status;
2949 if (data->rpc_status == 0)
2950 renew_lease(data->res.server, data->timestamp);
2951}
2952
2953static void nfs4_delegreturn_release(void *calldata)
2954{
2955 struct nfs4_delegreturndata *data = calldata;
2956
2957 put_rpccred(data->cred);
2958 kfree(calldata);
2959}
2960
2961const static struct rpc_call_ops nfs4_delegreturn_ops = {
2962 .rpc_call_prepare = nfs4_delegreturn_prepare,
2963 .rpc_call_done = nfs4_delegreturn_done,
2964 .rpc_release = nfs4_delegreturn_release,
2965};
2966
2967static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid)
2968{
2969 struct nfs4_delegreturndata *data;
2970 struct nfs_server *server = NFS_SERVER(inode);
2971 struct rpc_task *task;
2972 int status;
2973
2974 data = kmalloc(sizeof(*data), GFP_KERNEL);
2975 if (data == NULL)
2976 return -ENOMEM;
2977 data->args.fhandle = &data->fh;
2978 data->args.stateid = &data->stateid;
2979 data->args.bitmask = server->attr_bitmask;
2980 nfs_copy_fh(&data->fh, NFS_FH(inode));
2981 memcpy(&data->stateid, stateid, sizeof(data->stateid));
2982 data->res.fattr = &data->fattr;
2983 data->res.server = server;
2984 data->cred = get_rpccred(cred);
2985 data->timestamp = jiffies;
2986 data->rpc_status = 0;
2987
2988 task = rpc_run_task(NFS_CLIENT(inode), RPC_TASK_ASYNC, &nfs4_delegreturn_ops, data);
2989 if (IS_ERR(task)) {
2990 nfs4_delegreturn_release(data);
2991 return PTR_ERR(task);
2992 }
2993 status = nfs4_wait_for_completion_rpc_task(task);
2994 if (status == 0) {
2995 status = data->rpc_status;
2996 if (status == 0)
2997 nfs_post_op_update_inode(inode, &data->fattr);
2998 }
2999 rpc_release_task(task);
3000 return status;
2691} 3001}
2692 3002
2693int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid) 3003int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid)
@@ -2725,43 +3035,17 @@ nfs4_set_lock_task_retry(unsigned long timeout)
2725 return timeout; 3035 return timeout;
2726} 3036}
2727 3037
2728static inline int
2729nfs4_lck_type(int cmd, struct file_lock *request)
2730{
2731 /* set lock type */
2732 switch (request->fl_type) {
2733 case F_RDLCK:
2734 return IS_SETLKW(cmd) ? NFS4_READW_LT : NFS4_READ_LT;
2735 case F_WRLCK:
2736 return IS_SETLKW(cmd) ? NFS4_WRITEW_LT : NFS4_WRITE_LT;
2737 case F_UNLCK:
2738 return NFS4_WRITE_LT;
2739 }
2740 BUG();
2741 return 0;
2742}
2743
2744static inline uint64_t
2745nfs4_lck_length(struct file_lock *request)
2746{
2747 if (request->fl_end == OFFSET_MAX)
2748 return ~(uint64_t)0;
2749 return request->fl_end - request->fl_start + 1;
2750}
2751
2752static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request) 3038static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request)
2753{ 3039{
2754 struct inode *inode = state->inode; 3040 struct inode *inode = state->inode;
2755 struct nfs_server *server = NFS_SERVER(inode); 3041 struct nfs_server *server = NFS_SERVER(inode);
2756 struct nfs4_client *clp = server->nfs4_state; 3042 struct nfs4_client *clp = server->nfs4_state;
2757 struct nfs_lockargs arg = { 3043 struct nfs_lockt_args arg = {
2758 .fh = NFS_FH(inode), 3044 .fh = NFS_FH(inode),
2759 .type = nfs4_lck_type(cmd, request), 3045 .fl = request,
2760 .offset = request->fl_start,
2761 .length = nfs4_lck_length(request),
2762 }; 3046 };
2763 struct nfs_lockres res = { 3047 struct nfs_lockt_res res = {
2764 .server = server, 3048 .denied = request,
2765 }; 3049 };
2766 struct rpc_message msg = { 3050 struct rpc_message msg = {
2767 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKT], 3051 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKT],
@@ -2769,36 +3053,23 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
2769 .rpc_resp = &res, 3053 .rpc_resp = &res,
2770 .rpc_cred = state->owner->so_cred, 3054 .rpc_cred = state->owner->so_cred,
2771 }; 3055 };
2772 struct nfs_lowner nlo;
2773 struct nfs4_lock_state *lsp; 3056 struct nfs4_lock_state *lsp;
2774 int status; 3057 int status;
2775 3058
2776 down_read(&clp->cl_sem); 3059 down_read(&clp->cl_sem);
2777 nlo.clientid = clp->cl_clientid; 3060 arg.lock_owner.clientid = clp->cl_clientid;
2778 status = nfs4_set_lock_state(state, request); 3061 status = nfs4_set_lock_state(state, request);
2779 if (status != 0) 3062 if (status != 0)
2780 goto out; 3063 goto out;
2781 lsp = request->fl_u.nfs4_fl.owner; 3064 lsp = request->fl_u.nfs4_fl.owner;
2782 nlo.id = lsp->ls_id; 3065 arg.lock_owner.id = lsp->ls_id;
2783 arg.u.lockt = &nlo;
2784 status = rpc_call_sync(server->client, &msg, 0); 3066 status = rpc_call_sync(server->client, &msg, 0);
2785 if (!status) { 3067 switch (status) {
2786 request->fl_type = F_UNLCK; 3068 case 0:
2787 } else if (status == -NFS4ERR_DENIED) { 3069 request->fl_type = F_UNLCK;
2788 int64_t len, start, end; 3070 break;
2789 start = res.u.denied.offset; 3071 case -NFS4ERR_DENIED:
2790 len = res.u.denied.length; 3072 status = 0;
2791 end = start + len - 1;
2792 if (end < 0 || len == 0)
2793 request->fl_end = OFFSET_MAX;
2794 else
2795 request->fl_end = (loff_t)end;
2796 request->fl_start = (loff_t)start;
2797 request->fl_type = F_WRLCK;
2798 if (res.u.denied.type & 1)
2799 request->fl_type = F_RDLCK;
2800 request->fl_pid = 0;
2801 status = 0;
2802 } 3073 }
2803out: 3074out:
2804 up_read(&clp->cl_sem); 3075 up_read(&clp->cl_sem);
@@ -2838,196 +3109,314 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl)
2838} 3109}
2839 3110
2840struct nfs4_unlockdata { 3111struct nfs4_unlockdata {
2841 struct nfs_lockargs arg; 3112 struct nfs_locku_args arg;
2842 struct nfs_locku_opargs luargs; 3113 struct nfs_locku_res res;
2843 struct nfs_lockres res;
2844 struct nfs4_lock_state *lsp; 3114 struct nfs4_lock_state *lsp;
2845 struct nfs_open_context *ctx; 3115 struct nfs_open_context *ctx;
2846 atomic_t refcount; 3116 struct file_lock fl;
2847 struct completion completion; 3117 const struct nfs_server *server;
3118 unsigned long timestamp;
2848}; 3119};
2849 3120
2850static void nfs4_locku_release_calldata(struct nfs4_unlockdata *calldata) 3121static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
2851{ 3122 struct nfs_open_context *ctx,
2852 if (atomic_dec_and_test(&calldata->refcount)) { 3123 struct nfs4_lock_state *lsp,
2853 nfs_free_seqid(calldata->luargs.seqid); 3124 struct nfs_seqid *seqid)
2854 nfs4_put_lock_state(calldata->lsp); 3125{
2855 put_nfs_open_context(calldata->ctx); 3126 struct nfs4_unlockdata *p;
2856 kfree(calldata); 3127 struct inode *inode = lsp->ls_state->inode;
2857 } 3128
3129 p = kmalloc(sizeof(*p), GFP_KERNEL);
3130 if (p == NULL)
3131 return NULL;
3132 p->arg.fh = NFS_FH(inode);
3133 p->arg.fl = &p->fl;
3134 p->arg.seqid = seqid;
3135 p->arg.stateid = &lsp->ls_stateid;
3136 p->lsp = lsp;
3137 atomic_inc(&lsp->ls_count);
3138 /* Ensure we don't close file until we're done freeing locks! */
3139 p->ctx = get_nfs_open_context(ctx);
3140 memcpy(&p->fl, fl, sizeof(p->fl));
3141 p->server = NFS_SERVER(inode);
3142 return p;
2858} 3143}
2859 3144
2860static void nfs4_locku_complete(struct nfs4_unlockdata *calldata) 3145static void nfs4_locku_release_calldata(void *data)
2861{ 3146{
2862 complete(&calldata->completion); 3147 struct nfs4_unlockdata *calldata = data;
2863 nfs4_locku_release_calldata(calldata); 3148 nfs_free_seqid(calldata->arg.seqid);
3149 nfs4_put_lock_state(calldata->lsp);
3150 put_nfs_open_context(calldata->ctx);
3151 kfree(calldata);
2864} 3152}
2865 3153
2866static void nfs4_locku_done(struct rpc_task *task) 3154static void nfs4_locku_done(struct rpc_task *task, void *data)
2867{ 3155{
2868 struct nfs4_unlockdata *calldata = (struct nfs4_unlockdata *)task->tk_calldata; 3156 struct nfs4_unlockdata *calldata = data;
2869 3157
2870 nfs_increment_lock_seqid(task->tk_status, calldata->luargs.seqid); 3158 if (RPC_ASSASSINATED(task))
3159 return;
3160 nfs_increment_lock_seqid(task->tk_status, calldata->arg.seqid);
2871 switch (task->tk_status) { 3161 switch (task->tk_status) {
2872 case 0: 3162 case 0:
2873 memcpy(calldata->lsp->ls_stateid.data, 3163 memcpy(calldata->lsp->ls_stateid.data,
2874 calldata->res.u.stateid.data, 3164 calldata->res.stateid.data,
2875 sizeof(calldata->lsp->ls_stateid.data)); 3165 sizeof(calldata->lsp->ls_stateid.data));
3166 renew_lease(calldata->server, calldata->timestamp);
2876 break; 3167 break;
2877 case -NFS4ERR_STALE_STATEID: 3168 case -NFS4ERR_STALE_STATEID:
2878 case -NFS4ERR_EXPIRED: 3169 case -NFS4ERR_EXPIRED:
2879 nfs4_schedule_state_recovery(calldata->res.server->nfs4_state); 3170 nfs4_schedule_state_recovery(calldata->server->nfs4_state);
2880 break; 3171 break;
2881 default: 3172 default:
2882 if (nfs4_async_handle_error(task, calldata->res.server) == -EAGAIN) { 3173 if (nfs4_async_handle_error(task, calldata->server) == -EAGAIN) {
2883 rpc_restart_call(task); 3174 rpc_restart_call(task);
2884 return;
2885 } 3175 }
2886 } 3176 }
2887 nfs4_locku_complete(calldata);
2888} 3177}
2889 3178
2890static void nfs4_locku_begin(struct rpc_task *task) 3179static void nfs4_locku_prepare(struct rpc_task *task, void *data)
2891{ 3180{
2892 struct nfs4_unlockdata *calldata = (struct nfs4_unlockdata *)task->tk_calldata; 3181 struct nfs4_unlockdata *calldata = data;
2893 struct rpc_message msg = { 3182 struct rpc_message msg = {
2894 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKU], 3183 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKU],
2895 .rpc_argp = &calldata->arg, 3184 .rpc_argp = &calldata->arg,
2896 .rpc_resp = &calldata->res, 3185 .rpc_resp = &calldata->res,
2897 .rpc_cred = calldata->lsp->ls_state->owner->so_cred, 3186 .rpc_cred = calldata->lsp->ls_state->owner->so_cred,
2898 }; 3187 };
2899 int status;
2900 3188
2901 status = nfs_wait_on_sequence(calldata->luargs.seqid, task); 3189 if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
2902 if (status != 0)
2903 return; 3190 return;
2904 if ((calldata->lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) { 3191 if ((calldata->lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) {
2905 nfs4_locku_complete(calldata); 3192 /* Note: exit _without_ running nfs4_locku_done */
2906 task->tk_exit = NULL; 3193 task->tk_action = NULL;
2907 rpc_exit(task, 0);
2908 return; 3194 return;
2909 } 3195 }
3196 calldata->timestamp = jiffies;
2910 rpc_call_setup(task, &msg, 0); 3197 rpc_call_setup(task, &msg, 0);
2911} 3198}
2912 3199
3200static const struct rpc_call_ops nfs4_locku_ops = {
3201 .rpc_call_prepare = nfs4_locku_prepare,
3202 .rpc_call_done = nfs4_locku_done,
3203 .rpc_release = nfs4_locku_release_calldata,
3204};
3205
3206static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
3207 struct nfs_open_context *ctx,
3208 struct nfs4_lock_state *lsp,
3209 struct nfs_seqid *seqid)
3210{
3211 struct nfs4_unlockdata *data;
3212 struct rpc_task *task;
3213
3214 data = nfs4_alloc_unlockdata(fl, ctx, lsp, seqid);
3215 if (data == NULL) {
3216 nfs_free_seqid(seqid);
3217 return ERR_PTR(-ENOMEM);
3218 }
3219
3220 /* Unlock _before_ we do the RPC call */
3221 do_vfs_lock(fl->fl_file, fl);
3222 task = rpc_run_task(NFS_CLIENT(lsp->ls_state->inode), RPC_TASK_ASYNC, &nfs4_locku_ops, data);
3223 if (IS_ERR(task))
3224 nfs4_locku_release_calldata(data);
3225 return task;
3226}
3227
2913static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request) 3228static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
2914{ 3229{
2915 struct nfs4_unlockdata *calldata; 3230 struct nfs_seqid *seqid;
2916 struct inode *inode = state->inode;
2917 struct nfs_server *server = NFS_SERVER(inode);
2918 struct nfs4_lock_state *lsp; 3231 struct nfs4_lock_state *lsp;
2919 int status; 3232 struct rpc_task *task;
3233 int status = 0;
2920 3234
2921 /* Is this a delegated lock? */ 3235 /* Is this a delegated lock? */
2922 if (test_bit(NFS_DELEGATED_STATE, &state->flags)) 3236 if (test_bit(NFS_DELEGATED_STATE, &state->flags))
2923 return do_vfs_lock(request->fl_file, request); 3237 goto out_unlock;
3238 /* Is this open_owner holding any locks on the server? */
3239 if (test_bit(LK_STATE_IN_USE, &state->flags) == 0)
3240 goto out_unlock;
2924 3241
2925 status = nfs4_set_lock_state(state, request); 3242 status = nfs4_set_lock_state(state, request);
2926 if (status != 0) 3243 if (status != 0)
2927 return status; 3244 goto out_unlock;
2928 lsp = request->fl_u.nfs4_fl.owner; 3245 lsp = request->fl_u.nfs4_fl.owner;
2929 /* We might have lost the locks! */ 3246 status = -ENOMEM;
2930 if ((lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) 3247 seqid = nfs_alloc_seqid(&lsp->ls_seqid);
2931 return 0; 3248 if (seqid == NULL)
2932 calldata = kmalloc(sizeof(*calldata), GFP_KERNEL); 3249 goto out_unlock;
2933 if (calldata == NULL) 3250 task = nfs4_do_unlck(request, request->fl_file->private_data, lsp, seqid);
2934 return -ENOMEM; 3251 status = PTR_ERR(task);
2935 calldata->luargs.seqid = nfs_alloc_seqid(&lsp->ls_seqid); 3252 if (IS_ERR(task))
2936 if (calldata->luargs.seqid == NULL) { 3253 goto out_unlock;
2937 kfree(calldata); 3254 status = nfs4_wait_for_completion_rpc_task(task);
2938 return -ENOMEM; 3255 rpc_release_task(task);
2939 } 3256 return status;
2940 calldata->luargs.stateid = &lsp->ls_stateid; 3257out_unlock:
2941 calldata->arg.fh = NFS_FH(inode);
2942 calldata->arg.type = nfs4_lck_type(cmd, request);
2943 calldata->arg.offset = request->fl_start;
2944 calldata->arg.length = nfs4_lck_length(request);
2945 calldata->arg.u.locku = &calldata->luargs;
2946 calldata->res.server = server;
2947 calldata->lsp = lsp;
2948 atomic_inc(&lsp->ls_count);
2949
2950 /* Ensure we don't close file until we're done freeing locks! */
2951 calldata->ctx = get_nfs_open_context((struct nfs_open_context*)request->fl_file->private_data);
2952
2953 atomic_set(&calldata->refcount, 2);
2954 init_completion(&calldata->completion);
2955
2956 status = nfs4_call_async(NFS_SERVER(inode)->client, nfs4_locku_begin,
2957 nfs4_locku_done, calldata);
2958 if (status == 0)
2959 wait_for_completion_interruptible(&calldata->completion);
2960 do_vfs_lock(request->fl_file, request); 3258 do_vfs_lock(request->fl_file, request);
2961 nfs4_locku_release_calldata(calldata);
2962 return status; 3259 return status;
2963} 3260}
2964 3261
2965static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *request, int reclaim) 3262struct nfs4_lockdata {
3263 struct nfs_lock_args arg;
3264 struct nfs_lock_res res;
3265 struct nfs4_lock_state *lsp;
3266 struct nfs_open_context *ctx;
3267 struct file_lock fl;
3268 unsigned long timestamp;
3269 int rpc_status;
3270 int cancelled;
3271};
3272
3273static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
3274 struct nfs_open_context *ctx, struct nfs4_lock_state *lsp)
2966{ 3275{
2967 struct inode *inode = state->inode; 3276 struct nfs4_lockdata *p;
3277 struct inode *inode = lsp->ls_state->inode;
2968 struct nfs_server *server = NFS_SERVER(inode); 3278 struct nfs_server *server = NFS_SERVER(inode);
2969 struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner; 3279
2970 struct nfs_lock_opargs largs = { 3280 p = kzalloc(sizeof(*p), GFP_KERNEL);
2971 .lock_stateid = &lsp->ls_stateid, 3281 if (p == NULL)
2972 .open_stateid = &state->stateid, 3282 return NULL;
2973 .lock_owner = { 3283
2974 .clientid = server->nfs4_state->cl_clientid, 3284 p->arg.fh = NFS_FH(inode);
2975 .id = lsp->ls_id, 3285 p->arg.fl = &p->fl;
2976 }, 3286 p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid);
2977 .reclaim = reclaim, 3287 if (p->arg.lock_seqid == NULL)
2978 }; 3288 goto out_free;
2979 struct nfs_lockargs arg = { 3289 p->arg.lock_stateid = &lsp->ls_stateid;
2980 .fh = NFS_FH(inode), 3290 p->arg.lock_owner.clientid = server->nfs4_state->cl_clientid;
2981 .type = nfs4_lck_type(cmd, request), 3291 p->arg.lock_owner.id = lsp->ls_id;
2982 .offset = request->fl_start, 3292 p->lsp = lsp;
2983 .length = nfs4_lck_length(request), 3293 atomic_inc(&lsp->ls_count);
2984 .u = { 3294 p->ctx = get_nfs_open_context(ctx);
2985 .lock = &largs, 3295 memcpy(&p->fl, fl, sizeof(p->fl));
2986 }, 3296 return p;
2987 }; 3297out_free:
2988 struct nfs_lockres res = { 3298 kfree(p);
2989 .server = server, 3299 return NULL;
2990 }; 3300}
3301
3302static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
3303{
3304 struct nfs4_lockdata *data = calldata;
3305 struct nfs4_state *state = data->lsp->ls_state;
3306 struct nfs4_state_owner *sp = state->owner;
2991 struct rpc_message msg = { 3307 struct rpc_message msg = {
2992 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCK], 3308 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCK],
2993 .rpc_argp = &arg, 3309 .rpc_argp = &data->arg,
2994 .rpc_resp = &res, 3310 .rpc_resp = &data->res,
2995 .rpc_cred = state->owner->so_cred, 3311 .rpc_cred = sp->so_cred,
2996 }; 3312 };
2997 int status = -ENOMEM;
2998 3313
2999 largs.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid); 3314 if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
3000 if (largs.lock_seqid == NULL) 3315 return;
3001 return -ENOMEM; 3316 dprintk("%s: begin!\n", __FUNCTION__);
3002 if (!(lsp->ls_seqid.flags & NFS_SEQID_CONFIRMED)) { 3317 /* Do we need to do an open_to_lock_owner? */
3003 struct nfs4_state_owner *owner = state->owner; 3318 if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) {
3004 3319 data->arg.open_seqid = nfs_alloc_seqid(&sp->so_seqid);
3005 largs.open_seqid = nfs_alloc_seqid(&owner->so_seqid); 3320 if (data->arg.open_seqid == NULL) {
3006 if (largs.open_seqid == NULL) 3321 data->rpc_status = -ENOMEM;
3322 task->tk_action = NULL;
3007 goto out; 3323 goto out;
3008 largs.new_lock_owner = 1;
3009 status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR);
3010 /* increment open seqid on success, and seqid mutating errors */
3011 if (largs.new_lock_owner != 0) {
3012 nfs_increment_open_seqid(status, largs.open_seqid);
3013 if (status == 0)
3014 nfs_confirm_seqid(&lsp->ls_seqid, 0);
3015 } 3324 }
3016 nfs_free_seqid(largs.open_seqid); 3325 data->arg.open_stateid = &state->stateid;
3017 } else 3326 data->arg.new_lock_owner = 1;
3018 status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); 3327 }
3019 /* increment lock seqid on success, and seqid mutating errors*/ 3328 data->timestamp = jiffies;
3020 nfs_increment_lock_seqid(status, largs.lock_seqid); 3329 rpc_call_setup(task, &msg, 0);
3021 /* save the returned stateid. */
3022 if (status == 0) {
3023 memcpy(lsp->ls_stateid.data, res.u.stateid.data,
3024 sizeof(lsp->ls_stateid.data));
3025 lsp->ls_flags |= NFS_LOCK_INITIALIZED;
3026 } else if (status == -NFS4ERR_DENIED)
3027 status = -EAGAIN;
3028out: 3330out:
3029 nfs_free_seqid(largs.lock_seqid); 3331 dprintk("%s: done!, ret = %d\n", __FUNCTION__, data->rpc_status);
3030 return status; 3332}
3333
3334static void nfs4_lock_done(struct rpc_task *task, void *calldata)
3335{
3336 struct nfs4_lockdata *data = calldata;
3337
3338 dprintk("%s: begin!\n", __FUNCTION__);
3339
3340 data->rpc_status = task->tk_status;
3341 if (RPC_ASSASSINATED(task))
3342 goto out;
3343 if (data->arg.new_lock_owner != 0) {
3344 nfs_increment_open_seqid(data->rpc_status, data->arg.open_seqid);
3345 if (data->rpc_status == 0)
3346 nfs_confirm_seqid(&data->lsp->ls_seqid, 0);
3347 else
3348 goto out;
3349 }
3350 if (data->rpc_status == 0) {
3351 memcpy(data->lsp->ls_stateid.data, data->res.stateid.data,
3352 sizeof(data->lsp->ls_stateid.data));
3353 data->lsp->ls_flags |= NFS_LOCK_INITIALIZED;
3354 renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp);
3355 }
3356 nfs_increment_lock_seqid(data->rpc_status, data->arg.lock_seqid);
3357out:
3358 dprintk("%s: done, ret = %d!\n", __FUNCTION__, data->rpc_status);
3359}
3360
3361static void nfs4_lock_release(void *calldata)
3362{
3363 struct nfs4_lockdata *data = calldata;
3364
3365 dprintk("%s: begin!\n", __FUNCTION__);
3366 if (data->arg.open_seqid != NULL)
3367 nfs_free_seqid(data->arg.open_seqid);
3368 if (data->cancelled != 0) {
3369 struct rpc_task *task;
3370 task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp,
3371 data->arg.lock_seqid);
3372 if (!IS_ERR(task))
3373 rpc_release_task(task);
3374 dprintk("%s: cancelling lock!\n", __FUNCTION__);
3375 } else
3376 nfs_free_seqid(data->arg.lock_seqid);
3377 nfs4_put_lock_state(data->lsp);
3378 put_nfs_open_context(data->ctx);
3379 kfree(data);
3380 dprintk("%s: done!\n", __FUNCTION__);
3381}
3382
3383static const struct rpc_call_ops nfs4_lock_ops = {
3384 .rpc_call_prepare = nfs4_lock_prepare,
3385 .rpc_call_done = nfs4_lock_done,
3386 .rpc_release = nfs4_lock_release,
3387};
3388
3389static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int reclaim)
3390{
3391 struct nfs4_lockdata *data;
3392 struct rpc_task *task;
3393 int ret;
3394
3395 dprintk("%s: begin!\n", __FUNCTION__);
3396 data = nfs4_alloc_lockdata(fl, fl->fl_file->private_data,
3397 fl->fl_u.nfs4_fl.owner);
3398 if (data == NULL)
3399 return -ENOMEM;
3400 if (IS_SETLKW(cmd))
3401 data->arg.block = 1;
3402 if (reclaim != 0)
3403 data->arg.reclaim = 1;
3404 task = rpc_run_task(NFS_CLIENT(state->inode), RPC_TASK_ASYNC,
3405 &nfs4_lock_ops, data);
3406 if (IS_ERR(task)) {
3407 nfs4_lock_release(data);
3408 return PTR_ERR(task);
3409 }
3410 ret = nfs4_wait_for_completion_rpc_task(task);
3411 if (ret == 0) {
3412 ret = data->rpc_status;
3413 if (ret == -NFS4ERR_DENIED)
3414 ret = -EAGAIN;
3415 } else
3416 data->cancelled = 1;
3417 rpc_release_task(task);
3418 dprintk("%s: done, ret = %d!\n", __FUNCTION__, ret);
3419 return ret;
3031} 3420}
3032 3421
3033static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request) 3422static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request)
@@ -3071,15 +3460,15 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
3071 struct nfs4_client *clp = state->owner->so_client; 3460 struct nfs4_client *clp = state->owner->so_client;
3072 int status; 3461 int status;
3073 3462
3074 down_read(&clp->cl_sem);
3075 /* Is this a delegated open? */ 3463 /* Is this a delegated open? */
3076 if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { 3464 if (NFS_I(state->inode)->delegation_state != 0) {
3077 /* Yes: cache locks! */ 3465 /* Yes: cache locks! */
3078 status = do_vfs_lock(request->fl_file, request); 3466 status = do_vfs_lock(request->fl_file, request);
3079 /* ...but avoid races with delegation recall... */ 3467 /* ...but avoid races with delegation recall... */
3080 if (status < 0 || test_bit(NFS_DELEGATED_STATE, &state->flags)) 3468 if (status < 0 || test_bit(NFS_DELEGATED_STATE, &state->flags))
3081 goto out; 3469 return status;
3082 } 3470 }
3471 down_read(&clp->cl_sem);
3083 status = nfs4_set_lock_state(state, request); 3472 status = nfs4_set_lock_state(state, request);
3084 if (status != 0) 3473 if (status != 0)
3085 goto out; 3474 goto out;
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index a3001628ad3..5d764d8e6d8 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -54,6 +54,7 @@
54#include <linux/nfs4.h> 54#include <linux/nfs4.h>
55#include <linux/nfs_fs.h> 55#include <linux/nfs_fs.h>
56#include "nfs4_fs.h" 56#include "nfs4_fs.h"
57#include "delegation.h"
57 58
58#define NFSDBG_FACILITY NFSDBG_PROC 59#define NFSDBG_FACILITY NFSDBG_PROC
59 60
@@ -61,6 +62,7 @@ void
61nfs4_renew_state(void *data) 62nfs4_renew_state(void *data)
62{ 63{
63 struct nfs4_client *clp = (struct nfs4_client *)data; 64 struct nfs4_client *clp = (struct nfs4_client *)data;
65 struct rpc_cred *cred;
64 long lease, timeout; 66 long lease, timeout;
65 unsigned long last, now; 67 unsigned long last, now;
66 68
@@ -68,7 +70,7 @@ nfs4_renew_state(void *data)
68 dprintk("%s: start\n", __FUNCTION__); 70 dprintk("%s: start\n", __FUNCTION__);
69 /* Are there any active superblocks? */ 71 /* Are there any active superblocks? */
70 if (list_empty(&clp->cl_superblocks)) 72 if (list_empty(&clp->cl_superblocks))
71 goto out; 73 goto out;
72 spin_lock(&clp->cl_lock); 74 spin_lock(&clp->cl_lock);
73 lease = clp->cl_lease_time; 75 lease = clp->cl_lease_time;
74 last = clp->cl_last_renewal; 76 last = clp->cl_last_renewal;
@@ -76,9 +78,17 @@ nfs4_renew_state(void *data)
76 timeout = (2 * lease) / 3 + (long)last - (long)now; 78 timeout = (2 * lease) / 3 + (long)last - (long)now;
77 /* Are we close to a lease timeout? */ 79 /* Are we close to a lease timeout? */
78 if (time_after(now, last + lease/3)) { 80 if (time_after(now, last + lease/3)) {
81 cred = nfs4_get_renew_cred(clp);
82 if (cred == NULL) {
83 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
84 spin_unlock(&clp->cl_lock);
85 nfs_expire_all_delegations(clp);
86 goto out;
87 }
79 spin_unlock(&clp->cl_lock); 88 spin_unlock(&clp->cl_lock);
80 /* Queue an asynchronous RENEW. */ 89 /* Queue an asynchronous RENEW. */
81 nfs4_proc_async_renew(clp); 90 nfs4_proc_async_renew(clp, cred);
91 put_rpccred(cred);
82 timeout = (2 * lease) / 3; 92 timeout = (2 * lease) / 3;
83 spin_lock(&clp->cl_lock); 93 spin_lock(&clp->cl_lock);
84 } else 94 } else
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 0675f3215e0..afad0255e7d 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -43,6 +43,8 @@
43#include <linux/smp_lock.h> 43#include <linux/smp_lock.h>
44#include <linux/nfs_fs.h> 44#include <linux/nfs_fs.h>
45#include <linux/nfs_idmap.h> 45#include <linux/nfs_idmap.h>
46#include <linux/kthread.h>
47#include <linux/module.h>
46#include <linux/workqueue.h> 48#include <linux/workqueue.h>
47#include <linux/bitops.h> 49#include <linux/bitops.h>
48 50
@@ -57,8 +59,6 @@ const nfs4_stateid zero_stateid;
57static DEFINE_SPINLOCK(state_spinlock); 59static DEFINE_SPINLOCK(state_spinlock);
58static LIST_HEAD(nfs4_clientid_list); 60static LIST_HEAD(nfs4_clientid_list);
59 61
60static void nfs4_recover_state(void *);
61
62void 62void
63init_nfsv4_state(struct nfs_server *server) 63init_nfsv4_state(struct nfs_server *server)
64{ 64{
@@ -91,11 +91,10 @@ nfs4_alloc_client(struct in_addr *addr)
91 91
92 if (nfs_callback_up() < 0) 92 if (nfs_callback_up() < 0)
93 return NULL; 93 return NULL;
94 if ((clp = kmalloc(sizeof(*clp), GFP_KERNEL)) == NULL) { 94 if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL) {
95 nfs_callback_down(); 95 nfs_callback_down();
96 return NULL; 96 return NULL;
97 } 97 }
98 memset(clp, 0, sizeof(*clp));
99 memcpy(&clp->cl_addr, addr, sizeof(clp->cl_addr)); 98 memcpy(&clp->cl_addr, addr, sizeof(clp->cl_addr));
100 init_rwsem(&clp->cl_sem); 99 init_rwsem(&clp->cl_sem);
101 INIT_LIST_HEAD(&clp->cl_delegations); 100 INIT_LIST_HEAD(&clp->cl_delegations);
@@ -103,14 +102,12 @@ nfs4_alloc_client(struct in_addr *addr)
103 INIT_LIST_HEAD(&clp->cl_unused); 102 INIT_LIST_HEAD(&clp->cl_unused);
104 spin_lock_init(&clp->cl_lock); 103 spin_lock_init(&clp->cl_lock);
105 atomic_set(&clp->cl_count, 1); 104 atomic_set(&clp->cl_count, 1);
106 INIT_WORK(&clp->cl_recoverd, nfs4_recover_state, clp);
107 INIT_WORK(&clp->cl_renewd, nfs4_renew_state, clp); 105 INIT_WORK(&clp->cl_renewd, nfs4_renew_state, clp);
108 INIT_LIST_HEAD(&clp->cl_superblocks); 106 INIT_LIST_HEAD(&clp->cl_superblocks);
109 init_waitqueue_head(&clp->cl_waitq);
110 rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS4 client"); 107 rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS4 client");
111 clp->cl_rpcclient = ERR_PTR(-EINVAL); 108 clp->cl_rpcclient = ERR_PTR(-EINVAL);
112 clp->cl_boot_time = CURRENT_TIME; 109 clp->cl_boot_time = CURRENT_TIME;
113 clp->cl_state = 1 << NFS4CLNT_OK; 110 clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
114 return clp; 111 return clp;
115} 112}
116 113
@@ -127,8 +124,6 @@ nfs4_free_client(struct nfs4_client *clp)
127 kfree(sp); 124 kfree(sp);
128 } 125 }
129 BUG_ON(!list_empty(&clp->cl_state_owners)); 126 BUG_ON(!list_empty(&clp->cl_state_owners));
130 if (clp->cl_cred)
131 put_rpccred(clp->cl_cred);
132 nfs_idmap_delete(clp); 127 nfs_idmap_delete(clp);
133 if (!IS_ERR(clp->cl_rpcclient)) 128 if (!IS_ERR(clp->cl_rpcclient))
134 rpc_shutdown_client(clp->cl_rpcclient); 129 rpc_shutdown_client(clp->cl_rpcclient);
@@ -193,27 +188,22 @@ nfs4_put_client(struct nfs4_client *clp)
193 list_del(&clp->cl_servers); 188 list_del(&clp->cl_servers);
194 spin_unlock(&state_spinlock); 189 spin_unlock(&state_spinlock);
195 BUG_ON(!list_empty(&clp->cl_superblocks)); 190 BUG_ON(!list_empty(&clp->cl_superblocks));
196 wake_up_all(&clp->cl_waitq);
197 rpc_wake_up(&clp->cl_rpcwaitq); 191 rpc_wake_up(&clp->cl_rpcwaitq);
198 nfs4_kill_renewd(clp); 192 nfs4_kill_renewd(clp);
199 nfs4_free_client(clp); 193 nfs4_free_client(clp);
200} 194}
201 195
202static int __nfs4_init_client(struct nfs4_client *clp) 196static int nfs4_init_client(struct nfs4_client *clp, struct rpc_cred *cred)
203{ 197{
204 int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, nfs_callback_tcpport); 198 int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK,
199 nfs_callback_tcpport, cred);
205 if (status == 0) 200 if (status == 0)
206 status = nfs4_proc_setclientid_confirm(clp); 201 status = nfs4_proc_setclientid_confirm(clp, cred);
207 if (status == 0) 202 if (status == 0)
208 nfs4_schedule_state_renewal(clp); 203 nfs4_schedule_state_renewal(clp);
209 return status; 204 return status;
210} 205}
211 206
212int nfs4_init_client(struct nfs4_client *clp)
213{
214 return nfs4_map_errors(__nfs4_init_client(clp));
215}
216
217u32 207u32
218nfs4_alloc_lockowner_id(struct nfs4_client *clp) 208nfs4_alloc_lockowner_id(struct nfs4_client *clp)
219{ 209{
@@ -235,6 +225,32 @@ nfs4_client_grab_unused(struct nfs4_client *clp, struct rpc_cred *cred)
235 return sp; 225 return sp;
236} 226}
237 227
228struct rpc_cred *nfs4_get_renew_cred(struct nfs4_client *clp)
229{
230 struct nfs4_state_owner *sp;
231 struct rpc_cred *cred = NULL;
232
233 list_for_each_entry(sp, &clp->cl_state_owners, so_list) {
234 if (list_empty(&sp->so_states))
235 continue;
236 cred = get_rpccred(sp->so_cred);
237 break;
238 }
239 return cred;
240}
241
242struct rpc_cred *nfs4_get_setclientid_cred(struct nfs4_client *clp)
243{
244 struct nfs4_state_owner *sp;
245
246 if (!list_empty(&clp->cl_state_owners)) {
247 sp = list_entry(clp->cl_state_owners.next,
248 struct nfs4_state_owner, so_list);
249 return get_rpccred(sp->so_cred);
250 }
251 return NULL;
252}
253
238static struct nfs4_state_owner * 254static struct nfs4_state_owner *
239nfs4_find_state_owner(struct nfs4_client *clp, struct rpc_cred *cred) 255nfs4_find_state_owner(struct nfs4_client *clp, struct rpc_cred *cred)
240{ 256{
@@ -349,14 +365,9 @@ nfs4_alloc_open_state(void)
349{ 365{
350 struct nfs4_state *state; 366 struct nfs4_state *state;
351 367
352 state = kmalloc(sizeof(*state), GFP_KERNEL); 368 state = kzalloc(sizeof(*state), GFP_KERNEL);
353 if (!state) 369 if (!state)
354 return NULL; 370 return NULL;
355 state->state = 0;
356 state->nreaders = 0;
357 state->nwriters = 0;
358 state->flags = 0;
359 memset(state->stateid.data, 0, sizeof(state->stateid.data));
360 atomic_set(&state->count, 1); 371 atomic_set(&state->count, 1);
361 INIT_LIST_HEAD(&state->lock_states); 372 INIT_LIST_HEAD(&state->lock_states);
362 spin_lock_init(&state->state_lock); 373 spin_lock_init(&state->state_lock);
@@ -475,15 +486,23 @@ void nfs4_close_state(struct nfs4_state *state, mode_t mode)
475 /* Protect against nfs4_find_state() */ 486 /* Protect against nfs4_find_state() */
476 spin_lock(&owner->so_lock); 487 spin_lock(&owner->so_lock);
477 spin_lock(&inode->i_lock); 488 spin_lock(&inode->i_lock);
478 if (mode & FMODE_READ) 489 switch (mode & (FMODE_READ | FMODE_WRITE)) {
479 state->nreaders--; 490 case FMODE_READ:
480 if (mode & FMODE_WRITE) 491 state->n_rdonly--;
481 state->nwriters--; 492 break;
493 case FMODE_WRITE:
494 state->n_wronly--;
495 break;
496 case FMODE_READ|FMODE_WRITE:
497 state->n_rdwr--;
498 }
482 oldstate = newstate = state->state; 499 oldstate = newstate = state->state;
483 if (state->nreaders == 0) 500 if (state->n_rdwr == 0) {
484 newstate &= ~FMODE_READ; 501 if (state->n_rdonly == 0)
485 if (state->nwriters == 0) 502 newstate &= ~FMODE_READ;
486 newstate &= ~FMODE_WRITE; 503 if (state->n_wronly == 0)
504 newstate &= ~FMODE_WRITE;
505 }
487 if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { 506 if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
488 nfs4_state_set_mode_locked(state, newstate); 507 nfs4_state_set_mode_locked(state, newstate);
489 oldstate = newstate; 508 oldstate = newstate;
@@ -644,12 +663,15 @@ void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t f
644 663
645struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter) 664struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter)
646{ 665{
666 struct rpc_sequence *sequence = counter->sequence;
647 struct nfs_seqid *new; 667 struct nfs_seqid *new;
648 668
649 new = kmalloc(sizeof(*new), GFP_KERNEL); 669 new = kmalloc(sizeof(*new), GFP_KERNEL);
650 if (new != NULL) { 670 if (new != NULL) {
651 new->sequence = counter; 671 new->sequence = counter;
652 INIT_LIST_HEAD(&new->list); 672 spin_lock(&sequence->lock);
673 list_add_tail(&new->list, &sequence->list);
674 spin_unlock(&sequence->lock);
653 } 675 }
654 return new; 676 return new;
655} 677}
@@ -658,12 +680,10 @@ void nfs_free_seqid(struct nfs_seqid *seqid)
658{ 680{
659 struct rpc_sequence *sequence = seqid->sequence->sequence; 681 struct rpc_sequence *sequence = seqid->sequence->sequence;
660 682
661 if (!list_empty(&seqid->list)) { 683 spin_lock(&sequence->lock);
662 spin_lock(&sequence->lock); 684 list_del(&seqid->list);
663 list_del(&seqid->list); 685 spin_unlock(&sequence->lock);
664 spin_unlock(&sequence->lock); 686 rpc_wake_up(&sequence->wait);
665 }
666 rpc_wake_up_next(&sequence->wait);
667 kfree(seqid); 687 kfree(seqid);
668} 688}
669 689
@@ -722,56 +742,53 @@ int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
722 if (sequence->list.next == &seqid->list) 742 if (sequence->list.next == &seqid->list)
723 goto out; 743 goto out;
724 spin_lock(&sequence->lock); 744 spin_lock(&sequence->lock);
725 if (!list_empty(&sequence->list)) { 745 if (sequence->list.next != &seqid->list) {
726 rpc_sleep_on(&sequence->wait, task, NULL, NULL); 746 rpc_sleep_on(&sequence->wait, task, NULL, NULL);
727 status = -EAGAIN; 747 status = -EAGAIN;
728 } else 748 }
729 list_add(&seqid->list, &sequence->list);
730 spin_unlock(&sequence->lock); 749 spin_unlock(&sequence->lock);
731out: 750out:
732 return status; 751 return status;
733} 752}
734 753
735static int reclaimer(void *); 754static int reclaimer(void *);
736struct reclaimer_args { 755
737 struct nfs4_client *clp; 756static inline void nfs4_clear_recover_bit(struct nfs4_client *clp)
738 struct completion complete; 757{
739}; 758 smp_mb__before_clear_bit();
759 clear_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state);
760 smp_mb__after_clear_bit();
761 wake_up_bit(&clp->cl_state, NFS4CLNT_STATE_RECOVER);
762 rpc_wake_up(&clp->cl_rpcwaitq);
763}
740 764
741/* 765/*
742 * State recovery routine 766 * State recovery routine
743 */ 767 */
744void 768static void nfs4_recover_state(struct nfs4_client *clp)
745nfs4_recover_state(void *data)
746{ 769{
747 struct nfs4_client *clp = (struct nfs4_client *)data; 770 struct task_struct *task;
748 struct reclaimer_args args = {
749 .clp = clp,
750 };
751 might_sleep();
752
753 init_completion(&args.complete);
754 771
755 if (kernel_thread(reclaimer, &args, CLONE_KERNEL) < 0) 772 __module_get(THIS_MODULE);
756 goto out_failed_clear; 773 atomic_inc(&clp->cl_count);
757 wait_for_completion(&args.complete); 774 task = kthread_run(reclaimer, clp, "%u.%u.%u.%u-reclaim",
758 return; 775 NIPQUAD(clp->cl_addr));
759out_failed_clear: 776 if (!IS_ERR(task))
760 set_bit(NFS4CLNT_OK, &clp->cl_state); 777 return;
761 wake_up_all(&clp->cl_waitq); 778 nfs4_clear_recover_bit(clp);
762 rpc_wake_up(&clp->cl_rpcwaitq); 779 nfs4_put_client(clp);
780 module_put(THIS_MODULE);
763} 781}
764 782
765/* 783/*
766 * Schedule a state recovery attempt 784 * Schedule a state recovery attempt
767 */ 785 */
768void 786void nfs4_schedule_state_recovery(struct nfs4_client *clp)
769nfs4_schedule_state_recovery(struct nfs4_client *clp)
770{ 787{
771 if (!clp) 788 if (!clp)
772 return; 789 return;
773 if (test_and_clear_bit(NFS4CLNT_OK, &clp->cl_state)) 790 if (test_and_set_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) == 0)
774 schedule_work(&clp->cl_recoverd); 791 nfs4_recover_state(clp);
775} 792}
776 793
777static int nfs4_reclaim_locks(struct nfs4_state_recovery_ops *ops, struct nfs4_state *state) 794static int nfs4_reclaim_locks(struct nfs4_state_recovery_ops *ops, struct nfs4_state *state)
@@ -887,18 +904,14 @@ static void nfs4_state_mark_reclaim(struct nfs4_client *clp)
887 904
888static int reclaimer(void *ptr) 905static int reclaimer(void *ptr)
889{ 906{
890 struct reclaimer_args *args = (struct reclaimer_args *)ptr; 907 struct nfs4_client *clp = ptr;
891 struct nfs4_client *clp = args->clp;
892 struct nfs4_state_owner *sp; 908 struct nfs4_state_owner *sp;
893 struct nfs4_state_recovery_ops *ops; 909 struct nfs4_state_recovery_ops *ops;
910 struct rpc_cred *cred;
894 int status = 0; 911 int status = 0;
895 912
896 daemonize("%u.%u.%u.%u-reclaim", NIPQUAD(clp->cl_addr));
897 allow_signal(SIGKILL); 913 allow_signal(SIGKILL);
898 914
899 atomic_inc(&clp->cl_count);
900 complete(&args->complete);
901
902 /* Ensure exclusive access to NFSv4 state */ 915 /* Ensure exclusive access to NFSv4 state */
903 lock_kernel(); 916 lock_kernel();
904 down_write(&clp->cl_sem); 917 down_write(&clp->cl_sem);
@@ -906,20 +919,33 @@ static int reclaimer(void *ptr)
906 if (list_empty(&clp->cl_superblocks)) 919 if (list_empty(&clp->cl_superblocks))
907 goto out; 920 goto out;
908restart_loop: 921restart_loop:
909 status = nfs4_proc_renew(clp); 922 ops = &nfs4_network_partition_recovery_ops;
910 switch (status) { 923 /* Are there any open files on this volume? */
911 case 0: 924 cred = nfs4_get_renew_cred(clp);
912 case -NFS4ERR_CB_PATH_DOWN: 925 if (cred != NULL) {
913 goto out; 926 /* Yes there are: try to renew the old lease */
914 case -NFS4ERR_STALE_CLIENTID: 927 status = nfs4_proc_renew(clp, cred);
915 case -NFS4ERR_LEASE_MOVED: 928 switch (status) {
916 ops = &nfs4_reboot_recovery_ops; 929 case 0:
917 break; 930 case -NFS4ERR_CB_PATH_DOWN:
918 default: 931 put_rpccred(cred);
919 ops = &nfs4_network_partition_recovery_ops; 932 goto out;
920 }; 933 case -NFS4ERR_STALE_CLIENTID:
934 case -NFS4ERR_LEASE_MOVED:
935 ops = &nfs4_reboot_recovery_ops;
936 }
937 } else {
938 /* "reboot" to ensure we clear all state on the server */
939 clp->cl_boot_time = CURRENT_TIME;
940 cred = nfs4_get_setclientid_cred(clp);
941 }
942 /* We're going to have to re-establish a clientid */
921 nfs4_state_mark_reclaim(clp); 943 nfs4_state_mark_reclaim(clp);
922 status = __nfs4_init_client(clp); 944 status = -ENOENT;
945 if (cred != NULL) {
946 status = nfs4_init_client(clp, cred);
947 put_rpccred(cred);
948 }
923 if (status) 949 if (status)
924 goto out_error; 950 goto out_error;
925 /* Mark all delegations for reclaim */ 951 /* Mark all delegations for reclaim */
@@ -940,14 +966,13 @@ restart_loop:
940 } 966 }
941 nfs_delegation_reap_unclaimed(clp); 967 nfs_delegation_reap_unclaimed(clp);
942out: 968out:
943 set_bit(NFS4CLNT_OK, &clp->cl_state);
944 up_write(&clp->cl_sem); 969 up_write(&clp->cl_sem);
945 unlock_kernel(); 970 unlock_kernel();
946 wake_up_all(&clp->cl_waitq);
947 rpc_wake_up(&clp->cl_rpcwaitq);
948 if (status == -NFS4ERR_CB_PATH_DOWN) 971 if (status == -NFS4ERR_CB_PATH_DOWN)
949 nfs_handle_cb_pathdown(clp); 972 nfs_handle_cb_pathdown(clp);
973 nfs4_clear_recover_bit(clp);
950 nfs4_put_client(clp); 974 nfs4_put_client(clp);
975 module_put_and_exit(0);
951 return 0; 976 return 0;
952out_error: 977out_error:
953 printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %u.%u.%u.%u with error %d\n", 978 printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %u.%u.%u.%u with error %d\n",
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index fbbace8a30c..4bbf5ef5778 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -392,9 +392,11 @@ static int nfs_stat_to_errno(int);
392 decode_getattr_maxsz) 392 decode_getattr_maxsz)
393#define NFS4_enc_delegreturn_sz (compound_encode_hdr_maxsz + \ 393#define NFS4_enc_delegreturn_sz (compound_encode_hdr_maxsz + \
394 encode_putfh_maxsz + \ 394 encode_putfh_maxsz + \
395 encode_delegreturn_maxsz) 395 encode_delegreturn_maxsz + \
396 encode_getattr_maxsz)
396#define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \ 397#define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \
397 decode_delegreturn_maxsz) 398 decode_delegreturn_maxsz + \
399 decode_getattr_maxsz)
398#define NFS4_enc_getacl_sz (compound_encode_hdr_maxsz + \ 400#define NFS4_enc_getacl_sz (compound_encode_hdr_maxsz + \
399 encode_putfh_maxsz + \ 401 encode_putfh_maxsz + \
400 encode_getattr_maxsz) 402 encode_getattr_maxsz)
@@ -564,7 +566,7 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
564 } 566 }
565 if (iap->ia_valid & ATTR_MODE) { 567 if (iap->ia_valid & ATTR_MODE) {
566 bmval1 |= FATTR4_WORD1_MODE; 568 bmval1 |= FATTR4_WORD1_MODE;
567 WRITE32(iap->ia_mode); 569 WRITE32(iap->ia_mode & S_IALLUGO);
568 } 570 }
569 if (iap->ia_valid & ATTR_UID) { 571 if (iap->ia_valid & ATTR_UID) {
570 bmval1 |= FATTR4_WORD1_OWNER; 572 bmval1 |= FATTR4_WORD1_OWNER;
@@ -742,69 +744,80 @@ static int encode_link(struct xdr_stream *xdr, const struct qstr *name)
742 return 0; 744 return 0;
743} 745}
744 746
747static inline int nfs4_lock_type(struct file_lock *fl, int block)
748{
749 if ((fl->fl_type & (F_RDLCK|F_WRLCK|F_UNLCK)) == F_RDLCK)
750 return block ? NFS4_READW_LT : NFS4_READ_LT;
751 return block ? NFS4_WRITEW_LT : NFS4_WRITE_LT;
752}
753
754static inline uint64_t nfs4_lock_length(struct file_lock *fl)
755{
756 if (fl->fl_end == OFFSET_MAX)
757 return ~(uint64_t)0;
758 return fl->fl_end - fl->fl_start + 1;
759}
760
745/* 761/*
746 * opcode,type,reclaim,offset,length,new_lock_owner = 32 762 * opcode,type,reclaim,offset,length,new_lock_owner = 32
747 * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40 763 * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40
748 */ 764 */
749static int encode_lock(struct xdr_stream *xdr, const struct nfs_lockargs *arg) 765static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args)
750{ 766{
751 uint32_t *p; 767 uint32_t *p;
752 struct nfs_lock_opargs *opargs = arg->u.lock;
753 768
754 RESERVE_SPACE(32); 769 RESERVE_SPACE(32);
755 WRITE32(OP_LOCK); 770 WRITE32(OP_LOCK);
756 WRITE32(arg->type); 771 WRITE32(nfs4_lock_type(args->fl, args->block));
757 WRITE32(opargs->reclaim); 772 WRITE32(args->reclaim);
758 WRITE64(arg->offset); 773 WRITE64(args->fl->fl_start);
759 WRITE64(arg->length); 774 WRITE64(nfs4_lock_length(args->fl));
760 WRITE32(opargs->new_lock_owner); 775 WRITE32(args->new_lock_owner);
761 if (opargs->new_lock_owner){ 776 if (args->new_lock_owner){
762 RESERVE_SPACE(40); 777 RESERVE_SPACE(40);
763 WRITE32(opargs->open_seqid->sequence->counter); 778 WRITE32(args->open_seqid->sequence->counter);
764 WRITEMEM(opargs->open_stateid->data, sizeof(opargs->open_stateid->data)); 779 WRITEMEM(args->open_stateid->data, sizeof(args->open_stateid->data));
765 WRITE32(opargs->lock_seqid->sequence->counter); 780 WRITE32(args->lock_seqid->sequence->counter);
766 WRITE64(opargs->lock_owner.clientid); 781 WRITE64(args->lock_owner.clientid);
767 WRITE32(4); 782 WRITE32(4);
768 WRITE32(opargs->lock_owner.id); 783 WRITE32(args->lock_owner.id);
769 } 784 }
770 else { 785 else {
771 RESERVE_SPACE(20); 786 RESERVE_SPACE(20);
772 WRITEMEM(opargs->lock_stateid->data, sizeof(opargs->lock_stateid->data)); 787 WRITEMEM(args->lock_stateid->data, sizeof(args->lock_stateid->data));
773 WRITE32(opargs->lock_seqid->sequence->counter); 788 WRITE32(args->lock_seqid->sequence->counter);
774 } 789 }
775 790
776 return 0; 791 return 0;
777} 792}
778 793
779static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockargs *arg) 794static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args)
780{ 795{
781 uint32_t *p; 796 uint32_t *p;
782 struct nfs_lowner *opargs = arg->u.lockt;
783 797
784 RESERVE_SPACE(40); 798 RESERVE_SPACE(40);
785 WRITE32(OP_LOCKT); 799 WRITE32(OP_LOCKT);
786 WRITE32(arg->type); 800 WRITE32(nfs4_lock_type(args->fl, 0));
787 WRITE64(arg->offset); 801 WRITE64(args->fl->fl_start);
788 WRITE64(arg->length); 802 WRITE64(nfs4_lock_length(args->fl));
789 WRITE64(opargs->clientid); 803 WRITE64(args->lock_owner.clientid);
790 WRITE32(4); 804 WRITE32(4);
791 WRITE32(opargs->id); 805 WRITE32(args->lock_owner.id);
792 806
793 return 0; 807 return 0;
794} 808}
795 809
796static int encode_locku(struct xdr_stream *xdr, const struct nfs_lockargs *arg) 810static int encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args)
797{ 811{
798 uint32_t *p; 812 uint32_t *p;
799 struct nfs_locku_opargs *opargs = arg->u.locku;
800 813
801 RESERVE_SPACE(44); 814 RESERVE_SPACE(44);
802 WRITE32(OP_LOCKU); 815 WRITE32(OP_LOCKU);
803 WRITE32(arg->type); 816 WRITE32(nfs4_lock_type(args->fl, 0));
804 WRITE32(opargs->seqid->sequence->counter); 817 WRITE32(args->seqid->sequence->counter);
805 WRITEMEM(opargs->stateid->data, sizeof(opargs->stateid->data)); 818 WRITEMEM(args->stateid->data, sizeof(args->stateid->data));
806 WRITE64(arg->offset); 819 WRITE64(args->fl->fl_start);
807 WRITE64(arg->length); 820 WRITE64(nfs4_lock_length(args->fl));
808 821
809 return 0; 822 return 0;
810} 823}
@@ -964,9 +977,9 @@ static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_con
964{ 977{
965 uint32_t *p; 978 uint32_t *p;
966 979
967 RESERVE_SPACE(8+sizeof(arg->stateid.data)); 980 RESERVE_SPACE(8+sizeof(arg->stateid->data));
968 WRITE32(OP_OPEN_CONFIRM); 981 WRITE32(OP_OPEN_CONFIRM);
969 WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data)); 982 WRITEMEM(arg->stateid->data, sizeof(arg->stateid->data));
970 WRITE32(arg->seqid->sequence->counter); 983 WRITE32(arg->seqid->sequence->counter);
971 984
972 return 0; 985 return 0;
@@ -1499,9 +1512,6 @@ static int nfs4_xdr_enc_open(struct rpc_rqst *req, uint32_t *p, struct nfs_opena
1499 }; 1512 };
1500 int status; 1513 int status;
1501 1514
1502 status = nfs_wait_on_sequence(args->seqid, req->rq_task);
1503 if (status != 0)
1504 goto out;
1505 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1515 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1506 encode_compound_hdr(&xdr, &hdr); 1516 encode_compound_hdr(&xdr, &hdr);
1507 status = encode_putfh(&xdr, args->fh); 1517 status = encode_putfh(&xdr, args->fh);
@@ -1538,9 +1548,6 @@ static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, uint32_t *p, struct n
1538 }; 1548 };
1539 int status; 1549 int status;
1540 1550
1541 status = nfs_wait_on_sequence(args->seqid, req->rq_task);
1542 if (status != 0)
1543 goto out;
1544 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1551 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1545 encode_compound_hdr(&xdr, &hdr); 1552 encode_compound_hdr(&xdr, &hdr);
1546 status = encode_putfh(&xdr, args->fh); 1553 status = encode_putfh(&xdr, args->fh);
@@ -1558,19 +1565,19 @@ static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, uint32_t *p, struct nf
1558{ 1565{
1559 struct xdr_stream xdr; 1566 struct xdr_stream xdr;
1560 struct compound_hdr hdr = { 1567 struct compound_hdr hdr = {
1561 .nops = 2, 1568 .nops = 3,
1562 }; 1569 };
1563 int status; 1570 int status;
1564 1571
1565 status = nfs_wait_on_sequence(args->seqid, req->rq_task);
1566 if (status != 0)
1567 goto out;
1568 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1572 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1569 encode_compound_hdr(&xdr, &hdr); 1573 encode_compound_hdr(&xdr, &hdr);
1570 status = encode_putfh(&xdr, args->fh); 1574 status = encode_putfh(&xdr, args->fh);
1571 if (status) 1575 if (status)
1572 goto out; 1576 goto out;
1573 status = encode_open(&xdr, args); 1577 status = encode_open(&xdr, args);
1578 if (status)
1579 goto out;
1580 status = encode_getfattr(&xdr, args->bitmask);
1574out: 1581out:
1575 return status; 1582 return status;
1576} 1583}
@@ -1602,21 +1609,14 @@ out:
1602/* 1609/*
1603 * Encode a LOCK request 1610 * Encode a LOCK request
1604 */ 1611 */
1605static int nfs4_xdr_enc_lock(struct rpc_rqst *req, uint32_t *p, struct nfs_lockargs *args) 1612static int nfs4_xdr_enc_lock(struct rpc_rqst *req, uint32_t *p, struct nfs_lock_args *args)
1606{ 1613{
1607 struct xdr_stream xdr; 1614 struct xdr_stream xdr;
1608 struct compound_hdr hdr = { 1615 struct compound_hdr hdr = {
1609 .nops = 2, 1616 .nops = 2,
1610 }; 1617 };
1611 struct nfs_lock_opargs *opargs = args->u.lock;
1612 int status; 1618 int status;
1613 1619
1614 status = nfs_wait_on_sequence(opargs->lock_seqid, req->rq_task);
1615 if (status != 0)
1616 goto out;
1617 /* Do we need to do an open_to_lock_owner? */
1618 if (opargs->lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)
1619 opargs->new_lock_owner = 0;
1620 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1620 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1621 encode_compound_hdr(&xdr, &hdr); 1621 encode_compound_hdr(&xdr, &hdr);
1622 status = encode_putfh(&xdr, args->fh); 1622 status = encode_putfh(&xdr, args->fh);
@@ -1630,7 +1630,7 @@ out:
1630/* 1630/*
1631 * Encode a LOCKT request 1631 * Encode a LOCKT request
1632 */ 1632 */
1633static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, uint32_t *p, struct nfs_lockargs *args) 1633static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, uint32_t *p, struct nfs_lockt_args *args)
1634{ 1634{
1635 struct xdr_stream xdr; 1635 struct xdr_stream xdr;
1636 struct compound_hdr hdr = { 1636 struct compound_hdr hdr = {
@@ -1651,7 +1651,7 @@ out:
1651/* 1651/*
1652 * Encode a LOCKU request 1652 * Encode a LOCKU request
1653 */ 1653 */
1654static int nfs4_xdr_enc_locku(struct rpc_rqst *req, uint32_t *p, struct nfs_lockargs *args) 1654static int nfs4_xdr_enc_locku(struct rpc_rqst *req, uint32_t *p, struct nfs_locku_args *args)
1655{ 1655{
1656 struct xdr_stream xdr; 1656 struct xdr_stream xdr;
1657 struct compound_hdr hdr = { 1657 struct compound_hdr hdr = {
@@ -1985,14 +1985,20 @@ static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, uint32_t *p, const str
1985{ 1985{
1986 struct xdr_stream xdr; 1986 struct xdr_stream xdr;
1987 struct compound_hdr hdr = { 1987 struct compound_hdr hdr = {
1988 .nops = 2, 1988 .nops = 3,
1989 }; 1989 };
1990 int status; 1990 int status;
1991 1991
1992 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1992 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1993 encode_compound_hdr(&xdr, &hdr); 1993 encode_compound_hdr(&xdr, &hdr);
1994 if ((status = encode_putfh(&xdr, args->fhandle)) == 0) 1994 status = encode_putfh(&xdr, args->fhandle);
1995 status = encode_delegreturn(&xdr, args->stateid); 1995 if (status != 0)
1996 goto out;
1997 status = encode_delegreturn(&xdr, args->stateid);
1998 if (status != 0)
1999 goto out;
2000 status = encode_getfattr(&xdr, args->bitmask);
2001out:
1996 return status; 2002 return status;
1997} 2003}
1998 2004
@@ -2955,55 +2961,64 @@ static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
2955/* 2961/*
2956 * We create the owner, so we know a proper owner.id length is 4. 2962 * We create the owner, so we know a proper owner.id length is 4.
2957 */ 2963 */
2958static int decode_lock_denied (struct xdr_stream *xdr, struct nfs_lock_denied *denied) 2964static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
2959{ 2965{
2966 uint64_t offset, length, clientid;
2960 uint32_t *p; 2967 uint32_t *p;
2961 uint32_t namelen; 2968 uint32_t namelen, type;
2962 2969
2963 READ_BUF(32); 2970 READ_BUF(32);
2964 READ64(denied->offset); 2971 READ64(offset);
2965 READ64(denied->length); 2972 READ64(length);
2966 READ32(denied->type); 2973 READ32(type);
2967 READ64(denied->owner.clientid); 2974 if (fl != NULL) {
2975 fl->fl_start = (loff_t)offset;
2976 fl->fl_end = fl->fl_start + (loff_t)length - 1;
2977 if (length == ~(uint64_t)0)
2978 fl->fl_end = OFFSET_MAX;
2979 fl->fl_type = F_WRLCK;
2980 if (type & 1)
2981 fl->fl_type = F_RDLCK;
2982 fl->fl_pid = 0;
2983 }
2984 READ64(clientid);
2968 READ32(namelen); 2985 READ32(namelen);
2969 READ_BUF(namelen); 2986 READ_BUF(namelen);
2970 if (namelen == 4)
2971 READ32(denied->owner.id);
2972 return -NFS4ERR_DENIED; 2987 return -NFS4ERR_DENIED;
2973} 2988}
2974 2989
2975static int decode_lock(struct xdr_stream *xdr, struct nfs_lockres *res) 2990static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
2976{ 2991{
2977 uint32_t *p; 2992 uint32_t *p;
2978 int status; 2993 int status;
2979 2994
2980 status = decode_op_hdr(xdr, OP_LOCK); 2995 status = decode_op_hdr(xdr, OP_LOCK);
2981 if (status == 0) { 2996 if (status == 0) {
2982 READ_BUF(sizeof(res->u.stateid.data)); 2997 READ_BUF(sizeof(res->stateid.data));
2983 COPYMEM(res->u.stateid.data, sizeof(res->u.stateid.data)); 2998 COPYMEM(res->stateid.data, sizeof(res->stateid.data));
2984 } else if (status == -NFS4ERR_DENIED) 2999 } else if (status == -NFS4ERR_DENIED)
2985 return decode_lock_denied(xdr, &res->u.denied); 3000 return decode_lock_denied(xdr, NULL);
2986 return status; 3001 return status;
2987} 3002}
2988 3003
2989static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockres *res) 3004static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res)
2990{ 3005{
2991 int status; 3006 int status;
2992 status = decode_op_hdr(xdr, OP_LOCKT); 3007 status = decode_op_hdr(xdr, OP_LOCKT);
2993 if (status == -NFS4ERR_DENIED) 3008 if (status == -NFS4ERR_DENIED)
2994 return decode_lock_denied(xdr, &res->u.denied); 3009 return decode_lock_denied(xdr, res->denied);
2995 return status; 3010 return status;
2996} 3011}
2997 3012
2998static int decode_locku(struct xdr_stream *xdr, struct nfs_lockres *res) 3013static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
2999{ 3014{
3000 uint32_t *p; 3015 uint32_t *p;
3001 int status; 3016 int status;
3002 3017
3003 status = decode_op_hdr(xdr, OP_LOCKU); 3018 status = decode_op_hdr(xdr, OP_LOCKU);
3004 if (status == 0) { 3019 if (status == 0) {
3005 READ_BUF(sizeof(res->u.stateid.data)); 3020 READ_BUF(sizeof(res->stateid.data));
3006 COPYMEM(res->u.stateid.data, sizeof(res->u.stateid.data)); 3021 COPYMEM(res->stateid.data, sizeof(res->stateid.data));
3007 } 3022 }
3008 return status; 3023 return status;
3009} 3024}
@@ -3831,6 +3846,9 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, uint32_t *p, struct
3831 if (status) 3846 if (status)
3832 goto out; 3847 goto out;
3833 status = decode_open(&xdr, res); 3848 status = decode_open(&xdr, res);
3849 if (status)
3850 goto out;
3851 decode_getfattr(&xdr, res->f_attr, res->server);
3834out: 3852out:
3835 return status; 3853 return status;
3836} 3854}
@@ -3864,7 +3882,7 @@ out:
3864/* 3882/*
3865 * Decode LOCK response 3883 * Decode LOCK response
3866 */ 3884 */
3867static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lockres *res) 3885static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lock_res *res)
3868{ 3886{
3869 struct xdr_stream xdr; 3887 struct xdr_stream xdr;
3870 struct compound_hdr hdr; 3888 struct compound_hdr hdr;
@@ -3885,7 +3903,7 @@ out:
3885/* 3903/*
3886 * Decode LOCKT response 3904 * Decode LOCKT response
3887 */ 3905 */
3888static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lockres *res) 3906static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lockt_res *res)
3889{ 3907{
3890 struct xdr_stream xdr; 3908 struct xdr_stream xdr;
3891 struct compound_hdr hdr; 3909 struct compound_hdr hdr;
@@ -3906,7 +3924,7 @@ out:
3906/* 3924/*
3907 * Decode LOCKU response 3925 * Decode LOCKU response
3908 */ 3926 */
3909static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lockres *res) 3927static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_locku_res *res)
3910{ 3928{
3911 struct xdr_stream xdr; 3929 struct xdr_stream xdr;
3912 struct compound_hdr hdr; 3930 struct compound_hdr hdr;
@@ -4174,7 +4192,7 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, s
4174/* 4192/*
4175 * DELEGRETURN request 4193 * DELEGRETURN request
4176 */ 4194 */
4177static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, uint32_t *p, void *dummy) 4195static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_delegreturnres *res)
4178{ 4196{
4179 struct xdr_stream xdr; 4197 struct xdr_stream xdr;
4180 struct compound_hdr hdr; 4198 struct compound_hdr hdr;
@@ -4182,11 +4200,14 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, uint32_t *p, void *d
4182 4200
4183 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 4201 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
4184 status = decode_compound_hdr(&xdr, &hdr); 4202 status = decode_compound_hdr(&xdr, &hdr);
4185 if (status == 0) { 4203 if (status != 0)
4186 status = decode_putfh(&xdr); 4204 goto out;
4187 if (status == 0) 4205 status = decode_putfh(&xdr);
4188 status = decode_delegreturn(&xdr); 4206 if (status != 0)
4189 } 4207 goto out;
4208 status = decode_delegreturn(&xdr);
4209 decode_getfattr(&xdr, res->fattr, res->server);
4210out:
4190 return status; 4211 return status;
4191} 4212}
4192 4213
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 1b272a135a3..e897e00c2c9 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -275,7 +275,9 @@ static int __init root_nfs_parse(char *name, char *buf)
275 case Opt_noacl: 275 case Opt_noacl:
276 nfs_data.flags |= NFS_MOUNT_NOACL; 276 nfs_data.flags |= NFS_MOUNT_NOACL;
277 break; 277 break;
278 default : 278 default:
279 printk(KERN_WARNING "Root-NFS: unknown "
280 "option: %s\n", p);
279 return 0; 281 return 0;
280 } 282 }
281 } 283 }
@@ -296,8 +298,8 @@ static int __init root_nfs_name(char *name)
296 nfs_port = -1; 298 nfs_port = -1;
297 nfs_data.version = NFS_MOUNT_VERSION; 299 nfs_data.version = NFS_MOUNT_VERSION;
298 nfs_data.flags = NFS_MOUNT_NONLM; /* No lockd in nfs root yet */ 300 nfs_data.flags = NFS_MOUNT_NONLM; /* No lockd in nfs root yet */
299 nfs_data.rsize = NFS_DEF_FILE_IO_BUFFER_SIZE; 301 nfs_data.rsize = NFS_DEF_FILE_IO_SIZE;
300 nfs_data.wsize = NFS_DEF_FILE_IO_BUFFER_SIZE; 302 nfs_data.wsize = NFS_DEF_FILE_IO_SIZE;
301 nfs_data.acregmin = 3; 303 nfs_data.acregmin = 3;
302 nfs_data.acregmax = 60; 304 nfs_data.acregmax = 60;
303 nfs_data.acdirmin = 30; 305 nfs_data.acdirmin = 30;
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index a48a003242c..f5150d71c03 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -111,6 +111,9 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
111 }; 111 };
112 int status; 112 int status;
113 113
114 /* Mask out the non-modebit related stuff from attr->ia_mode */
115 sattr->ia_mode &= S_IALLUGO;
116
114 dprintk("NFS call setattr\n"); 117 dprintk("NFS call setattr\n");
115 nfs_fattr_init(fattr); 118 nfs_fattr_init(fattr);
116 status = rpc_call(NFS_CLIENT(inode), NFSPROC_SETATTR, &arg, fattr, 0); 119 status = rpc_call(NFS_CLIENT(inode), NFSPROC_SETATTR, &arg, fattr, 0);
@@ -375,6 +378,7 @@ nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
375 378
376 dprintk("NFS call link %s\n", name->name); 379 dprintk("NFS call link %s\n", name->name);
377 status = rpc_call(NFS_CLIENT(inode), NFSPROC_LINK, &arg, NULL, 0); 380 status = rpc_call(NFS_CLIENT(inode), NFSPROC_LINK, &arg, NULL, 0);
381 nfs_mark_for_revalidate(inode);
378 nfs_mark_for_revalidate(dir); 382 nfs_mark_for_revalidate(dir);
379 dprintk("NFS reply link: %d\n", status); 383 dprintk("NFS reply link: %d\n", status);
380 return status; 384 return status;
@@ -546,10 +550,9 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
546 550
547extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int); 551extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int);
548 552
549static void 553static void nfs_read_done(struct rpc_task *task, void *calldata)
550nfs_read_done(struct rpc_task *task)
551{ 554{
552 struct nfs_read_data *data = (struct nfs_read_data *) task->tk_calldata; 555 struct nfs_read_data *data = calldata;
553 556
554 if (task->tk_status >= 0) { 557 if (task->tk_status >= 0) {
555 nfs_refresh_inode(data->inode, data->res.fattr); 558 nfs_refresh_inode(data->inode, data->res.fattr);
@@ -559,9 +562,14 @@ nfs_read_done(struct rpc_task *task)
559 if (data->args.offset + data->args.count >= data->res.fattr->size) 562 if (data->args.offset + data->args.count >= data->res.fattr->size)
560 data->res.eof = 1; 563 data->res.eof = 1;
561 } 564 }
562 nfs_readpage_result(task); 565 nfs_readpage_result(task, calldata);
563} 566}
564 567
568static const struct rpc_call_ops nfs_read_ops = {
569 .rpc_call_done = nfs_read_done,
570 .rpc_release = nfs_readdata_release,
571};
572
565static void 573static void
566nfs_proc_read_setup(struct nfs_read_data *data) 574nfs_proc_read_setup(struct nfs_read_data *data)
567{ 575{
@@ -579,20 +587,24 @@ nfs_proc_read_setup(struct nfs_read_data *data)
579 flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0); 587 flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0);
580 588
581 /* Finalize the task. */ 589 /* Finalize the task. */
582 rpc_init_task(task, NFS_CLIENT(inode), nfs_read_done, flags); 590 rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs_read_ops, data);
583 rpc_call_setup(task, &msg, 0); 591 rpc_call_setup(task, &msg, 0);
584} 592}
585 593
586static void 594static void nfs_write_done(struct rpc_task *task, void *calldata)
587nfs_write_done(struct rpc_task *task)
588{ 595{
589 struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata; 596 struct nfs_write_data *data = calldata;
590 597
591 if (task->tk_status >= 0) 598 if (task->tk_status >= 0)
592 nfs_post_op_update_inode(data->inode, data->res.fattr); 599 nfs_post_op_update_inode(data->inode, data->res.fattr);
593 nfs_writeback_done(task); 600 nfs_writeback_done(task, calldata);
594} 601}
595 602
603static const struct rpc_call_ops nfs_write_ops = {
604 .rpc_call_done = nfs_write_done,
605 .rpc_release = nfs_writedata_release,
606};
607
596static void 608static void
597nfs_proc_write_setup(struct nfs_write_data *data, int how) 609nfs_proc_write_setup(struct nfs_write_data *data, int how)
598{ 610{
@@ -613,7 +625,7 @@ nfs_proc_write_setup(struct nfs_write_data *data, int how)
613 flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; 625 flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
614 626
615 /* Finalize the task. */ 627 /* Finalize the task. */
616 rpc_init_task(task, NFS_CLIENT(inode), nfs_write_done, flags); 628 rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs_write_ops, data);
617 rpc_call_setup(task, &msg, 0); 629 rpc_call_setup(task, &msg, 0);
618} 630}
619 631
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 5f20eafba8e..05eb43fadf8 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -42,9 +42,8 @@ mempool_t *nfs_rdata_mempool;
42 42
43#define MIN_POOL_READ (32) 43#define MIN_POOL_READ (32)
44 44
45void nfs_readdata_release(struct rpc_task *task) 45void nfs_readdata_release(void *data)
46{ 46{
47 struct nfs_read_data *data = (struct nfs_read_data *)task->tk_calldata;
48 nfs_readdata_free(data); 47 nfs_readdata_free(data);
49} 48}
50 49
@@ -84,7 +83,7 @@ static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode,
84 int result; 83 int result;
85 struct nfs_read_data *rdata; 84 struct nfs_read_data *rdata;
86 85
87 rdata = nfs_readdata_alloc(); 86 rdata = nfs_readdata_alloc(1);
88 if (!rdata) 87 if (!rdata)
89 return -ENOMEM; 88 return -ENOMEM;
90 89
@@ -220,9 +219,6 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
220 NFS_PROTO(inode)->read_setup(data); 219 NFS_PROTO(inode)->read_setup(data);
221 220
222 data->task.tk_cookie = (unsigned long)inode; 221 data->task.tk_cookie = (unsigned long)inode;
223 data->task.tk_calldata = data;
224 /* Release requests */
225 data->task.tk_release = nfs_readdata_release;
226 222
227 dprintk("NFS: %4d initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", 223 dprintk("NFS: %4d initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
228 data->task.tk_pid, 224 data->task.tk_pid,
@@ -287,7 +283,7 @@ static int nfs_pagein_multi(struct list_head *head, struct inode *inode)
287 283
288 nbytes = req->wb_bytes; 284 nbytes = req->wb_bytes;
289 for(;;) { 285 for(;;) {
290 data = nfs_readdata_alloc(); 286 data = nfs_readdata_alloc(1);
291 if (!data) 287 if (!data)
292 goto out_bad; 288 goto out_bad;
293 INIT_LIST_HEAD(&data->pages); 289 INIT_LIST_HEAD(&data->pages);
@@ -343,7 +339,7 @@ static int nfs_pagein_one(struct list_head *head, struct inode *inode)
343 if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE) 339 if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
344 return nfs_pagein_multi(head, inode); 340 return nfs_pagein_multi(head, inode);
345 341
346 data = nfs_readdata_alloc(); 342 data = nfs_readdata_alloc(NFS_SERVER(inode)->rpages);
347 if (!data) 343 if (!data)
348 goto out_bad; 344 goto out_bad;
349 345
@@ -452,9 +448,9 @@ static void nfs_readpage_result_full(struct nfs_read_data *data, int status)
452 * This is the callback from RPC telling us whether a reply was 448 * This is the callback from RPC telling us whether a reply was
453 * received or some error occurred (timeout or socket shutdown). 449 * received or some error occurred (timeout or socket shutdown).
454 */ 450 */
455void nfs_readpage_result(struct rpc_task *task) 451void nfs_readpage_result(struct rpc_task *task, void *calldata)
456{ 452{
457 struct nfs_read_data *data = (struct nfs_read_data *)task->tk_calldata; 453 struct nfs_read_data *data = calldata;
458 struct nfs_readargs *argp = &data->args; 454 struct nfs_readargs *argp = &data->args;
459 struct nfs_readres *resp = &data->res; 455 struct nfs_readres *resp = &data->res;
460 int status = task->tk_status; 456 int status = task->tk_status;
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
new file mode 100644
index 00000000000..4c486eb867c
--- /dev/null
+++ b/fs/nfs/sysctl.c
@@ -0,0 +1,84 @@
1/*
2 * linux/fs/nfs/sysctl.c
3 *
4 * Sysctl interface to NFS parameters
5 */
6#include <linux/config.h>
7#include <linux/types.h>
8#include <linux/linkage.h>
9#include <linux/ctype.h>
10#include <linux/fs.h>
11#include <linux/sysctl.h>
12#include <linux/module.h>
13#include <linux/nfs4.h>
14#include <linux/nfs_idmap.h>
15
16#include "callback.h"
17
18static const int nfs_set_port_min = 0;
19static const int nfs_set_port_max = 65535;
20static struct ctl_table_header *nfs_callback_sysctl_table;
21/*
22 * Something that isn't CTL_ANY, CTL_NONE or a value that may clash.
23 * Use the same values as fs/lockd/svc.c
24 */
25#define CTL_UNNUMBERED -2
26
27static ctl_table nfs_cb_sysctls[] = {
28#ifdef CONFIG_NFS_V4
29 {
30 .ctl_name = CTL_UNNUMBERED,
31 .procname = "nfs_callback_tcpport",
32 .data = &nfs_callback_set_tcpport,
33 .maxlen = sizeof(int),
34 .mode = 0644,
35 .proc_handler = &proc_dointvec_minmax,
36 .extra1 = (int *)&nfs_set_port_min,
37 .extra2 = (int *)&nfs_set_port_max,
38 },
39 {
40 .ctl_name = CTL_UNNUMBERED,
41 .procname = "idmap_cache_timeout",
42 .data = &nfs_idmap_cache_timeout,
43 .maxlen = sizeof(int),
44 .mode = 0644,
45 .proc_handler = &proc_dointvec_jiffies,
46 .strategy = &sysctl_jiffies,
47 },
48#endif
49 { .ctl_name = 0 }
50};
51
52static ctl_table nfs_cb_sysctl_dir[] = {
53 {
54 .ctl_name = CTL_UNNUMBERED,
55 .procname = "nfs",
56 .mode = 0555,
57 .child = nfs_cb_sysctls,
58 },
59 { .ctl_name = 0 }
60};
61
62static ctl_table nfs_cb_sysctl_root[] = {
63 {
64 .ctl_name = CTL_FS,
65 .procname = "fs",
66 .mode = 0555,
67 .child = nfs_cb_sysctl_dir,
68 },
69 { .ctl_name = 0 }
70};
71
72int nfs_register_sysctl(void)
73{
74 nfs_callback_sysctl_table = register_sysctl_table(nfs_cb_sysctl_root, 0);
75 if (nfs_callback_sysctl_table == NULL)
76 return -ENOMEM;
77 return 0;
78}
79
80void nfs_unregister_sysctl(void)
81{
82 unregister_sysctl_table(nfs_callback_sysctl_table);
83 nfs_callback_sysctl_table = NULL;
84}
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index d639d172d56..a65c7b53d55 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -87,10 +87,9 @@ nfs_copy_dname(struct dentry *dentry, struct nfs_unlinkdata *data)
87 * We delay initializing RPC info until after the call to dentry_iput() 87 * We delay initializing RPC info until after the call to dentry_iput()
88 * in order to minimize races against rename(). 88 * in order to minimize races against rename().
89 */ 89 */
90static void 90static void nfs_async_unlink_init(struct rpc_task *task, void *calldata)
91nfs_async_unlink_init(struct rpc_task *task)
92{ 91{
93 struct nfs_unlinkdata *data = (struct nfs_unlinkdata *)task->tk_calldata; 92 struct nfs_unlinkdata *data = calldata;
94 struct dentry *dir = data->dir; 93 struct dentry *dir = data->dir;
95 struct rpc_message msg = { 94 struct rpc_message msg = {
96 .rpc_cred = data->cred, 95 .rpc_cred = data->cred,
@@ -116,10 +115,9 @@ nfs_async_unlink_init(struct rpc_task *task)
116 * 115 *
117 * Do the directory attribute update. 116 * Do the directory attribute update.
118 */ 117 */
119static void 118static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
120nfs_async_unlink_done(struct rpc_task *task)
121{ 119{
122 struct nfs_unlinkdata *data = (struct nfs_unlinkdata *)task->tk_calldata; 120 struct nfs_unlinkdata *data = calldata;
123 struct dentry *dir = data->dir; 121 struct dentry *dir = data->dir;
124 struct inode *dir_i; 122 struct inode *dir_i;
125 123
@@ -141,13 +139,18 @@ nfs_async_unlink_done(struct rpc_task *task)
141 * We need to call nfs_put_unlinkdata as a 'tk_release' task since the 139 * We need to call nfs_put_unlinkdata as a 'tk_release' task since the
142 * rpc_task would be freed too. 140 * rpc_task would be freed too.
143 */ 141 */
144static void 142static void nfs_async_unlink_release(void *calldata)
145nfs_async_unlink_release(struct rpc_task *task)
146{ 143{
147 struct nfs_unlinkdata *data = (struct nfs_unlinkdata *)task->tk_calldata; 144 struct nfs_unlinkdata *data = calldata;
148 nfs_put_unlinkdata(data); 145 nfs_put_unlinkdata(data);
149} 146}
150 147
148static const struct rpc_call_ops nfs_unlink_ops = {
149 .rpc_call_prepare = nfs_async_unlink_init,
150 .rpc_call_done = nfs_async_unlink_done,
151 .rpc_release = nfs_async_unlink_release,
152};
153
151/** 154/**
152 * nfs_async_unlink - asynchronous unlinking of a file 155 * nfs_async_unlink - asynchronous unlinking of a file
153 * @dentry: dentry to unlink 156 * @dentry: dentry to unlink
@@ -157,7 +160,6 @@ nfs_async_unlink(struct dentry *dentry)
157{ 160{
158 struct dentry *dir = dentry->d_parent; 161 struct dentry *dir = dentry->d_parent;
159 struct nfs_unlinkdata *data; 162 struct nfs_unlinkdata *data;
160 struct rpc_task *task;
161 struct rpc_clnt *clnt = NFS_CLIENT(dir->d_inode); 163 struct rpc_clnt *clnt = NFS_CLIENT(dir->d_inode);
162 int status = -ENOMEM; 164 int status = -ENOMEM;
163 165
@@ -178,17 +180,13 @@ nfs_async_unlink(struct dentry *dentry)
178 nfs_deletes = data; 180 nfs_deletes = data;
179 data->count = 1; 181 data->count = 1;
180 182
181 task = &data->task; 183 rpc_init_task(&data->task, clnt, RPC_TASK_ASYNC, &nfs_unlink_ops, data);
182 rpc_init_task(task, clnt, nfs_async_unlink_done , RPC_TASK_ASYNC);
183 task->tk_calldata = data;
184 task->tk_action = nfs_async_unlink_init;
185 task->tk_release = nfs_async_unlink_release;
186 184
187 spin_lock(&dentry->d_lock); 185 spin_lock(&dentry->d_lock);
188 dentry->d_flags |= DCACHE_NFSFS_RENAMED; 186 dentry->d_flags |= DCACHE_NFSFS_RENAMED;
189 spin_unlock(&dentry->d_lock); 187 spin_unlock(&dentry->d_lock);
190 188
191 rpc_sleep_on(&nfs_delete_queue, task, NULL, NULL); 189 rpc_sleep_on(&nfs_delete_queue, &data->task, NULL, NULL);
192 status = 0; 190 status = 0;
193 out: 191 out:
194 return status; 192 return status;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 8f71e766cc5..9449b683550 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -89,24 +89,38 @@ static mempool_t *nfs_commit_mempool;
89 89
90static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion); 90static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion);
91 91
92static inline struct nfs_write_data *nfs_commit_alloc(void) 92static inline struct nfs_write_data *nfs_commit_alloc(unsigned int pagecount)
93{ 93{
94 struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, SLAB_NOFS); 94 struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, SLAB_NOFS);
95
95 if (p) { 96 if (p) {
96 memset(p, 0, sizeof(*p)); 97 memset(p, 0, sizeof(*p));
97 INIT_LIST_HEAD(&p->pages); 98 INIT_LIST_HEAD(&p->pages);
99 if (pagecount < NFS_PAGEVEC_SIZE)
100 p->pagevec = &p->page_array[0];
101 else {
102 size_t size = ++pagecount * sizeof(struct page *);
103 p->pagevec = kmalloc(size, GFP_NOFS);
104 if (p->pagevec) {
105 memset(p->pagevec, 0, size);
106 } else {
107 mempool_free(p, nfs_commit_mempool);
108 p = NULL;
109 }
110 }
98 } 111 }
99 return p; 112 return p;
100} 113}
101 114
102static inline void nfs_commit_free(struct nfs_write_data *p) 115static inline void nfs_commit_free(struct nfs_write_data *p)
103{ 116{
117 if (p && (p->pagevec != &p->page_array[0]))
118 kfree(p->pagevec);
104 mempool_free(p, nfs_commit_mempool); 119 mempool_free(p, nfs_commit_mempool);
105} 120}
106 121
107static void nfs_writedata_release(struct rpc_task *task) 122void nfs_writedata_release(void *wdata)
108{ 123{
109 struct nfs_write_data *wdata = (struct nfs_write_data *)task->tk_calldata;
110 nfs_writedata_free(wdata); 124 nfs_writedata_free(wdata);
111} 125}
112 126
@@ -168,7 +182,7 @@ static int nfs_writepage_sync(struct nfs_open_context *ctx, struct inode *inode,
168 int result, written = 0; 182 int result, written = 0;
169 struct nfs_write_data *wdata; 183 struct nfs_write_data *wdata;
170 184
171 wdata = nfs_writedata_alloc(); 185 wdata = nfs_writedata_alloc(1);
172 if (!wdata) 186 if (!wdata)
173 return -ENOMEM; 187 return -ENOMEM;
174 188
@@ -189,6 +203,7 @@ static int nfs_writepage_sync(struct nfs_open_context *ctx, struct inode *inode,
189 (long long)NFS_FILEID(inode), 203 (long long)NFS_FILEID(inode),
190 count, (long long)(page_offset(page) + offset)); 204 count, (long long)(page_offset(page) + offset));
191 205
206 set_page_writeback(page);
192 nfs_begin_data_update(inode); 207 nfs_begin_data_update(inode);
193 do { 208 do {
194 if (count < wsize) 209 if (count < wsize)
@@ -221,6 +236,7 @@ static int nfs_writepage_sync(struct nfs_open_context *ctx, struct inode *inode,
221 236
222io_error: 237io_error:
223 nfs_end_data_update(inode); 238 nfs_end_data_update(inode);
239 end_page_writeback(page);
224 nfs_writedata_free(wdata); 240 nfs_writedata_free(wdata);
225 return written ? written : result; 241 return written ? written : result;
226} 242}
@@ -230,19 +246,16 @@ static int nfs_writepage_async(struct nfs_open_context *ctx,
230 unsigned int offset, unsigned int count) 246 unsigned int offset, unsigned int count)
231{ 247{
232 struct nfs_page *req; 248 struct nfs_page *req;
233 int status;
234 249
235 req = nfs_update_request(ctx, inode, page, offset, count); 250 req = nfs_update_request(ctx, inode, page, offset, count);
236 status = (IS_ERR(req)) ? PTR_ERR(req) : 0; 251 if (IS_ERR(req))
237 if (status < 0) 252 return PTR_ERR(req);
238 goto out;
239 /* Update file length */ 253 /* Update file length */
240 nfs_grow_file(page, offset, count); 254 nfs_grow_file(page, offset, count);
241 /* Set the PG_uptodate flag? */ 255 /* Set the PG_uptodate flag? */
242 nfs_mark_uptodate(page, offset, count); 256 nfs_mark_uptodate(page, offset, count);
243 nfs_unlock_request(req); 257 nfs_unlock_request(req);
244 out: 258 return 0;
245 return status;
246} 259}
247 260
248static int wb_priority(struct writeback_control *wbc) 261static int wb_priority(struct writeback_control *wbc)
@@ -302,11 +315,8 @@ do_it:
302 lock_kernel(); 315 lock_kernel();
303 if (!IS_SYNC(inode) && inode_referenced) { 316 if (!IS_SYNC(inode) && inode_referenced) {
304 err = nfs_writepage_async(ctx, inode, page, 0, offset); 317 err = nfs_writepage_async(ctx, inode, page, 0, offset);
305 if (err >= 0) { 318 if (!wbc->for_writepages)
306 err = 0; 319 nfs_flush_inode(inode, 0, 0, wb_priority(wbc));
307 if (wbc->for_reclaim)
308 nfs_flush_inode(inode, 0, 0, FLUSH_STABLE);
309 }
310 } else { 320 } else {
311 err = nfs_writepage_sync(ctx, inode, page, 0, 321 err = nfs_writepage_sync(ctx, inode, page, 0,
312 offset, priority); 322 offset, priority);
@@ -875,9 +885,6 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
875 885
876 data->task.tk_priority = flush_task_priority(how); 886 data->task.tk_priority = flush_task_priority(how);
877 data->task.tk_cookie = (unsigned long)inode; 887 data->task.tk_cookie = (unsigned long)inode;
878 data->task.tk_calldata = data;
879 /* Release requests */
880 data->task.tk_release = nfs_writedata_release;
881 888
882 dprintk("NFS: %4d initiated write call (req %s/%Ld, %u bytes @ offset %Lu)\n", 889 dprintk("NFS: %4d initiated write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
883 data->task.tk_pid, 890 data->task.tk_pid,
@@ -917,7 +924,7 @@ static int nfs_flush_multi(struct list_head *head, struct inode *inode, int how)
917 924
918 nbytes = req->wb_bytes; 925 nbytes = req->wb_bytes;
919 for (;;) { 926 for (;;) {
920 data = nfs_writedata_alloc(); 927 data = nfs_writedata_alloc(1);
921 if (!data) 928 if (!data)
922 goto out_bad; 929 goto out_bad;
923 list_add(&data->pages, &list); 930 list_add(&data->pages, &list);
@@ -929,7 +936,7 @@ static int nfs_flush_multi(struct list_head *head, struct inode *inode, int how)
929 atomic_set(&req->wb_complete, requests); 936 atomic_set(&req->wb_complete, requests);
930 937
931 ClearPageError(page); 938 ClearPageError(page);
932 SetPageWriteback(page); 939 set_page_writeback(page);
933 offset = 0; 940 offset = 0;
934 nbytes = req->wb_bytes; 941 nbytes = req->wb_bytes;
935 do { 942 do {
@@ -981,7 +988,7 @@ static int nfs_flush_one(struct list_head *head, struct inode *inode, int how)
981 if (NFS_SERVER(inode)->wsize < PAGE_CACHE_SIZE) 988 if (NFS_SERVER(inode)->wsize < PAGE_CACHE_SIZE)
982 return nfs_flush_multi(head, inode, how); 989 return nfs_flush_multi(head, inode, how);
983 990
984 data = nfs_writedata_alloc(); 991 data = nfs_writedata_alloc(NFS_SERVER(inode)->wpages);
985 if (!data) 992 if (!data)
986 goto out_bad; 993 goto out_bad;
987 994
@@ -992,7 +999,7 @@ static int nfs_flush_one(struct list_head *head, struct inode *inode, int how)
992 nfs_list_remove_request(req); 999 nfs_list_remove_request(req);
993 nfs_list_add_request(req, &data->pages); 1000 nfs_list_add_request(req, &data->pages);
994 ClearPageError(req->wb_page); 1001 ClearPageError(req->wb_page);
995 SetPageWriteback(req->wb_page); 1002 set_page_writeback(req->wb_page);
996 *pages++ = req->wb_page; 1003 *pages++ = req->wb_page;
997 count += req->wb_bytes; 1004 count += req->wb_bytes;
998 } 1005 }
@@ -1135,9 +1142,9 @@ static void nfs_writeback_done_full(struct nfs_write_data *data, int status)
1135/* 1142/*
1136 * This function is called when the WRITE call is complete. 1143 * This function is called when the WRITE call is complete.
1137 */ 1144 */
1138void nfs_writeback_done(struct rpc_task *task) 1145void nfs_writeback_done(struct rpc_task *task, void *calldata)
1139{ 1146{
1140 struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata; 1147 struct nfs_write_data *data = calldata;
1141 struct nfs_writeargs *argp = &data->args; 1148 struct nfs_writeargs *argp = &data->args;
1142 struct nfs_writeres *resp = &data->res; 1149 struct nfs_writeres *resp = &data->res;
1143 1150
@@ -1204,9 +1211,8 @@ void nfs_writeback_done(struct rpc_task *task)
1204 1211
1205 1212
1206#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 1213#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
1207static void nfs_commit_release(struct rpc_task *task) 1214void nfs_commit_release(void *wdata)
1208{ 1215{
1209 struct nfs_write_data *wdata = (struct nfs_write_data *)task->tk_calldata;
1210 nfs_commit_free(wdata); 1216 nfs_commit_free(wdata);
1211} 1217}
1212 1218
@@ -1242,9 +1248,6 @@ static void nfs_commit_rpcsetup(struct list_head *head,
1242 1248
1243 data->task.tk_priority = flush_task_priority(how); 1249 data->task.tk_priority = flush_task_priority(how);
1244 data->task.tk_cookie = (unsigned long)inode; 1250 data->task.tk_cookie = (unsigned long)inode;
1245 data->task.tk_calldata = data;
1246 /* Release requests */
1247 data->task.tk_release = nfs_commit_release;
1248 1251
1249 dprintk("NFS: %4d initiated commit call\n", data->task.tk_pid); 1252 dprintk("NFS: %4d initiated commit call\n", data->task.tk_pid);
1250} 1253}
@@ -1253,12 +1256,12 @@ static void nfs_commit_rpcsetup(struct list_head *head,
1253 * Commit dirty pages 1256 * Commit dirty pages
1254 */ 1257 */
1255static int 1258static int
1256nfs_commit_list(struct list_head *head, int how) 1259nfs_commit_list(struct inode *inode, struct list_head *head, int how)
1257{ 1260{
1258 struct nfs_write_data *data; 1261 struct nfs_write_data *data;
1259 struct nfs_page *req; 1262 struct nfs_page *req;
1260 1263
1261 data = nfs_commit_alloc(); 1264 data = nfs_commit_alloc(NFS_SERVER(inode)->wpages);
1262 1265
1263 if (!data) 1266 if (!data)
1264 goto out_bad; 1267 goto out_bad;
@@ -1281,10 +1284,9 @@ nfs_commit_list(struct list_head *head, int how)
1281/* 1284/*
1282 * COMMIT call returned 1285 * COMMIT call returned
1283 */ 1286 */
1284void 1287void nfs_commit_done(struct rpc_task *task, void *calldata)
1285nfs_commit_done(struct rpc_task *task)
1286{ 1288{
1287 struct nfs_write_data *data = (struct nfs_write_data *)task->tk_calldata; 1289 struct nfs_write_data *data = calldata;
1288 struct nfs_page *req; 1290 struct nfs_page *req;
1289 int res = 0; 1291 int res = 0;
1290 1292
@@ -1364,7 +1366,7 @@ int nfs_commit_inode(struct inode *inode, int how)
1364 res = nfs_scan_commit(inode, &head, 0, 0); 1366 res = nfs_scan_commit(inode, &head, 0, 0);
1365 spin_unlock(&nfsi->req_lock); 1367 spin_unlock(&nfsi->req_lock);
1366 if (res) { 1368 if (res) {
1367 error = nfs_commit_list(&head, how); 1369 error = nfs_commit_list(inode, &head, how);
1368 if (error < 0) 1370 if (error < 0)
1369 return error; 1371 return error;
1370 } 1372 }
@@ -1375,22 +1377,23 @@ int nfs_commit_inode(struct inode *inode, int how)
1375int nfs_sync_inode(struct inode *inode, unsigned long idx_start, 1377int nfs_sync_inode(struct inode *inode, unsigned long idx_start,
1376 unsigned int npages, int how) 1378 unsigned int npages, int how)
1377{ 1379{
1378 int error, 1380 int nocommit = how & FLUSH_NOCOMMIT;
1379 wait; 1381 int wait = how & FLUSH_WAIT;
1382 int error;
1380 1383
1381 wait = how & FLUSH_WAIT; 1384 how &= ~(FLUSH_WAIT|FLUSH_NOCOMMIT);
1382 how &= ~FLUSH_WAIT;
1383 1385
1384 do { 1386 do {
1385 error = 0; 1387 if (wait) {
1386 if (wait)
1387 error = nfs_wait_on_requests(inode, idx_start, npages); 1388 error = nfs_wait_on_requests(inode, idx_start, npages);
1388 if (error == 0) 1389 if (error != 0)
1389 error = nfs_flush_inode(inode, idx_start, npages, how); 1390 continue;
1390#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 1391 }
1391 if (error == 0) 1392 error = nfs_flush_inode(inode, idx_start, npages, how);
1393 if (error != 0)
1394 continue;
1395 if (!nocommit)
1392 error = nfs_commit_inode(inode, how); 1396 error = nfs_commit_inode(inode, how);
1393#endif
1394 } while (error > 0); 1397 } while (error > 0);
1395 return error; 1398 return error;
1396} 1399}
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index 0b14938b5b6..0d4cf948606 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -5,6 +5,7 @@
5 * 5 *
6 */ 6 */
7#include <linux/config.h> 7#include <linux/config.h>
8#include <linux/types.h>
8#include <linux/file.h> 9#include <linux/file.h>
9#include <linux/fs.h> 10#include <linux/fs.h>
10#include <linux/sunrpc/svc.h> 11#include <linux/sunrpc/svc.h>
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 7cbf0682b2f..fc95c4df669 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -107,7 +107,7 @@ static int nfsacld_proc_setacl(struct svc_rqst * rqstp,
107 dprintk("nfsd: SETACL(2acl) %s\n", SVCFH_fmt(&argp->fh)); 107 dprintk("nfsd: SETACL(2acl) %s\n", SVCFH_fmt(&argp->fh));
108 108
109 fh = fh_copy(&resp->fh, &argp->fh); 109 fh = fh_copy(&resp->fh, &argp->fh);
110 nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP); 110 nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_SATTR);
111 111
112 if (!nfserr) { 112 if (!nfserr) {
113 nfserr = nfserrno( nfsd_set_posix_acl( 113 nfserr = nfserrno( nfsd_set_posix_acl(
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 64ba40572fe..16e10c170ae 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -101,7 +101,7 @@ static int nfsd3_proc_setacl(struct svc_rqst * rqstp,
101 int nfserr = 0; 101 int nfserr = 0;
102 102
103 fh = fh_copy(&resp->fh, &argp->fh); 103 fh = fh_copy(&resp->fh, &argp->fh);
104 nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP); 104 nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_SATTR);
105 105
106 if (!nfserr) { 106 if (!nfserr) {
107 nfserr = nfserrno( nfsd_set_posix_acl( 107 nfserr = nfserrno( nfsd_set_posix_acl(
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 041380fe667..6d2dfed1de0 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -56,13 +56,20 @@ static int
56nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp, 56nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp,
57 struct nfsd3_attrstat *resp) 57 struct nfsd3_attrstat *resp)
58{ 58{
59 int nfserr; 59 int err, nfserr;
60 60
61 dprintk("nfsd: GETATTR(3) %s\n", 61 dprintk("nfsd: GETATTR(3) %s\n",
62 SVCFH_fmt(&argp->fh)); 62 SVCFH_fmt(&argp->fh));
63 63
64 fh_copy(&resp->fh, &argp->fh); 64 fh_copy(&resp->fh, &argp->fh);
65 nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP); 65 nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP);
66 if (nfserr)
67 RETURN_STATUS(nfserr);
68
69 err = vfs_getattr(resp->fh.fh_export->ex_mnt,
70 resp->fh.fh_dentry, &resp->stat);
71 nfserr = nfserrno(err);
72
66 RETURN_STATUS(nfserr); 73 RETURN_STATUS(nfserr);
67} 74}
68 75
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 9147b8524d0..243d94b9653 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -154,37 +154,34 @@ decode_sattr3(u32 *p, struct iattr *iap)
154} 154}
155 155
156static inline u32 * 156static inline u32 *
157encode_fattr3(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp) 157encode_fattr3(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
158 struct kstat *stat)
158{ 159{
159 struct vfsmount *mnt = fhp->fh_export->ex_mnt;
160 struct dentry *dentry = fhp->fh_dentry; 160 struct dentry *dentry = fhp->fh_dentry;
161 struct kstat stat;
162 struct timespec time; 161 struct timespec time;
163 162
164 vfs_getattr(mnt, dentry, &stat); 163 *p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]);
165 164 *p++ = htonl((u32) stat->mode);
166 *p++ = htonl(nfs3_ftypes[(stat.mode & S_IFMT) >> 12]); 165 *p++ = htonl((u32) stat->nlink);
167 *p++ = htonl((u32) stat.mode); 166 *p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid));
168 *p++ = htonl((u32) stat.nlink); 167 *p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid));
169 *p++ = htonl((u32) nfsd_ruid(rqstp, stat.uid)); 168 if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) {
170 *p++ = htonl((u32) nfsd_rgid(rqstp, stat.gid));
171 if (S_ISLNK(stat.mode) && stat.size > NFS3_MAXPATHLEN) {
172 p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN); 169 p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN);
173 } else { 170 } else {
174 p = xdr_encode_hyper(p, (u64) stat.size); 171 p = xdr_encode_hyper(p, (u64) stat->size);
175 } 172 }
176 p = xdr_encode_hyper(p, ((u64)stat.blocks) << 9); 173 p = xdr_encode_hyper(p, ((u64)stat->blocks) << 9);
177 *p++ = htonl((u32) MAJOR(stat.rdev)); 174 *p++ = htonl((u32) MAJOR(stat->rdev));
178 *p++ = htonl((u32) MINOR(stat.rdev)); 175 *p++ = htonl((u32) MINOR(stat->rdev));
179 if (is_fsid(fhp, rqstp->rq_reffh)) 176 if (is_fsid(fhp, rqstp->rq_reffh))
180 p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid); 177 p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid);
181 else 178 else
182 p = xdr_encode_hyper(p, (u64) huge_encode_dev(stat.dev)); 179 p = xdr_encode_hyper(p, (u64) huge_encode_dev(stat->dev));
183 p = xdr_encode_hyper(p, (u64) stat.ino); 180 p = xdr_encode_hyper(p, (u64) stat->ino);
184 p = encode_time3(p, &stat.atime); 181 p = encode_time3(p, &stat->atime);
185 lease_get_mtime(dentry->d_inode, &time); 182 lease_get_mtime(dentry->d_inode, &time);
186 p = encode_time3(p, &time); 183 p = encode_time3(p, &time);
187 p = encode_time3(p, &stat.ctime); 184 p = encode_time3(p, &stat->ctime);
188 185
189 return p; 186 return p;
190} 187}
@@ -232,8 +229,14 @@ encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
232{ 229{
233 struct dentry *dentry = fhp->fh_dentry; 230 struct dentry *dentry = fhp->fh_dentry;
234 if (dentry && dentry->d_inode != NULL) { 231 if (dentry && dentry->d_inode != NULL) {
235 *p++ = xdr_one; /* attributes follow */ 232 int err;
236 return encode_fattr3(rqstp, p, fhp); 233 struct kstat stat;
234
235 err = vfs_getattr(fhp->fh_export->ex_mnt, dentry, &stat);
236 if (!err) {
237 *p++ = xdr_one; /* attributes follow */
238 return encode_fattr3(rqstp, p, fhp, &stat);
239 }
237 } 240 }
238 *p++ = xdr_zero; 241 *p++ = xdr_zero;
239 return p; 242 return p;
@@ -616,7 +619,7 @@ nfs3svc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
616 struct nfsd3_attrstat *resp) 619 struct nfsd3_attrstat *resp)
617{ 620{
618 if (resp->status == 0) 621 if (resp->status == 0)
619 p = encode_fattr3(rqstp, p, &resp->fh); 622 p = encode_fattr3(rqstp, p, &resp->fh, &resp->stat);
620 return xdr_ressize_check(rqstp, p); 623 return xdr_ressize_check(rqstp, p);
621} 624}
622 625
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 583c0710e45..d828662d737 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -53,7 +53,7 @@
53#define NFSPROC4_CB_COMPOUND 1 53#define NFSPROC4_CB_COMPOUND 1
54 54
55/* declarations */ 55/* declarations */
56static void nfs4_cb_null(struct rpc_task *task); 56static const struct rpc_call_ops nfs4_cb_null_ops;
57 57
58/* Index of predefined Linux callback client operations */ 58/* Index of predefined Linux callback client operations */
59 59
@@ -431,7 +431,6 @@ nfsd4_probe_callback(struct nfs4_client *clp)
431 } 431 }
432 clnt->cl_intr = 0; 432 clnt->cl_intr = 0;
433 clnt->cl_softrtry = 1; 433 clnt->cl_softrtry = 1;
434 clnt->cl_chatty = 1;
435 434
436 /* Kick rpciod, put the call on the wire. */ 435 /* Kick rpciod, put the call on the wire. */
437 436
@@ -447,7 +446,7 @@ nfsd4_probe_callback(struct nfs4_client *clp)
447 msg.rpc_cred = nfsd4_lookupcred(clp,0); 446 msg.rpc_cred = nfsd4_lookupcred(clp,0);
448 if (IS_ERR(msg.rpc_cred)) 447 if (IS_ERR(msg.rpc_cred))
449 goto out_rpciod; 448 goto out_rpciod;
450 status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, nfs4_cb_null, NULL); 449 status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, &nfs4_cb_null_ops, NULL);
451 put_rpccred(msg.rpc_cred); 450 put_rpccred(msg.rpc_cred);
452 451
453 if (status != 0) { 452 if (status != 0) {
@@ -469,7 +468,7 @@ out_err:
469} 468}
470 469
471static void 470static void
472nfs4_cb_null(struct rpc_task *task) 471nfs4_cb_null(struct rpc_task *task, void *dummy)
473{ 472{
474 struct nfs4_client *clp = (struct nfs4_client *)task->tk_msg.rpc_argp; 473 struct nfs4_client *clp = (struct nfs4_client *)task->tk_msg.rpc_argp;
475 struct nfs4_callback *cb = &clp->cl_callback; 474 struct nfs4_callback *cb = &clp->cl_callback;
@@ -488,6 +487,10 @@ out:
488 put_nfs4_client(clp); 487 put_nfs4_client(clp);
489} 488}
490 489
490static const struct rpc_call_ops nfs4_cb_null_ops = {
491 .rpc_call_done = nfs4_cb_null,
492};
493
491/* 494/*
492 * called with dp->dl_count inc'ed. 495 * called with dp->dl_count inc'ed.
493 * nfs4_lock_state() may or may not have been called. 496 * nfs4_lock_state() may or may not have been called.
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 361b4007d4a..a00fe868629 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -192,6 +192,14 @@ nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open
192 } 192 }
193 if (status) 193 if (status)
194 goto out; 194 goto out;
195
196 /* Openowner is now set, so sequence id will get bumped. Now we need
197 * these checks before we do any creates: */
198 if (nfs4_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
199 return nfserr_grace;
200 if (!nfs4_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
201 return nfserr_no_grace;
202
195 switch (open->op_claim_type) { 203 switch (open->op_claim_type) {
196 case NFS4_OPEN_CLAIM_DELEGATE_CUR: 204 case NFS4_OPEN_CLAIM_DELEGATE_CUR:
197 status = nfserr_inval; 205 status = nfserr_inval;
@@ -210,6 +218,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open
210 goto out; 218 goto out;
211 break; 219 break;
212 case NFS4_OPEN_CLAIM_PREVIOUS: 220 case NFS4_OPEN_CLAIM_PREVIOUS:
221 open->op_stateowner->so_confirmed = 1;
213 /* 222 /*
214 * The CURRENT_FH is already set to the file being 223 * The CURRENT_FH is already set to the file being
215 * opened. (1) set open->op_cinfo, (2) set 224 * opened. (1) set open->op_cinfo, (2) set
@@ -221,6 +230,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open
221 goto out; 230 goto out;
222 break; 231 break;
223 case NFS4_OPEN_CLAIM_DELEGATE_PREV: 232 case NFS4_OPEN_CLAIM_DELEGATE_PREV:
233 open->op_stateowner->so_confirmed = 1;
224 printk("NFSD: unsupported OPEN claim type %d\n", 234 printk("NFSD: unsupported OPEN claim type %d\n",
225 open->op_claim_type); 235 open->op_claim_type);
226 status = nfserr_notsupp; 236 status = nfserr_notsupp;
@@ -584,31 +594,23 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_se
584{ 594{
585 int status = nfs_ok; 595 int status = nfs_ok;
586 596
587 if (!current_fh->fh_dentry)
588 return nfserr_nofilehandle;
589
590 status = nfs_ok;
591 if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { 597 if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
592 nfs4_lock_state(); 598 nfs4_lock_state();
593 if ((status = nfs4_preprocess_stateid_op(current_fh, 599 status = nfs4_preprocess_stateid_op(current_fh,
594 &setattr->sa_stateid, 600 &setattr->sa_stateid, CHECK_FH | WR_STATE, NULL);
595 CHECK_FH | WR_STATE, NULL))) {
596 dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
597 goto out_unlock;
598 }
599 nfs4_unlock_state(); 601 nfs4_unlock_state();
602 if (status) {
603 dprintk("NFSD: nfsd4_setattr: couldn't process stateid!");
604 return status;
605 }
600 } 606 }
601 status = nfs_ok; 607 status = nfs_ok;
602 if (setattr->sa_acl != NULL) 608 if (setattr->sa_acl != NULL)
603 status = nfsd4_set_nfs4_acl(rqstp, current_fh, setattr->sa_acl); 609 status = nfsd4_set_nfs4_acl(rqstp, current_fh, setattr->sa_acl);
604 if (status) 610 if (status)
605 goto out; 611 return status;
606 status = nfsd_setattr(rqstp, current_fh, &setattr->sa_iattr, 612 status = nfsd_setattr(rqstp, current_fh, &setattr->sa_iattr,
607 0, (time_t)0); 613 0, (time_t)0);
608out:
609 return status;
610out_unlock:
611 nfs4_unlock_state();
612 return status; 614 return status;
613} 615}
614 616
@@ -626,15 +628,17 @@ nfsd4_write(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_writ
626 return nfserr_inval; 628 return nfserr_inval;
627 629
628 nfs4_lock_state(); 630 nfs4_lock_state();
629 if ((status = nfs4_preprocess_stateid_op(current_fh, stateid, 631 status = nfs4_preprocess_stateid_op(current_fh, stateid,
630 CHECK_FH | WR_STATE, &filp))) { 632 CHECK_FH | WR_STATE, &filp);
631 dprintk("NFSD: nfsd4_write: couldn't process stateid!\n");
632 goto out;
633 }
634 if (filp) 633 if (filp)
635 get_file(filp); 634 get_file(filp);
636 nfs4_unlock_state(); 635 nfs4_unlock_state();
637 636
637 if (status) {
638 dprintk("NFSD: nfsd4_write: couldn't process stateid!\n");
639 return status;
640 }
641
638 write->wr_bytes_written = write->wr_buflen; 642 write->wr_bytes_written = write->wr_buflen;
639 write->wr_how_written = write->wr_stable_how; 643 write->wr_how_written = write->wr_stable_how;
640 p = (u32 *)write->wr_verifier.data; 644 p = (u32 *)write->wr_verifier.data;
@@ -650,9 +654,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_writ
650 if (status == nfserr_symlink) 654 if (status == nfserr_symlink)
651 status = nfserr_inval; 655 status = nfserr_inval;
652 return status; 656 return status;
653out:
654 nfs4_unlock_state();
655 return status;
656} 657}
657 658
658/* This routine never returns NFS_OK! If there are no other errors, it 659/* This routine never returns NFS_OK! If there are no other errors, it
@@ -768,6 +769,8 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
768 while (!status && resp->opcnt < args->opcnt) { 769 while (!status && resp->opcnt < args->opcnt) {
769 op = &args->ops[resp->opcnt++]; 770 op = &args->ops[resp->opcnt++];
770 771
772 dprintk("nfsv4 compound op #%d: %d\n", resp->opcnt, op->opnum);
773
771 /* 774 /*
772 * The XDR decode routines may have pre-set op->status; 775 * The XDR decode routines may have pre-set op->status;
773 * for example, if there is a miscellaneous XDR error 776 * for example, if there is a miscellaneous XDR error
@@ -792,17 +795,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
792 /* All operations except RENEW, SETCLIENTID, RESTOREFH 795 /* All operations except RENEW, SETCLIENTID, RESTOREFH
793 * SETCLIENTID_CONFIRM, PUTFH and PUTROOTFH 796 * SETCLIENTID_CONFIRM, PUTFH and PUTROOTFH
794 * require a valid current filehandle 797 * require a valid current filehandle
795 *
796 * SETATTR NOFILEHANDLE error handled in nfsd4_setattr
797 * due to required returned bitmap argument
798 */ 798 */
799 if ((!current_fh->fh_dentry) && 799 if ((!current_fh->fh_dentry) &&
800 !((op->opnum == OP_PUTFH) || (op->opnum == OP_PUTROOTFH) || 800 !((op->opnum == OP_PUTFH) || (op->opnum == OP_PUTROOTFH) ||
801 (op->opnum == OP_SETCLIENTID) || 801 (op->opnum == OP_SETCLIENTID) ||
802 (op->opnum == OP_SETCLIENTID_CONFIRM) || 802 (op->opnum == OP_SETCLIENTID_CONFIRM) ||
803 (op->opnum == OP_RENEW) || (op->opnum == OP_RESTOREFH) || 803 (op->opnum == OP_RENEW) || (op->opnum == OP_RESTOREFH) ||
804 (op->opnum == OP_RELEASE_LOCKOWNER) || 804 (op->opnum == OP_RELEASE_LOCKOWNER))) {
805 (op->opnum == OP_SETATTR))) {
806 op->status = nfserr_nofilehandle; 805 op->status = nfserr_nofilehandle;
807 goto encode_op; 806 goto encode_op;
808 } 807 }
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 954cf893d50..06da7506363 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -121,9 +121,9 @@ out:
121static void 121static void
122nfsd4_sync_rec_dir(void) 122nfsd4_sync_rec_dir(void)
123{ 123{
124 down(&rec_dir.dentry->d_inode->i_sem); 124 mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
125 nfsd_sync_dir(rec_dir.dentry); 125 nfsd_sync_dir(rec_dir.dentry);
126 up(&rec_dir.dentry->d_inode->i_sem); 126 mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
127} 127}
128 128
129int 129int
@@ -143,7 +143,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
143 nfs4_save_user(&uid, &gid); 143 nfs4_save_user(&uid, &gid);
144 144
145 /* lock the parent */ 145 /* lock the parent */
146 down(&rec_dir.dentry->d_inode->i_sem); 146 mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
147 147
148 dentry = lookup_one_len(dname, rec_dir.dentry, HEXDIR_LEN-1); 148 dentry = lookup_one_len(dname, rec_dir.dentry, HEXDIR_LEN-1);
149 if (IS_ERR(dentry)) { 149 if (IS_ERR(dentry)) {
@@ -159,7 +159,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
159out_put: 159out_put:
160 dput(dentry); 160 dput(dentry);
161out_unlock: 161out_unlock:
162 up(&rec_dir.dentry->d_inode->i_sem); 162 mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
163 if (status == 0) { 163 if (status == 0) {
164 clp->cl_firststate = 1; 164 clp->cl_firststate = 1;
165 nfsd4_sync_rec_dir(); 165 nfsd4_sync_rec_dir();
@@ -222,8 +222,7 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
222 222
223 nfs4_save_user(&uid, &gid); 223 nfs4_save_user(&uid, &gid);
224 224
225 filp = dentry_open(dget(dir), mntget(rec_dir.mnt), 225 filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY);
226 O_RDWR);
227 status = PTR_ERR(filp); 226 status = PTR_ERR(filp);
228 if (IS_ERR(filp)) 227 if (IS_ERR(filp))
229 goto out; 228 goto out;
@@ -259,9 +258,9 @@ nfsd4_remove_clid_file(struct dentry *dir, struct dentry *dentry)
259 printk("nfsd4: non-file found in client recovery directory\n"); 258 printk("nfsd4: non-file found in client recovery directory\n");
260 return -EINVAL; 259 return -EINVAL;
261 } 260 }
262 down(&dir->d_inode->i_sem); 261 mutex_lock(&dir->d_inode->i_mutex);
263 status = vfs_unlink(dir->d_inode, dentry); 262 status = vfs_unlink(dir->d_inode, dentry);
264 up(&dir->d_inode->i_sem); 263 mutex_unlock(&dir->d_inode->i_mutex);
265 return status; 264 return status;
266} 265}
267 266
@@ -274,9 +273,9 @@ nfsd4_clear_clid_dir(struct dentry *dir, struct dentry *dentry)
274 * any regular files anyway, just in case the directory was created by 273 * any regular files anyway, just in case the directory was created by
275 * a kernel from the future.... */ 274 * a kernel from the future.... */
276 nfsd4_list_rec_dir(dentry, nfsd4_remove_clid_file); 275 nfsd4_list_rec_dir(dentry, nfsd4_remove_clid_file);
277 down(&dir->d_inode->i_sem); 276 mutex_lock(&dir->d_inode->i_mutex);
278 status = vfs_rmdir(dir->d_inode, dentry); 277 status = vfs_rmdir(dir->d_inode, dentry);
279 up(&dir->d_inode->i_sem); 278 mutex_unlock(&dir->d_inode->i_mutex);
280 return status; 279 return status;
281} 280}
282 281
@@ -288,9 +287,9 @@ nfsd4_unlink_clid_dir(char *name, int namlen)
288 287
289 dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); 288 dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
290 289
291 down(&rec_dir.dentry->d_inode->i_sem); 290 mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
292 dentry = lookup_one_len(name, rec_dir.dentry, namlen); 291 dentry = lookup_one_len(name, rec_dir.dentry, namlen);
293 up(&rec_dir.dentry->d_inode->i_sem); 292 mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
294 if (IS_ERR(dentry)) { 293 if (IS_ERR(dentry)) {
295 status = PTR_ERR(dentry); 294 status = PTR_ERR(dentry);
296 return status; 295 return status;
@@ -400,9 +399,10 @@ nfsd4_init_recdir(char *rec_dirname)
400 399
401 nfs4_save_user(&uid, &gid); 400 nfs4_save_user(&uid, &gid);
402 401
403 status = path_lookup(rec_dirname, LOOKUP_FOLLOW, &rec_dir); 402 status = path_lookup(rec_dirname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
404 if (status == -ENOENT) 403 &rec_dir);
405 printk("NFSD: recovery directory %s doesn't exist\n", 404 if (status)
405 printk("NFSD: unable to find recovery directory %s\n",
406 rec_dirname); 406 rec_dirname);
407 407
408 if (!status) 408 if (!status)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 6bbefd06f10..1143cfb6454 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1088,7 +1088,7 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
1088 sop->so_seqid = open->op_seqid; 1088 sop->so_seqid = open->op_seqid;
1089 sop->so_confirmed = 0; 1089 sop->so_confirmed = 0;
1090 rp = &sop->so_replay; 1090 rp = &sop->so_replay;
1091 rp->rp_status = NFSERR_SERVERFAULT; 1091 rp->rp_status = nfserr_serverfault;
1092 rp->rp_buflen = 0; 1092 rp->rp_buflen = 0;
1093 rp->rp_buf = rp->rp_ibuf; 1093 rp->rp_buf = rp->rp_ibuf;
1094 return sop; 1094 return sop;
@@ -1178,7 +1178,6 @@ release_stateid(struct nfs4_stateid *stp, int flags)
1178 locks_remove_posix(filp, (fl_owner_t) stp->st_stateowner); 1178 locks_remove_posix(filp, (fl_owner_t) stp->st_stateowner);
1179 put_nfs4_file(stp->st_file); 1179 put_nfs4_file(stp->st_file);
1180 kmem_cache_free(stateid_slab, stp); 1180 kmem_cache_free(stateid_slab, stp);
1181 stp = NULL;
1182} 1181}
1183 1182
1184static void 1183static void
@@ -1191,22 +1190,6 @@ move_to_close_lru(struct nfs4_stateowner *sop)
1191 sop->so_time = get_seconds(); 1190 sop->so_time = get_seconds();
1192} 1191}
1193 1192
1194static void
1195release_state_owner(struct nfs4_stateid *stp, int flag)
1196{
1197 struct nfs4_stateowner *sop = stp->st_stateowner;
1198
1199 dprintk("NFSD: release_state_owner\n");
1200 release_stateid(stp, flag);
1201
1202 /* place unused nfs4_stateowners on so_close_lru list to be
1203 * released by the laundromat service after the lease period
1204 * to enable us to handle CLOSE replay
1205 */
1206 if (sop->so_confirmed && list_empty(&sop->so_stateids))
1207 move_to_close_lru(sop);
1208}
1209
1210static int 1193static int
1211cmp_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner, clientid_t *clid) { 1194cmp_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner, clientid_t *clid) {
1212 return ((sop->so_owner.len == owner->len) && 1195 return ((sop->so_owner.len == owner->len) &&
@@ -1446,92 +1429,61 @@ static struct lock_manager_operations nfsd_lease_mng_ops = {
1446}; 1429};
1447 1430
1448 1431
1449/*
1450 * nfsd4_process_open1()
1451 * lookup stateowner.
1452 * found:
1453 * check confirmed
1454 * confirmed:
1455 * check seqid
1456 * not confirmed:
1457 * delete owner
1458 * create new owner
1459 * notfound:
1460 * verify clientid
1461 * create new owner
1462 *
1463 * called with nfs4_lock_state() held.
1464 */
1465int 1432int
1466nfsd4_process_open1(struct nfsd4_open *open) 1433nfsd4_process_open1(struct nfsd4_open *open)
1467{ 1434{
1468 int status;
1469 clientid_t *clientid = &open->op_clientid; 1435 clientid_t *clientid = &open->op_clientid;
1470 struct nfs4_client *clp = NULL; 1436 struct nfs4_client *clp = NULL;
1471 unsigned int strhashval; 1437 unsigned int strhashval;
1472 struct nfs4_stateowner *sop = NULL; 1438 struct nfs4_stateowner *sop = NULL;
1473 1439
1474 status = nfserr_inval;
1475 if (!check_name(open->op_owner)) 1440 if (!check_name(open->op_owner))
1476 goto out; 1441 return nfserr_inval;
1477 1442
1478 if (STALE_CLIENTID(&open->op_clientid)) 1443 if (STALE_CLIENTID(&open->op_clientid))
1479 return nfserr_stale_clientid; 1444 return nfserr_stale_clientid;
1480 1445
1481 strhashval = ownerstr_hashval(clientid->cl_id, open->op_owner); 1446 strhashval = ownerstr_hashval(clientid->cl_id, open->op_owner);
1482 sop = find_openstateowner_str(strhashval, open); 1447 sop = find_openstateowner_str(strhashval, open);
1483 if (sop) { 1448 open->op_stateowner = sop;
1484 open->op_stateowner = sop; 1449 if (!sop) {
1485 /* check for replay */ 1450 /* Make sure the client's lease hasn't expired. */
1486 if (open->op_seqid == sop->so_seqid - 1){
1487 if (sop->so_replay.rp_buflen)
1488 return NFSERR_REPLAY_ME;
1489 else {
1490 /* The original OPEN failed so spectacularly
1491 * that we don't even have replay data saved!
1492 * Therefore, we have no choice but to continue
1493 * processing this OPEN; presumably, we'll
1494 * fail again for the same reason.
1495 */
1496 dprintk("nfsd4_process_open1:"
1497 " replay with no replay cache\n");
1498 goto renew;
1499 }
1500 } else if (sop->so_confirmed) {
1501 if (open->op_seqid == sop->so_seqid)
1502 goto renew;
1503 status = nfserr_bad_seqid;
1504 goto out;
1505 } else {
1506 /* If we get here, we received an OPEN for an
1507 * unconfirmed nfs4_stateowner. Since the seqid's are
1508 * different, purge the existing nfs4_stateowner, and
1509 * instantiate a new one.
1510 */
1511 clp = sop->so_client;
1512 release_stateowner(sop);
1513 }
1514 } else {
1515 /* nfs4_stateowner not found.
1516 * Verify clientid and instantiate new nfs4_stateowner.
1517 * If verify fails this is presumably the result of the
1518 * client's lease expiring.
1519 */
1520 status = nfserr_expired;
1521 clp = find_confirmed_client(clientid); 1451 clp = find_confirmed_client(clientid);
1522 if (clp == NULL) 1452 if (clp == NULL)
1523 goto out; 1453 return nfserr_expired;
1454 goto renew;
1524 } 1455 }
1525 status = nfserr_resource; 1456 if (!sop->so_confirmed) {
1526 sop = alloc_init_open_stateowner(strhashval, clp, open); 1457 /* Replace unconfirmed owners without checking for replay. */
1527 if (sop == NULL) 1458 clp = sop->so_client;
1528 goto out; 1459 release_stateowner(sop);
1529 open->op_stateowner = sop; 1460 open->op_stateowner = NULL;
1461 goto renew;
1462 }
1463 if (open->op_seqid == sop->so_seqid - 1) {
1464 if (sop->so_replay.rp_buflen)
1465 return NFSERR_REPLAY_ME;
1466 /* The original OPEN failed so spectacularly
1467 * that we don't even have replay data saved!
1468 * Therefore, we have no choice but to continue
1469 * processing this OPEN; presumably, we'll
1470 * fail again for the same reason.
1471 */
1472 dprintk("nfsd4_process_open1: replay with no replay cache\n");
1473 goto renew;
1474 }
1475 if (open->op_seqid != sop->so_seqid)
1476 return nfserr_bad_seqid;
1530renew: 1477renew:
1531 status = nfs_ok; 1478 if (open->op_stateowner == NULL) {
1479 sop = alloc_init_open_stateowner(strhashval, clp, open);
1480 if (sop == NULL)
1481 return nfserr_resource;
1482 open->op_stateowner = sop;
1483 }
1484 list_del_init(&sop->so_close_lru);
1532 renew_client(sop->so_client); 1485 renew_client(sop->so_client);
1533out: 1486 return nfs_ok;
1534 return status;
1535} 1487}
1536 1488
1537static inline int 1489static inline int
@@ -1648,7 +1600,7 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
1648 if (!open->op_truncate) 1600 if (!open->op_truncate)
1649 return 0; 1601 return 0;
1650 if (!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE)) 1602 if (!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
1651 return -EINVAL; 1603 return nfserr_inval;
1652 return nfsd_setattr(rqstp, fh, &iattr, 0, (time_t)0); 1604 return nfsd_setattr(rqstp, fh, &iattr, 0, (time_t)0);
1653} 1605}
1654 1606
@@ -1657,26 +1609,26 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_sta
1657{ 1609{
1658 struct file *filp = stp->st_vfs_file; 1610 struct file *filp = stp->st_vfs_file;
1659 struct inode *inode = filp->f_dentry->d_inode; 1611 struct inode *inode = filp->f_dentry->d_inode;
1660 unsigned int share_access; 1612 unsigned int share_access, new_writer;
1661 int status; 1613 int status;
1662 1614
1663 set_access(&share_access, stp->st_access_bmap); 1615 set_access(&share_access, stp->st_access_bmap);
1664 share_access = ~share_access; 1616 new_writer = (~share_access) & open->op_share_access
1665 share_access &= open->op_share_access; 1617 & NFS4_SHARE_ACCESS_WRITE;
1666
1667 if (!(share_access & NFS4_SHARE_ACCESS_WRITE))
1668 return nfsd4_truncate(rqstp, cur_fh, open);
1669 1618
1670 status = get_write_access(inode); 1619 if (new_writer) {
1671 if (status) 1620 status = get_write_access(inode);
1672 return nfserrno(status); 1621 if (status)
1622 return nfserrno(status);
1623 }
1673 status = nfsd4_truncate(rqstp, cur_fh, open); 1624 status = nfsd4_truncate(rqstp, cur_fh, open);
1674 if (status) { 1625 if (status) {
1675 put_write_access(inode); 1626 if (new_writer)
1627 put_write_access(inode);
1676 return status; 1628 return status;
1677 } 1629 }
1678 /* remember the open */ 1630 /* remember the open */
1679 filp->f_mode = (filp->f_mode | FMODE_WRITE) & ~FMODE_READ; 1631 filp->f_mode |= open->op_share_access;
1680 set_bit(open->op_share_access, &stp->st_access_bmap); 1632 set_bit(open->op_share_access, &stp->st_access_bmap);
1681 set_bit(open->op_share_deny, &stp->st_deny_bmap); 1633 set_bit(open->op_share_deny, &stp->st_deny_bmap);
1682 1634
@@ -1780,12 +1732,6 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
1780 struct nfs4_delegation *dp = NULL; 1732 struct nfs4_delegation *dp = NULL;
1781 int status; 1733 int status;
1782 1734
1783 if (nfs4_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
1784 return nfserr_grace;
1785
1786 if (!nfs4_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
1787 return nfserr_no_grace;
1788
1789 status = nfserr_inval; 1735 status = nfserr_inval;
1790 if (!TEST_ACCESS(open->op_share_access) || !TEST_DENY(open->op_share_deny)) 1736 if (!TEST_ACCESS(open->op_share_access) || !TEST_DENY(open->op_share_deny))
1791 goto out; 1737 goto out;
@@ -2423,15 +2369,19 @@ nfsd4_close(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_clos
2423 CHECK_FH | OPEN_STATE | CLOSE_STATE, 2369 CHECK_FH | OPEN_STATE | CLOSE_STATE,
2424 &close->cl_stateowner, &stp, NULL))) 2370 &close->cl_stateowner, &stp, NULL)))
2425 goto out; 2371 goto out;
2426 /*
2427 * Return success, but first update the stateid.
2428 */
2429 status = nfs_ok; 2372 status = nfs_ok;
2430 update_stateid(&stp->st_stateid); 2373 update_stateid(&stp->st_stateid);
2431 memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t)); 2374 memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t));
2432 2375
2433 /* release_state_owner() calls nfsd_close() if needed */ 2376 /* release_stateid() calls nfsd_close() if needed */
2434 release_state_owner(stp, OPEN_STATE); 2377 release_stateid(stp, OPEN_STATE);
2378
2379 /* place unused nfs4_stateowners on so_close_lru list to be
2380 * released by the laundromat service after the lease period
2381 * to enable us to handle CLOSE replay
2382 */
2383 if (list_empty(&close->cl_stateowner->so_stateids))
2384 move_to_close_lru(close->cl_stateowner);
2435out: 2385out:
2436 if (close->cl_stateowner) { 2386 if (close->cl_stateowner) {
2437 nfs4_get_stateowner(close->cl_stateowner); 2387 nfs4_get_stateowner(close->cl_stateowner);
@@ -2633,7 +2583,7 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
2633 sop->so_seqid = lock->lk_new_lock_seqid + 1; 2583 sop->so_seqid = lock->lk_new_lock_seqid + 1;
2634 sop->so_confirmed = 1; 2584 sop->so_confirmed = 1;
2635 rp = &sop->so_replay; 2585 rp = &sop->so_replay;
2636 rp->rp_status = NFSERR_SERVERFAULT; 2586 rp->rp_status = nfserr_serverfault;
2637 rp->rp_buflen = 0; 2587 rp->rp_buflen = 0;
2638 rp->rp_buf = rp->rp_ibuf; 2588 rp->rp_buf = rp->rp_ibuf;
2639 return sop; 2589 return sop;
@@ -2700,6 +2650,11 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
2700 if (check_lock_length(lock->lk_offset, lock->lk_length)) 2650 if (check_lock_length(lock->lk_offset, lock->lk_length))
2701 return nfserr_inval; 2651 return nfserr_inval;
2702 2652
2653 if ((status = fh_verify(rqstp, current_fh, S_IFREG, MAY_LOCK))) {
2654 dprintk("NFSD: nfsd4_lock: permission denied!\n");
2655 return status;
2656 }
2657
2703 nfs4_lock_state(); 2658 nfs4_lock_state();
2704 2659
2705 if (lock->lk_is_new) { 2660 if (lock->lk_is_new) {
@@ -2720,11 +2675,11 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
2720 lock->lk_new_open_seqid, 2675 lock->lk_new_open_seqid,
2721 &lock->lk_new_open_stateid, 2676 &lock->lk_new_open_stateid,
2722 CHECK_FH | OPEN_STATE, 2677 CHECK_FH | OPEN_STATE,
2723 &lock->lk_stateowner, &open_stp, 2678 &lock->lk_replay_owner, &open_stp,
2724 lock); 2679 lock);
2725 if (status) 2680 if (status)
2726 goto out; 2681 goto out;
2727 open_sop = lock->lk_stateowner; 2682 open_sop = lock->lk_replay_owner;
2728 /* create lockowner and lock stateid */ 2683 /* create lockowner and lock stateid */
2729 fp = open_stp->st_file; 2684 fp = open_stp->st_file;
2730 strhashval = lock_ownerstr_hashval(fp->fi_inode, 2685 strhashval = lock_ownerstr_hashval(fp->fi_inode,
@@ -2739,29 +2694,22 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
2739 if (lock_sop == NULL) 2694 if (lock_sop == NULL)
2740 goto out; 2695 goto out;
2741 lock_stp = alloc_init_lock_stateid(lock_sop, fp, open_stp); 2696 lock_stp = alloc_init_lock_stateid(lock_sop, fp, open_stp);
2742 if (lock_stp == NULL) { 2697 if (lock_stp == NULL)
2743 release_stateowner(lock_sop);
2744 goto out; 2698 goto out;
2745 }
2746 } else { 2699 } else {
2747 /* lock (lock owner + lock stateid) already exists */ 2700 /* lock (lock owner + lock stateid) already exists */
2748 status = nfs4_preprocess_seqid_op(current_fh, 2701 status = nfs4_preprocess_seqid_op(current_fh,
2749 lock->lk_old_lock_seqid, 2702 lock->lk_old_lock_seqid,
2750 &lock->lk_old_lock_stateid, 2703 &lock->lk_old_lock_stateid,
2751 CHECK_FH | LOCK_STATE, 2704 CHECK_FH | LOCK_STATE,
2752 &lock->lk_stateowner, &lock_stp, lock); 2705 &lock->lk_replay_owner, &lock_stp, lock);
2753 if (status) 2706 if (status)
2754 goto out; 2707 goto out;
2755 lock_sop = lock->lk_stateowner; 2708 lock_sop = lock->lk_replay_owner;
2756 } 2709 }
2757 /* lock->lk_stateowner and lock_stp have been created or found */ 2710 /* lock->lk_replay_owner and lock_stp have been created or found */
2758 filp = lock_stp->st_vfs_file; 2711 filp = lock_stp->st_vfs_file;
2759 2712
2760 if ((status = fh_verify(rqstp, current_fh, S_IFREG, MAY_LOCK))) {
2761 dprintk("NFSD: nfsd4_lock: permission denied!\n");
2762 goto out;
2763 }
2764
2765 status = nfserr_grace; 2713 status = nfserr_grace;
2766 if (nfs4_in_grace() && !lock->lk_reclaim) 2714 if (nfs4_in_grace() && !lock->lk_reclaim)
2767 goto out; 2715 goto out;
@@ -2802,8 +2750,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
2802 */ 2750 */
2803 2751
2804 status = posix_lock_file(filp, &file_lock); 2752 status = posix_lock_file(filp, &file_lock);
2805 if (file_lock.fl_ops && file_lock.fl_ops->fl_release_private)
2806 file_lock.fl_ops->fl_release_private(&file_lock);
2807 dprintk("NFSD: nfsd4_lock: posix_lock_file status %d\n",status); 2753 dprintk("NFSD: nfsd4_lock: posix_lock_file status %d\n",status);
2808 switch (-status) { 2754 switch (-status) {
2809 case 0: /* success! */ 2755 case 0: /* success! */
@@ -2815,9 +2761,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
2815 goto conflicting_lock; 2761 goto conflicting_lock;
2816 case (EDEADLK): 2762 case (EDEADLK):
2817 status = nfserr_deadlock; 2763 status = nfserr_deadlock;
2764 dprintk("NFSD: nfsd4_lock: posix_lock_file() failed! status %d\n",status);
2765 goto out;
2818 default: 2766 default:
2767 status = nfserrno(status);
2819 dprintk("NFSD: nfsd4_lock: posix_lock_file() failed! status %d\n",status); 2768 dprintk("NFSD: nfsd4_lock: posix_lock_file() failed! status %d\n",status);
2820 goto out_destroy_new_stateid; 2769 goto out;
2821 } 2770 }
2822 2771
2823conflicting_lock: 2772conflicting_lock:
@@ -2831,20 +2780,12 @@ conflicting_lock:
2831 goto out; 2780 goto out;
2832 } 2781 }
2833 nfs4_set_lock_denied(conflock, &lock->lk_denied); 2782 nfs4_set_lock_denied(conflock, &lock->lk_denied);
2834
2835out_destroy_new_stateid:
2836 if (lock->lk_is_new) {
2837 dprintk("NFSD: nfsd4_lock: destroy new stateid!\n");
2838 /*
2839 * An error encountered after instantiation of the new
2840 * stateid has forced us to destroy it.
2841 */
2842 release_state_owner(lock_stp, LOCK_STATE);
2843 }
2844out: 2783out:
2845 if (lock->lk_stateowner) { 2784 if (status && lock->lk_is_new && lock_sop)
2846 nfs4_get_stateowner(lock->lk_stateowner); 2785 release_stateowner(lock_sop);
2847 *replay_owner = lock->lk_stateowner; 2786 if (lock->lk_replay_owner) {
2787 nfs4_get_stateowner(lock->lk_replay_owner);
2788 *replay_owner = lock->lk_replay_owner;
2848 } 2789 }
2849 nfs4_unlock_state(); 2790 nfs4_unlock_state();
2850 return status; 2791 return status;
@@ -2977,8 +2918,6 @@ nfsd4_locku(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
2977 * Try to unlock the file in the VFS. 2918 * Try to unlock the file in the VFS.
2978 */ 2919 */
2979 status = posix_lock_file(filp, &file_lock); 2920 status = posix_lock_file(filp, &file_lock);
2980 if (file_lock.fl_ops && file_lock.fl_ops->fl_release_private)
2981 file_lock.fl_ops->fl_release_private(&file_lock);
2982 if (status) { 2921 if (status) {
2983 dprintk("NFSD: nfs4_locku: posix_lock_file failed!\n"); 2922 dprintk("NFSD: nfs4_locku: posix_lock_file failed!\n");
2984 goto out_nfserr; 2923 goto out_nfserr;
@@ -3016,9 +2955,10 @@ check_for_locks(struct file *filp, struct nfs4_stateowner *lowner)
3016 2955
3017 lock_kernel(); 2956 lock_kernel();
3018 for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) { 2957 for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) {
3019 if ((*flpp)->fl_owner == (fl_owner_t)lowner) 2958 if ((*flpp)->fl_owner == (fl_owner_t)lowner) {
3020 status = 1; 2959 status = 1;
3021 goto out; 2960 goto out;
2961 }
3022 } 2962 }
3023out: 2963out:
3024 unlock_kernel(); 2964 unlock_kernel();
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index dcd67318694..69d3501173a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -528,7 +528,7 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
528{ 528{
529 DECODE_HEAD; 529 DECODE_HEAD;
530 530
531 lock->lk_stateowner = NULL; 531 lock->lk_replay_owner = NULL;
532 /* 532 /*
533 * type, reclaim(boolean), offset, length, new_lock_owner(boolean) 533 * type, reclaim(boolean), offset, length, new_lock_owner(boolean)
534 */ 534 */
@@ -1764,10 +1764,11 @@ nfsd4_encode_dirent(struct readdir_cd *ccd, const char *name, int namlen,
1764 */ 1764 */
1765 if (!(cd->rd_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)) 1765 if (!(cd->rd_bmval[0] & FATTR4_WORD0_RDATTR_ERROR))
1766 goto fail; 1766 goto fail;
1767 nfserr = nfserr_toosmall;
1768 p = nfsd4_encode_rdattr_error(p, buflen, nfserr); 1767 p = nfsd4_encode_rdattr_error(p, buflen, nfserr);
1769 if (p == NULL) 1768 if (p == NULL) {
1769 nfserr = nfserr_toosmall;
1770 goto fail; 1770 goto fail;
1771 }
1771 } 1772 }
1772 cd->buflen -= (p - cd->buffer); 1773 cd->buflen -= (p - cd->buffer);
1773 cd->buffer = p; 1774 cd->buffer = p;
@@ -1895,7 +1896,6 @@ nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denie
1895static void 1896static void
1896nfsd4_encode_lock(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_lock *lock) 1897nfsd4_encode_lock(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_lock *lock)
1897{ 1898{
1898
1899 ENCODE_SEQID_OP_HEAD; 1899 ENCODE_SEQID_OP_HEAD;
1900 1900
1901 if (!nfserr) { 1901 if (!nfserr) {
@@ -1906,7 +1906,7 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_lock
1906 } else if (nfserr == nfserr_denied) 1906 } else if (nfserr == nfserr_denied)
1907 nfsd4_encode_lock_denied(resp, &lock->lk_denied); 1907 nfsd4_encode_lock_denied(resp, &lock->lk_denied);
1908 1908
1909 ENCODE_SEQID_OP_TAIL(lock->lk_stateowner); 1909 ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner);
1910} 1910}
1911 1911
1912static void 1912static void
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 0aa1b9603d7..3e6b75cd90f 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -36,6 +36,22 @@ nfsd_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
36 return nfs_ok; 36 return nfs_ok;
37} 37}
38 38
39static int
40nfsd_return_attrs(int err, struct nfsd_attrstat *resp)
41{
42 if (err) return err;
43 return nfserrno(vfs_getattr(resp->fh.fh_export->ex_mnt,
44 resp->fh.fh_dentry,
45 &resp->stat));
46}
47static int
48nfsd_return_dirop(int err, struct nfsd_diropres *resp)
49{
50 if (err) return err;
51 return nfserrno(vfs_getattr(resp->fh.fh_export->ex_mnt,
52 resp->fh.fh_dentry,
53 &resp->stat));
54}
39/* 55/*
40 * Get a file's attributes 56 * Get a file's attributes
41 * N.B. After this call resp->fh needs an fh_put 57 * N.B. After this call resp->fh needs an fh_put
@@ -44,10 +60,12 @@ static int
44nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp, 60nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp,
45 struct nfsd_attrstat *resp) 61 struct nfsd_attrstat *resp)
46{ 62{
63 int nfserr;
47 dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); 64 dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh));
48 65
49 fh_copy(&resp->fh, &argp->fh); 66 fh_copy(&resp->fh, &argp->fh);
50 return fh_verify(rqstp, &resp->fh, 0, MAY_NOP); 67 nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP);
68 return nfsd_return_attrs(nfserr, resp);
51} 69}
52 70
53/* 71/*
@@ -58,12 +76,14 @@ static int
58nfsd_proc_setattr(struct svc_rqst *rqstp, struct nfsd_sattrargs *argp, 76nfsd_proc_setattr(struct svc_rqst *rqstp, struct nfsd_sattrargs *argp,
59 struct nfsd_attrstat *resp) 77 struct nfsd_attrstat *resp)
60{ 78{
79 int nfserr;
61 dprintk("nfsd: SETATTR %s, valid=%x, size=%ld\n", 80 dprintk("nfsd: SETATTR %s, valid=%x, size=%ld\n",
62 SVCFH_fmt(&argp->fh), 81 SVCFH_fmt(&argp->fh),
63 argp->attrs.ia_valid, (long) argp->attrs.ia_size); 82 argp->attrs.ia_valid, (long) argp->attrs.ia_size);
64 83
65 fh_copy(&resp->fh, &argp->fh); 84 fh_copy(&resp->fh, &argp->fh);
66 return nfsd_setattr(rqstp, &resp->fh, &argp->attrs,0, (time_t)0); 85 nfserr = nfsd_setattr(rqstp, &resp->fh, &argp->attrs,0, (time_t)0);
86 return nfsd_return_attrs(nfserr, resp);
67} 87}
68 88
69/* 89/*
@@ -86,7 +106,7 @@ nfsd_proc_lookup(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
86 &resp->fh); 106 &resp->fh);
87 107
88 fh_put(&argp->fh); 108 fh_put(&argp->fh);
89 return nfserr; 109 return nfsd_return_dirop(nfserr, resp);
90} 110}
91 111
92/* 112/*
@@ -142,7 +162,10 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
142 argp->vec, argp->vlen, 162 argp->vec, argp->vlen,
143 &resp->count); 163 &resp->count);
144 164
145 return nfserr; 165 if (nfserr) return nfserr;
166 return nfserrno(vfs_getattr(resp->fh.fh_export->ex_mnt,
167 resp->fh.fh_dentry,
168 &resp->stat));
146} 169}
147 170
148/* 171/*
@@ -165,7 +188,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
165 argp->vec, argp->vlen, 188 argp->vec, argp->vlen,
166 argp->len, 189 argp->len,
167 &stable); 190 &stable);
168 return nfserr; 191 return nfsd_return_attrs(nfserr, resp);
169} 192}
170 193
171/* 194/*
@@ -322,7 +345,7 @@ out_unlock:
322 345
323done: 346done:
324 fh_put(dirfhp); 347 fh_put(dirfhp);
325 return nfserr; 348 return nfsd_return_dirop(nfserr, resp);
326} 349}
327 350
328static int 351static int
@@ -425,7 +448,7 @@ nfsd_proc_mkdir(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
425 nfserr = nfsd_create(rqstp, &argp->fh, argp->name, argp->len, 448 nfserr = nfsd_create(rqstp, &argp->fh, argp->name, argp->len,
426 &argp->attrs, S_IFDIR, 0, &resp->fh); 449 &argp->attrs, S_IFDIR, 0, &resp->fh);
427 fh_put(&argp->fh); 450 fh_put(&argp->fh);
428 return nfserr; 451 return nfsd_return_dirop(nfserr, resp);
429} 452}
430 453
431/* 454/*
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index b45999ff33e..e3a0797dd56 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -37,7 +37,7 @@ static u32 nfs_ftypes[] = {
37/* 37/*
38 * XDR functions for basic NFS types 38 * XDR functions for basic NFS types
39 */ 39 */
40static inline u32 * 40static u32 *
41decode_fh(u32 *p, struct svc_fh *fhp) 41decode_fh(u32 *p, struct svc_fh *fhp)
42{ 42{
43 fh_init(fhp, NFS_FHSIZE); 43 fh_init(fhp, NFS_FHSIZE);
@@ -151,47 +151,45 @@ decode_sattr(u32 *p, struct iattr *iap)
151 return p; 151 return p;
152} 152}
153 153
154static inline u32 * 154static u32 *
155encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp) 155encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
156 struct kstat *stat)
156{ 157{
157 struct vfsmount *mnt = fhp->fh_export->ex_mnt;
158 struct dentry *dentry = fhp->fh_dentry; 158 struct dentry *dentry = fhp->fh_dentry;
159 struct kstat stat;
160 int type; 159 int type;
161 struct timespec time; 160 struct timespec time;
162 161
163 vfs_getattr(mnt, dentry, &stat); 162 type = (stat->mode & S_IFMT);
164 type = (stat.mode & S_IFMT);
165 163
166 *p++ = htonl(nfs_ftypes[type >> 12]); 164 *p++ = htonl(nfs_ftypes[type >> 12]);
167 *p++ = htonl((u32) stat.mode); 165 *p++ = htonl((u32) stat->mode);
168 *p++ = htonl((u32) stat.nlink); 166 *p++ = htonl((u32) stat->nlink);
169 *p++ = htonl((u32) nfsd_ruid(rqstp, stat.uid)); 167 *p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid));
170 *p++ = htonl((u32) nfsd_rgid(rqstp, stat.gid)); 168 *p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid));
171 169
172 if (S_ISLNK(type) && stat.size > NFS_MAXPATHLEN) { 170 if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) {
173 *p++ = htonl(NFS_MAXPATHLEN); 171 *p++ = htonl(NFS_MAXPATHLEN);
174 } else { 172 } else {
175 *p++ = htonl((u32) stat.size); 173 *p++ = htonl((u32) stat->size);
176 } 174 }
177 *p++ = htonl((u32) stat.blksize); 175 *p++ = htonl((u32) stat->blksize);
178 if (S_ISCHR(type) || S_ISBLK(type)) 176 if (S_ISCHR(type) || S_ISBLK(type))
179 *p++ = htonl(new_encode_dev(stat.rdev)); 177 *p++ = htonl(new_encode_dev(stat->rdev));
180 else 178 else
181 *p++ = htonl(0xffffffff); 179 *p++ = htonl(0xffffffff);
182 *p++ = htonl((u32) stat.blocks); 180 *p++ = htonl((u32) stat->blocks);
183 if (is_fsid(fhp, rqstp->rq_reffh)) 181 if (is_fsid(fhp, rqstp->rq_reffh))
184 *p++ = htonl((u32) fhp->fh_export->ex_fsid); 182 *p++ = htonl((u32) fhp->fh_export->ex_fsid);
185 else 183 else
186 *p++ = htonl(new_encode_dev(stat.dev)); 184 *p++ = htonl(new_encode_dev(stat->dev));
187 *p++ = htonl((u32) stat.ino); 185 *p++ = htonl((u32) stat->ino);
188 *p++ = htonl((u32) stat.atime.tv_sec); 186 *p++ = htonl((u32) stat->atime.tv_sec);
189 *p++ = htonl(stat.atime.tv_nsec ? stat.atime.tv_nsec / 1000 : 0); 187 *p++ = htonl(stat->atime.tv_nsec ? stat->atime.tv_nsec / 1000 : 0);
190 lease_get_mtime(dentry->d_inode, &time); 188 lease_get_mtime(dentry->d_inode, &time);
191 *p++ = htonl((u32) time.tv_sec); 189 *p++ = htonl((u32) time.tv_sec);
192 *p++ = htonl(time.tv_nsec ? time.tv_nsec / 1000 : 0); 190 *p++ = htonl(time.tv_nsec ? time.tv_nsec / 1000 : 0);
193 *p++ = htonl((u32) stat.ctime.tv_sec); 191 *p++ = htonl((u32) stat->ctime.tv_sec);
194 *p++ = htonl(stat.ctime.tv_nsec ? stat.ctime.tv_nsec / 1000 : 0); 192 *p++ = htonl(stat->ctime.tv_nsec ? stat->ctime.tv_nsec / 1000 : 0);
195 193
196 return p; 194 return p;
197} 195}
@@ -199,7 +197,9 @@ encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
199/* Helper function for NFSv2 ACL code */ 197/* Helper function for NFSv2 ACL code */
200u32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp) 198u32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
201{ 199{
202 return encode_fattr(rqstp, p, fhp); 200 struct kstat stat;
201 vfs_getattr(fhp->fh_export->ex_mnt, fhp->fh_dentry, &stat);
202 return encode_fattr(rqstp, p, fhp, &stat);
203} 203}
204 204
205/* 205/*
@@ -394,7 +394,7 @@ int
394nfssvc_encode_attrstat(struct svc_rqst *rqstp, u32 *p, 394nfssvc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
395 struct nfsd_attrstat *resp) 395 struct nfsd_attrstat *resp)
396{ 396{
397 p = encode_fattr(rqstp, p, &resp->fh); 397 p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
398 return xdr_ressize_check(rqstp, p); 398 return xdr_ressize_check(rqstp, p);
399} 399}
400 400
@@ -403,7 +403,7 @@ nfssvc_encode_diropres(struct svc_rqst *rqstp, u32 *p,
403 struct nfsd_diropres *resp) 403 struct nfsd_diropres *resp)
404{ 404{
405 p = encode_fh(p, &resp->fh); 405 p = encode_fh(p, &resp->fh);
406 p = encode_fattr(rqstp, p, &resp->fh); 406 p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
407 return xdr_ressize_check(rqstp, p); 407 return xdr_ressize_check(rqstp, p);
408} 408}
409 409
@@ -428,7 +428,7 @@ int
428nfssvc_encode_readres(struct svc_rqst *rqstp, u32 *p, 428nfssvc_encode_readres(struct svc_rqst *rqstp, u32 *p,
429 struct nfsd_readres *resp) 429 struct nfsd_readres *resp)
430{ 430{
431 p = encode_fattr(rqstp, p, &resp->fh); 431 p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
432 *p++ = htonl(resp->count); 432 *p++ = htonl(resp->count);
433 xdr_ressize_check(rqstp, p); 433 xdr_ressize_check(rqstp, p);
434 434
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index af7c3c3074b..5320e5afadd 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -48,8 +48,8 @@
48#include <linux/fsnotify.h> 48#include <linux/fsnotify.h>
49#include <linux/posix_acl.h> 49#include <linux/posix_acl.h>
50#include <linux/posix_acl_xattr.h> 50#include <linux/posix_acl_xattr.h>
51#ifdef CONFIG_NFSD_V4
52#include <linux/xattr.h> 51#include <linux/xattr.h>
52#ifdef CONFIG_NFSD_V4
53#include <linux/nfs4.h> 53#include <linux/nfs4.h>
54#include <linux/nfs4_acl.h> 54#include <linux/nfs4_acl.h>
55#include <linux/nfsd_idmap.h> 55#include <linux/nfsd_idmap.h>
@@ -365,8 +365,30 @@ out_nfserr:
365 goto out; 365 goto out;
366} 366}
367 367
368#if defined(CONFIG_NFSD_V4) 368#if defined(CONFIG_NFSD_V2_ACL) || \
369 defined(CONFIG_NFSD_V3_ACL) || \
370 defined(CONFIG_NFSD_V4)
371static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf)
372{
373 ssize_t buflen;
374 int error;
375
376 buflen = vfs_getxattr(dentry, key, NULL, 0);
377 if (buflen <= 0)
378 return buflen;
379
380 *buf = kmalloc(buflen, GFP_KERNEL);
381 if (!*buf)
382 return -ENOMEM;
383
384 error = vfs_getxattr(dentry, key, *buf, buflen);
385 if (error < 0)
386 return error;
387 return buflen;
388}
389#endif
369 390
391#if defined(CONFIG_NFSD_V4)
370static int 392static int
371set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key) 393set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key)
372{ 394{
@@ -374,7 +396,6 @@ set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key)
374 size_t buflen; 396 size_t buflen;
375 char *buf = NULL; 397 char *buf = NULL;
376 int error = 0; 398 int error = 0;
377 struct inode *inode = dentry->d_inode;
378 399
379 buflen = posix_acl_xattr_size(pacl->a_count); 400 buflen = posix_acl_xattr_size(pacl->a_count);
380 buf = kmalloc(buflen, GFP_KERNEL); 401 buf = kmalloc(buflen, GFP_KERNEL);
@@ -388,15 +409,7 @@ set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key)
388 goto out; 409 goto out;
389 } 410 }
390 411
391 error = -EOPNOTSUPP; 412 error = vfs_setxattr(dentry, key, buf, len, 0);
392 if (inode->i_op && inode->i_op->setxattr) {
393 down(&inode->i_sem);
394 security_inode_setxattr(dentry, key, buf, len, 0);
395 error = inode->i_op->setxattr(dentry, key, buf, len, 0);
396 if (!error)
397 security_inode_post_setxattr(dentry, key, buf, len, 0);
398 up(&inode->i_sem);
399 }
400out: 413out:
401 kfree(buf); 414 kfree(buf);
402 return error; 415 return error;
@@ -455,44 +468,19 @@ out_nfserr:
455static struct posix_acl * 468static struct posix_acl *
456_get_posix_acl(struct dentry *dentry, char *key) 469_get_posix_acl(struct dentry *dentry, char *key)
457{ 470{
458 struct inode *inode = dentry->d_inode; 471 void *buf = NULL;
459 char *buf = NULL;
460 int buflen, error = 0;
461 struct posix_acl *pacl = NULL; 472 struct posix_acl *pacl = NULL;
473 int buflen;
462 474
463 error = -EOPNOTSUPP; 475 buflen = nfsd_getxattr(dentry, key, &buf);
464 if (inode->i_op == NULL) 476 if (!buflen)
465 goto out_err; 477 buflen = -ENODATA;
466 if (inode->i_op->getxattr == NULL) 478 if (buflen <= 0)
467 goto out_err; 479 return ERR_PTR(buflen);
468
469 error = security_inode_getxattr(dentry, key);
470 if (error)
471 goto out_err;
472
473 buflen = inode->i_op->getxattr(dentry, key, NULL, 0);
474 if (buflen <= 0) {
475 error = buflen < 0 ? buflen : -ENODATA;
476 goto out_err;
477 }
478
479 buf = kmalloc(buflen, GFP_KERNEL);
480 if (buf == NULL) {
481 error = -ENOMEM;
482 goto out_err;
483 }
484
485 error = inode->i_op->getxattr(dentry, key, buf, buflen);
486 if (error < 0)
487 goto out_err;
488 480
489 pacl = posix_acl_from_xattr(buf, buflen); 481 pacl = posix_acl_from_xattr(buf, buflen);
490 out:
491 kfree(buf); 482 kfree(buf);
492 return pacl; 483 return pacl;
493 out_err:
494 pacl = ERR_PTR(error);
495 goto out;
496} 484}
497 485
498int 486int
@@ -717,33 +705,40 @@ nfsd_close(struct file *filp)
717 * As this calls fsync (not fdatasync) there is no need for a write_inode 705 * As this calls fsync (not fdatasync) there is no need for a write_inode
718 * after it. 706 * after it.
719 */ 707 */
720static inline void nfsd_dosync(struct file *filp, struct dentry *dp, 708static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
721 struct file_operations *fop) 709 struct file_operations *fop)
722{ 710{
723 struct inode *inode = dp->d_inode; 711 struct inode *inode = dp->d_inode;
724 int (*fsync) (struct file *, struct dentry *, int); 712 int (*fsync) (struct file *, struct dentry *, int);
713 int err;
714
715 err = filemap_fdatawrite(inode->i_mapping);
716 if (err == 0 && fop && (fsync = fop->fsync))
717 err = fsync(filp, dp, 0);
718 if (err == 0)
719 err = filemap_fdatawait(inode->i_mapping);
725 720
726 filemap_fdatawrite(inode->i_mapping); 721 return err;
727 if (fop && (fsync = fop->fsync))
728 fsync(filp, dp, 0);
729 filemap_fdatawait(inode->i_mapping);
730} 722}
731 723
732 724
733static void 725static int
734nfsd_sync(struct file *filp) 726nfsd_sync(struct file *filp)
735{ 727{
728 int err;
736 struct inode *inode = filp->f_dentry->d_inode; 729 struct inode *inode = filp->f_dentry->d_inode;
737 dprintk("nfsd: sync file %s\n", filp->f_dentry->d_name.name); 730 dprintk("nfsd: sync file %s\n", filp->f_dentry->d_name.name);
738 down(&inode->i_sem); 731 mutex_lock(&inode->i_mutex);
739 nfsd_dosync(filp, filp->f_dentry, filp->f_op); 732 err=nfsd_dosync(filp, filp->f_dentry, filp->f_op);
740 up(&inode->i_sem); 733 mutex_unlock(&inode->i_mutex);
734
735 return err;
741} 736}
742 737
743void 738int
744nfsd_sync_dir(struct dentry *dp) 739nfsd_sync_dir(struct dentry *dp)
745{ 740{
746 nfsd_dosync(NULL, dp, dp->d_inode->i_fop); 741 return nfsd_dosync(NULL, dp, dp->d_inode->i_fop);
747} 742}
748 743
749/* 744/*
@@ -820,7 +815,7 @@ nfsd_read_actor(read_descriptor_t *desc, struct page *page, unsigned long offset
820 return size; 815 return size;
821} 816}
822 817
823static inline int 818static int
824nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 819nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
825 loff_t offset, struct kvec *vec, int vlen, unsigned long *count) 820 loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
826{ 821{
@@ -874,7 +869,17 @@ out:
874 return err; 869 return err;
875} 870}
876 871
877static inline int 872static void kill_suid(struct dentry *dentry)
873{
874 struct iattr ia;
875 ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID;
876
877 mutex_lock(&dentry->d_inode->i_mutex);
878 notify_change(dentry, &ia);
879 mutex_unlock(&dentry->d_inode->i_mutex);
880}
881
882static int
878nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 883nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
879 loff_t offset, struct kvec *vec, int vlen, 884 loff_t offset, struct kvec *vec, int vlen,
880 unsigned long cnt, int *stablep) 885 unsigned long cnt, int *stablep)
@@ -886,9 +891,9 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
886 int err = 0; 891 int err = 0;
887 int stable = *stablep; 892 int stable = *stablep;
888 893
894#ifdef MSNFS
889 err = nfserr_perm; 895 err = nfserr_perm;
890 896
891#ifdef MSNFS
892 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && 897 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
893 (!lock_may_write(file->f_dentry->d_inode, offset, cnt))) 898 (!lock_may_write(file->f_dentry->d_inode, offset, cnt)))
894 goto out; 899 goto out;
@@ -927,14 +932,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
927 } 932 }
928 933
929 /* clear setuid/setgid flag after write */ 934 /* clear setuid/setgid flag after write */
930 if (err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID))) { 935 if (err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID)))
931 struct iattr ia; 936 kill_suid(dentry);
932 ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID;
933
934 down(&inode->i_sem);
935 notify_change(dentry, &ia);
936 up(&inode->i_sem);
937 }
938 937
939 if (err >= 0 && stable) { 938 if (err >= 0 && stable) {
940 static ino_t last_ino; 939 static ino_t last_ino;
@@ -962,7 +961,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
962 961
963 if (inode->i_state & I_DIRTY) { 962 if (inode->i_state & I_DIRTY) {
964 dprintk("nfsd: write sync %d\n", current->pid); 963 dprintk("nfsd: write sync %d\n", current->pid);
965 nfsd_sync(file); 964 err=nfsd_sync(file);
966 } 965 }
967#if 0 966#if 0
968 wake_up(&inode->i_wait); 967 wake_up(&inode->i_wait);
@@ -1066,7 +1065,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
1066 return err; 1065 return err;
1067 if (EX_ISSYNC(fhp->fh_export)) { 1066 if (EX_ISSYNC(fhp->fh_export)) {
1068 if (file->f_op && file->f_op->fsync) { 1067 if (file->f_op && file->f_op->fsync) {
1069 nfsd_sync(file); 1068 err = nfserrno(nfsd_sync(file));
1070 } else { 1069 } else {
1071 err = nfserr_notsupp; 1070 err = nfserr_notsupp;
1072 } 1071 }
@@ -1134,7 +1133,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1134 "nfsd_create: parent %s/%s not locked!\n", 1133 "nfsd_create: parent %s/%s not locked!\n",
1135 dentry->d_parent->d_name.name, 1134 dentry->d_parent->d_name.name,
1136 dentry->d_name.name); 1135 dentry->d_name.name);
1137 err = -EIO; 1136 err = nfserr_io;
1138 goto out; 1137 goto out;
1139 } 1138 }
1140 } 1139 }
@@ -1177,7 +1176,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1177 goto out_nfserr; 1176 goto out_nfserr;
1178 1177
1179 if (EX_ISSYNC(fhp->fh_export)) { 1178 if (EX_ISSYNC(fhp->fh_export)) {
1180 nfsd_sync_dir(dentry); 1179 err = nfserrno(nfsd_sync_dir(dentry));
1181 write_inode_now(dchild->d_inode, 1); 1180 write_inode_now(dchild->d_inode, 1);
1182 } 1181 }
1183 1182
@@ -1187,9 +1186,11 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1187 * send along the gid when it tries to implement setgid 1186 * send along the gid when it tries to implement setgid
1188 * directories via NFS. 1187 * directories via NFS.
1189 */ 1188 */
1190 err = 0; 1189 if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) {
1191 if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) 1190 int err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
1192 err = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); 1191 if (err2)
1192 err = err2;
1193 }
1193 /* 1194 /*
1194 * Update the file handle to get the new inode info. 1195 * Update the file handle to get the new inode info.
1195 */ 1196 */
@@ -1308,17 +1309,10 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1308 goto out_nfserr; 1309 goto out_nfserr;
1309 1310
1310 if (EX_ISSYNC(fhp->fh_export)) { 1311 if (EX_ISSYNC(fhp->fh_export)) {
1311 nfsd_sync_dir(dentry); 1312 err = nfserrno(nfsd_sync_dir(dentry));
1312 /* setattr will sync the child (or not) */ 1313 /* setattr will sync the child (or not) */
1313 } 1314 }
1314 1315
1315 /*
1316 * Update the filehandle to get the new inode info.
1317 */
1318 err = fh_update(resfhp);
1319 if (err)
1320 goto out;
1321
1322 if (createmode == NFS3_CREATE_EXCLUSIVE) { 1316 if (createmode == NFS3_CREATE_EXCLUSIVE) {
1323 /* Cram the verifier into atime/mtime/mode */ 1317 /* Cram the verifier into atime/mtime/mode */
1324 iap->ia_valid = ATTR_MTIME|ATTR_ATIME 1318 iap->ia_valid = ATTR_MTIME|ATTR_ATIME
@@ -1339,8 +1333,17 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1339 * implement setgid directories via NFS. Clear out all that cruft. 1333 * implement setgid directories via NFS. Clear out all that cruft.
1340 */ 1334 */
1341 set_attr: 1335 set_attr:
1342 if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID)) != 0) 1336 if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID)) != 0) {
1343 err = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); 1337 int err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
1338 if (err2)
1339 err = err2;
1340 }
1341
1342 /*
1343 * Update the filehandle to get the new inode info.
1344 */
1345 if (!err)
1346 err = fh_update(resfhp);
1344 1347
1345 out: 1348 out:
1346 fh_unlock(fhp); 1349 fh_unlock(fhp);
@@ -1449,10 +1452,10 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
1449 } else 1452 } else
1450 err = vfs_symlink(dentry->d_inode, dnew, path, mode); 1453 err = vfs_symlink(dentry->d_inode, dnew, path, mode);
1451 1454
1452 if (!err) { 1455 if (!err)
1453 if (EX_ISSYNC(fhp->fh_export)) 1456 if (EX_ISSYNC(fhp->fh_export))
1454 nfsd_sync_dir(dentry); 1457 err = nfsd_sync_dir(dentry);
1455 } else 1458 if (err)
1456 err = nfserrno(err); 1459 err = nfserrno(err);
1457 fh_unlock(fhp); 1460 fh_unlock(fhp);
1458 1461
@@ -1508,7 +1511,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1508 err = vfs_link(dold, dirp, dnew); 1511 err = vfs_link(dold, dirp, dnew);
1509 if (!err) { 1512 if (!err) {
1510 if (EX_ISSYNC(ffhp->fh_export)) { 1513 if (EX_ISSYNC(ffhp->fh_export)) {
1511 nfsd_sync_dir(ddir); 1514 err = nfserrno(nfsd_sync_dir(ddir));
1512 write_inode_now(dest, 1); 1515 write_inode_now(dest, 1);
1513 } 1516 }
1514 } else { 1517 } else {
@@ -1592,13 +1595,14 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1592 if ((ffhp->fh_export->ex_flags & NFSEXP_MSNFS) && 1595 if ((ffhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
1593 ((atomic_read(&odentry->d_count) > 1) 1596 ((atomic_read(&odentry->d_count) > 1)
1594 || (atomic_read(&ndentry->d_count) > 1))) { 1597 || (atomic_read(&ndentry->d_count) > 1))) {
1595 err = nfserr_perm; 1598 err = -EPERM;
1596 } else 1599 } else
1597#endif 1600#endif
1598 err = vfs_rename(fdir, odentry, tdir, ndentry); 1601 err = vfs_rename(fdir, odentry, tdir, ndentry);
1599 if (!err && EX_ISSYNC(tfhp->fh_export)) { 1602 if (!err && EX_ISSYNC(tfhp->fh_export)) {
1600 nfsd_sync_dir(tdentry); 1603 err = nfsd_sync_dir(tdentry);
1601 nfsd_sync_dir(fdentry); 1604 if (!err)
1605 err = nfsd_sync_dir(fdentry);
1602 } 1606 }
1603 1607
1604 out_dput_new: 1608 out_dput_new:
@@ -1663,7 +1667,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
1663#ifdef MSNFS 1667#ifdef MSNFS
1664 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && 1668 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
1665 (atomic_read(&rdentry->d_count) > 1)) { 1669 (atomic_read(&rdentry->d_count) > 1)) {
1666 err = nfserr_perm; 1670 err = -EPERM;
1667 } else 1671 } else
1668#endif 1672#endif
1669 err = vfs_unlink(dirp, rdentry); 1673 err = vfs_unlink(dirp, rdentry);
@@ -1673,17 +1677,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
1673 1677
1674 dput(rdentry); 1678 dput(rdentry);
1675 1679
1676 if (err) 1680 if (err == 0 &&
1677 goto out_nfserr; 1681 EX_ISSYNC(fhp->fh_export))
1678 if (EX_ISSYNC(fhp->fh_export)) 1682 err = nfsd_sync_dir(dentry);
1679 nfsd_sync_dir(dentry);
1680
1681out:
1682 return err;
1683 1683
1684out_nfserr: 1684out_nfserr:
1685 err = nfserrno(err); 1685 err = nfserrno(err);
1686 goto out; 1686out:
1687 return err;
1687} 1688}
1688 1689
1689/* 1690/*
@@ -1874,39 +1875,25 @@ nfsd_get_posix_acl(struct svc_fh *fhp, int type)
1874 ssize_t size; 1875 ssize_t size;
1875 struct posix_acl *acl; 1876 struct posix_acl *acl;
1876 1877
1877 if (!IS_POSIXACL(inode) || !inode->i_op || !inode->i_op->getxattr) 1878 if (!IS_POSIXACL(inode))
1879 return ERR_PTR(-EOPNOTSUPP);
1880
1881 switch (type) {
1882 case ACL_TYPE_ACCESS:
1883 name = POSIX_ACL_XATTR_ACCESS;
1884 break;
1885 case ACL_TYPE_DEFAULT:
1886 name = POSIX_ACL_XATTR_DEFAULT;
1887 break;
1888 default:
1878 return ERR_PTR(-EOPNOTSUPP); 1889 return ERR_PTR(-EOPNOTSUPP);
1879 switch(type) {
1880 case ACL_TYPE_ACCESS:
1881 name = POSIX_ACL_XATTR_ACCESS;
1882 break;
1883 case ACL_TYPE_DEFAULT:
1884 name = POSIX_ACL_XATTR_DEFAULT;
1885 break;
1886 default:
1887 return ERR_PTR(-EOPNOTSUPP);
1888 } 1890 }
1889 1891
1890 size = inode->i_op->getxattr(fhp->fh_dentry, name, NULL, 0); 1892 size = nfsd_getxattr(fhp->fh_dentry, name, &value);
1893 if (size < 0)
1894 return ERR_PTR(size);
1891 1895
1892 if (size < 0) {
1893 acl = ERR_PTR(size);
1894 goto getout;
1895 } else if (size > 0) {
1896 value = kmalloc(size, GFP_KERNEL);
1897 if (!value) {
1898 acl = ERR_PTR(-ENOMEM);
1899 goto getout;
1900 }
1901 size = inode->i_op->getxattr(fhp->fh_dentry, name, value, size);
1902 if (size < 0) {
1903 acl = ERR_PTR(size);
1904 goto getout;
1905 }
1906 }
1907 acl = posix_acl_from_xattr(value, size); 1896 acl = posix_acl_from_xattr(value, size);
1908
1909getout:
1910 kfree(value); 1897 kfree(value);
1911 return acl; 1898 return acl;
1912} 1899}
@@ -1947,16 +1934,13 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
1947 } else 1934 } else
1948 size = 0; 1935 size = 0;
1949 1936
1950 if (!fhp->fh_locked)
1951 fh_lock(fhp); /* unlocking is done automatically */
1952 if (size) 1937 if (size)
1953 error = inode->i_op->setxattr(fhp->fh_dentry, name, 1938 error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0);
1954 value, size, 0);
1955 else { 1939 else {
1956 if (!S_ISDIR(inode->i_mode) && type == ACL_TYPE_DEFAULT) 1940 if (!S_ISDIR(inode->i_mode) && type == ACL_TYPE_DEFAULT)
1957 error = 0; 1941 error = 0;
1958 else { 1942 else {
1959 error = inode->i_op->removexattr(fhp->fh_dentry, name); 1943 error = vfs_removexattr(fhp->fh_dentry, name);
1960 if (error == -ENODATA) 1944 if (error == -ENODATA)
1961 error = 0; 1945 error = 0;
1962 } 1946 }
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 50a7749cfca..02f44094bda 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -884,7 +884,7 @@ ToDo/Notes:
884 884
885 - Add handling for initialized_size != data_size in compressed files. 885 - Add handling for initialized_size != data_size in compressed files.
886 - Reduce function local stack usage from 0x3d4 bytes to just noise in 886 - Reduce function local stack usage from 0x3d4 bytes to just noise in
887 fs/ntfs/upcase.c. (Randy Dunlap <rddunlap@osdl.ord>) 887 fs/ntfs/upcase.c. (Randy Dunlap <rdunlap@xenotime.net>)
888 - Remove compiler warnings for newer gcc. 888 - Remove compiler warnings for newer gcc.
889 - Pages are no longer kmapped by mm/filemap.c::generic_file_write() 889 - Pages are no longer kmapped by mm/filemap.c::generic_file_write()
890 around calls to ->{prepare,commit}_write. Adapt NTFS appropriately 890 around calls to ->{prepare,commit}_write. Adapt NTFS appropriately
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index eda056bac25..9480a0526cd 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -1532,7 +1532,7 @@ int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
1532 * NOTE to self: No changes in the attribute list are required to move from 1532 * NOTE to self: No changes in the attribute list are required to move from
1533 * a resident to a non-resident attribute. 1533 * a resident to a non-resident attribute.
1534 * 1534 *
1535 * Locking: - The caller must hold i_sem on the inode. 1535 * Locking: - The caller must hold i_mutex on the inode.
1536 */ 1536 */
1537int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size) 1537int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
1538{ 1538{
@@ -1728,7 +1728,7 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
1728 /* 1728 /*
1729 * This needs to be last since the address space operations ->readpage 1729 * This needs to be last since the address space operations ->readpage
1730 * and ->writepage can run concurrently with us as they are not 1730 * and ->writepage can run concurrently with us as they are not
1731 * serialized on i_sem. Note, we are not allowed to fail once we flip 1731 * serialized on i_mutex. Note, we are not allowed to fail once we flip
1732 * this switch, which is another reason to do this last. 1732 * this switch, which is another reason to do this last.
1733 */ 1733 */
1734 NInoSetNonResident(ni); 1734 NInoSetNonResident(ni);
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 795c3d1930f..b0690d4c890 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -69,7 +69,7 @@ ntfschar I30[5] = { const_cpu_to_le16('$'), const_cpu_to_le16('I'),
69 * work but we don't care for how quickly one can access them. This also fixes 69 * work but we don't care for how quickly one can access them. This also fixes
70 * the dcache aliasing issues. 70 * the dcache aliasing issues.
71 * 71 *
72 * Locking: - Caller must hold i_sem on the directory. 72 * Locking: - Caller must hold i_mutex on the directory.
73 * - Each page cache page in the index allocation mapping must be 73 * - Each page cache page in the index allocation mapping must be
74 * locked whilst being accessed otherwise we may find a corrupt 74 * locked whilst being accessed otherwise we may find a corrupt
75 * page due to it being under ->writepage at the moment which 75 * page due to it being under ->writepage at the moment which
@@ -1085,11 +1085,11 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
1085 * While this will return the names in random order this doesn't matter for 1085 * While this will return the names in random order this doesn't matter for
1086 * ->readdir but OTOH results in a faster ->readdir. 1086 * ->readdir but OTOH results in a faster ->readdir.
1087 * 1087 *
1088 * VFS calls ->readdir without BKL but with i_sem held. This protects the VFS 1088 * VFS calls ->readdir without BKL but with i_mutex held. This protects the VFS
1089 * parts (e.g. ->f_pos and ->i_size, and it also protects against directory 1089 * parts (e.g. ->f_pos and ->i_size, and it also protects against directory
1090 * modifications). 1090 * modifications).
1091 * 1091 *
1092 * Locking: - Caller must hold i_sem on the directory. 1092 * Locking: - Caller must hold i_mutex on the directory.
1093 * - Each page cache page in the index allocation mapping must be 1093 * - Each page cache page in the index allocation mapping must be
1094 * locked whilst being accessed otherwise we may find a corrupt 1094 * locked whilst being accessed otherwise we may find a corrupt
1095 * page due to it being under ->writepage at the moment which 1095 * page due to it being under ->writepage at the moment which
@@ -1520,7 +1520,7 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp)
1520 * Note: In the past @filp could be NULL so we ignore it as we don't need it 1520 * Note: In the past @filp could be NULL so we ignore it as we don't need it
1521 * anyway. 1521 * anyway.
1522 * 1522 *
1523 * Locking: Caller must hold i_sem on the inode. 1523 * Locking: Caller must hold i_mutex on the inode.
1524 * 1524 *
1525 * TODO: We should probably also write all attribute/index inodes associated 1525 * TODO: We should probably also write all attribute/index inodes associated
1526 * with this inode but since we have no simple way of getting to them we ignore 1526 * with this inode but since we have no simple way of getting to them we ignore
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 72753389181..fb413d3d861 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -106,7 +106,7 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
106 * this is the case, the necessary zeroing will also have happened and that all 106 * this is the case, the necessary zeroing will also have happened and that all
107 * metadata is self-consistent. 107 * metadata is self-consistent.
108 * 108 *
109 * Locking: i_sem on the vfs inode corrseponsind to the ntfs inode @ni must be 109 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
110 * held by the caller. 110 * held by the caller.
111 */ 111 */
112static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size, 112static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size,
@@ -473,7 +473,7 @@ static inline int ntfs_submit_bh_for_read(struct buffer_head *bh)
473 * @bytes: number of bytes to be written 473 * @bytes: number of bytes to be written
474 * 474 *
475 * This is called for non-resident attributes from ntfs_file_buffered_write() 475 * This is called for non-resident attributes from ntfs_file_buffered_write()
476 * with i_sem held on the inode (@pages[0]->mapping->host). There are 476 * with i_mutex held on the inode (@pages[0]->mapping->host). There are
477 * @nr_pages pages in @pages which are locked but not kmap()ped. The source 477 * @nr_pages pages in @pages which are locked but not kmap()ped. The source
478 * data has not yet been copied into the @pages. 478 * data has not yet been copied into the @pages.
479 * 479 *
@@ -1637,7 +1637,7 @@ err_out:
1637 * @pos: byte position in file at which the write begins 1637 * @pos: byte position in file at which the write begins
1638 * @bytes: number of bytes to be written 1638 * @bytes: number of bytes to be written
1639 * 1639 *
1640 * This is called from ntfs_file_buffered_write() with i_sem held on the inode 1640 * This is called from ntfs_file_buffered_write() with i_mutex held on the inode
1641 * (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are 1641 * (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are
1642 * locked but not kmap()ped. The source data has already been copied into the 1642 * locked but not kmap()ped. The source data has already been copied into the
1643 * @page. ntfs_prepare_pages_for_non_resident_write() has been called before 1643 * @page. ntfs_prepare_pages_for_non_resident_write() has been called before
@@ -1814,7 +1814,7 @@ err_out:
1814/** 1814/**
1815 * ntfs_file_buffered_write - 1815 * ntfs_file_buffered_write -
1816 * 1816 *
1817 * Locking: The vfs is holding ->i_sem on the inode. 1817 * Locking: The vfs is holding ->i_mutex on the inode.
1818 */ 1818 */
1819static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, 1819static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1820 const struct iovec *iov, unsigned long nr_segs, 1820 const struct iovec *iov, unsigned long nr_segs,
@@ -2173,7 +2173,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
2173 err = remove_suid(file->f_dentry); 2173 err = remove_suid(file->f_dentry);
2174 if (err) 2174 if (err)
2175 goto out; 2175 goto out;
2176 inode_update_time(inode, 1); 2176 file_update_time(file);
2177 written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos, 2177 written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos,
2178 count); 2178 count);
2179out: 2179out:
@@ -2196,9 +2196,9 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const char __user *buf,
2196 2196
2197 BUG_ON(iocb->ki_pos != pos); 2197 BUG_ON(iocb->ki_pos != pos);
2198 2198
2199 down(&inode->i_sem); 2199 mutex_lock(&inode->i_mutex);
2200 ret = ntfs_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); 2200 ret = ntfs_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);
2201 up(&inode->i_sem); 2201 mutex_unlock(&inode->i_mutex);
2202 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2202 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2203 int err = sync_page_range(inode, mapping, pos, ret); 2203 int err = sync_page_range(inode, mapping, pos, ret);
2204 if (err < 0) 2204 if (err < 0)
@@ -2221,12 +2221,12 @@ static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov,
2221 struct kiocb kiocb; 2221 struct kiocb kiocb;
2222 ssize_t ret; 2222 ssize_t ret;
2223 2223
2224 down(&inode->i_sem); 2224 mutex_lock(&inode->i_mutex);
2225 init_sync_kiocb(&kiocb, file); 2225 init_sync_kiocb(&kiocb, file);
2226 ret = ntfs_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos); 2226 ret = ntfs_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
2227 if (ret == -EIOCBQUEUED) 2227 if (ret == -EIOCBQUEUED)
2228 ret = wait_on_sync_kiocb(&kiocb); 2228 ret = wait_on_sync_kiocb(&kiocb);
2229 up(&inode->i_sem); 2229 mutex_unlock(&inode->i_mutex);
2230 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2230 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2231 int err = sync_page_range(inode, mapping, *ppos - ret, ret); 2231 int err = sync_page_range(inode, mapping, *ppos - ret, ret);
2232 if (err < 0) 2232 if (err < 0)
@@ -2269,7 +2269,7 @@ static ssize_t ntfs_file_write(struct file *file, const char __user *buf,
2269 * Note: In the past @filp could be NULL so we ignore it as we don't need it 2269 * Note: In the past @filp could be NULL so we ignore it as we don't need it
2270 * anyway. 2270 * anyway.
2271 * 2271 *
2272 * Locking: Caller must hold i_sem on the inode. 2272 * Locking: Caller must hold i_mutex on the inode.
2273 * 2273 *
2274 * TODO: We should probably also write all attribute/index inodes associated 2274 * TODO: We should probably also write all attribute/index inodes associated
2275 * with this inode but since we have no simple way of getting to them we ignore 2275 * with this inode but since we have no simple way of getting to them we ignore
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
index 8f2d5727546..9f5427c2d10 100644
--- a/fs/ntfs/index.c
+++ b/fs/ntfs/index.c
@@ -32,7 +32,7 @@
32 * Allocate a new index context, initialize it with @idx_ni and return it. 32 * Allocate a new index context, initialize it with @idx_ni and return it.
33 * Return NULL if allocation failed. 33 * Return NULL if allocation failed.
34 * 34 *
35 * Locking: Caller must hold i_sem on the index inode. 35 * Locking: Caller must hold i_mutex on the index inode.
36 */ 36 */
37ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni) 37ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni)
38{ 38{
@@ -50,7 +50,7 @@ ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni)
50 * 50 *
51 * Release the index context @ictx, releasing all associated resources. 51 * Release the index context @ictx, releasing all associated resources.
52 * 52 *
53 * Locking: Caller must hold i_sem on the index inode. 53 * Locking: Caller must hold i_mutex on the index inode.
54 */ 54 */
55void ntfs_index_ctx_put(ntfs_index_context *ictx) 55void ntfs_index_ctx_put(ntfs_index_context *ictx)
56{ 56{
@@ -106,7 +106,7 @@ void ntfs_index_ctx_put(ntfs_index_context *ictx)
106 * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to 106 * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to
107 * ensure that the changes are written to disk. 107 * ensure that the changes are written to disk.
108 * 108 *
109 * Locking: - Caller must hold i_sem on the index inode. 109 * Locking: - Caller must hold i_mutex on the index inode.
110 * - Each page cache page in the index allocation mapping must be 110 * - Each page cache page in the index allocation mapping must be
111 * locked whilst being accessed otherwise we may find a corrupt 111 * locked whilst being accessed otherwise we may find a corrupt
112 * page due to it being under ->writepage at the moment which 112 * page due to it being under ->writepage at the moment which
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index b24f4c4b2c5..ea1bd3feea1 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2125,13 +2125,13 @@ void ntfs_put_inode(struct inode *vi)
2125 ntfs_inode *ni = NTFS_I(vi); 2125 ntfs_inode *ni = NTFS_I(vi);
2126 if (NInoIndexAllocPresent(ni)) { 2126 if (NInoIndexAllocPresent(ni)) {
2127 struct inode *bvi = NULL; 2127 struct inode *bvi = NULL;
2128 down(&vi->i_sem); 2128 mutex_lock(&vi->i_mutex);
2129 if (atomic_read(&vi->i_count) == 2) { 2129 if (atomic_read(&vi->i_count) == 2) {
2130 bvi = ni->itype.index.bmp_ino; 2130 bvi = ni->itype.index.bmp_ino;
2131 if (bvi) 2131 if (bvi)
2132 ni->itype.index.bmp_ino = NULL; 2132 ni->itype.index.bmp_ino = NULL;
2133 } 2133 }
2134 up(&vi->i_sem); 2134 mutex_unlock(&vi->i_mutex);
2135 if (bvi) 2135 if (bvi)
2136 iput(bvi); 2136 iput(bvi);
2137 } 2137 }
@@ -2311,7 +2311,7 @@ static const char *es = " Leaving inconsistent metadata. Unmount and run "
2311 * 2311 *
2312 * Returns 0 on success or -errno on error. 2312 * Returns 0 on success or -errno on error.
2313 * 2313 *
2314 * Called with ->i_sem held. In all but one case ->i_alloc_sem is held for 2314 * Called with ->i_mutex held. In all but one case ->i_alloc_sem is held for
2315 * writing. The only case in the kernel where ->i_alloc_sem is not held is 2315 * writing. The only case in the kernel where ->i_alloc_sem is not held is
2316 * mm/filemap.c::generic_file_buffered_write() where vmtruncate() is called 2316 * mm/filemap.c::generic_file_buffered_write() where vmtruncate() is called
2317 * with the current i_size as the offset. The analogous place in NTFS is in 2317 * with the current i_size as the offset. The analogous place in NTFS is in
@@ -2767,7 +2767,25 @@ unm_done:
2767 up_write(&ni->runlist.lock); 2767 up_write(&ni->runlist.lock);
2768done: 2768done:
2769 /* Update the mtime and ctime on the base inode. */ 2769 /* Update the mtime and ctime on the base inode. */
2770 inode_update_time(VFS_I(base_ni), 1); 2770 /* normally ->truncate shouldn't update ctime or mtime,
2771 * but ntfs did before so it got a copy & paste version
2772 * of file_update_time. one day someone should fix this
2773 * for real.
2774 */
2775 if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) {
2776 struct timespec now = current_fs_time(VFS_I(base_ni)->i_sb);
2777 int sync_it = 0;
2778
2779 if (!timespec_equal(&VFS_I(base_ni)->i_mtime, &now) ||
2780 !timespec_equal(&VFS_I(base_ni)->i_ctime, &now))
2781 sync_it = 1;
2782 VFS_I(base_ni)->i_mtime = now;
2783 VFS_I(base_ni)->i_ctime = now;
2784
2785 if (sync_it)
2786 mark_inode_dirty_sync(VFS_I(base_ni));
2787 }
2788
2771 if (likely(!err)) { 2789 if (likely(!err)) {
2772 NInoClearTruncateFailed(ni); 2790 NInoClearTruncateFailed(ni);
2773 ntfs_debug("Done."); 2791 ntfs_debug("Done.");
@@ -2831,7 +2849,7 @@ void ntfs_truncate_vfs(struct inode *vi) {
2831 * We also abort all changes of user, group, and mode as we do not implement 2849 * We also abort all changes of user, group, and mode as we do not implement
2832 * the NTFS ACLs yet. 2850 * the NTFS ACLs yet.
2833 * 2851 *
2834 * Called with ->i_sem held. For the ATTR_SIZE (i.e. ->truncate) case, also 2852 * Called with ->i_mutex held. For the ATTR_SIZE (i.e. ->truncate) case, also
2835 * called with ->i_alloc_sem held for writing. 2853 * called with ->i_alloc_sem held for writing.
2836 * 2854 *
2837 * Basically this is a copy of generic notify_change() and inode_setattr() 2855 * Basically this is a copy of generic notify_change() and inode_setattr()
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 351dbc3b6e4..5ea9eb93af6 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -96,7 +96,7 @@
96 * name. We then convert the name to the current NLS code page, and proceed 96 * name. We then convert the name to the current NLS code page, and proceed
97 * searching for a dentry with this name, etc, as in case 2), above. 97 * searching for a dentry with this name, etc, as in case 2), above.
98 * 98 *
99 * Locking: Caller must hold i_sem on the directory. 99 * Locking: Caller must hold i_mutex on the directory.
100 */ 100 */
101static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent, 101static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent,
102 struct nameidata *nd) 102 struct nameidata *nd)
@@ -254,7 +254,7 @@ handle_name:
254 nls_name.hash = full_name_hash(nls_name.name, nls_name.len); 254 nls_name.hash = full_name_hash(nls_name.name, nls_name.len);
255 255
256 /* 256 /*
257 * Note: No need for dent->d_lock lock as i_sem is held on the 257 * Note: No need for dent->d_lock lock as i_mutex is held on the
258 * parent inode. 258 * parent inode.
259 */ 259 */
260 260
@@ -374,7 +374,7 @@ struct inode_operations ntfs_dir_inode_ops = {
374 * The code is based on the ext3 ->get_parent() implementation found in 374 * The code is based on the ext3 ->get_parent() implementation found in
375 * fs/ext3/namei.c::ext3_get_parent(). 375 * fs/ext3/namei.c::ext3_get_parent().
376 * 376 *
377 * Note: ntfs_get_parent() is called with @child_dent->d_inode->i_sem down. 377 * Note: ntfs_get_parent() is called with @child_dent->d_inode->i_mutex down.
378 * 378 *
379 * Return the dentry of the parent directory on success or the error code on 379 * Return the dentry of the parent directory on success or the error code on
380 * error (IS_ERR() is true). 380 * error (IS_ERR() is true).
diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c
index 833df2a4e9f..d0ef4182147 100644
--- a/fs/ntfs/quota.c
+++ b/fs/ntfs/quota.c
@@ -48,7 +48,7 @@ BOOL ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
48 ntfs_error(vol->sb, "Quota inodes are not open."); 48 ntfs_error(vol->sb, "Quota inodes are not open.");
49 return FALSE; 49 return FALSE;
50 } 50 }
51 down(&vol->quota_q_ino->i_sem); 51 mutex_lock(&vol->quota_q_ino->i_mutex);
52 ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino)); 52 ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino));
53 if (!ictx) { 53 if (!ictx) {
54 ntfs_error(vol->sb, "Failed to get index context."); 54 ntfs_error(vol->sb, "Failed to get index context.");
@@ -98,7 +98,7 @@ BOOL ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
98 ntfs_index_entry_mark_dirty(ictx); 98 ntfs_index_entry_mark_dirty(ictx);
99set_done: 99set_done:
100 ntfs_index_ctx_put(ictx); 100 ntfs_index_ctx_put(ictx);
101 up(&vol->quota_q_ino->i_sem); 101 mutex_unlock(&vol->quota_q_ino->i_mutex);
102 /* 102 /*
103 * We set the flag so we do not try to mark the quotas out of date 103 * We set the flag so we do not try to mark the quotas out of date
104 * again on remount. 104 * again on remount.
@@ -110,7 +110,7 @@ done:
110err_out: 110err_out:
111 if (ictx) 111 if (ictx)
112 ntfs_index_ctx_put(ictx); 112 ntfs_index_ctx_put(ictx);
113 up(&vol->quota_q_ino->i_sem); 113 mutex_unlock(&vol->quota_q_ino->i_mutex);
114 return FALSE; 114 return FALSE;
115} 115}
116 116
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 6c16db9e1a8..c3a3f1a8310 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -443,8 +443,8 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
443 443
444 ntfs_debug("Entering with remount options string: %s", opt); 444 ntfs_debug("Entering with remount options string: %s", opt);
445#ifndef NTFS_RW 445#ifndef NTFS_RW
446 /* For read-only compiled driver, enforce all read-only flags. */ 446 /* For read-only compiled driver, enforce read-only flag. */
447 *flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; 447 *flags |= MS_RDONLY;
448#else /* NTFS_RW */ 448#else /* NTFS_RW */
449 /* 449 /*
450 * For the read-write compiled driver, if we are remounting read-write, 450 * For the read-write compiled driver, if we are remounting read-write,
@@ -1213,10 +1213,10 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
1213 * Find the inode number for the hibernation file by looking up the 1213 * Find the inode number for the hibernation file by looking up the
1214 * filename hiberfil.sys in the root directory. 1214 * filename hiberfil.sys in the root directory.
1215 */ 1215 */
1216 down(&vol->root_ino->i_sem); 1216 mutex_lock(&vol->root_ino->i_mutex);
1217 mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12, 1217 mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12,
1218 &name); 1218 &name);
1219 up(&vol->root_ino->i_sem); 1219 mutex_unlock(&vol->root_ino->i_mutex);
1220 if (IS_ERR_MREF(mref)) { 1220 if (IS_ERR_MREF(mref)) {
1221 ret = MREF_ERR(mref); 1221 ret = MREF_ERR(mref);
1222 /* If the file does not exist, Windows is not hibernated. */ 1222 /* If the file does not exist, Windows is not hibernated. */
@@ -1307,10 +1307,10 @@ static BOOL load_and_init_quota(ntfs_volume *vol)
1307 * Find the inode number for the quota file by looking up the filename 1307 * Find the inode number for the quota file by looking up the filename
1308 * $Quota in the extended system files directory $Extend. 1308 * $Quota in the extended system files directory $Extend.
1309 */ 1309 */
1310 down(&vol->extend_ino->i_sem); 1310 mutex_lock(&vol->extend_ino->i_mutex);
1311 mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6, 1311 mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6,
1312 &name); 1312 &name);
1313 up(&vol->extend_ino->i_sem); 1313 mutex_unlock(&vol->extend_ino->i_mutex);
1314 if (IS_ERR_MREF(mref)) { 1314 if (IS_ERR_MREF(mref)) {
1315 /* 1315 /*
1316 * If the file does not exist, quotas are disabled and have 1316 * If the file does not exist, quotas are disabled and have
@@ -1390,10 +1390,10 @@ static BOOL load_and_init_usnjrnl(ntfs_volume *vol)
1390 * Find the inode number for the transaction log file by looking up the 1390 * Find the inode number for the transaction log file by looking up the
1391 * filename $UsnJrnl in the extended system files directory $Extend. 1391 * filename $UsnJrnl in the extended system files directory $Extend.
1392 */ 1392 */
1393 down(&vol->extend_ino->i_sem); 1393 mutex_lock(&vol->extend_ino->i_mutex);
1394 mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8, 1394 mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8,
1395 &name); 1395 &name);
1396 up(&vol->extend_ino->i_sem); 1396 mutex_unlock(&vol->extend_ino->i_mutex);
1397 if (IS_ERR_MREF(mref)) { 1397 if (IS_ERR_MREF(mref)) {
1398 /* 1398 /*
1399 * If the file does not exist, transaction logging is disabled, 1399 * If the file does not exist, transaction logging is disabled,
@@ -1721,7 +1721,7 @@ static BOOL load_system_files(ntfs_volume *vol)
1721 es3); 1721 es3);
1722 goto iput_mirr_err_out; 1722 goto iput_mirr_err_out;
1723 } 1723 }
1724 sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; 1724 sb->s_flags |= MS_RDONLY;
1725 ntfs_error(sb, "%s. Mounting read-only%s", 1725 ntfs_error(sb, "%s. Mounting read-only%s",
1726 !vol->mftmirr_ino ? es1 : es2, es3); 1726 !vol->mftmirr_ino ? es1 : es2, es3);
1727 } else 1727 } else
@@ -1837,7 +1837,7 @@ get_ctx_vol_failed:
1837 es1, es2); 1837 es1, es2);
1838 goto iput_vol_err_out; 1838 goto iput_vol_err_out;
1839 } 1839 }
1840 sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; 1840 sb->s_flags |= MS_RDONLY;
1841 ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); 1841 ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
1842 } else 1842 } else
1843 ntfs_warning(sb, "%s. Will not be able to remount " 1843 ntfs_warning(sb, "%s. Will not be able to remount "
@@ -1874,7 +1874,7 @@ get_ctx_vol_failed:
1874 } 1874 }
1875 goto iput_logfile_err_out; 1875 goto iput_logfile_err_out;
1876 } 1876 }
1877 sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; 1877 sb->s_flags |= MS_RDONLY;
1878 ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); 1878 ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
1879 } else 1879 } else
1880 ntfs_warning(sb, "%s. Will not be able to remount " 1880 ntfs_warning(sb, "%s. Will not be able to remount "
@@ -1919,7 +1919,7 @@ get_ctx_vol_failed:
1919 es1, es2); 1919 es1, es2);
1920 goto iput_root_err_out; 1920 goto iput_root_err_out;
1921 } 1921 }
1922 sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; 1922 sb->s_flags |= MS_RDONLY;
1923 ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); 1923 ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
1924 } else 1924 } else
1925 ntfs_warning(sb, "%s. Will not be able to remount " 1925 ntfs_warning(sb, "%s. Will not be able to remount "
@@ -1943,7 +1943,7 @@ get_ctx_vol_failed:
1943 goto iput_root_err_out; 1943 goto iput_root_err_out;
1944 } 1944 }
1945 ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); 1945 ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
1946 sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; 1946 sb->s_flags |= MS_RDONLY;
1947 /* 1947 /*
1948 * Do not set NVolErrors() because ntfs_remount() might manage 1948 * Do not set NVolErrors() because ntfs_remount() might manage
1949 * to set the dirty flag in which case all would be well. 1949 * to set the dirty flag in which case all would be well.
@@ -1970,7 +1970,7 @@ get_ctx_vol_failed:
1970 goto iput_root_err_out; 1970 goto iput_root_err_out;
1971 } 1971 }
1972 ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); 1972 ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
1973 sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; 1973 sb->s_flags |= MS_RDONLY;
1974 NVolSetErrors(vol); 1974 NVolSetErrors(vol);
1975 } 1975 }
1976#endif 1976#endif
@@ -1989,7 +1989,7 @@ get_ctx_vol_failed:
1989 goto iput_root_err_out; 1989 goto iput_root_err_out;
1990 } 1990 }
1991 ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); 1991 ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
1992 sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; 1992 sb->s_flags |= MS_RDONLY;
1993 NVolSetErrors(vol); 1993 NVolSetErrors(vol);
1994 } 1994 }
1995#endif /* NTFS_RW */ 1995#endif /* NTFS_RW */
@@ -2030,7 +2030,7 @@ get_ctx_vol_failed:
2030 es1, es2); 2030 es1, es2);
2031 goto iput_quota_err_out; 2031 goto iput_quota_err_out;
2032 } 2032 }
2033 sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; 2033 sb->s_flags |= MS_RDONLY;
2034 ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); 2034 ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
2035 } else 2035 } else
2036 ntfs_warning(sb, "%s. Will not be able to remount " 2036 ntfs_warning(sb, "%s. Will not be able to remount "
@@ -2053,7 +2053,7 @@ get_ctx_vol_failed:
2053 goto iput_quota_err_out; 2053 goto iput_quota_err_out;
2054 } 2054 }
2055 ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); 2055 ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
2056 sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; 2056 sb->s_flags |= MS_RDONLY;
2057 NVolSetErrors(vol); 2057 NVolSetErrors(vol);
2058 } 2058 }
2059 /* 2059 /*
@@ -2074,7 +2074,7 @@ get_ctx_vol_failed:
2074 es1, es2); 2074 es1, es2);
2075 goto iput_usnjrnl_err_out; 2075 goto iput_usnjrnl_err_out;
2076 } 2076 }
2077 sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; 2077 sb->s_flags |= MS_RDONLY;
2078 ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); 2078 ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
2079 } else 2079 } else
2080 ntfs_warning(sb, "%s. Will not be able to remount " 2080 ntfs_warning(sb, "%s. Will not be able to remount "
@@ -2097,7 +2097,7 @@ get_ctx_vol_failed:
2097 goto iput_usnjrnl_err_out; 2097 goto iput_usnjrnl_err_out;
2098 } 2098 }
2099 ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); 2099 ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
2100 sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; 2100 sb->s_flags |= MS_RDONLY;
2101 NVolSetErrors(vol); 2101 NVolSetErrors(vol);
2102 } 2102 }
2103#endif /* NTFS_RW */ 2103#endif /* NTFS_RW */
@@ -2312,9 +2312,9 @@ static void ntfs_put_super(struct super_block *sb)
2312 if (!list_empty(&sb->s_dirty)) { 2312 if (!list_empty(&sb->s_dirty)) {
2313 const char *s1, *s2; 2313 const char *s1, *s2;
2314 2314
2315 down(&vol->mft_ino->i_sem); 2315 mutex_lock(&vol->mft_ino->i_mutex);
2316 truncate_inode_pages(vol->mft_ino->i_mapping, 0); 2316 truncate_inode_pages(vol->mft_ino->i_mapping, 0);
2317 up(&vol->mft_ino->i_sem); 2317 mutex_unlock(&vol->mft_ino->i_mutex);
2318 write_inode_now(vol->mft_ino, 1); 2318 write_inode_now(vol->mft_ino, 1);
2319 if (!list_empty(&sb->s_dirty)) { 2319 if (!list_empty(&sb->s_dirty)) {
2320 static const char *_s1 = "inodes"; 2320 static const char *_s1 = "inodes";
@@ -2689,7 +2689,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
2689 2689
2690 ntfs_debug("Entering."); 2690 ntfs_debug("Entering.");
2691#ifndef NTFS_RW 2691#ifndef NTFS_RW
2692 sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; 2692 sb->s_flags |= MS_RDONLY;
2693#endif /* ! NTFS_RW */ 2693#endif /* ! NTFS_RW */
2694 /* Allocate a new ntfs_volume and place it in sb->s_fs_info. */ 2694 /* Allocate a new ntfs_volume and place it in sb->s_fs_info. */
2695 sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS); 2695 sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS);
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
new file mode 100644
index 00000000000..7d3be845a61
--- /dev/null
+++ b/fs/ocfs2/Makefile
@@ -0,0 +1,33 @@
1EXTRA_CFLAGS += -Ifs/ocfs2
2
3EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
4
5obj-$(CONFIG_OCFS2_FS) += ocfs2.o
6
7ocfs2-objs := \
8 alloc.o \
9 aops.o \
10 buffer_head_io.o \
11 dcache.o \
12 dir.o \
13 dlmglue.o \
14 export.o \
15 extent_map.o \
16 file.o \
17 heartbeat.o \
18 inode.o \
19 journal.o \
20 localalloc.o \
21 mmap.o \
22 namei.o \
23 slot_map.o \
24 suballoc.o \
25 super.o \
26 symlink.o \
27 sysfile.o \
28 uptodate.o \
29 ver.o \
30 vote.o
31
32obj-$(CONFIG_OCFS2_FS) += cluster/
33obj-$(CONFIG_OCFS2_FS) += dlm/
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
new file mode 100644
index 00000000000..6b9812db377
--- /dev/null
+++ b/fs/ocfs2/alloc.c
@@ -0,0 +1,2040 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * alloc.c
5 *
6 * Extent allocs and frees
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30
31#define MLOG_MASK_PREFIX ML_DISK_ALLOC
32#include <cluster/masklog.h>
33
34#include "ocfs2.h"
35
36#include "alloc.h"
37#include "dlmglue.h"
38#include "extent_map.h"
39#include "inode.h"
40#include "journal.h"
41#include "localalloc.h"
42#include "suballoc.h"
43#include "sysfile.h"
44#include "file.h"
45#include "super.h"
46#include "uptodate.h"
47
48#include "buffer_head_io.h"
49
50static int ocfs2_extent_contig(struct inode *inode,
51 struct ocfs2_extent_rec *ext,
52 u64 blkno);
53
54static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
55 struct ocfs2_journal_handle *handle,
56 struct inode *inode,
57 int wanted,
58 struct ocfs2_alloc_context *meta_ac,
59 struct buffer_head *bhs[]);
60
61static int ocfs2_add_branch(struct ocfs2_super *osb,
62 struct ocfs2_journal_handle *handle,
63 struct inode *inode,
64 struct buffer_head *fe_bh,
65 struct buffer_head *eb_bh,
66 struct buffer_head *last_eb_bh,
67 struct ocfs2_alloc_context *meta_ac);
68
69static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
70 struct ocfs2_journal_handle *handle,
71 struct inode *inode,
72 struct buffer_head *fe_bh,
73 struct ocfs2_alloc_context *meta_ac,
74 struct buffer_head **ret_new_eb_bh);
75
76static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
77 struct ocfs2_journal_handle *handle,
78 struct inode *inode,
79 struct buffer_head *fe_bh,
80 u64 blkno,
81 u32 new_clusters);
82
83static int ocfs2_find_branch_target(struct ocfs2_super *osb,
84 struct inode *inode,
85 struct buffer_head *fe_bh,
86 struct buffer_head **target_bh);
87
88static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
89 struct inode *inode,
90 struct ocfs2_dinode *fe,
91 unsigned int new_i_clusters,
92 struct buffer_head *old_last_eb,
93 struct buffer_head **new_last_eb);
94
95static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
96
97static int ocfs2_extent_contig(struct inode *inode,
98 struct ocfs2_extent_rec *ext,
99 u64 blkno)
100{
101 return blkno == (le64_to_cpu(ext->e_blkno) +
102 ocfs2_clusters_to_blocks(inode->i_sb,
103 le32_to_cpu(ext->e_clusters)));
104}
105
106/*
107 * How many free extents have we got before we need more meta data?
108 */
109int ocfs2_num_free_extents(struct ocfs2_super *osb,
110 struct inode *inode,
111 struct ocfs2_dinode *fe)
112{
113 int retval;
114 struct ocfs2_extent_list *el;
115 struct ocfs2_extent_block *eb;
116 struct buffer_head *eb_bh = NULL;
117
118 mlog_entry_void();
119
120 if (!OCFS2_IS_VALID_DINODE(fe)) {
121 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
122 retval = -EIO;
123 goto bail;
124 }
125
126 if (fe->i_last_eb_blk) {
127 retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
128 &eb_bh, OCFS2_BH_CACHED, inode);
129 if (retval < 0) {
130 mlog_errno(retval);
131 goto bail;
132 }
133 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
134 el = &eb->h_list;
135 } else
136 el = &fe->id2.i_list;
137
138 BUG_ON(el->l_tree_depth != 0);
139
140 retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
141bail:
142 if (eb_bh)
143 brelse(eb_bh);
144
145 mlog_exit(retval);
146 return retval;
147}
148
149/* expects array to already be allocated
150 *
151 * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
152 * l_count for you
153 */
154static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
155 struct ocfs2_journal_handle *handle,
156 struct inode *inode,
157 int wanted,
158 struct ocfs2_alloc_context *meta_ac,
159 struct buffer_head *bhs[])
160{
161 int count, status, i;
162 u16 suballoc_bit_start;
163 u32 num_got;
164 u64 first_blkno;
165 struct ocfs2_extent_block *eb;
166
167 mlog_entry_void();
168
169 count = 0;
170 while (count < wanted) {
171 status = ocfs2_claim_metadata(osb,
172 handle,
173 meta_ac,
174 wanted - count,
175 &suballoc_bit_start,
176 &num_got,
177 &first_blkno);
178 if (status < 0) {
179 mlog_errno(status);
180 goto bail;
181 }
182
183 for(i = count; i < (num_got + count); i++) {
184 bhs[i] = sb_getblk(osb->sb, first_blkno);
185 if (bhs[i] == NULL) {
186 status = -EIO;
187 mlog_errno(status);
188 goto bail;
189 }
190 ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
191
192 status = ocfs2_journal_access(handle, inode, bhs[i],
193 OCFS2_JOURNAL_ACCESS_CREATE);
194 if (status < 0) {
195 mlog_errno(status);
196 goto bail;
197 }
198
199 memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
200 eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
201 /* Ok, setup the minimal stuff here. */
202 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
203 eb->h_blkno = cpu_to_le64(first_blkno);
204 eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
205
206#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
207 /* we always use slot zero's suballocator */
208 eb->h_suballoc_slot = 0;
209#else
210 eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
211#endif
212 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
213 eb->h_list.l_count =
214 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
215
216 suballoc_bit_start++;
217 first_blkno++;
218
219 /* We'll also be dirtied by the caller, so
220 * this isn't absolutely necessary. */
221 status = ocfs2_journal_dirty(handle, bhs[i]);
222 if (status < 0) {
223 mlog_errno(status);
224 goto bail;
225 }
226 }
227
228 count += num_got;
229 }
230
231 status = 0;
232bail:
233 if (status < 0) {
234 for(i = 0; i < wanted; i++) {
235 if (bhs[i])
236 brelse(bhs[i]);
237 bhs[i] = NULL;
238 }
239 }
240 mlog_exit(status);
241 return status;
242}
243
244/*
245 * Add an entire tree branch to our inode. eb_bh is the extent block
246 * to start at, if we don't want to start the branch at the dinode
247 * structure.
248 *
249 * last_eb_bh is required as we have to update it's next_leaf pointer
250 * for the new last extent block.
251 *
252 * the new branch will be 'empty' in the sense that every block will
253 * contain a single record with e_clusters == 0.
254 */
255static int ocfs2_add_branch(struct ocfs2_super *osb,
256 struct ocfs2_journal_handle *handle,
257 struct inode *inode,
258 struct buffer_head *fe_bh,
259 struct buffer_head *eb_bh,
260 struct buffer_head *last_eb_bh,
261 struct ocfs2_alloc_context *meta_ac)
262{
263 int status, new_blocks, i;
264 u64 next_blkno, new_last_eb_blk;
265 struct buffer_head *bh;
266 struct buffer_head **new_eb_bhs = NULL;
267 struct ocfs2_dinode *fe;
268 struct ocfs2_extent_block *eb;
269 struct ocfs2_extent_list *eb_el;
270 struct ocfs2_extent_list *el;
271
272 mlog_entry_void();
273
274 BUG_ON(!last_eb_bh);
275
276 fe = (struct ocfs2_dinode *) fe_bh->b_data;
277
278 if (eb_bh) {
279 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
280 el = &eb->h_list;
281 } else
282 el = &fe->id2.i_list;
283
284 /* we never add a branch to a leaf. */
285 BUG_ON(!el->l_tree_depth);
286
287 new_blocks = le16_to_cpu(el->l_tree_depth);
288
289 /* allocate the number of new eb blocks we need */
290 new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
291 GFP_KERNEL);
292 if (!new_eb_bhs) {
293 status = -ENOMEM;
294 mlog_errno(status);
295 goto bail;
296 }
297
298 status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
299 meta_ac, new_eb_bhs);
300 if (status < 0) {
301 mlog_errno(status);
302 goto bail;
303 }
304
305 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
306 * linked with the rest of the tree.
307 * conversly, new_eb_bhs[0] is the new bottommost leaf.
308 *
309 * when we leave the loop, new_last_eb_blk will point to the
310 * newest leaf, and next_blkno will point to the topmost extent
311 * block. */
312 next_blkno = new_last_eb_blk = 0;
313 for(i = 0; i < new_blocks; i++) {
314 bh = new_eb_bhs[i];
315 eb = (struct ocfs2_extent_block *) bh->b_data;
316 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
317 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
318 status = -EIO;
319 goto bail;
320 }
321 eb_el = &eb->h_list;
322
323 status = ocfs2_journal_access(handle, inode, bh,
324 OCFS2_JOURNAL_ACCESS_CREATE);
325 if (status < 0) {
326 mlog_errno(status);
327 goto bail;
328 }
329
330 eb->h_next_leaf_blk = 0;
331 eb_el->l_tree_depth = cpu_to_le16(i);
332 eb_el->l_next_free_rec = cpu_to_le16(1);
333 eb_el->l_recs[0].e_cpos = fe->i_clusters;
334 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
335 eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
336 if (!eb_el->l_tree_depth)
337 new_last_eb_blk = le64_to_cpu(eb->h_blkno);
338
339 status = ocfs2_journal_dirty(handle, bh);
340 if (status < 0) {
341 mlog_errno(status);
342 goto bail;
343 }
344
345 next_blkno = le64_to_cpu(eb->h_blkno);
346 }
347
348 /* This is a bit hairy. We want to update up to three blocks
349 * here without leaving any of them in an inconsistent state
350 * in case of error. We don't have to worry about
351 * journal_dirty erroring as it won't unless we've aborted the
352 * handle (in which case we would never be here) so reserving
353 * the write with journal_access is all we need to do. */
354 status = ocfs2_journal_access(handle, inode, last_eb_bh,
355 OCFS2_JOURNAL_ACCESS_WRITE);
356 if (status < 0) {
357 mlog_errno(status);
358 goto bail;
359 }
360 status = ocfs2_journal_access(handle, inode, fe_bh,
361 OCFS2_JOURNAL_ACCESS_WRITE);
362 if (status < 0) {
363 mlog_errno(status);
364 goto bail;
365 }
366 if (eb_bh) {
367 status = ocfs2_journal_access(handle, inode, eb_bh,
368 OCFS2_JOURNAL_ACCESS_WRITE);
369 if (status < 0) {
370 mlog_errno(status);
371 goto bail;
372 }
373 }
374
375 /* Link the new branch into the rest of the tree (el will
376 * either be on the fe, or the extent block passed in. */
377 i = le16_to_cpu(el->l_next_free_rec);
378 el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
379 el->l_recs[i].e_cpos = fe->i_clusters;
380 el->l_recs[i].e_clusters = 0;
381 le16_add_cpu(&el->l_next_free_rec, 1);
382
383 /* fe needs a new last extent block pointer, as does the
384 * next_leaf on the previously last-extent-block. */
385 fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
386
387 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
388 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
389
390 status = ocfs2_journal_dirty(handle, last_eb_bh);
391 if (status < 0)
392 mlog_errno(status);
393 status = ocfs2_journal_dirty(handle, fe_bh);
394 if (status < 0)
395 mlog_errno(status);
396 if (eb_bh) {
397 status = ocfs2_journal_dirty(handle, eb_bh);
398 if (status < 0)
399 mlog_errno(status);
400 }
401
402 status = 0;
403bail:
404 if (new_eb_bhs) {
405 for (i = 0; i < new_blocks; i++)
406 if (new_eb_bhs[i])
407 brelse(new_eb_bhs[i]);
408 kfree(new_eb_bhs);
409 }
410
411 mlog_exit(status);
412 return status;
413}
414
415/*
416 * adds another level to the allocation tree.
417 * returns back the new extent block so you can add a branch to it
418 * after this call.
419 */
420static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
421 struct ocfs2_journal_handle *handle,
422 struct inode *inode,
423 struct buffer_head *fe_bh,
424 struct ocfs2_alloc_context *meta_ac,
425 struct buffer_head **ret_new_eb_bh)
426{
427 int status, i;
428 struct buffer_head *new_eb_bh = NULL;
429 struct ocfs2_dinode *fe;
430 struct ocfs2_extent_block *eb;
431 struct ocfs2_extent_list *fe_el;
432 struct ocfs2_extent_list *eb_el;
433
434 mlog_entry_void();
435
436 status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
437 &new_eb_bh);
438 if (status < 0) {
439 mlog_errno(status);
440 goto bail;
441 }
442
443 eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
444 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
445 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
446 status = -EIO;
447 goto bail;
448 }
449
450 eb_el = &eb->h_list;
451 fe = (struct ocfs2_dinode *) fe_bh->b_data;
452 fe_el = &fe->id2.i_list;
453
454 status = ocfs2_journal_access(handle, inode, new_eb_bh,
455 OCFS2_JOURNAL_ACCESS_CREATE);
456 if (status < 0) {
457 mlog_errno(status);
458 goto bail;
459 }
460
461 /* copy the fe data into the new extent block */
462 eb_el->l_tree_depth = fe_el->l_tree_depth;
463 eb_el->l_next_free_rec = fe_el->l_next_free_rec;
464 for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
465 eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
466 eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
467 eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
468 }
469
470 status = ocfs2_journal_dirty(handle, new_eb_bh);
471 if (status < 0) {
472 mlog_errno(status);
473 goto bail;
474 }
475
476 status = ocfs2_journal_access(handle, inode, fe_bh,
477 OCFS2_JOURNAL_ACCESS_WRITE);
478 if (status < 0) {
479 mlog_errno(status);
480 goto bail;
481 }
482
483 /* update fe now */
484 le16_add_cpu(&fe_el->l_tree_depth, 1);
485 fe_el->l_recs[0].e_cpos = 0;
486 fe_el->l_recs[0].e_blkno = eb->h_blkno;
487 fe_el->l_recs[0].e_clusters = fe->i_clusters;
488 for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
489 fe_el->l_recs[i].e_cpos = 0;
490 fe_el->l_recs[i].e_clusters = 0;
491 fe_el->l_recs[i].e_blkno = 0;
492 }
493 fe_el->l_next_free_rec = cpu_to_le16(1);
494
495 /* If this is our 1st tree depth shift, then last_eb_blk
496 * becomes the allocated extent block */
497 if (fe_el->l_tree_depth == cpu_to_le16(1))
498 fe->i_last_eb_blk = eb->h_blkno;
499
500 status = ocfs2_journal_dirty(handle, fe_bh);
501 if (status < 0) {
502 mlog_errno(status);
503 goto bail;
504 }
505
506 *ret_new_eb_bh = new_eb_bh;
507 new_eb_bh = NULL;
508 status = 0;
509bail:
510 if (new_eb_bh)
511 brelse(new_eb_bh);
512
513 mlog_exit(status);
514 return status;
515}
516
517/*
518 * Expects the tree to already have room in the rightmost leaf for the
519 * extent. Updates all the extent blocks (and the dinode) on the way
520 * down.
521 */
522static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
523 struct ocfs2_journal_handle *handle,
524 struct inode *inode,
525 struct buffer_head *fe_bh,
526 u64 start_blk,
527 u32 new_clusters)
528{
529 int status, i, num_bhs = 0;
530 u64 next_blkno;
531 u16 next_free;
532 struct buffer_head **eb_bhs = NULL;
533 struct ocfs2_dinode *fe;
534 struct ocfs2_extent_block *eb;
535 struct ocfs2_extent_list *el;
536
537 mlog_entry_void();
538
539 status = ocfs2_journal_access(handle, inode, fe_bh,
540 OCFS2_JOURNAL_ACCESS_WRITE);
541 if (status < 0) {
542 mlog_errno(status);
543 goto bail;
544 }
545
546 fe = (struct ocfs2_dinode *) fe_bh->b_data;
547 el = &fe->id2.i_list;
548 if (el->l_tree_depth) {
549 /* This is another operation where we want to be
550 * careful about our tree updates. An error here means
551 * none of the previous changes we made should roll
552 * forward. As a result, we have to record the buffers
553 * for this part of the tree in an array and reserve a
554 * journal write to them before making any changes. */
555 num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth);
556 eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *),
557 GFP_KERNEL);
558 if (!eb_bhs) {
559 status = -ENOMEM;
560 mlog_errno(status);
561 goto bail;
562 }
563
564 i = 0;
565 while(el->l_tree_depth) {
566 next_free = le16_to_cpu(el->l_next_free_rec);
567 if (next_free == 0) {
568 ocfs2_error(inode->i_sb,
569 "Dinode %"MLFu64" has a bad "
570 "extent list",
571 OCFS2_I(inode)->ip_blkno);
572 status = -EIO;
573 goto bail;
574 }
575 next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno);
576
577 BUG_ON(i >= num_bhs);
578 status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i],
579 OCFS2_BH_CACHED, inode);
580 if (status < 0) {
581 mlog_errno(status);
582 goto bail;
583 }
584 eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
585 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
586 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
587 eb);
588 status = -EIO;
589 goto bail;
590 }
591
592 status = ocfs2_journal_access(handle, inode, eb_bhs[i],
593 OCFS2_JOURNAL_ACCESS_WRITE);
594 if (status < 0) {
595 mlog_errno(status);
596 goto bail;
597 }
598
599 el = &eb->h_list;
600 i++;
601 /* When we leave this loop, eb_bhs[num_bhs - 1] will
602 * hold the bottom-most leaf extent block. */
603 }
604 BUG_ON(el->l_tree_depth);
605
606 el = &fe->id2.i_list;
607 /* If we have tree depth, then the fe update is
608 * trivial, and we want to switch el out for the
609 * bottom-most leaf in order to update it with the
610 * actual extent data below. */
611 next_free = le16_to_cpu(el->l_next_free_rec);
612 if (next_free == 0) {
613 ocfs2_error(inode->i_sb,
614 "Dinode %"MLFu64" has a bad "
615 "extent list",
616 OCFS2_I(inode)->ip_blkno);
617 status = -EIO;
618 goto bail;
619 }
620 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
621 new_clusters);
622 /* (num_bhs - 1) to avoid the leaf */
623 for(i = 0; i < (num_bhs - 1); i++) {
624 eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
625 el = &eb->h_list;
626
627 /* finally, make our actual change to the
628 * intermediate extent blocks. */
629 next_free = le16_to_cpu(el->l_next_free_rec);
630 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
631 new_clusters);
632
633 status = ocfs2_journal_dirty(handle, eb_bhs[i]);
634 if (status < 0)
635 mlog_errno(status);
636 }
637 BUG_ON(i != (num_bhs - 1));
638 /* note that the leaf block wasn't touched in
639 * the loop above */
640 eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data;
641 el = &eb->h_list;
642 BUG_ON(el->l_tree_depth);
643 }
644
645 /* yay, we can finally add the actual extent now! */
646 i = le16_to_cpu(el->l_next_free_rec) - 1;
647 if (le16_to_cpu(el->l_next_free_rec) &&
648 ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) {
649 le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters);
650 } else if (le16_to_cpu(el->l_next_free_rec) &&
651 (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) {
652 /* having an empty extent at eof is legal. */
653 if (el->l_recs[i].e_cpos != fe->i_clusters) {
654 ocfs2_error(inode->i_sb,
655 "Dinode %"MLFu64" trailing extent is bad: "
656 "cpos (%u) != number of clusters (%u)",
657 le32_to_cpu(el->l_recs[i].e_cpos),
658 le32_to_cpu(fe->i_clusters));
659 status = -EIO;
660 goto bail;
661 }
662 el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
663 el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
664 } else {
665 /* No contiguous record, or no empty record at eof, so
666 * we add a new one. */
667
668 BUG_ON(le16_to_cpu(el->l_next_free_rec) >=
669 le16_to_cpu(el->l_count));
670 i = le16_to_cpu(el->l_next_free_rec);
671
672 el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
673 el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
674 el->l_recs[i].e_cpos = fe->i_clusters;
675 le16_add_cpu(&el->l_next_free_rec, 1);
676 }
677
678 /*
679 * extent_map errors are not fatal, so they are ignored outside
680 * of flushing the thing.
681 */
682 status = ocfs2_extent_map_append(inode, &el->l_recs[i],
683 new_clusters);
684 if (status) {
685 mlog_errno(status);
686 ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters));
687 }
688
689 status = ocfs2_journal_dirty(handle, fe_bh);
690 if (status < 0)
691 mlog_errno(status);
692 if (fe->id2.i_list.l_tree_depth) {
693 status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]);
694 if (status < 0)
695 mlog_errno(status);
696 }
697
698 status = 0;
699bail:
700 if (eb_bhs) {
701 for (i = 0; i < num_bhs; i++)
702 if (eb_bhs[i])
703 brelse(eb_bhs[i]);
704 kfree(eb_bhs);
705 }
706
707 mlog_exit(status);
708 return status;
709}
710
711/*
712 * Should only be called when there is no space left in any of the
713 * leaf nodes. What we want to do is find the lowest tree depth
714 * non-leaf extent block with room for new records. There are three
715 * valid results of this search:
716 *
717 * 1) a lowest extent block is found, then we pass it back in
718 * *lowest_eb_bh and return '0'
719 *
720 * 2) the search fails to find anything, but the dinode has room. We
721 * pass NULL back in *lowest_eb_bh, but still return '0'
722 *
723 * 3) the search fails to find anything AND the dinode is full, in
724 * which case we return > 0
725 *
726 * return status < 0 indicates an error.
727 */
728static int ocfs2_find_branch_target(struct ocfs2_super *osb,
729 struct inode *inode,
730 struct buffer_head *fe_bh,
731 struct buffer_head **target_bh)
732{
733 int status = 0, i;
734 u64 blkno;
735 struct ocfs2_dinode *fe;
736 struct ocfs2_extent_block *eb;
737 struct ocfs2_extent_list *el;
738 struct buffer_head *bh = NULL;
739 struct buffer_head *lowest_bh = NULL;
740
741 mlog_entry_void();
742
743 *target_bh = NULL;
744
745 fe = (struct ocfs2_dinode *) fe_bh->b_data;
746 el = &fe->id2.i_list;
747
748 while(le16_to_cpu(el->l_tree_depth) > 1) {
749 if (le16_to_cpu(el->l_next_free_rec) == 0) {
750 ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has empty "
751 "extent list (next_free_rec == 0)",
752 OCFS2_I(inode)->ip_blkno);
753 status = -EIO;
754 goto bail;
755 }
756 i = le16_to_cpu(el->l_next_free_rec) - 1;
757 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
758 if (!blkno) {
759 ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has extent "
760 "list where extent # %d has no physical "
761 "block start",
762 OCFS2_I(inode)->ip_blkno, i);
763 status = -EIO;
764 goto bail;
765 }
766
767 if (bh) {
768 brelse(bh);
769 bh = NULL;
770 }
771
772 status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
773 inode);
774 if (status < 0) {
775 mlog_errno(status);
776 goto bail;
777 }
778
779 eb = (struct ocfs2_extent_block *) bh->b_data;
780 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
781 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
782 status = -EIO;
783 goto bail;
784 }
785 el = &eb->h_list;
786
787 if (le16_to_cpu(el->l_next_free_rec) <
788 le16_to_cpu(el->l_count)) {
789 if (lowest_bh)
790 brelse(lowest_bh);
791 lowest_bh = bh;
792 get_bh(lowest_bh);
793 }
794 }
795
796 /* If we didn't find one and the fe doesn't have any room,
797 * then return '1' */
798 if (!lowest_bh
799 && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
800 status = 1;
801
802 *target_bh = lowest_bh;
803bail:
804 if (bh)
805 brelse(bh);
806
807 mlog_exit(status);
808 return status;
809}
810
811/* the caller needs to update fe->i_clusters */
812int ocfs2_insert_extent(struct ocfs2_super *osb,
813 struct ocfs2_journal_handle *handle,
814 struct inode *inode,
815 struct buffer_head *fe_bh,
816 u64 start_blk,
817 u32 new_clusters,
818 struct ocfs2_alloc_context *meta_ac)
819{
820 int status, i, shift;
821 struct buffer_head *last_eb_bh = NULL;
822 struct buffer_head *bh = NULL;
823 struct ocfs2_dinode *fe;
824 struct ocfs2_extent_block *eb;
825 struct ocfs2_extent_list *el;
826
827 mlog_entry_void();
828
829 mlog(0, "add %u clusters starting at block %"MLFu64" to "
830 "inode %"MLFu64"\n",
831 new_clusters, start_blk, OCFS2_I(inode)->ip_blkno);
832
833 fe = (struct ocfs2_dinode *) fe_bh->b_data;
834 el = &fe->id2.i_list;
835
836 if (el->l_tree_depth) {
837 /* jump to end of tree */
838 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
839 &last_eb_bh, OCFS2_BH_CACHED, inode);
840 if (status < 0) {
841 mlog_exit(status);
842 goto bail;
843 }
844 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
845 el = &eb->h_list;
846 }
847
848 /* Can we allocate without adding/shifting tree bits? */
849 i = le16_to_cpu(el->l_next_free_rec) - 1;
850 if (le16_to_cpu(el->l_next_free_rec) == 0
851 || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count))
852 || le32_to_cpu(el->l_recs[i].e_clusters) == 0
853 || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk))
854 goto out_add;
855
856 mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing "
857 "tree now.\n");
858
859 shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
860 if (shift < 0) {
861 status = shift;
862 mlog_errno(status);
863 goto bail;
864 }
865
866 /* We traveled all the way to the bottom of the allocation tree
867 * and didn't find room for any more extents - we need to add
868 * another tree level */
869 if (shift) {
870 /* if we hit a leaf, we'd better be empty :) */
871 BUG_ON(le16_to_cpu(el->l_next_free_rec) !=
872 le16_to_cpu(el->l_count));
873 BUG_ON(bh);
874 mlog(0, "ocfs2_allocate_extent: need to shift tree depth "
875 "(current = %u)\n",
876 le16_to_cpu(fe->id2.i_list.l_tree_depth));
877
878 /* ocfs2_shift_tree_depth will return us a buffer with
879 * the new extent block (so we can pass that to
880 * ocfs2_add_branch). */
881 status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
882 meta_ac, &bh);
883 if (status < 0) {
884 mlog_errno(status);
885 goto bail;
886 }
887 /* Special case: we have room now if we shifted from
888 * tree_depth 0 */
889 if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1))
890 goto out_add;
891 }
892
893 /* call ocfs2_add_branch to add the final part of the tree with
894 * the new data. */
895 mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh);
896 status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
897 meta_ac);
898 if (status < 0) {
899 mlog_errno(status);
900 goto bail;
901 }
902
903out_add:
904 /* Finally, we can add clusters. */
905 status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh,
906 start_blk, new_clusters);
907 if (status < 0)
908 mlog_errno(status);
909
910bail:
911 if (bh)
912 brelse(bh);
913
914 if (last_eb_bh)
915 brelse(last_eb_bh);
916
917 mlog_exit(status);
918 return status;
919}
920
921static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
922{
923 struct buffer_head *tl_bh = osb->osb_tl_bh;
924 struct ocfs2_dinode *di;
925 struct ocfs2_truncate_log *tl;
926
927 di = (struct ocfs2_dinode *) tl_bh->b_data;
928 tl = &di->id2.i_dealloc;
929
930 mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
931 "slot %d, invalid truncate log parameters: used = "
932 "%u, count = %u\n", osb->slot_num,
933 le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
934 return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
935}
936
937static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
938 unsigned int new_start)
939{
940 unsigned int tail_index;
941 unsigned int current_tail;
942
943 /* No records, nothing to coalesce */
944 if (!le16_to_cpu(tl->tl_used))
945 return 0;
946
947 tail_index = le16_to_cpu(tl->tl_used) - 1;
948 current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
949 current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
950
951 return current_tail == new_start;
952}
953
954static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
955 struct ocfs2_journal_handle *handle,
956 u64 start_blk,
957 unsigned int num_clusters)
958{
959 int status, index;
960 unsigned int start_cluster, tl_count;
961 struct inode *tl_inode = osb->osb_tl_inode;
962 struct buffer_head *tl_bh = osb->osb_tl_bh;
963 struct ocfs2_dinode *di;
964 struct ocfs2_truncate_log *tl;
965
966 mlog_entry("start_blk = %"MLFu64", num_clusters = %u\n", start_blk,
967 num_clusters);
968
969 BUG_ON(mutex_trylock(&tl_inode->i_mutex));
970
971 start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
972
973 di = (struct ocfs2_dinode *) tl_bh->b_data;
974 tl = &di->id2.i_dealloc;
975 if (!OCFS2_IS_VALID_DINODE(di)) {
976 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
977 status = -EIO;
978 goto bail;
979 }
980
981 tl_count = le16_to_cpu(tl->tl_count);
982 mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
983 tl_count == 0,
984 "Truncate record count on #%"MLFu64" invalid ("
985 "wanted %u, actual %u\n", OCFS2_I(tl_inode)->ip_blkno,
986 ocfs2_truncate_recs_per_inode(osb->sb),
987 le16_to_cpu(tl->tl_count));
988
989 /* Caller should have known to flush before calling us. */
990 index = le16_to_cpu(tl->tl_used);
991 if (index >= tl_count) {
992 status = -ENOSPC;
993 mlog_errno(status);
994 goto bail;
995 }
996
997 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
998 OCFS2_JOURNAL_ACCESS_WRITE);
999 if (status < 0) {
1000 mlog_errno(status);
1001 goto bail;
1002 }
1003
1004 mlog(0, "Log truncate of %u clusters starting at cluster %u to "
1005 "%"MLFu64" (index = %d)\n", num_clusters, start_cluster,
1006 OCFS2_I(tl_inode)->ip_blkno, index);
1007
1008 if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
1009 /*
1010 * Move index back to the record we are coalescing with.
1011 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
1012 */
1013 index--;
1014
1015 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
1016 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
1017 index, le32_to_cpu(tl->tl_recs[index].t_start),
1018 num_clusters);
1019 } else {
1020 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
1021 tl->tl_used = cpu_to_le16(index + 1);
1022 }
1023 tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
1024
1025 status = ocfs2_journal_dirty(handle, tl_bh);
1026 if (status < 0) {
1027 mlog_errno(status);
1028 goto bail;
1029 }
1030
1031bail:
1032 mlog_exit(status);
1033 return status;
1034}
1035
1036static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
1037 struct ocfs2_journal_handle *handle,
1038 struct inode *data_alloc_inode,
1039 struct buffer_head *data_alloc_bh)
1040{
1041 int status = 0;
1042 int i;
1043 unsigned int num_clusters;
1044 u64 start_blk;
1045 struct ocfs2_truncate_rec rec;
1046 struct ocfs2_dinode *di;
1047 struct ocfs2_truncate_log *tl;
1048 struct inode *tl_inode = osb->osb_tl_inode;
1049 struct buffer_head *tl_bh = osb->osb_tl_bh;
1050
1051 mlog_entry_void();
1052
1053 di = (struct ocfs2_dinode *) tl_bh->b_data;
1054 tl = &di->id2.i_dealloc;
1055 i = le16_to_cpu(tl->tl_used) - 1;
1056 while (i >= 0) {
1057 /* Caller has given us at least enough credits to
1058 * update the truncate log dinode */
1059 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
1060 OCFS2_JOURNAL_ACCESS_WRITE);
1061 if (status < 0) {
1062 mlog_errno(status);
1063 goto bail;
1064 }
1065
1066 tl->tl_used = cpu_to_le16(i);
1067
1068 status = ocfs2_journal_dirty(handle, tl_bh);
1069 if (status < 0) {
1070 mlog_errno(status);
1071 goto bail;
1072 }
1073
1074 /* TODO: Perhaps we can calculate the bulk of the
1075 * credits up front rather than extending like
1076 * this. */
1077 status = ocfs2_extend_trans(handle,
1078 OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
1079 if (status < 0) {
1080 mlog_errno(status);
1081 goto bail;
1082 }
1083
1084 rec = tl->tl_recs[i];
1085 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
1086 le32_to_cpu(rec.t_start));
1087 num_clusters = le32_to_cpu(rec.t_clusters);
1088
1089 /* if start_blk is not set, we ignore the record as
1090 * invalid. */
1091 if (start_blk) {
1092 mlog(0, "free record %d, start = %u, clusters = %u\n",
1093 i, le32_to_cpu(rec.t_start), num_clusters);
1094
1095 status = ocfs2_free_clusters(handle, data_alloc_inode,
1096 data_alloc_bh, start_blk,
1097 num_clusters);
1098 if (status < 0) {
1099 mlog_errno(status);
1100 goto bail;
1101 }
1102 }
1103 i--;
1104 }
1105
1106bail:
1107 mlog_exit(status);
1108 return status;
1109}
1110
1111/* Expects you to already be holding tl_inode->i_mutex */
1112static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
1113{
1114 int status;
1115 unsigned int num_to_flush;
1116 struct ocfs2_journal_handle *handle = NULL;
1117 struct inode *tl_inode = osb->osb_tl_inode;
1118 struct inode *data_alloc_inode = NULL;
1119 struct buffer_head *tl_bh = osb->osb_tl_bh;
1120 struct buffer_head *data_alloc_bh = NULL;
1121 struct ocfs2_dinode *di;
1122 struct ocfs2_truncate_log *tl;
1123
1124 mlog_entry_void();
1125
1126 BUG_ON(mutex_trylock(&tl_inode->i_mutex));
1127
1128 di = (struct ocfs2_dinode *) tl_bh->b_data;
1129 tl = &di->id2.i_dealloc;
1130 if (!OCFS2_IS_VALID_DINODE(di)) {
1131 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
1132 status = -EIO;
1133 goto bail;
1134 }
1135
1136 num_to_flush = le16_to_cpu(tl->tl_used);
1137 mlog(0, "Flush %u records from truncate log #%"MLFu64"\n",
1138 num_to_flush, OCFS2_I(tl_inode)->ip_blkno);
1139 if (!num_to_flush) {
1140 status = 0;
1141 goto bail;
1142 }
1143
1144 handle = ocfs2_alloc_handle(osb);
1145 if (!handle) {
1146 status = -ENOMEM;
1147 mlog_errno(status);
1148 goto bail;
1149 }
1150
1151 data_alloc_inode = ocfs2_get_system_file_inode(osb,
1152 GLOBAL_BITMAP_SYSTEM_INODE,
1153 OCFS2_INVALID_SLOT);
1154 if (!data_alloc_inode) {
1155 status = -EINVAL;
1156 mlog(ML_ERROR, "Could not get bitmap inode!\n");
1157 goto bail;
1158 }
1159
1160 ocfs2_handle_add_inode(handle, data_alloc_inode);
1161 status = ocfs2_meta_lock(data_alloc_inode, handle, &data_alloc_bh, 1);
1162 if (status < 0) {
1163 mlog_errno(status);
1164 goto bail;
1165 }
1166
1167 handle = ocfs2_start_trans(osb, handle, OCFS2_TRUNCATE_LOG_UPDATE);
1168 if (IS_ERR(handle)) {
1169 status = PTR_ERR(handle);
1170 handle = NULL;
1171 mlog_errno(status);
1172 goto bail;
1173 }
1174
1175 status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
1176 data_alloc_bh);
1177 if (status < 0) {
1178 mlog_errno(status);
1179 goto bail;
1180 }
1181
1182bail:
1183 if (handle)
1184 ocfs2_commit_trans(handle);
1185
1186 if (data_alloc_inode)
1187 iput(data_alloc_inode);
1188
1189 if (data_alloc_bh)
1190 brelse(data_alloc_bh);
1191
1192 mlog_exit(status);
1193 return status;
1194}
1195
1196int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
1197{
1198 int status;
1199 struct inode *tl_inode = osb->osb_tl_inode;
1200
1201 mutex_lock(&tl_inode->i_mutex);
1202 status = __ocfs2_flush_truncate_log(osb);
1203 mutex_unlock(&tl_inode->i_mutex);
1204
1205 return status;
1206}
1207
1208static void ocfs2_truncate_log_worker(void *data)
1209{
1210 int status;
1211 struct ocfs2_super *osb = data;
1212
1213 mlog_entry_void();
1214
1215 status = ocfs2_flush_truncate_log(osb);
1216 if (status < 0)
1217 mlog_errno(status);
1218
1219 mlog_exit(status);
1220}
1221
1222#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
1223void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
1224 int cancel)
1225{
1226 if (osb->osb_tl_inode) {
1227 /* We want to push off log flushes while truncates are
1228 * still running. */
1229 if (cancel)
1230 cancel_delayed_work(&osb->osb_truncate_log_wq);
1231
1232 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
1233 OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
1234 }
1235}
1236
1237static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
1238 int slot_num,
1239 struct inode **tl_inode,
1240 struct buffer_head **tl_bh)
1241{
1242 int status;
1243 struct inode *inode = NULL;
1244 struct buffer_head *bh = NULL;
1245
1246 inode = ocfs2_get_system_file_inode(osb,
1247 TRUNCATE_LOG_SYSTEM_INODE,
1248 slot_num);
1249 if (!inode) {
1250 status = -EINVAL;
1251 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
1252 goto bail;
1253 }
1254
1255 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
1256 OCFS2_BH_CACHED, inode);
1257 if (status < 0) {
1258 iput(inode);
1259 mlog_errno(status);
1260 goto bail;
1261 }
1262
1263 *tl_inode = inode;
1264 *tl_bh = bh;
1265bail:
1266 mlog_exit(status);
1267 return status;
1268}
1269
1270/* called during the 1st stage of node recovery. we stamp a clean
1271 * truncate log and pass back a copy for processing later. if the
1272 * truncate log does not require processing, a *tl_copy is set to
1273 * NULL. */
1274int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
1275 int slot_num,
1276 struct ocfs2_dinode **tl_copy)
1277{
1278 int status;
1279 struct inode *tl_inode = NULL;
1280 struct buffer_head *tl_bh = NULL;
1281 struct ocfs2_dinode *di;
1282 struct ocfs2_truncate_log *tl;
1283
1284 *tl_copy = NULL;
1285
1286 mlog(0, "recover truncate log from slot %d\n", slot_num);
1287
1288 status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
1289 if (status < 0) {
1290 mlog_errno(status);
1291 goto bail;
1292 }
1293
1294 di = (struct ocfs2_dinode *) tl_bh->b_data;
1295 tl = &di->id2.i_dealloc;
1296 if (!OCFS2_IS_VALID_DINODE(di)) {
1297 OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
1298 status = -EIO;
1299 goto bail;
1300 }
1301
1302 if (le16_to_cpu(tl->tl_used)) {
1303 mlog(0, "We'll have %u logs to recover\n",
1304 le16_to_cpu(tl->tl_used));
1305
1306 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
1307 if (!(*tl_copy)) {
1308 status = -ENOMEM;
1309 mlog_errno(status);
1310 goto bail;
1311 }
1312
1313 /* Assuming the write-out below goes well, this copy
1314 * will be passed back to recovery for processing. */
1315 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
1316
1317 /* All we need to do to clear the truncate log is set
1318 * tl_used. */
1319 tl->tl_used = 0;
1320
1321 status = ocfs2_write_block(osb, tl_bh, tl_inode);
1322 if (status < 0) {
1323 mlog_errno(status);
1324 goto bail;
1325 }
1326 }
1327
1328bail:
1329 if (tl_inode)
1330 iput(tl_inode);
1331 if (tl_bh)
1332 brelse(tl_bh);
1333
1334 if (status < 0 && (*tl_copy)) {
1335 kfree(*tl_copy);
1336 *tl_copy = NULL;
1337 }
1338
1339 mlog_exit(status);
1340 return status;
1341}
1342
1343int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
1344 struct ocfs2_dinode *tl_copy)
1345{
1346 int status = 0;
1347 int i;
1348 unsigned int clusters, num_recs, start_cluster;
1349 u64 start_blk;
1350 struct ocfs2_journal_handle *handle;
1351 struct inode *tl_inode = osb->osb_tl_inode;
1352 struct ocfs2_truncate_log *tl;
1353
1354 mlog_entry_void();
1355
1356 if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
1357 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
1358 return -EINVAL;
1359 }
1360
1361 tl = &tl_copy->id2.i_dealloc;
1362 num_recs = le16_to_cpu(tl->tl_used);
1363 mlog(0, "cleanup %u records from %"MLFu64"\n", num_recs,
1364 tl_copy->i_blkno);
1365
1366 mutex_lock(&tl_inode->i_mutex);
1367 for(i = 0; i < num_recs; i++) {
1368 if (ocfs2_truncate_log_needs_flush(osb)) {
1369 status = __ocfs2_flush_truncate_log(osb);
1370 if (status < 0) {
1371 mlog_errno(status);
1372 goto bail_up;
1373 }
1374 }
1375
1376 handle = ocfs2_start_trans(osb, NULL,
1377 OCFS2_TRUNCATE_LOG_UPDATE);
1378 if (IS_ERR(handle)) {
1379 status = PTR_ERR(handle);
1380 mlog_errno(status);
1381 goto bail_up;
1382 }
1383
1384 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
1385 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
1386 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
1387
1388 status = ocfs2_truncate_log_append(osb, handle,
1389 start_blk, clusters);
1390 ocfs2_commit_trans(handle);
1391 if (status < 0) {
1392 mlog_errno(status);
1393 goto bail_up;
1394 }
1395 }
1396
1397bail_up:
1398 mutex_unlock(&tl_inode->i_mutex);
1399
1400 mlog_exit(status);
1401 return status;
1402}
1403
1404void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
1405{
1406 int status;
1407 struct inode *tl_inode = osb->osb_tl_inode;
1408
1409 mlog_entry_void();
1410
1411 if (tl_inode) {
1412 cancel_delayed_work(&osb->osb_truncate_log_wq);
1413 flush_workqueue(ocfs2_wq);
1414
1415 status = ocfs2_flush_truncate_log(osb);
1416 if (status < 0)
1417 mlog_errno(status);
1418
1419 brelse(osb->osb_tl_bh);
1420 iput(osb->osb_tl_inode);
1421 }
1422
1423 mlog_exit_void();
1424}
1425
1426int ocfs2_truncate_log_init(struct ocfs2_super *osb)
1427{
1428 int status;
1429 struct inode *tl_inode = NULL;
1430 struct buffer_head *tl_bh = NULL;
1431
1432 mlog_entry_void();
1433
1434 status = ocfs2_get_truncate_log_info(osb,
1435 osb->slot_num,
1436 &tl_inode,
1437 &tl_bh);
1438 if (status < 0)
1439 mlog_errno(status);
1440
1441 /* ocfs2_truncate_log_shutdown keys on the existence of
1442 * osb->osb_tl_inode so we don't set any of the osb variables
1443 * until we're sure all is well. */
1444 INIT_WORK(&osb->osb_truncate_log_wq, ocfs2_truncate_log_worker, osb);
1445 osb->osb_tl_bh = tl_bh;
1446 osb->osb_tl_inode = tl_inode;
1447
1448 mlog_exit(status);
1449 return status;
1450}
1451
1452/* This function will figure out whether the currently last extent
1453 * block will be deleted, and if it will, what the new last extent
1454 * block will be so we can update his h_next_leaf_blk field, as well
1455 * as the dinodes i_last_eb_blk */
1456static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
1457 struct inode *inode,
1458 struct ocfs2_dinode *fe,
1459 u32 new_i_clusters,
1460 struct buffer_head *old_last_eb,
1461 struct buffer_head **new_last_eb)
1462{
1463 int i, status = 0;
1464 u64 block = 0;
1465 struct ocfs2_extent_block *eb;
1466 struct ocfs2_extent_list *el;
1467 struct buffer_head *bh = NULL;
1468
1469 *new_last_eb = NULL;
1470
1471 if (!OCFS2_IS_VALID_DINODE(fe)) {
1472 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
1473 status = -EIO;
1474 goto bail;
1475 }
1476
1477 /* we have no tree, so of course, no last_eb. */
1478 if (!fe->id2.i_list.l_tree_depth)
1479 goto bail;
1480
1481 /* trunc to zero special case - this makes tree_depth = 0
1482 * regardless of what it is. */
1483 if (!new_i_clusters)
1484 goto bail;
1485
1486 eb = (struct ocfs2_extent_block *) old_last_eb->b_data;
1487 el = &(eb->h_list);
1488 BUG_ON(!el->l_next_free_rec);
1489
1490 /* Make sure that this guy will actually be empty after we
1491 * clear away the data. */
1492 if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters)
1493 goto bail;
1494
1495 /* Ok, at this point, we know that last_eb will definitely
1496 * change, so lets traverse the tree and find the second to
1497 * last extent block. */
1498 el = &(fe->id2.i_list);
1499 /* go down the tree, */
1500 do {
1501 for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) {
1502 if (le32_to_cpu(el->l_recs[i].e_cpos) <
1503 new_i_clusters) {
1504 block = le64_to_cpu(el->l_recs[i].e_blkno);
1505 break;
1506 }
1507 }
1508 BUG_ON(i < 0);
1509
1510 if (bh) {
1511 brelse(bh);
1512 bh = NULL;
1513 }
1514
1515 status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED,
1516 inode);
1517 if (status < 0) {
1518 mlog_errno(status);
1519 goto bail;
1520 }
1521 eb = (struct ocfs2_extent_block *) bh->b_data;
1522 el = &eb->h_list;
1523 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1524 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1525 status = -EIO;
1526 goto bail;
1527 }
1528 } while (el->l_tree_depth);
1529
1530 *new_last_eb = bh;
1531 get_bh(*new_last_eb);
1532 mlog(0, "returning block %"MLFu64"\n", le64_to_cpu(eb->h_blkno));
1533bail:
1534 if (bh)
1535 brelse(bh);
1536
1537 return status;
1538}
1539
1540static int ocfs2_do_truncate(struct ocfs2_super *osb,
1541 unsigned int clusters_to_del,
1542 struct inode *inode,
1543 struct buffer_head *fe_bh,
1544 struct buffer_head *old_last_eb_bh,
1545 struct ocfs2_journal_handle *handle,
1546 struct ocfs2_truncate_context *tc)
1547{
1548 int status, i, depth;
1549 struct ocfs2_dinode *fe;
1550 struct ocfs2_extent_block *eb;
1551 struct ocfs2_extent_block *last_eb = NULL;
1552 struct ocfs2_extent_list *el;
1553 struct buffer_head *eb_bh = NULL;
1554 struct buffer_head *last_eb_bh = NULL;
1555 u64 next_eb = 0;
1556 u64 delete_blk = 0;
1557
1558 fe = (struct ocfs2_dinode *) fe_bh->b_data;
1559
1560 status = ocfs2_find_new_last_ext_blk(osb,
1561 inode,
1562 fe,
1563 le32_to_cpu(fe->i_clusters) -
1564 clusters_to_del,
1565 old_last_eb_bh,
1566 &last_eb_bh);
1567 if (status < 0) {
1568 mlog_errno(status);
1569 goto bail;
1570 }
1571 if (last_eb_bh)
1572 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1573
1574 status = ocfs2_journal_access(handle, inode, fe_bh,
1575 OCFS2_JOURNAL_ACCESS_WRITE);
1576 if (status < 0) {
1577 mlog_errno(status);
1578 goto bail;
1579 }
1580 el = &(fe->id2.i_list);
1581
1582 spin_lock(&OCFS2_I(inode)->ip_lock);
1583 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
1584 clusters_to_del;
1585 spin_unlock(&OCFS2_I(inode)->ip_lock);
1586 le32_add_cpu(&fe->i_clusters, -clusters_to_del);
1587 fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
1588 fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
1589
1590 i = le16_to_cpu(el->l_next_free_rec) - 1;
1591
1592 BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
1593 le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
1594 /* tree depth zero, we can just delete the clusters, otherwise
1595 * we need to record the offset of the next level extent block
1596 * as we may overwrite it. */
1597 if (!el->l_tree_depth)
1598 delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
1599 + ocfs2_clusters_to_blocks(osb->sb,
1600 le32_to_cpu(el->l_recs[i].e_clusters));
1601 else
1602 next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
1603
1604 if (!el->l_recs[i].e_clusters) {
1605 /* if we deleted the whole extent record, then clear
1606 * out the other fields and update the extent
1607 * list. For depth > 0 trees, we've already recorded
1608 * the extent block in 'next_eb' */
1609 el->l_recs[i].e_cpos = 0;
1610 el->l_recs[i].e_blkno = 0;
1611 BUG_ON(!el->l_next_free_rec);
1612 le16_add_cpu(&el->l_next_free_rec, -1);
1613 }
1614
1615 depth = le16_to_cpu(el->l_tree_depth);
1616 if (!fe->i_clusters) {
1617 /* trunc to zero is a special case. */
1618 el->l_tree_depth = 0;
1619 fe->i_last_eb_blk = 0;
1620 } else if (last_eb)
1621 fe->i_last_eb_blk = last_eb->h_blkno;
1622
1623 status = ocfs2_journal_dirty(handle, fe_bh);
1624 if (status < 0) {
1625 mlog_errno(status);
1626 goto bail;
1627 }
1628
1629 if (last_eb) {
1630 /* If there will be a new last extent block, then by
1631 * definition, there cannot be any leaves to the right of
1632 * him. */
1633 status = ocfs2_journal_access(handle, inode, last_eb_bh,
1634 OCFS2_JOURNAL_ACCESS_WRITE);
1635 if (status < 0) {
1636 mlog_errno(status);
1637 goto bail;
1638 }
1639 last_eb->h_next_leaf_blk = 0;
1640 status = ocfs2_journal_dirty(handle, last_eb_bh);
1641 if (status < 0) {
1642 mlog_errno(status);
1643 goto bail;
1644 }
1645 }
1646
1647 /* if our tree depth > 0, update all the tree blocks below us. */
1648 while (depth) {
1649 mlog(0, "traveling tree (depth = %d, next_eb = %"MLFu64")\n",
1650 depth, next_eb);
1651 status = ocfs2_read_block(osb, next_eb, &eb_bh,
1652 OCFS2_BH_CACHED, inode);
1653 if (status < 0) {
1654 mlog_errno(status);
1655 goto bail;
1656 }
1657 eb = (struct ocfs2_extent_block *)eb_bh->b_data;
1658 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1659 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1660 status = -EIO;
1661 goto bail;
1662 }
1663 el = &(eb->h_list);
1664
1665 status = ocfs2_journal_access(handle, inode, eb_bh,
1666 OCFS2_JOURNAL_ACCESS_WRITE);
1667 if (status < 0) {
1668 mlog_errno(status);
1669 goto bail;
1670 }
1671
1672 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
1673 BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1));
1674
1675 i = le16_to_cpu(el->l_next_free_rec) - 1;
1676
1677 mlog(0, "extent block %"MLFu64", before: record %d: "
1678 "(%u, %u, %"MLFu64"), next = %u\n",
1679 le64_to_cpu(eb->h_blkno), i,
1680 le32_to_cpu(el->l_recs[i].e_cpos),
1681 le32_to_cpu(el->l_recs[i].e_clusters),
1682 le64_to_cpu(el->l_recs[i].e_blkno),
1683 le16_to_cpu(el->l_next_free_rec));
1684
1685 BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
1686 le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
1687
1688 next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
1689 /* bottom-most block requires us to delete data.*/
1690 if (!el->l_tree_depth)
1691 delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
1692 + ocfs2_clusters_to_blocks(osb->sb,
1693 le32_to_cpu(el->l_recs[i].e_clusters));
1694 if (!el->l_recs[i].e_clusters) {
1695 el->l_recs[i].e_cpos = 0;
1696 el->l_recs[i].e_blkno = 0;
1697 BUG_ON(!el->l_next_free_rec);
1698 le16_add_cpu(&el->l_next_free_rec, -1);
1699 }
1700 mlog(0, "extent block %"MLFu64", after: record %d: "
1701 "(%u, %u, %"MLFu64"), next = %u\n",
1702 le64_to_cpu(eb->h_blkno), i,
1703 le32_to_cpu(el->l_recs[i].e_cpos),
1704 le32_to_cpu(el->l_recs[i].e_clusters),
1705 le64_to_cpu(el->l_recs[i].e_blkno),
1706 le16_to_cpu(el->l_next_free_rec));
1707
1708 status = ocfs2_journal_dirty(handle, eb_bh);
1709 if (status < 0) {
1710 mlog_errno(status);
1711 goto bail;
1712 }
1713
1714 if (!el->l_next_free_rec) {
1715 mlog(0, "deleting this extent block.\n");
1716
1717 ocfs2_remove_from_cache(inode, eb_bh);
1718
1719 BUG_ON(eb->h_suballoc_slot);
1720 BUG_ON(el->l_recs[0].e_clusters);
1721 BUG_ON(el->l_recs[0].e_cpos);
1722 BUG_ON(el->l_recs[0].e_blkno);
1723 status = ocfs2_free_extent_block(handle,
1724 tc->tc_ext_alloc_inode,
1725 tc->tc_ext_alloc_bh,
1726 eb);
1727 if (status < 0) {
1728 mlog_errno(status);
1729 goto bail;
1730 }
1731 }
1732 brelse(eb_bh);
1733 eb_bh = NULL;
1734 depth--;
1735 }
1736
1737 BUG_ON(!delete_blk);
1738 status = ocfs2_truncate_log_append(osb, handle, delete_blk,
1739 clusters_to_del);
1740 if (status < 0) {
1741 mlog_errno(status);
1742 goto bail;
1743 }
1744 status = 0;
1745bail:
1746 if (!status)
1747 ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters));
1748 else
1749 ocfs2_extent_map_drop(inode, 0);
1750 mlog_exit(status);
1751 return status;
1752}
1753
1754/*
1755 * It is expected, that by the time you call this function,
1756 * inode->i_size and fe->i_size have been adjusted.
1757 *
1758 * WARNING: This will kfree the truncate context
1759 */
1760int ocfs2_commit_truncate(struct ocfs2_super *osb,
1761 struct inode *inode,
1762 struct buffer_head *fe_bh,
1763 struct ocfs2_truncate_context *tc)
1764{
1765 int status, i, credits, tl_sem = 0;
1766 u32 clusters_to_del, target_i_clusters;
1767 u64 last_eb = 0;
1768 struct ocfs2_dinode *fe;
1769 struct ocfs2_extent_block *eb;
1770 struct ocfs2_extent_list *el;
1771 struct buffer_head *last_eb_bh;
1772 struct ocfs2_journal_handle *handle = NULL;
1773 struct inode *tl_inode = osb->osb_tl_inode;
1774
1775 mlog_entry_void();
1776
1777 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1778
1779 target_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
1780 i_size_read(inode));
1781
1782 last_eb_bh = tc->tc_last_eb_bh;
1783 tc->tc_last_eb_bh = NULL;
1784
1785 fe = (struct ocfs2_dinode *) fe_bh->b_data;
1786
1787 if (fe->id2.i_list.l_tree_depth) {
1788 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1789 el = &eb->h_list;
1790 } else
1791 el = &fe->id2.i_list;
1792 last_eb = le64_to_cpu(fe->i_last_eb_blk);
1793start:
1794 mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, "
1795 "last_eb = %"MLFu64", fe->i_last_eb_blk = %"MLFu64", "
1796 "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n",
1797 le32_to_cpu(fe->i_clusters), last_eb,
1798 le64_to_cpu(fe->i_last_eb_blk),
1799 le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh);
1800
1801 if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) {
1802 mlog(0, "last_eb changed!\n");
1803 BUG_ON(!fe->id2.i_list.l_tree_depth);
1804 last_eb = le64_to_cpu(fe->i_last_eb_blk);
1805 /* i_last_eb_blk may have changed, read it if
1806 * necessary. We don't have to worry about the
1807 * truncate to zero case here (where there becomes no
1808 * last_eb) because we never loop back after our work
1809 * is done. */
1810 if (last_eb_bh) {
1811 brelse(last_eb_bh);
1812 last_eb_bh = NULL;
1813 }
1814
1815 status = ocfs2_read_block(osb, last_eb,
1816 &last_eb_bh, OCFS2_BH_CACHED,
1817 inode);
1818 if (status < 0) {
1819 mlog_errno(status);
1820 goto bail;
1821 }
1822 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1823 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1824 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1825 status = -EIO;
1826 goto bail;
1827 }
1828 el = &(eb->h_list);
1829 }
1830
1831 /* by now, el will point to the extent list on the bottom most
1832 * portion of this tree. */
1833 i = le16_to_cpu(el->l_next_free_rec) - 1;
1834 if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters)
1835 clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
1836 else
1837 clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
1838 le32_to_cpu(el->l_recs[i].e_cpos)) -
1839 target_i_clusters;
1840
1841 mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del);
1842
1843 mutex_lock(&tl_inode->i_mutex);
1844 tl_sem = 1;
1845 /* ocfs2_truncate_log_needs_flush guarantees us at least one
1846 * record is free for use. If there isn't any, we flush to get
1847 * an empty truncate log. */
1848 if (ocfs2_truncate_log_needs_flush(osb)) {
1849 status = __ocfs2_flush_truncate_log(osb);
1850 if (status < 0) {
1851 mlog_errno(status);
1852 goto bail;
1853 }
1854 }
1855
1856 credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
1857 fe, el);
1858 handle = ocfs2_start_trans(osb, NULL, credits);
1859 if (IS_ERR(handle)) {
1860 status = PTR_ERR(handle);
1861 handle = NULL;
1862 mlog_errno(status);
1863 goto bail;
1864 }
1865
1866 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1867 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
1868 if (status < 0)
1869 mlog_errno(status);
1870
1871 status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh,
1872 last_eb_bh, handle, tc);
1873 if (status < 0) {
1874 mlog_errno(status);
1875 goto bail;
1876 }
1877
1878 mutex_unlock(&tl_inode->i_mutex);
1879 tl_sem = 0;
1880
1881 ocfs2_commit_trans(handle);
1882 handle = NULL;
1883
1884 BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters);
1885 if (le32_to_cpu(fe->i_clusters) > target_i_clusters)
1886 goto start;
1887bail:
1888 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1889
1890 ocfs2_schedule_truncate_log_flush(osb, 1);
1891
1892 if (tl_sem)
1893 mutex_unlock(&tl_inode->i_mutex);
1894
1895 if (handle)
1896 ocfs2_commit_trans(handle);
1897
1898 if (last_eb_bh)
1899 brelse(last_eb_bh);
1900
1901 /* This will drop the ext_alloc cluster lock for us */
1902 ocfs2_free_truncate_context(tc);
1903
1904 mlog_exit(status);
1905 return status;
1906}
1907
1908
1909/*
1910 * Expects the inode to already be locked. This will figure out which
1911 * inodes need to be locked and will put them on the returned truncate
1912 * context.
1913 */
1914int ocfs2_prepare_truncate(struct ocfs2_super *osb,
1915 struct inode *inode,
1916 struct buffer_head *fe_bh,
1917 struct ocfs2_truncate_context **tc)
1918{
1919 int status, metadata_delete;
1920 unsigned int new_i_clusters;
1921 struct ocfs2_dinode *fe;
1922 struct ocfs2_extent_block *eb;
1923 struct ocfs2_extent_list *el;
1924 struct buffer_head *last_eb_bh = NULL;
1925 struct inode *ext_alloc_inode = NULL;
1926 struct buffer_head *ext_alloc_bh = NULL;
1927
1928 mlog_entry_void();
1929
1930 *tc = NULL;
1931
1932 new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
1933 i_size_read(inode));
1934 fe = (struct ocfs2_dinode *) fe_bh->b_data;
1935
1936 mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
1937 "%"MLFu64"\n", fe->i_clusters, new_i_clusters, fe->i_size);
1938
1939 if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
1940 ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has cluster count "
1941 "%u and size %"MLFu64" whereas struct inode has "
1942 "cluster count %u and size %llu which caused an "
1943 "invalid truncate to %u clusters.",
1944 le64_to_cpu(fe->i_blkno),
1945 le32_to_cpu(fe->i_clusters),
1946 le64_to_cpu(fe->i_size),
1947 OCFS2_I(inode)->ip_clusters, i_size_read(inode),
1948 new_i_clusters);
1949 mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
1950 status = -EIO;
1951 goto bail;
1952 }
1953
1954 *tc = kcalloc(1, sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
1955 if (!(*tc)) {
1956 status = -ENOMEM;
1957 mlog_errno(status);
1958 goto bail;
1959 }
1960
1961 metadata_delete = 0;
1962 if (fe->id2.i_list.l_tree_depth) {
1963 /* If we have a tree, then the truncate may result in
1964 * metadata deletes. Figure this out from the
1965 * rightmost leaf block.*/
1966 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
1967 &last_eb_bh, OCFS2_BH_CACHED, inode);
1968 if (status < 0) {
1969 mlog_errno(status);
1970 goto bail;
1971 }
1972 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1973 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1974 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1975
1976 brelse(last_eb_bh);
1977 status = -EIO;
1978 goto bail;
1979 }
1980 el = &(eb->h_list);
1981 if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters)
1982 metadata_delete = 1;
1983 }
1984
1985 (*tc)->tc_last_eb_bh = last_eb_bh;
1986
1987 if (metadata_delete) {
1988 mlog(0, "Will have to delete metadata for this trunc. "
1989 "locking allocator.\n");
1990 ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
1991 if (!ext_alloc_inode) {
1992 status = -ENOMEM;
1993 mlog_errno(status);
1994 goto bail;
1995 }
1996
1997 mutex_lock(&ext_alloc_inode->i_mutex);
1998 (*tc)->tc_ext_alloc_inode = ext_alloc_inode;
1999
2000 status = ocfs2_meta_lock(ext_alloc_inode,
2001 NULL,
2002 &ext_alloc_bh,
2003 1);
2004 if (status < 0) {
2005 mlog_errno(status);
2006 goto bail;
2007 }
2008 (*tc)->tc_ext_alloc_bh = ext_alloc_bh;
2009 (*tc)->tc_ext_alloc_locked = 1;
2010 }
2011
2012 status = 0;
2013bail:
2014 if (status < 0) {
2015 if (*tc)
2016 ocfs2_free_truncate_context(*tc);
2017 *tc = NULL;
2018 }
2019 mlog_exit_void();
2020 return status;
2021}
2022
2023static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
2024{
2025 if (tc->tc_ext_alloc_inode) {
2026 if (tc->tc_ext_alloc_locked)
2027 ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
2028
2029 mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex);
2030 iput(tc->tc_ext_alloc_inode);
2031 }
2032
2033 if (tc->tc_ext_alloc_bh)
2034 brelse(tc->tc_ext_alloc_bh);
2035
2036 if (tc->tc_last_eb_bh)
2037 brelse(tc->tc_last_eb_bh);
2038
2039 kfree(tc);
2040}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
new file mode 100644
index 00000000000..12ba897743f
--- /dev/null
+++ b/fs/ocfs2/alloc.h
@@ -0,0 +1,82 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * alloc.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_ALLOC_H
27#define OCFS2_ALLOC_H
28
29struct ocfs2_alloc_context;
30int ocfs2_insert_extent(struct ocfs2_super *osb,
31 struct ocfs2_journal_handle *handle,
32 struct inode *inode,
33 struct buffer_head *fe_bh,
34 u64 blkno,
35 u32 new_clusters,
36 struct ocfs2_alloc_context *meta_ac);
37int ocfs2_num_free_extents(struct ocfs2_super *osb,
38 struct inode *inode,
39 struct ocfs2_dinode *fe);
40/* how many new metadata chunks would an allocation need at maximum? */
41static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
42{
43 /*
44 * Rather than do all the work of determining how much we need
45 * (involves a ton of reads and locks), just ask for the
46 * maximal limit. That's a tree depth shift. So, one block for
47 * level of the tree (current l_tree_depth), one block for the
48 * new tree_depth==0 extent_block, and one block at the new
49 * top-of-the tree.
50 */
51 return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
52}
53
54int ocfs2_truncate_log_init(struct ocfs2_super *osb);
55void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb);
56void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
57 int cancel);
58int ocfs2_flush_truncate_log(struct ocfs2_super *osb);
59int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
60 int slot_num,
61 struct ocfs2_dinode **tl_copy);
62int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
63 struct ocfs2_dinode *tl_copy);
64
65struct ocfs2_truncate_context {
66 struct inode *tc_ext_alloc_inode;
67 struct buffer_head *tc_ext_alloc_bh;
68 int tc_ext_alloc_locked; /* is it cluster locked? */
69 /* these get destroyed once it's passed to ocfs2_commit_truncate. */
70 struct buffer_head *tc_last_eb_bh;
71};
72
73int ocfs2_prepare_truncate(struct ocfs2_super *osb,
74 struct inode *inode,
75 struct buffer_head *fe_bh,
76 struct ocfs2_truncate_context **tc);
77int ocfs2_commit_truncate(struct ocfs2_super *osb,
78 struct inode *inode,
79 struct buffer_head *fe_bh,
80 struct ocfs2_truncate_context *tc);
81
82#endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
new file mode 100644
index 00000000000..8f4467a930a
--- /dev/null
+++ b/fs/ocfs2/aops.c
@@ -0,0 +1,643 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#include <linux/fs.h>
23#include <linux/slab.h>
24#include <linux/highmem.h>
25#include <linux/pagemap.h>
26#include <asm/byteorder.h>
27
28#define MLOG_MASK_PREFIX ML_FILE_IO
29#include <cluster/masklog.h>
30
31#include "ocfs2.h"
32
33#include "alloc.h"
34#include "aops.h"
35#include "dlmglue.h"
36#include "extent_map.h"
37#include "file.h"
38#include "inode.h"
39#include "journal.h"
40#include "super.h"
41#include "symlink.h"
42
43#include "buffer_head_io.h"
44
45static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
46 struct buffer_head *bh_result, int create)
47{
48 int err = -EIO;
49 int status;
50 struct ocfs2_dinode *fe = NULL;
51 struct buffer_head *bh = NULL;
52 struct buffer_head *buffer_cache_bh = NULL;
53 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
54 void *kaddr;
55
56 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
57 (unsigned long long)iblock, bh_result, create);
58
59 BUG_ON(ocfs2_inode_is_fast_symlink(inode));
60
61 if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
62 mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
63 (unsigned long long)iblock);
64 goto bail;
65 }
66
67 status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
68 OCFS2_I(inode)->ip_blkno,
69 &bh, OCFS2_BH_CACHED, inode);
70 if (status < 0) {
71 mlog_errno(status);
72 goto bail;
73 }
74 fe = (struct ocfs2_dinode *) bh->b_data;
75
76 if (!OCFS2_IS_VALID_DINODE(fe)) {
77 mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n",
78 fe->i_blkno, 7, fe->i_signature);
79 goto bail;
80 }
81
82 if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
83 le32_to_cpu(fe->i_clusters))) {
84 mlog(ML_ERROR, "block offset is outside the allocated size: "
85 "%llu\n", (unsigned long long)iblock);
86 goto bail;
87 }
88
89 /* We don't use the page cache to create symlink data, so if
90 * need be, copy it over from the buffer cache. */
91 if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
92 u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
93 iblock;
94 buffer_cache_bh = sb_getblk(osb->sb, blkno);
95 if (!buffer_cache_bh) {
96 mlog(ML_ERROR, "couldn't getblock for symlink!\n");
97 goto bail;
98 }
99
100 /* we haven't locked out transactions, so a commit
101 * could've happened. Since we've got a reference on
102 * the bh, even if it commits while we're doing the
103 * copy, the data is still good. */
104 if (buffer_jbd(buffer_cache_bh)
105 && ocfs2_inode_is_new(inode)) {
106 kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
107 if (!kaddr) {
108 mlog(ML_ERROR, "couldn't kmap!\n");
109 goto bail;
110 }
111 memcpy(kaddr + (bh_result->b_size * iblock),
112 buffer_cache_bh->b_data,
113 bh_result->b_size);
114 kunmap_atomic(kaddr, KM_USER0);
115 set_buffer_uptodate(bh_result);
116 }
117 brelse(buffer_cache_bh);
118 }
119
120 map_bh(bh_result, inode->i_sb,
121 le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
122
123 err = 0;
124
125bail:
126 if (bh)
127 brelse(bh);
128
129 mlog_exit(err);
130 return err;
131}
132
133static int ocfs2_get_block(struct inode *inode, sector_t iblock,
134 struct buffer_head *bh_result, int create)
135{
136 int err = 0;
137 u64 p_blkno, past_eof;
138
139 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
140 (unsigned long long)iblock, bh_result, create);
141
142 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
143 mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
144 inode, inode->i_ino);
145
146 if (S_ISLNK(inode->i_mode)) {
147 /* this always does I/O for some reason. */
148 err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
149 goto bail;
150 }
151
152 /* this can happen if another node truncs after our extend! */
153 spin_lock(&OCFS2_I(inode)->ip_lock);
154 if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
155 OCFS2_I(inode)->ip_clusters))
156 err = -EIO;
157 spin_unlock(&OCFS2_I(inode)->ip_lock);
158 if (err)
159 goto bail;
160
161 err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
162 NULL);
163 if (err) {
164 mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
165 "%"MLFu64", NULL)\n", err, inode,
166 (unsigned long long)iblock, p_blkno);
167 goto bail;
168 }
169
170 map_bh(bh_result, inode->i_sb, p_blkno);
171
172 if (bh_result->b_blocknr == 0) {
173 err = -EIO;
174 mlog(ML_ERROR, "iblock = %llu p_blkno = %"MLFu64" "
175 "blkno=(%"MLFu64")\n", (unsigned long long)iblock,
176 p_blkno, OCFS2_I(inode)->ip_blkno);
177 }
178
179 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
180 mlog(0, "Inode %lu, past_eof = %"MLFu64"\n", inode->i_ino, past_eof);
181
182 if (create && (iblock >= past_eof))
183 set_buffer_new(bh_result);
184
185bail:
186 if (err < 0)
187 err = -EIO;
188
189 mlog_exit(err);
190 return err;
191}
192
193static int ocfs2_readpage(struct file *file, struct page *page)
194{
195 struct inode *inode = page->mapping->host;
196 loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
197 int ret, unlock = 1;
198
199 mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
200
201 ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
202 if (ret != 0) {
203 if (ret == AOP_TRUNCATED_PAGE)
204 unlock = 0;
205 mlog_errno(ret);
206 goto out;
207 }
208
209 down_read(&OCFS2_I(inode)->ip_alloc_sem);
210
211 /*
212 * i_size might have just been updated as we grabed the meta lock. We
213 * might now be discovering a truncate that hit on another node.
214 * block_read_full_page->get_block freaks out if it is asked to read
215 * beyond the end of a file, so we check here. Callers
216 * (generic_file_read, fault->nopage) are clever enough to check i_size
217 * and notice that the page they just read isn't needed.
218 *
219 * XXX sys_readahead() seems to get that wrong?
220 */
221 if (start >= i_size_read(inode)) {
222 char *addr = kmap(page);
223 memset(addr, 0, PAGE_SIZE);
224 flush_dcache_page(page);
225 kunmap(page);
226 SetPageUptodate(page);
227 ret = 0;
228 goto out_alloc;
229 }
230
231 ret = ocfs2_data_lock_with_page(inode, 0, page);
232 if (ret != 0) {
233 if (ret == AOP_TRUNCATED_PAGE)
234 unlock = 0;
235 mlog_errno(ret);
236 goto out_alloc;
237 }
238
239 ret = block_read_full_page(page, ocfs2_get_block);
240 unlock = 0;
241
242 ocfs2_data_unlock(inode, 0);
243out_alloc:
244 up_read(&OCFS2_I(inode)->ip_alloc_sem);
245 ocfs2_meta_unlock(inode, 0);
246out:
247 if (unlock)
248 unlock_page(page);
249 mlog_exit(ret);
250 return ret;
251}
252
253/* Note: Because we don't support holes, our allocation has
254 * already happened (allocation writes zeros to the file data)
255 * so we don't have to worry about ordered writes in
256 * ocfs2_writepage.
257 *
258 * ->writepage is called during the process of invalidating the page cache
259 * during blocked lock processing. It can't block on any cluster locks
260 * to during block mapping. It's relying on the fact that the block
261 * mapping can't have disappeared under the dirty pages that it is
262 * being asked to write back.
263 */
264static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
265{
266 int ret;
267
268 mlog_entry("(0x%p)\n", page);
269
270 ret = block_write_full_page(page, ocfs2_get_block, wbc);
271
272 mlog_exit(ret);
273
274 return ret;
275}
276
277/*
278 * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
279 * from loopback. It must be able to perform its own locking around
280 * ocfs2_get_block().
281 */
282int ocfs2_prepare_write(struct file *file, struct page *page,
283 unsigned from, unsigned to)
284{
285 struct inode *inode = page->mapping->host;
286 int ret;
287
288 mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
289
290 ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
291 if (ret != 0) {
292 mlog_errno(ret);
293 goto out;
294 }
295
296 down_read(&OCFS2_I(inode)->ip_alloc_sem);
297
298 ret = block_prepare_write(page, from, to, ocfs2_get_block);
299
300 up_read(&OCFS2_I(inode)->ip_alloc_sem);
301
302 ocfs2_meta_unlock(inode, 0);
303out:
304 mlog_exit(ret);
305 return ret;
306}
307
308/* Taken from ext3. We don't necessarily need the full blown
309 * functionality yet, but IMHO it's better to cut and paste the whole
310 * thing so we can avoid introducing our own bugs (and easily pick up
311 * their fixes when they happen) --Mark */
312static int walk_page_buffers( handle_t *handle,
313 struct buffer_head *head,
314 unsigned from,
315 unsigned to,
316 int *partial,
317 int (*fn)( handle_t *handle,
318 struct buffer_head *bh))
319{
320 struct buffer_head *bh;
321 unsigned block_start, block_end;
322 unsigned blocksize = head->b_size;
323 int err, ret = 0;
324 struct buffer_head *next;
325
326 for ( bh = head, block_start = 0;
327 ret == 0 && (bh != head || !block_start);
328 block_start = block_end, bh = next)
329 {
330 next = bh->b_this_page;
331 block_end = block_start + blocksize;
332 if (block_end <= from || block_start >= to) {
333 if (partial && !buffer_uptodate(bh))
334 *partial = 1;
335 continue;
336 }
337 err = (*fn)(handle, bh);
338 if (!ret)
339 ret = err;
340 }
341 return ret;
342}
343
344struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
345 struct page *page,
346 unsigned from,
347 unsigned to)
348{
349 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
350 struct ocfs2_journal_handle *handle = NULL;
351 int ret = 0;
352
353 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
354 if (!handle) {
355 ret = -ENOMEM;
356 mlog_errno(ret);
357 goto out;
358 }
359
360 if (ocfs2_should_order_data(inode)) {
361 ret = walk_page_buffers(handle->k_handle,
362 page_buffers(page),
363 from, to, NULL,
364 ocfs2_journal_dirty_data);
365 if (ret < 0)
366 mlog_errno(ret);
367 }
368out:
369 if (ret) {
370 if (handle)
371 ocfs2_commit_trans(handle);
372 handle = ERR_PTR(ret);
373 }
374 return handle;
375}
376
377static int ocfs2_commit_write(struct file *file, struct page *page,
378 unsigned from, unsigned to)
379{
380 int ret, extending = 0, locklevel = 0;
381 loff_t new_i_size;
382 struct buffer_head *di_bh = NULL;
383 struct inode *inode = page->mapping->host;
384 struct ocfs2_journal_handle *handle = NULL;
385
386 mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
387
388 /* NOTE: ocfs2_file_aio_write has ensured that it's safe for
389 * us to sample inode->i_size here without the metadata lock:
390 *
391 * 1) We're currently holding the inode alloc lock, so no
392 * nodes can change it underneath us.
393 *
394 * 2) We've had to take the metadata lock at least once
395 * already to check for extending writes, hence insuring
396 * that our current copy is also up to date.
397 */
398 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
399 if (new_i_size > i_size_read(inode)) {
400 extending = 1;
401 locklevel = 1;
402 }
403
404 ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page);
405 if (ret != 0) {
406 mlog_errno(ret);
407 goto out;
408 }
409
410 ret = ocfs2_data_lock_with_page(inode, 1, page);
411 if (ret != 0) {
412 mlog_errno(ret);
413 goto out_unlock_meta;
414 }
415
416 if (extending) {
417 handle = ocfs2_start_walk_page_trans(inode, page, from, to);
418 if (IS_ERR(handle)) {
419 ret = PTR_ERR(handle);
420 handle = NULL;
421 goto out_unlock_data;
422 }
423
424 /* Mark our buffer early. We'd rather catch this error up here
425 * as opposed to after a successful commit_write which would
426 * require us to set back inode->i_size. */
427 ret = ocfs2_journal_access(handle, inode, di_bh,
428 OCFS2_JOURNAL_ACCESS_WRITE);
429 if (ret < 0) {
430 mlog_errno(ret);
431 goto out_commit;
432 }
433 }
434
435 /* might update i_size */
436 ret = generic_commit_write(file, page, from, to);
437 if (ret < 0) {
438 mlog_errno(ret);
439 goto out_commit;
440 }
441
442 if (extending) {
443 loff_t size = (u64) i_size_read(inode);
444 struct ocfs2_dinode *di =
445 (struct ocfs2_dinode *)di_bh->b_data;
446
447 /* ocfs2_mark_inode_dirty is too heavy to use here. */
448 inode->i_blocks = ocfs2_align_bytes_to_sectors(size);
449 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
450
451 di->i_size = cpu_to_le64(size);
452 di->i_ctime = di->i_mtime =
453 cpu_to_le64(inode->i_mtime.tv_sec);
454 di->i_ctime_nsec = di->i_mtime_nsec =
455 cpu_to_le32(inode->i_mtime.tv_nsec);
456
457 ret = ocfs2_journal_dirty(handle, di_bh);
458 if (ret < 0) {
459 mlog_errno(ret);
460 goto out_commit;
461 }
462 }
463
464 BUG_ON(extending && (i_size_read(inode) != new_i_size));
465
466out_commit:
467 if (handle)
468 ocfs2_commit_trans(handle);
469out_unlock_data:
470 ocfs2_data_unlock(inode, 1);
471out_unlock_meta:
472 ocfs2_meta_unlock(inode, locklevel);
473out:
474 if (di_bh)
475 brelse(di_bh);
476
477 mlog_exit(ret);
478 return ret;
479}
480
481static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
482{
483 sector_t status;
484 u64 p_blkno = 0;
485 int err = 0;
486 struct inode *inode = mapping->host;
487
488 mlog_entry("(block = %llu)\n", (unsigned long long)block);
489
490 /* We don't need to lock journal system files, since they aren't
491 * accessed concurrently from multiple nodes.
492 */
493 if (!INODE_JOURNAL(inode)) {
494 err = ocfs2_meta_lock(inode, NULL, NULL, 0);
495 if (err) {
496 if (err != -ENOENT)
497 mlog_errno(err);
498 goto bail;
499 }
500 down_read(&OCFS2_I(inode)->ip_alloc_sem);
501 }
502
503 err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,
504 NULL);
505
506 if (!INODE_JOURNAL(inode)) {
507 up_read(&OCFS2_I(inode)->ip_alloc_sem);
508 ocfs2_meta_unlock(inode, 0);
509 }
510
511 if (err) {
512 mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
513 (unsigned long long)block);
514 mlog_errno(err);
515 goto bail;
516 }
517
518
519bail:
520 status = err ? 0 : p_blkno;
521
522 mlog_exit((int)status);
523
524 return status;
525}
526
527/*
528 * TODO: Make this into a generic get_blocks function.
529 *
530 * From do_direct_io in direct-io.c:
531 * "So what we do is to permit the ->get_blocks function to populate
532 * bh.b_size with the size of IO which is permitted at this offset and
533 * this i_blkbits."
534 *
535 * This function is called directly from get_more_blocks in direct-io.c.
536 *
537 * called like this: dio->get_blocks(dio->inode, fs_startblk,
538 * fs_count, map_bh, dio->rw == WRITE);
539 */
540static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
541 unsigned long max_blocks,
542 struct buffer_head *bh_result, int create)
543{
544 int ret;
545 u64 vbo_max; /* file offset, max_blocks from iblock */
546 u64 p_blkno;
547 int contig_blocks;
548 unsigned char blocksize_bits;
549
550 if (!inode || !bh_result) {
551 mlog(ML_ERROR, "inode or bh_result is null\n");
552 return -EIO;
553 }
554
555 blocksize_bits = inode->i_sb->s_blocksize_bits;
556
557 /* This function won't even be called if the request isn't all
558 * nicely aligned and of the right size, so there's no need
559 * for us to check any of that. */
560
561 vbo_max = ((u64)iblock + max_blocks) << blocksize_bits;
562
563 spin_lock(&OCFS2_I(inode)->ip_lock);
564 if ((iblock + max_blocks) >
565 ocfs2_clusters_to_blocks(inode->i_sb,
566 OCFS2_I(inode)->ip_clusters)) {
567 spin_unlock(&OCFS2_I(inode)->ip_lock);
568 ret = -EIO;
569 goto bail;
570 }
571 spin_unlock(&OCFS2_I(inode)->ip_lock);
572
573 /* This figures out the size of the next contiguous block, and
574 * our logical offset */
575 ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
576 &contig_blocks);
577 if (ret) {
578 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
579 (unsigned long long)iblock);
580 ret = -EIO;
581 goto bail;
582 }
583
584 map_bh(bh_result, inode->i_sb, p_blkno);
585
586 /* make sure we don't map more than max_blocks blocks here as
587 that's all the kernel will handle at this point. */
588 if (max_blocks < contig_blocks)
589 contig_blocks = max_blocks;
590 bh_result->b_size = contig_blocks << blocksize_bits;
591bail:
592 return ret;
593}
594
595/*
596 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
597 * particularly interested in the aio/dio case. Like the core uses
598 * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
599 * truncation on another.
600 */
601static void ocfs2_dio_end_io(struct kiocb *iocb,
602 loff_t offset,
603 ssize_t bytes,
604 void *private)
605{
606 struct inode *inode = iocb->ki_filp->f_dentry->d_inode;
607
608 /* this io's submitter should not have unlocked this before we could */
609 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
610 ocfs2_iocb_clear_rw_locked(iocb);
611 up_read(&inode->i_alloc_sem);
612 ocfs2_rw_unlock(inode, 0);
613}
614
615static ssize_t ocfs2_direct_IO(int rw,
616 struct kiocb *iocb,
617 const struct iovec *iov,
618 loff_t offset,
619 unsigned long nr_segs)
620{
621 struct file *file = iocb->ki_filp;
622 struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
623 int ret;
624
625 mlog_entry_void();
626 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
627 inode->i_sb->s_bdev, iov, offset,
628 nr_segs,
629 ocfs2_direct_IO_get_blocks,
630 ocfs2_dio_end_io);
631 mlog_exit(ret);
632 return ret;
633}
634
635struct address_space_operations ocfs2_aops = {
636 .readpage = ocfs2_readpage,
637 .writepage = ocfs2_writepage,
638 .prepare_write = ocfs2_prepare_write,
639 .commit_write = ocfs2_commit_write,
640 .bmap = ocfs2_bmap,
641 .sync_page = block_sync_page,
642 .direct_IO = ocfs2_direct_IO
643};
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
new file mode 100644
index 00000000000..d40456d509a
--- /dev/null
+++ b/fs/ocfs2/aops.h
@@ -0,0 +1,41 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#ifndef OCFS2_AOPS_H
23#define OCFS2_AOPS_H
24
25int ocfs2_prepare_write(struct file *file, struct page *page,
26 unsigned from, unsigned to);
27
28struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
29 struct page *page,
30 unsigned from,
31 unsigned to);
32
33/* all ocfs2_dio_end_io()'s fault */
34#define ocfs2_iocb_is_rw_locked(iocb) \
35 test_bit(0, (unsigned long *)&iocb->private)
36#define ocfs2_iocb_set_rw_locked(iocb) \
37 set_bit(0, (unsigned long *)&iocb->private)
38#define ocfs2_iocb_clear_rw_locked(iocb) \
39 clear_bit(0, (unsigned long *)&iocb->private)
40
41#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
new file mode 100644
index 00000000000..d424041b38e
--- /dev/null
+++ b/fs/ocfs2/buffer_head_io.c
@@ -0,0 +1,232 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * io.c
5 *
6 * Buffer cache handling
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30
31#include <cluster/masklog.h>
32
33#include "ocfs2.h"
34
35#include "alloc.h"
36#include "inode.h"
37#include "journal.h"
38#include "uptodate.h"
39
40#include "buffer_head_io.h"
41
42int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
43 struct inode *inode)
44{
45 int ret = 0;
46
47 mlog_entry("(bh->b_blocknr = %llu, inode=%p)\n",
48 (unsigned long long)bh->b_blocknr, inode);
49
50 BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
51 BUG_ON(buffer_jbd(bh));
52
53 /* No need to check for a soft readonly file system here. non
54 * journalled writes are only ever done on system files which
55 * can get modified during recovery even if read-only. */
56 if (ocfs2_is_hard_readonly(osb)) {
57 ret = -EROFS;
58 goto out;
59 }
60
61 down(&OCFS2_I(inode)->ip_io_sem);
62
63 lock_buffer(bh);
64 set_buffer_uptodate(bh);
65
66 /* remove from dirty list before I/O. */
67 clear_buffer_dirty(bh);
68
69 get_bh(bh); /* for end_buffer_write_sync() */
70 bh->b_end_io = end_buffer_write_sync;
71 submit_bh(WRITE, bh);
72
73 wait_on_buffer(bh);
74
75 if (buffer_uptodate(bh)) {
76 ocfs2_set_buffer_uptodate(inode, bh);
77 } else {
78 /* We don't need to remove the clustered uptodate
79 * information for this bh as it's not marked locally
80 * uptodate. */
81 ret = -EIO;
82 brelse(bh);
83 }
84
85 up(&OCFS2_I(inode)->ip_io_sem);
86out:
87 mlog_exit(ret);
88 return ret;
89}
90
91int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
92 struct buffer_head *bhs[], int flags,
93 struct inode *inode)
94{
95 int status = 0;
96 struct super_block *sb;
97 int i, ignore_cache = 0;
98 struct buffer_head *bh;
99
100 mlog_entry("(block=(%"MLFu64"), nr=(%d), flags=%d, inode=%p)\n",
101 block, nr, flags, inode);
102
103 if (osb == NULL || osb->sb == NULL || bhs == NULL) {
104 status = -EINVAL;
105 mlog_errno(status);
106 goto bail;
107 }
108
109 if (nr < 0) {
110 mlog(ML_ERROR, "asked to read %d blocks!\n", nr);
111 status = -EINVAL;
112 mlog_errno(status);
113 goto bail;
114 }
115
116 if (nr == 0) {
117 mlog(ML_BH_IO, "No buffers will be read!\n");
118 status = 0;
119 goto bail;
120 }
121
122 sb = osb->sb;
123
124 if (flags & OCFS2_BH_CACHED && !inode)
125 flags &= ~OCFS2_BH_CACHED;
126
127 if (inode)
128 down(&OCFS2_I(inode)->ip_io_sem);
129 for (i = 0 ; i < nr ; i++) {
130 if (bhs[i] == NULL) {
131 bhs[i] = sb_getblk(sb, block++);
132 if (bhs[i] == NULL) {
133 if (inode)
134 up(&OCFS2_I(inode)->ip_io_sem);
135 status = -EIO;
136 mlog_errno(status);
137 goto bail;
138 }
139 }
140 bh = bhs[i];
141 ignore_cache = 0;
142
143 if (flags & OCFS2_BH_CACHED &&
144 !ocfs2_buffer_uptodate(inode, bh)) {
145 mlog(ML_UPTODATE,
146 "bh (%llu), inode %"MLFu64" not uptodate\n",
147 (unsigned long long)bh->b_blocknr,
148 OCFS2_I(inode)->ip_blkno);
149 ignore_cache = 1;
150 }
151
152 /* XXX: Can we ever get this and *not* have the cached
153 * flag set? */
154 if (buffer_jbd(bh)) {
155 if (!(flags & OCFS2_BH_CACHED) || ignore_cache)
156 mlog(ML_BH_IO, "trying to sync read a jbd "
157 "managed bh (blocknr = %llu)\n",
158 (unsigned long long)bh->b_blocknr);
159 continue;
160 }
161
162 if (!(flags & OCFS2_BH_CACHED) || ignore_cache) {
163 if (buffer_dirty(bh)) {
164 /* This should probably be a BUG, or
165 * at least return an error. */
166 mlog(ML_BH_IO, "asking me to sync read a dirty "
167 "buffer! (blocknr = %llu)\n",
168 (unsigned long long)bh->b_blocknr);
169 continue;
170 }
171
172 lock_buffer(bh);
173 if (buffer_jbd(bh)) {
174#ifdef CATCH_BH_JBD_RACES
175 mlog(ML_ERROR, "block %llu had the JBD bit set "
176 "while I was in lock_buffer!",
177 (unsigned long long)bh->b_blocknr);
178 BUG();
179#else
180 unlock_buffer(bh);
181 continue;
182#endif
183 }
184 clear_buffer_uptodate(bh);
185 get_bh(bh); /* for end_buffer_read_sync() */
186 bh->b_end_io = end_buffer_read_sync;
187 if (flags & OCFS2_BH_READAHEAD)
188 submit_bh(READA, bh);
189 else
190 submit_bh(READ, bh);
191 continue;
192 }
193 }
194
195 status = 0;
196
197 for (i = (nr - 1); i >= 0; i--) {
198 bh = bhs[i];
199
200 /* We know this can't have changed as we hold the
201 * inode sem. Avoid doing any work on the bh if the
202 * journal has it. */
203 if (!buffer_jbd(bh))
204 wait_on_buffer(bh);
205
206 if (!buffer_uptodate(bh)) {
207 /* Status won't be cleared from here on out,
208 * so we can safely record this and loop back
209 * to cleanup the other buffers. Don't need to
210 * remove the clustered uptodate information
211 * for this bh as it's not marked locally
212 * uptodate. */
213 status = -EIO;
214 brelse(bh);
215 bhs[i] = NULL;
216 continue;
217 }
218
219 if (inode)
220 ocfs2_set_buffer_uptodate(inode, bh);
221 }
222 if (inode)
223 up(&OCFS2_I(inode)->ip_io_sem);
224
225 mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr,
226 (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes");
227
228bail:
229
230 mlog_exit(status);
231 return status;
232}
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
new file mode 100644
index 00000000000..6ecb90937b6
--- /dev/null
+++ b/fs/ocfs2/buffer_head_io.h
@@ -0,0 +1,73 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_buffer_head.h
5 *
6 * Buffer cache handling functions defined
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_BUFFER_HEAD_IO_H
27#define OCFS2_BUFFER_HEAD_IO_H
28
29#include <linux/buffer_head.h>
30
31void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
32 int uptodate);
33
34static inline int ocfs2_read_block(struct ocfs2_super *osb,
35 u64 off,
36 struct buffer_head **bh,
37 int flags,
38 struct inode *inode);
39
40int ocfs2_write_block(struct ocfs2_super *osb,
41 struct buffer_head *bh,
42 struct inode *inode);
43int ocfs2_read_blocks(struct ocfs2_super *osb,
44 u64 block,
45 int nr,
46 struct buffer_head *bhs[],
47 int flags,
48 struct inode *inode);
49
50
51#define OCFS2_BH_CACHED 1
52#define OCFS2_BH_READAHEAD 8 /* use this to pass READA down to submit_bh */
53
54static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
55 struct buffer_head **bh, int flags,
56 struct inode *inode)
57{
58 int status = 0;
59
60 if (bh == NULL) {
61 printk("ocfs2: bh == NULL\n");
62 status = -EINVAL;
63 goto bail;
64 }
65
66 status = ocfs2_read_blocks(osb, off, 1, bh,
67 flags, inode);
68
69bail:
70 return status;
71}
72
73#endif /* OCFS2_BUFFER_HEAD_IO_H */
diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile
new file mode 100644
index 00000000000..cdd162f1365
--- /dev/null
+++ b/fs/ocfs2/cluster/Makefile
@@ -0,0 +1,4 @@
1obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
2
3ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
4 quorum.o tcp.o ver.o
diff --git a/fs/ocfs2/cluster/endian.h b/fs/ocfs2/cluster/endian.h
new file mode 100644
index 00000000000..2df9082f4e3
--- /dev/null
+++ b/fs/ocfs2/cluster/endian.h
@@ -0,0 +1,30 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#ifndef OCFS2_CLUSTER_ENDIAN_H
23#define OCFS2_CLUSTER_ENDIAN_H
24
25static inline void be32_add_cpu(__be32 *var, u32 val)
26{
27 *var = cpu_to_be32(be32_to_cpu(*var) + val);
28}
29
30#endif /* OCFS2_CLUSTER_ENDIAN_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
new file mode 100644
index 00000000000..7307ba52891
--- /dev/null
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -0,0 +1,1797 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2004, 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#include <linux/kernel.h>
23#include <linux/sched.h>
24#include <linux/jiffies.h>
25#include <linux/module.h>
26#include <linux/fs.h>
27#include <linux/bio.h>
28#include <linux/blkdev.h>
29#include <linux/delay.h>
30#include <linux/file.h>
31#include <linux/kthread.h>
32#include <linux/configfs.h>
33#include <linux/random.h>
34#include <linux/crc32.h>
35#include <linux/time.h>
36
37#include "heartbeat.h"
38#include "tcp.h"
39#include "nodemanager.h"
40#include "quorum.h"
41
42#include "masklog.h"
43
44
45/*
46 * The first heartbeat pass had one global thread that would serialize all hb
47 * callback calls. This global serializing sem should only be removed once
48 * we've made sure that all callees can deal with being called concurrently
49 * from multiple hb region threads.
50 */
51static DECLARE_RWSEM(o2hb_callback_sem);
52
53/*
54 * multiple hb threads are watching multiple regions. A node is live
55 * whenever any of the threads sees activity from the node in its region.
56 */
57static spinlock_t o2hb_live_lock = SPIN_LOCK_UNLOCKED;
58static struct list_head o2hb_live_slots[O2NM_MAX_NODES];
59static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
60static LIST_HEAD(o2hb_node_events);
61static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
62
63static LIST_HEAD(o2hb_all_regions);
64
65static struct o2hb_callback {
66 struct list_head list;
67} o2hb_callbacks[O2HB_NUM_CB];
68
69static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
70
71#define O2HB_DEFAULT_BLOCK_BITS 9
72
73unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
74
75/* Only sets a new threshold if there are no active regions.
76 *
77 * No locking or otherwise interesting code is required for reading
78 * o2hb_dead_threshold as it can't change once regions are active and
79 * it's not interesting to anyone until then anyway. */
80static void o2hb_dead_threshold_set(unsigned int threshold)
81{
82 if (threshold > O2HB_MIN_DEAD_THRESHOLD) {
83 spin_lock(&o2hb_live_lock);
84 if (list_empty(&o2hb_all_regions))
85 o2hb_dead_threshold = threshold;
86 spin_unlock(&o2hb_live_lock);
87 }
88}
89
90struct o2hb_node_event {
91 struct list_head hn_item;
92 enum o2hb_callback_type hn_event_type;
93 struct o2nm_node *hn_node;
94 int hn_node_num;
95};
96
97struct o2hb_disk_slot {
98 struct o2hb_disk_heartbeat_block *ds_raw_block;
99 u8 ds_node_num;
100 u64 ds_last_time;
101 u64 ds_last_generation;
102 u16 ds_equal_samples;
103 u16 ds_changed_samples;
104 struct list_head ds_live_item;
105};
106
107/* each thread owns a region.. when we're asked to tear down the region
108 * we ask the thread to stop, who cleans up the region */
109struct o2hb_region {
110 struct config_item hr_item;
111
112 struct list_head hr_all_item;
113 unsigned hr_unclean_stop:1;
114
115 /* protected by the hr_callback_sem */
116 struct task_struct *hr_task;
117
118 unsigned int hr_blocks;
119 unsigned long long hr_start_block;
120
121 unsigned int hr_block_bits;
122 unsigned int hr_block_bytes;
123
124 unsigned int hr_slots_per_page;
125 unsigned int hr_num_pages;
126
127 struct page **hr_slot_data;
128 struct block_device *hr_bdev;
129 struct o2hb_disk_slot *hr_slots;
130
131 /* let the person setting up hb wait for it to return until it
132 * has reached a 'steady' state. This will be fixed when we have
133 * a more complete api that doesn't lead to this sort of fragility. */
134 atomic_t hr_steady_iterations;
135
136 char hr_dev_name[BDEVNAME_SIZE];
137
138 unsigned int hr_timeout_ms;
139
140 /* randomized as the region goes up and down so that a node
141 * recognizes a node going up and down in one iteration */
142 u64 hr_generation;
143
144 struct work_struct hr_write_timeout_work;
145 unsigned long hr_last_timeout_start;
146
147 /* Used during o2hb_check_slot to hold a copy of the block
148 * being checked because we temporarily have to zero out the
149 * crc field. */
150 struct o2hb_disk_heartbeat_block *hr_tmp_block;
151};
152
153struct o2hb_bio_wait_ctxt {
154 atomic_t wc_num_reqs;
155 struct completion wc_io_complete;
156};
157
158static void o2hb_write_timeout(void *arg)
159{
160 struct o2hb_region *reg = arg;
161
162 mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
163 "milliseconds\n", reg->hr_dev_name,
164 jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
165 o2quo_disk_timeout();
166}
167
168static void o2hb_arm_write_timeout(struct o2hb_region *reg)
169{
170 mlog(0, "Queue write timeout for %u ms\n", O2HB_MAX_WRITE_TIMEOUT_MS);
171
172 cancel_delayed_work(&reg->hr_write_timeout_work);
173 reg->hr_last_timeout_start = jiffies;
174 schedule_delayed_work(&reg->hr_write_timeout_work,
175 msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS));
176}
177
178static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
179{
180 cancel_delayed_work(&reg->hr_write_timeout_work);
181 flush_scheduled_work();
182}
183
184static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc,
185 unsigned int num_ios)
186{
187 atomic_set(&wc->wc_num_reqs, num_ios);
188 init_completion(&wc->wc_io_complete);
189}
190
191/* Used in error paths too */
192static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
193 unsigned int num)
194{
195 /* sadly atomic_sub_and_test() isn't available on all platforms. The
196 * good news is that the fast path only completes one at a time */
197 while(num--) {
198 if (atomic_dec_and_test(&wc->wc_num_reqs)) {
199 BUG_ON(num > 0);
200 complete(&wc->wc_io_complete);
201 }
202 }
203}
204
205static void o2hb_wait_on_io(struct o2hb_region *reg,
206 struct o2hb_bio_wait_ctxt *wc)
207{
208 struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping;
209
210 blk_run_address_space(mapping);
211
212 wait_for_completion(&wc->wc_io_complete);
213}
214
215static int o2hb_bio_end_io(struct bio *bio,
216 unsigned int bytes_done,
217 int error)
218{
219 struct o2hb_bio_wait_ctxt *wc = bio->bi_private;
220
221 if (error)
222 mlog(ML_ERROR, "IO Error %d\n", error);
223
224 if (bio->bi_size)
225 return 1;
226
227 o2hb_bio_wait_dec(wc, 1);
228 return 0;
229}
230
231/* Setup a Bio to cover I/O against num_slots slots starting at
232 * start_slot. */
233static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
234 struct o2hb_bio_wait_ctxt *wc,
235 unsigned int start_slot,
236 unsigned int num_slots)
237{
238 int i, nr_vecs, len, first_page, last_page;
239 unsigned int vec_len, vec_start;
240 unsigned int bits = reg->hr_block_bits;
241 unsigned int spp = reg->hr_slots_per_page;
242 struct bio *bio;
243 struct page *page;
244
245 nr_vecs = (num_slots + spp - 1) / spp;
246
247 /* Testing has shown this allocation to take long enough under
248 * GFP_KERNEL that the local node can get fenced. It would be
249 * nicest if we could pre-allocate these bios and avoid this
250 * all together. */
251 bio = bio_alloc(GFP_ATOMIC, nr_vecs);
252 if (!bio) {
253 mlog(ML_ERROR, "Could not alloc slots BIO!\n");
254 bio = ERR_PTR(-ENOMEM);
255 goto bail;
256 }
257
258 /* Must put everything in 512 byte sectors for the bio... */
259 bio->bi_sector = (reg->hr_start_block + start_slot) << (bits - 9);
260 bio->bi_bdev = reg->hr_bdev;
261 bio->bi_private = wc;
262 bio->bi_end_io = o2hb_bio_end_io;
263
264 first_page = start_slot / spp;
265 last_page = first_page + nr_vecs;
266 vec_start = (start_slot << bits) % PAGE_CACHE_SIZE;
267 for(i = first_page; i < last_page; i++) {
268 page = reg->hr_slot_data[i];
269
270 vec_len = PAGE_CACHE_SIZE;
271 /* last page might be short */
272 if (((i + 1) * spp) > (start_slot + num_slots))
273 vec_len = ((num_slots + start_slot) % spp) << bits;
274 vec_len -= vec_start;
275
276 mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n",
277 i, vec_len, vec_start);
278
279 len = bio_add_page(bio, page, vec_len, vec_start);
280 if (len != vec_len) {
281 bio_put(bio);
282 bio = ERR_PTR(-EIO);
283
284 mlog(ML_ERROR, "Error adding page to bio i = %d, "
285 "vec_len = %u, len = %d\n, start = %u\n",
286 i, vec_len, len, vec_start);
287 goto bail;
288 }
289
290 vec_start = 0;
291 }
292
293bail:
294 return bio;
295}
296
297/*
298 * Compute the maximum number of sectors the bdev can handle in one bio,
299 * as a power of two.
300 *
301 * Stolen from oracleasm, thanks Joel!
302 */
303static int compute_max_sectors(struct block_device *bdev)
304{
305 int max_pages, max_sectors, pow_two_sectors;
306
307 struct request_queue *q;
308
309 q = bdev_get_queue(bdev);
310 max_pages = q->max_sectors >> (PAGE_SHIFT - 9);
311 if (max_pages > BIO_MAX_PAGES)
312 max_pages = BIO_MAX_PAGES;
313 if (max_pages > q->max_phys_segments)
314 max_pages = q->max_phys_segments;
315 if (max_pages > q->max_hw_segments)
316 max_pages = q->max_hw_segments;
317 max_pages--; /* Handle I/Os that straddle a page */
318
319 max_sectors = max_pages << (PAGE_SHIFT - 9);
320
321 /* Why is fls() 1-based???? */
322 pow_two_sectors = 1 << (fls(max_sectors) - 1);
323
324 return pow_two_sectors;
325}
326
327static inline void o2hb_compute_request_limits(struct o2hb_region *reg,
328 unsigned int num_slots,
329 unsigned int *num_bios,
330 unsigned int *slots_per_bio)
331{
332 unsigned int max_sectors, io_sectors;
333
334 max_sectors = compute_max_sectors(reg->hr_bdev);
335
336 io_sectors = num_slots << (reg->hr_block_bits - 9);
337
338 *num_bios = (io_sectors + max_sectors - 1) / max_sectors;
339 *slots_per_bio = max_sectors >> (reg->hr_block_bits - 9);
340
341 mlog(ML_HB_BIO, "My io size is %u sectors for %u slots. This "
342 "device can handle %u sectors of I/O\n", io_sectors, num_slots,
343 max_sectors);
344 mlog(ML_HB_BIO, "Will need %u bios holding %u slots each\n",
345 *num_bios, *slots_per_bio);
346}
347
348static int o2hb_read_slots(struct o2hb_region *reg,
349 unsigned int max_slots)
350{
351 unsigned int num_bios, slots_per_bio, start_slot, num_slots;
352 int i, status;
353 struct o2hb_bio_wait_ctxt wc;
354 struct bio **bios;
355 struct bio *bio;
356
357 o2hb_compute_request_limits(reg, max_slots, &num_bios, &slots_per_bio);
358
359 bios = kcalloc(num_bios, sizeof(struct bio *), GFP_KERNEL);
360 if (!bios) {
361 status = -ENOMEM;
362 mlog_errno(status);
363 return status;
364 }
365
366 o2hb_bio_wait_init(&wc, num_bios);
367
368 num_slots = slots_per_bio;
369 for(i = 0; i < num_bios; i++) {
370 start_slot = i * slots_per_bio;
371
372 /* adjust num_slots at last bio */
373 if (max_slots < (start_slot + num_slots))
374 num_slots = max_slots - start_slot;
375
376 bio = o2hb_setup_one_bio(reg, &wc, start_slot, num_slots);
377 if (IS_ERR(bio)) {
378 o2hb_bio_wait_dec(&wc, num_bios - i);
379
380 status = PTR_ERR(bio);
381 mlog_errno(status);
382 goto bail_and_wait;
383 }
384 bios[i] = bio;
385
386 submit_bio(READ, bio);
387 }
388
389 status = 0;
390
391bail_and_wait:
392 o2hb_wait_on_io(reg, &wc);
393
394 if (bios) {
395 for(i = 0; i < num_bios; i++)
396 if (bios[i])
397 bio_put(bios[i]);
398 kfree(bios);
399 }
400
401 return status;
402}
403
404static int o2hb_issue_node_write(struct o2hb_region *reg,
405 struct bio **write_bio,
406 struct o2hb_bio_wait_ctxt *write_wc)
407{
408 int status;
409 unsigned int slot;
410 struct bio *bio;
411
412 o2hb_bio_wait_init(write_wc, 1);
413
414 slot = o2nm_this_node();
415
416 bio = o2hb_setup_one_bio(reg, write_wc, slot, 1);
417 if (IS_ERR(bio)) {
418 status = PTR_ERR(bio);
419 mlog_errno(status);
420 goto bail;
421 }
422
423 submit_bio(WRITE, bio);
424
425 *write_bio = bio;
426 status = 0;
427bail:
428 return status;
429}
430
431static u32 o2hb_compute_block_crc_le(struct o2hb_region *reg,
432 struct o2hb_disk_heartbeat_block *hb_block)
433{
434 __le32 old_cksum;
435 u32 ret;
436
437 /* We want to compute the block crc with a 0 value in the
438 * hb_cksum field. Save it off here and replace after the
439 * crc. */
440 old_cksum = hb_block->hb_cksum;
441 hb_block->hb_cksum = 0;
442
443 ret = crc32_le(0, (unsigned char *) hb_block, reg->hr_block_bytes);
444
445 hb_block->hb_cksum = old_cksum;
446
447 return ret;
448}
449
450static void o2hb_dump_slot(struct o2hb_disk_heartbeat_block *hb_block)
451{
452 mlog(ML_ERROR, "Dump slot information: seq = 0x%"MLFx64", node = %u, "
453 "cksum = 0x%x, generation 0x%"MLFx64"\n",
454 le64_to_cpu(hb_block->hb_seq), hb_block->hb_node,
455 le32_to_cpu(hb_block->hb_cksum),
456 le64_to_cpu(hb_block->hb_generation));
457}
458
459static int o2hb_verify_crc(struct o2hb_region *reg,
460 struct o2hb_disk_heartbeat_block *hb_block)
461{
462 u32 read, computed;
463
464 read = le32_to_cpu(hb_block->hb_cksum);
465 computed = o2hb_compute_block_crc_le(reg, hb_block);
466
467 return read == computed;
468}
469
470/* We want to make sure that nobody is heartbeating on top of us --
471 * this will help detect an invalid configuration. */
472static int o2hb_check_last_timestamp(struct o2hb_region *reg)
473{
474 int node_num, ret;
475 struct o2hb_disk_slot *slot;
476 struct o2hb_disk_heartbeat_block *hb_block;
477
478 node_num = o2nm_this_node();
479
480 ret = 1;
481 slot = &reg->hr_slots[node_num];
482 /* Don't check on our 1st timestamp */
483 if (slot->ds_last_time) {
484 hb_block = slot->ds_raw_block;
485
486 if (le64_to_cpu(hb_block->hb_seq) != slot->ds_last_time)
487 ret = 0;
488 }
489
490 return ret;
491}
492
493static inline void o2hb_prepare_block(struct o2hb_region *reg,
494 u64 generation)
495{
496 int node_num;
497 u64 cputime;
498 struct o2hb_disk_slot *slot;
499 struct o2hb_disk_heartbeat_block *hb_block;
500
501 node_num = o2nm_this_node();
502 slot = &reg->hr_slots[node_num];
503
504 hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block;
505 memset(hb_block, 0, reg->hr_block_bytes);
506 /* TODO: time stuff */
507 cputime = CURRENT_TIME.tv_sec;
508 if (!cputime)
509 cputime = 1;
510
511 hb_block->hb_seq = cpu_to_le64(cputime);
512 hb_block->hb_node = node_num;
513 hb_block->hb_generation = cpu_to_le64(generation);
514
515 /* This step must always happen last! */
516 hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg,
517 hb_block));
518
519 mlog(ML_HB_BIO, "our node generation = 0x%"MLFx64", cksum = 0x%x\n",
520 cpu_to_le64(generation), le32_to_cpu(hb_block->hb_cksum));
521}
522
523static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
524 struct o2nm_node *node,
525 int idx)
526{
527 struct list_head *iter;
528 struct o2hb_callback_func *f;
529
530 list_for_each(iter, &hbcall->list) {
531 f = list_entry(iter, struct o2hb_callback_func, hc_item);
532 mlog(ML_HEARTBEAT, "calling funcs %p\n", f);
533 (f->hc_func)(node, idx, f->hc_data);
534 }
535}
536
537/* Will run the list in order until we process the passed event */
538static void o2hb_run_event_list(struct o2hb_node_event *queued_event)
539{
540 int empty;
541 struct o2hb_callback *hbcall;
542 struct o2hb_node_event *event;
543
544 spin_lock(&o2hb_live_lock);
545 empty = list_empty(&queued_event->hn_item);
546 spin_unlock(&o2hb_live_lock);
547 if (empty)
548 return;
549
550 /* Holding callback sem assures we don't alter the callback
551 * lists when doing this, and serializes ourselves with other
552 * processes wanting callbacks. */
553 down_write(&o2hb_callback_sem);
554
555 spin_lock(&o2hb_live_lock);
556 while (!list_empty(&o2hb_node_events)
557 && !list_empty(&queued_event->hn_item)) {
558 event = list_entry(o2hb_node_events.next,
559 struct o2hb_node_event,
560 hn_item);
561 list_del_init(&event->hn_item);
562 spin_unlock(&o2hb_live_lock);
563
564 mlog(ML_HEARTBEAT, "Node %s event for %d\n",
565 event->hn_event_type == O2HB_NODE_UP_CB ? "UP" : "DOWN",
566 event->hn_node_num);
567
568 hbcall = hbcall_from_type(event->hn_event_type);
569
570 /* We should *never* have gotten on to the list with a
571 * bad type... This isn't something that we should try
572 * to recover from. */
573 BUG_ON(IS_ERR(hbcall));
574
575 o2hb_fire_callbacks(hbcall, event->hn_node, event->hn_node_num);
576
577 spin_lock(&o2hb_live_lock);
578 }
579 spin_unlock(&o2hb_live_lock);
580
581 up_write(&o2hb_callback_sem);
582}
583
584static void o2hb_queue_node_event(struct o2hb_node_event *event,
585 enum o2hb_callback_type type,
586 struct o2nm_node *node,
587 int node_num)
588{
589 assert_spin_locked(&o2hb_live_lock);
590
591 event->hn_event_type = type;
592 event->hn_node = node;
593 event->hn_node_num = node_num;
594
595 mlog(ML_HEARTBEAT, "Queue node %s event for node %d\n",
596 type == O2HB_NODE_UP_CB ? "UP" : "DOWN", node_num);
597
598 list_add_tail(&event->hn_item, &o2hb_node_events);
599}
600
601static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
602{
603 struct o2hb_node_event event =
604 { .hn_item = LIST_HEAD_INIT(event.hn_item), };
605 struct o2nm_node *node;
606
607 node = o2nm_get_node_by_num(slot->ds_node_num);
608 if (!node)
609 return;
610
611 spin_lock(&o2hb_live_lock);
612 if (!list_empty(&slot->ds_live_item)) {
613 mlog(ML_HEARTBEAT, "Shutdown, node %d leaves region\n",
614 slot->ds_node_num);
615
616 list_del_init(&slot->ds_live_item);
617
618 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
619 clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
620
621 o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
622 slot->ds_node_num);
623 }
624 }
625 spin_unlock(&o2hb_live_lock);
626
627 o2hb_run_event_list(&event);
628
629 o2nm_node_put(node);
630}
631
632static int o2hb_check_slot(struct o2hb_region *reg,
633 struct o2hb_disk_slot *slot)
634{
635 int changed = 0, gen_changed = 0;
636 struct o2hb_node_event event =
637 { .hn_item = LIST_HEAD_INIT(event.hn_item), };
638 struct o2nm_node *node;
639 struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block;
640 u64 cputime;
641
642 memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
643
644 /* Is this correct? Do we assume that the node doesn't exist
645 * if we're not configured for him? */
646 node = o2nm_get_node_by_num(slot->ds_node_num);
647 if (!node)
648 return 0;
649
650 if (!o2hb_verify_crc(reg, hb_block)) {
651 /* all paths from here will drop o2hb_live_lock for
652 * us. */
653 spin_lock(&o2hb_live_lock);
654
655 /* Don't print an error on the console in this case -
656 * a freshly formatted heartbeat area will not have a
657 * crc set on it. */
658 if (list_empty(&slot->ds_live_item))
659 goto out;
660
661 /* The node is live but pushed out a bad crc. We
662 * consider it a transient miss but don't populate any
663 * other values as they may be junk. */
664 mlog(ML_ERROR, "Node %d has written a bad crc to %s\n",
665 slot->ds_node_num, reg->hr_dev_name);
666 o2hb_dump_slot(hb_block);
667
668 slot->ds_equal_samples++;
669 goto fire_callbacks;
670 }
671
672 /* we don't care if these wrap.. the state transitions below
673 * clear at the right places */
674 cputime = le64_to_cpu(hb_block->hb_seq);
675 if (slot->ds_last_time != cputime)
676 slot->ds_changed_samples++;
677 else
678 slot->ds_equal_samples++;
679 slot->ds_last_time = cputime;
680
681 /* The node changed heartbeat generations. We assume this to
682 * mean it dropped off but came back before we timed out. We
683 * want to consider it down for the time being but don't want
684 * to lose any changed_samples state we might build up to
685 * considering it live again. */
686 if (slot->ds_last_generation != le64_to_cpu(hb_block->hb_generation)) {
687 gen_changed = 1;
688 slot->ds_equal_samples = 0;
689 mlog(ML_HEARTBEAT, "Node %d changed generation (0x%"MLFx64" "
690 "to 0x%"MLFx64")\n", slot->ds_node_num,
691 slot->ds_last_generation,
692 le64_to_cpu(hb_block->hb_generation));
693 }
694
695 slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation);
696
697 mlog(ML_HEARTBEAT, "Slot %d gen 0x%"MLFx64" cksum 0x%x "
698 "seq %"MLFu64" last %"MLFu64" changed %u equal %u\n",
699 slot->ds_node_num, slot->ds_last_generation,
700 le32_to_cpu(hb_block->hb_cksum), le64_to_cpu(hb_block->hb_seq),
701 slot->ds_last_time, slot->ds_changed_samples,
702 slot->ds_equal_samples);
703
704 spin_lock(&o2hb_live_lock);
705
706fire_callbacks:
707 /* dead nodes only come to life after some number of
708 * changes at any time during their dead time */
709 if (list_empty(&slot->ds_live_item) &&
710 slot->ds_changed_samples >= O2HB_LIVE_THRESHOLD) {
711 mlog(ML_HEARTBEAT, "Node %d (id 0x%"MLFx64") joined my "
712 "region\n", slot->ds_node_num, slot->ds_last_generation);
713
714 /* first on the list generates a callback */
715 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
716 set_bit(slot->ds_node_num, o2hb_live_node_bitmap);
717
718 o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
719 slot->ds_node_num);
720
721 changed = 1;
722 }
723
724 list_add_tail(&slot->ds_live_item,
725 &o2hb_live_slots[slot->ds_node_num]);
726
727 slot->ds_equal_samples = 0;
728 goto out;
729 }
730
731 /* if the list is dead, we're done.. */
732 if (list_empty(&slot->ds_live_item))
733 goto out;
734
735 /* live nodes only go dead after enough consequtive missed
736 * samples.. reset the missed counter whenever we see
737 * activity */
738 if (slot->ds_equal_samples >= o2hb_dead_threshold || gen_changed) {
739 mlog(ML_HEARTBEAT, "Node %d left my region\n",
740 slot->ds_node_num);
741
742 /* last off the live_slot generates a callback */
743 list_del_init(&slot->ds_live_item);
744 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
745 clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
746
747 o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
748 slot->ds_node_num);
749
750 changed = 1;
751 }
752
753 /* We don't clear this because the node is still
754 * actually writing new blocks. */
755 if (!gen_changed)
756 slot->ds_changed_samples = 0;
757 goto out;
758 }
759 if (slot->ds_changed_samples) {
760 slot->ds_changed_samples = 0;
761 slot->ds_equal_samples = 0;
762 }
763out:
764 spin_unlock(&o2hb_live_lock);
765
766 o2hb_run_event_list(&event);
767
768 o2nm_node_put(node);
769 return changed;
770}
771
772/* This could be faster if we just implmented a find_last_bit, but I
773 * don't think the circumstances warrant it. */
774static int o2hb_highest_node(unsigned long *nodes,
775 int numbits)
776{
777 int highest, node;
778
779 highest = numbits;
780 node = -1;
781 while ((node = find_next_bit(nodes, numbits, node + 1)) != -1) {
782 if (node >= numbits)
783 break;
784
785 highest = node;
786 }
787
788 return highest;
789}
790
791static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
792{
793 int i, ret, highest_node, change = 0;
794 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
795 struct bio *write_bio;
796 struct o2hb_bio_wait_ctxt write_wc;
797
798 if (o2nm_configured_node_map(configured_nodes, sizeof(configured_nodes)))
799 return;
800
801 highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
802 if (highest_node >= O2NM_MAX_NODES) {
803 mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
804 return;
805 }
806
807 /* No sense in reading the slots of nodes that don't exist
808 * yet. Of course, if the node definitions have holes in them
809 * then we're reading an empty slot anyway... Consider this
810 * best-effort. */
811 ret = o2hb_read_slots(reg, highest_node + 1);
812 if (ret < 0) {
813 mlog_errno(ret);
814 return;
815 }
816
817 /* With an up to date view of the slots, we can check that no
818 * other node has been improperly configured to heartbeat in
819 * our slot. */
820 if (!o2hb_check_last_timestamp(reg))
821 mlog(ML_ERROR, "Device \"%s\": another node is heartbeating "
822 "in our slot!\n", reg->hr_dev_name);
823
824 /* fill in the proper info for our next heartbeat */
825 o2hb_prepare_block(reg, reg->hr_generation);
826
827 /* And fire off the write. Note that we don't wait on this I/O
828 * until later. */
829 ret = o2hb_issue_node_write(reg, &write_bio, &write_wc);
830 if (ret < 0) {
831 mlog_errno(ret);
832 return;
833 }
834
835 i = -1;
836 while((i = find_next_bit(configured_nodes, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
837
838 change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
839 }
840
841 /*
842 * We have to be sure we've advertised ourselves on disk
843 * before we can go to steady state. This ensures that
844 * people we find in our steady state have seen us.
845 */
846 o2hb_wait_on_io(reg, &write_wc);
847 bio_put(write_bio);
848 o2hb_arm_write_timeout(reg);
849
850 /* let the person who launched us know when things are steady */
851 if (!change && (atomic_read(&reg->hr_steady_iterations) != 0)) {
852 if (atomic_dec_and_test(&reg->hr_steady_iterations))
853 wake_up(&o2hb_steady_queue);
854 }
855}
856
857/* Subtract b from a, storing the result in a. a *must* have a larger
858 * value than b. */
859static void o2hb_tv_subtract(struct timeval *a,
860 struct timeval *b)
861{
862 /* just return 0 when a is after b */
863 if (a->tv_sec < b->tv_sec ||
864 (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) {
865 a->tv_sec = 0;
866 a->tv_usec = 0;
867 return;
868 }
869
870 a->tv_sec -= b->tv_sec;
871 a->tv_usec -= b->tv_usec;
872 while ( a->tv_usec < 0 ) {
873 a->tv_sec--;
874 a->tv_usec += 1000000;
875 }
876}
877
878static unsigned int o2hb_elapsed_msecs(struct timeval *start,
879 struct timeval *end)
880{
881 struct timeval res = *end;
882
883 o2hb_tv_subtract(&res, start);
884
885 return res.tv_sec * 1000 + res.tv_usec / 1000;
886}
887
888/*
889 * we ride the region ref that the region dir holds. before the region
890 * dir is removed and drops it ref it will wait to tear down this
891 * thread.
892 */
893static int o2hb_thread(void *data)
894{
895 int i, ret;
896 struct o2hb_region *reg = data;
897 struct bio *write_bio;
898 struct o2hb_bio_wait_ctxt write_wc;
899 struct timeval before_hb, after_hb;
900 unsigned int elapsed_msec;
901
902 mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
903
904 set_user_nice(current, -20);
905
906 while (!kthread_should_stop() && !reg->hr_unclean_stop) {
907 /* We track the time spent inside
908 * o2hb_do_disk_heartbeat so that we avoid more then
909 * hr_timeout_ms between disk writes. On busy systems
910 * this should result in a heartbeat which is less
911 * likely to time itself out. */
912 do_gettimeofday(&before_hb);
913
914 o2hb_do_disk_heartbeat(reg);
915
916 do_gettimeofday(&after_hb);
917 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
918
919 mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
920 before_hb.tv_sec, before_hb.tv_usec,
921 after_hb.tv_sec, after_hb.tv_usec, elapsed_msec);
922
923 if (elapsed_msec < reg->hr_timeout_ms) {
924 /* the kthread api has blocked signals for us so no
925 * need to record the return value. */
926 msleep_interruptible(reg->hr_timeout_ms - elapsed_msec);
927 }
928 }
929
930 o2hb_disarm_write_timeout(reg);
931
932 /* unclean stop is only used in very bad situation */
933 for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++)
934 o2hb_shutdown_slot(&reg->hr_slots[i]);
935
936 /* Explicit down notification - avoid forcing the other nodes
937 * to timeout on this region when we could just as easily
938 * write a clear generation - thus indicating to them that
939 * this node has left this region.
940 *
941 * XXX: Should we skip this on unclean_stop? */
942 o2hb_prepare_block(reg, 0);
943 ret = o2hb_issue_node_write(reg, &write_bio, &write_wc);
944 if (ret == 0) {
945 o2hb_wait_on_io(reg, &write_wc);
946 bio_put(write_bio);
947 } else {
948 mlog_errno(ret);
949 }
950
951 mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
952
953 return 0;
954}
955
956void o2hb_init(void)
957{
958 int i;
959
960 for (i = 0; i < ARRAY_SIZE(o2hb_callbacks); i++)
961 INIT_LIST_HEAD(&o2hb_callbacks[i].list);
962
963 for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++)
964 INIT_LIST_HEAD(&o2hb_live_slots[i]);
965
966 INIT_LIST_HEAD(&o2hb_node_events);
967
968 memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
969}
970
971/* if we're already in a callback then we're already serialized by the sem */
972static void o2hb_fill_node_map_from_callback(unsigned long *map,
973 unsigned bytes)
974{
975 BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
976
977 memcpy(map, &o2hb_live_node_bitmap, bytes);
978}
979
980/*
981 * get a map of all nodes that are heartbeating in any regions
982 */
983void o2hb_fill_node_map(unsigned long *map, unsigned bytes)
984{
985 /* callers want to serialize this map and callbacks so that they
986 * can trust that they don't miss nodes coming to the party */
987 down_read(&o2hb_callback_sem);
988 spin_lock(&o2hb_live_lock);
989 o2hb_fill_node_map_from_callback(map, bytes);
990 spin_unlock(&o2hb_live_lock);
991 up_read(&o2hb_callback_sem);
992}
993EXPORT_SYMBOL_GPL(o2hb_fill_node_map);
994
995/*
996 * heartbeat configfs bits. The heartbeat set is a default set under
997 * the cluster set in nodemanager.c.
998 */
999
1000static struct o2hb_region *to_o2hb_region(struct config_item *item)
1001{
1002 return item ? container_of(item, struct o2hb_region, hr_item) : NULL;
1003}
1004
1005/* drop_item only drops its ref after killing the thread, nothing should
1006 * be using the region anymore. this has to clean up any state that
1007 * attributes might have built up. */
1008static void o2hb_region_release(struct config_item *item)
1009{
1010 int i;
1011 struct page *page;
1012 struct o2hb_region *reg = to_o2hb_region(item);
1013
1014 if (reg->hr_tmp_block)
1015 kfree(reg->hr_tmp_block);
1016
1017 if (reg->hr_slot_data) {
1018 for (i = 0; i < reg->hr_num_pages; i++) {
1019 page = reg->hr_slot_data[i];
1020 if (page)
1021 __free_page(page);
1022 }
1023 kfree(reg->hr_slot_data);
1024 }
1025
1026 if (reg->hr_bdev)
1027 blkdev_put(reg->hr_bdev);
1028
1029 if (reg->hr_slots)
1030 kfree(reg->hr_slots);
1031
1032 spin_lock(&o2hb_live_lock);
1033 list_del(&reg->hr_all_item);
1034 spin_unlock(&o2hb_live_lock);
1035
1036 kfree(reg);
1037}
1038
1039static int o2hb_read_block_input(struct o2hb_region *reg,
1040 const char *page,
1041 size_t count,
1042 unsigned long *ret_bytes,
1043 unsigned int *ret_bits)
1044{
1045 unsigned long bytes;
1046 char *p = (char *)page;
1047
1048 bytes = simple_strtoul(p, &p, 0);
1049 if (!p || (*p && (*p != '\n')))
1050 return -EINVAL;
1051
1052 /* Heartbeat and fs min / max block sizes are the same. */
1053 if (bytes > 4096 || bytes < 512)
1054 return -ERANGE;
1055 if (hweight16(bytes) != 1)
1056 return -EINVAL;
1057
1058 if (ret_bytes)
1059 *ret_bytes = bytes;
1060 if (ret_bits)
1061 *ret_bits = ffs(bytes) - 1;
1062
1063 return 0;
1064}
1065
1066static ssize_t o2hb_region_block_bytes_read(struct o2hb_region *reg,
1067 char *page)
1068{
1069 return sprintf(page, "%u\n", reg->hr_block_bytes);
1070}
1071
1072static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg,
1073 const char *page,
1074 size_t count)
1075{
1076 int status;
1077 unsigned long block_bytes;
1078 unsigned int block_bits;
1079
1080 if (reg->hr_bdev)
1081 return -EINVAL;
1082
1083 status = o2hb_read_block_input(reg, page, count,
1084 &block_bytes, &block_bits);
1085 if (status)
1086 return status;
1087
1088 reg->hr_block_bytes = (unsigned int)block_bytes;
1089 reg->hr_block_bits = block_bits;
1090
1091 return count;
1092}
1093
1094static ssize_t o2hb_region_start_block_read(struct o2hb_region *reg,
1095 char *page)
1096{
1097 return sprintf(page, "%llu\n", reg->hr_start_block);
1098}
1099
1100static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg,
1101 const char *page,
1102 size_t count)
1103{
1104 unsigned long long tmp;
1105 char *p = (char *)page;
1106
1107 if (reg->hr_bdev)
1108 return -EINVAL;
1109
1110 tmp = simple_strtoull(p, &p, 0);
1111 if (!p || (*p && (*p != '\n')))
1112 return -EINVAL;
1113
1114 reg->hr_start_block = tmp;
1115
1116 return count;
1117}
1118
1119static ssize_t o2hb_region_blocks_read(struct o2hb_region *reg,
1120 char *page)
1121{
1122 return sprintf(page, "%d\n", reg->hr_blocks);
1123}
1124
1125static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg,
1126 const char *page,
1127 size_t count)
1128{
1129 unsigned long tmp;
1130 char *p = (char *)page;
1131
1132 if (reg->hr_bdev)
1133 return -EINVAL;
1134
1135 tmp = simple_strtoul(p, &p, 0);
1136 if (!p || (*p && (*p != '\n')))
1137 return -EINVAL;
1138
1139 if (tmp > O2NM_MAX_NODES || tmp == 0)
1140 return -ERANGE;
1141
1142 reg->hr_blocks = (unsigned int)tmp;
1143
1144 return count;
1145}
1146
1147static ssize_t o2hb_region_dev_read(struct o2hb_region *reg,
1148 char *page)
1149{
1150 unsigned int ret = 0;
1151
1152 if (reg->hr_bdev)
1153 ret = sprintf(page, "%s\n", reg->hr_dev_name);
1154
1155 return ret;
1156}
1157
1158static void o2hb_init_region_params(struct o2hb_region *reg)
1159{
1160 reg->hr_slots_per_page = PAGE_CACHE_SIZE >> reg->hr_block_bits;
1161 reg->hr_timeout_ms = O2HB_REGION_TIMEOUT_MS;
1162
1163 mlog(ML_HEARTBEAT, "hr_start_block = %llu, hr_blocks = %u\n",
1164 reg->hr_start_block, reg->hr_blocks);
1165 mlog(ML_HEARTBEAT, "hr_block_bytes = %u, hr_block_bits = %u\n",
1166 reg->hr_block_bytes, reg->hr_block_bits);
1167 mlog(ML_HEARTBEAT, "hr_timeout_ms = %u\n", reg->hr_timeout_ms);
1168 mlog(ML_HEARTBEAT, "dead threshold = %u\n", o2hb_dead_threshold);
1169}
1170
1171static int o2hb_map_slot_data(struct o2hb_region *reg)
1172{
1173 int i, j;
1174 unsigned int last_slot;
1175 unsigned int spp = reg->hr_slots_per_page;
1176 struct page *page;
1177 char *raw;
1178 struct o2hb_disk_slot *slot;
1179
1180 reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL);
1181 if (reg->hr_tmp_block == NULL) {
1182 mlog_errno(-ENOMEM);
1183 return -ENOMEM;
1184 }
1185
1186 reg->hr_slots = kcalloc(reg->hr_blocks,
1187 sizeof(struct o2hb_disk_slot), GFP_KERNEL);
1188 if (reg->hr_slots == NULL) {
1189 mlog_errno(-ENOMEM);
1190 return -ENOMEM;
1191 }
1192
1193 for(i = 0; i < reg->hr_blocks; i++) {
1194 slot = &reg->hr_slots[i];
1195 slot->ds_node_num = i;
1196 INIT_LIST_HEAD(&slot->ds_live_item);
1197 slot->ds_raw_block = NULL;
1198 }
1199
1200 reg->hr_num_pages = (reg->hr_blocks + spp - 1) / spp;
1201 mlog(ML_HEARTBEAT, "Going to require %u pages to cover %u blocks "
1202 "at %u blocks per page\n",
1203 reg->hr_num_pages, reg->hr_blocks, spp);
1204
1205 reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *),
1206 GFP_KERNEL);
1207 if (!reg->hr_slot_data) {
1208 mlog_errno(-ENOMEM);
1209 return -ENOMEM;
1210 }
1211
1212 for(i = 0; i < reg->hr_num_pages; i++) {
1213 page = alloc_page(GFP_KERNEL);
1214 if (!page) {
1215 mlog_errno(-ENOMEM);
1216 return -ENOMEM;
1217 }
1218
1219 reg->hr_slot_data[i] = page;
1220
1221 last_slot = i * spp;
1222 raw = page_address(page);
1223 for (j = 0;
1224 (j < spp) && ((j + last_slot) < reg->hr_blocks);
1225 j++) {
1226 BUG_ON((j + last_slot) >= reg->hr_blocks);
1227
1228 slot = &reg->hr_slots[j + last_slot];
1229 slot->ds_raw_block =
1230 (struct o2hb_disk_heartbeat_block *) raw;
1231
1232 raw += reg->hr_block_bytes;
1233 }
1234 }
1235
1236 return 0;
1237}
1238
1239/* Read in all the slots available and populate the tracking
1240 * structures so that we can start with a baseline idea of what's
1241 * there. */
1242static int o2hb_populate_slot_data(struct o2hb_region *reg)
1243{
1244 int ret, i;
1245 struct o2hb_disk_slot *slot;
1246 struct o2hb_disk_heartbeat_block *hb_block;
1247
1248 mlog_entry_void();
1249
1250 ret = o2hb_read_slots(reg, reg->hr_blocks);
1251 if (ret) {
1252 mlog_errno(ret);
1253 goto out;
1254 }
1255
1256 /* We only want to get an idea of the values initially in each
1257 * slot, so we do no verification - o2hb_check_slot will
1258 * actually determine if each configured slot is valid and
1259 * whether any values have changed. */
1260 for(i = 0; i < reg->hr_blocks; i++) {
1261 slot = &reg->hr_slots[i];
1262 hb_block = (struct o2hb_disk_heartbeat_block *) slot->ds_raw_block;
1263
1264 /* Only fill the values that o2hb_check_slot uses to
1265 * determine changing slots */
1266 slot->ds_last_time = le64_to_cpu(hb_block->hb_seq);
1267 slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation);
1268 }
1269
1270out:
1271 mlog_exit(ret);
1272 return ret;
1273}
1274
1275/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */
1276static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1277 const char *page,
1278 size_t count)
1279{
1280 long fd;
1281 int sectsize;
1282 char *p = (char *)page;
1283 struct file *filp = NULL;
1284 struct inode *inode = NULL;
1285 ssize_t ret = -EINVAL;
1286
1287 if (reg->hr_bdev)
1288 goto out;
1289
1290 /* We can't heartbeat without having had our node number
1291 * configured yet. */
1292 if (o2nm_this_node() == O2NM_MAX_NODES)
1293 goto out;
1294
1295 fd = simple_strtol(p, &p, 0);
1296 if (!p || (*p && (*p != '\n')))
1297 goto out;
1298
1299 if (fd < 0 || fd >= INT_MAX)
1300 goto out;
1301
1302 filp = fget(fd);
1303 if (filp == NULL)
1304 goto out;
1305
1306 if (reg->hr_blocks == 0 || reg->hr_start_block == 0 ||
1307 reg->hr_block_bytes == 0)
1308 goto out;
1309
1310 inode = igrab(filp->f_mapping->host);
1311 if (inode == NULL)
1312 goto out;
1313
1314 if (!S_ISBLK(inode->i_mode))
1315 goto out;
1316
1317 reg->hr_bdev = I_BDEV(filp->f_mapping->host);
1318 ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, 0);
1319 if (ret) {
1320 reg->hr_bdev = NULL;
1321 goto out;
1322 }
1323 inode = NULL;
1324
1325 bdevname(reg->hr_bdev, reg->hr_dev_name);
1326
1327 sectsize = bdev_hardsect_size(reg->hr_bdev);
1328 if (sectsize != reg->hr_block_bytes) {
1329 mlog(ML_ERROR,
1330 "blocksize %u incorrect for device, expected %d",
1331 reg->hr_block_bytes, sectsize);
1332 ret = -EINVAL;
1333 goto out;
1334 }
1335
1336 o2hb_init_region_params(reg);
1337
1338 /* Generation of zero is invalid */
1339 do {
1340 get_random_bytes(&reg->hr_generation,
1341 sizeof(reg->hr_generation));
1342 } while (reg->hr_generation == 0);
1343
1344 ret = o2hb_map_slot_data(reg);
1345 if (ret) {
1346 mlog_errno(ret);
1347 goto out;
1348 }
1349
1350 ret = o2hb_populate_slot_data(reg);
1351 if (ret) {
1352 mlog_errno(ret);
1353 goto out;
1354 }
1355
1356 INIT_WORK(&reg->hr_write_timeout_work, o2hb_write_timeout, reg);
1357
1358 /*
1359 * A node is considered live after it has beat LIVE_THRESHOLD
1360 * times. We're not steady until we've given them a chance
1361 * _after_ our first read.
1362 */
1363 atomic_set(&reg->hr_steady_iterations, O2HB_LIVE_THRESHOLD + 1);
1364
1365 reg->hr_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
1366 reg->hr_item.ci_name);
1367 if (IS_ERR(reg->hr_task)) {
1368 ret = PTR_ERR(reg->hr_task);
1369 mlog_errno(ret);
1370 reg->hr_task = NULL;
1371 goto out;
1372 }
1373
1374 ret = wait_event_interruptible(o2hb_steady_queue,
1375 atomic_read(&reg->hr_steady_iterations) == 0);
1376 if (ret) {
1377 kthread_stop(reg->hr_task);
1378 reg->hr_task = NULL;
1379 goto out;
1380 }
1381
1382 ret = count;
1383out:
1384 if (filp)
1385 fput(filp);
1386 if (inode)
1387 iput(inode);
1388 if (ret < 0) {
1389 if (reg->hr_bdev) {
1390 blkdev_put(reg->hr_bdev);
1391 reg->hr_bdev = NULL;
1392 }
1393 }
1394 return ret;
1395}
1396
1397struct o2hb_region_attribute {
1398 struct configfs_attribute attr;
1399 ssize_t (*show)(struct o2hb_region *, char *);
1400 ssize_t (*store)(struct o2hb_region *, const char *, size_t);
1401};
1402
1403static struct o2hb_region_attribute o2hb_region_attr_block_bytes = {
1404 .attr = { .ca_owner = THIS_MODULE,
1405 .ca_name = "block_bytes",
1406 .ca_mode = S_IRUGO | S_IWUSR },
1407 .show = o2hb_region_block_bytes_read,
1408 .store = o2hb_region_block_bytes_write,
1409};
1410
1411static struct o2hb_region_attribute o2hb_region_attr_start_block = {
1412 .attr = { .ca_owner = THIS_MODULE,
1413 .ca_name = "start_block",
1414 .ca_mode = S_IRUGO | S_IWUSR },
1415 .show = o2hb_region_start_block_read,
1416 .store = o2hb_region_start_block_write,
1417};
1418
1419static struct o2hb_region_attribute o2hb_region_attr_blocks = {
1420 .attr = { .ca_owner = THIS_MODULE,
1421 .ca_name = "blocks",
1422 .ca_mode = S_IRUGO | S_IWUSR },
1423 .show = o2hb_region_blocks_read,
1424 .store = o2hb_region_blocks_write,
1425};
1426
1427static struct o2hb_region_attribute o2hb_region_attr_dev = {
1428 .attr = { .ca_owner = THIS_MODULE,
1429 .ca_name = "dev",
1430 .ca_mode = S_IRUGO | S_IWUSR },
1431 .show = o2hb_region_dev_read,
1432 .store = o2hb_region_dev_write,
1433};
1434
1435static struct configfs_attribute *o2hb_region_attrs[] = {
1436 &o2hb_region_attr_block_bytes.attr,
1437 &o2hb_region_attr_start_block.attr,
1438 &o2hb_region_attr_blocks.attr,
1439 &o2hb_region_attr_dev.attr,
1440 NULL,
1441};
1442
1443static ssize_t o2hb_region_show(struct config_item *item,
1444 struct configfs_attribute *attr,
1445 char *page)
1446{
1447 struct o2hb_region *reg = to_o2hb_region(item);
1448 struct o2hb_region_attribute *o2hb_region_attr =
1449 container_of(attr, struct o2hb_region_attribute, attr);
1450 ssize_t ret = 0;
1451
1452 if (o2hb_region_attr->show)
1453 ret = o2hb_region_attr->show(reg, page);
1454 return ret;
1455}
1456
1457static ssize_t o2hb_region_store(struct config_item *item,
1458 struct configfs_attribute *attr,
1459 const char *page, size_t count)
1460{
1461 struct o2hb_region *reg = to_o2hb_region(item);
1462 struct o2hb_region_attribute *o2hb_region_attr =
1463 container_of(attr, struct o2hb_region_attribute, attr);
1464 ssize_t ret = -EINVAL;
1465
1466 if (o2hb_region_attr->store)
1467 ret = o2hb_region_attr->store(reg, page, count);
1468 return ret;
1469}
1470
1471static struct configfs_item_operations o2hb_region_item_ops = {
1472 .release = o2hb_region_release,
1473 .show_attribute = o2hb_region_show,
1474 .store_attribute = o2hb_region_store,
1475};
1476
1477static struct config_item_type o2hb_region_type = {
1478 .ct_item_ops = &o2hb_region_item_ops,
1479 .ct_attrs = o2hb_region_attrs,
1480 .ct_owner = THIS_MODULE,
1481};
1482
1483/* heartbeat set */
1484
1485struct o2hb_heartbeat_group {
1486 struct config_group hs_group;
1487 /* some stuff? */
1488};
1489
1490static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group *group)
1491{
1492 return group ?
1493 container_of(group, struct o2hb_heartbeat_group, hs_group)
1494 : NULL;
1495}
1496
1497static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
1498 const char *name)
1499{
1500 struct o2hb_region *reg = NULL;
1501 struct config_item *ret = NULL;
1502
1503 reg = kcalloc(1, sizeof(struct o2hb_region), GFP_KERNEL);
1504 if (reg == NULL)
1505 goto out; /* ENOMEM */
1506
1507 config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
1508
1509 ret = &reg->hr_item;
1510
1511 spin_lock(&o2hb_live_lock);
1512 list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
1513 spin_unlock(&o2hb_live_lock);
1514out:
1515 if (ret == NULL)
1516 kfree(reg);
1517
1518 return ret;
1519}
1520
1521static void o2hb_heartbeat_group_drop_item(struct config_group *group,
1522 struct config_item *item)
1523{
1524 struct o2hb_region *reg = to_o2hb_region(item);
1525
1526 /* stop the thread when the user removes the region dir */
1527 if (reg->hr_task) {
1528 kthread_stop(reg->hr_task);
1529 reg->hr_task = NULL;
1530 }
1531
1532 config_item_put(item);
1533}
1534
1535struct o2hb_heartbeat_group_attribute {
1536 struct configfs_attribute attr;
1537 ssize_t (*show)(struct o2hb_heartbeat_group *, char *);
1538 ssize_t (*store)(struct o2hb_heartbeat_group *, const char *, size_t);
1539};
1540
1541static ssize_t o2hb_heartbeat_group_show(struct config_item *item,
1542 struct configfs_attribute *attr,
1543 char *page)
1544{
1545 struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
1546 struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
1547 container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
1548 ssize_t ret = 0;
1549
1550 if (o2hb_heartbeat_group_attr->show)
1551 ret = o2hb_heartbeat_group_attr->show(reg, page);
1552 return ret;
1553}
1554
1555static ssize_t o2hb_heartbeat_group_store(struct config_item *item,
1556 struct configfs_attribute *attr,
1557 const char *page, size_t count)
1558{
1559 struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
1560 struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
1561 container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
1562 ssize_t ret = -EINVAL;
1563
1564 if (o2hb_heartbeat_group_attr->store)
1565 ret = o2hb_heartbeat_group_attr->store(reg, page, count);
1566 return ret;
1567}
1568
1569static ssize_t o2hb_heartbeat_group_threshold_show(struct o2hb_heartbeat_group *group,
1570 char *page)
1571{
1572 return sprintf(page, "%u\n", o2hb_dead_threshold);
1573}
1574
1575static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group *group,
1576 const char *page,
1577 size_t count)
1578{
1579 unsigned long tmp;
1580 char *p = (char *)page;
1581
1582 tmp = simple_strtoul(p, &p, 10);
1583 if (!p || (*p && (*p != '\n')))
1584 return -EINVAL;
1585
1586 /* this will validate ranges for us. */
1587 o2hb_dead_threshold_set((unsigned int) tmp);
1588
1589 return count;
1590}
1591
1592static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
1593 .attr = { .ca_owner = THIS_MODULE,
1594 .ca_name = "dead_threshold",
1595 .ca_mode = S_IRUGO | S_IWUSR },
1596 .show = o2hb_heartbeat_group_threshold_show,
1597 .store = o2hb_heartbeat_group_threshold_store,
1598};
1599
1600static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
1601 &o2hb_heartbeat_group_attr_threshold.attr,
1602 NULL,
1603};
1604
1605static struct configfs_item_operations o2hb_hearbeat_group_item_ops = {
1606 .show_attribute = o2hb_heartbeat_group_show,
1607 .store_attribute = o2hb_heartbeat_group_store,
1608};
1609
1610static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
1611 .make_item = o2hb_heartbeat_group_make_item,
1612 .drop_item = o2hb_heartbeat_group_drop_item,
1613};
1614
1615static struct config_item_type o2hb_heartbeat_group_type = {
1616 .ct_group_ops = &o2hb_heartbeat_group_group_ops,
1617 .ct_item_ops = &o2hb_hearbeat_group_item_ops,
1618 .ct_attrs = o2hb_heartbeat_group_attrs,
1619 .ct_owner = THIS_MODULE,
1620};
1621
1622/* this is just here to avoid touching group in heartbeat.h which the
1623 * entire damn world #includes */
1624struct config_group *o2hb_alloc_hb_set(void)
1625{
1626 struct o2hb_heartbeat_group *hs = NULL;
1627 struct config_group *ret = NULL;
1628
1629 hs = kcalloc(1, sizeof(struct o2hb_heartbeat_group), GFP_KERNEL);
1630 if (hs == NULL)
1631 goto out;
1632
1633 config_group_init_type_name(&hs->hs_group, "heartbeat",
1634 &o2hb_heartbeat_group_type);
1635
1636 ret = &hs->hs_group;
1637out:
1638 if (ret == NULL)
1639 kfree(hs);
1640 return ret;
1641}
1642
1643void o2hb_free_hb_set(struct config_group *group)
1644{
1645 struct o2hb_heartbeat_group *hs = to_o2hb_heartbeat_group(group);
1646 kfree(hs);
1647}
1648
1649/* hb callback registration and issueing */
1650
1651static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type)
1652{
1653 if (type == O2HB_NUM_CB)
1654 return ERR_PTR(-EINVAL);
1655
1656 return &o2hb_callbacks[type];
1657}
1658
1659void o2hb_setup_callback(struct o2hb_callback_func *hc,
1660 enum o2hb_callback_type type,
1661 o2hb_cb_func *func,
1662 void *data,
1663 int priority)
1664{
1665 INIT_LIST_HEAD(&hc->hc_item);
1666 hc->hc_func = func;
1667 hc->hc_data = data;
1668 hc->hc_priority = priority;
1669 hc->hc_type = type;
1670 hc->hc_magic = O2HB_CB_MAGIC;
1671}
1672EXPORT_SYMBOL_GPL(o2hb_setup_callback);
1673
1674int o2hb_register_callback(struct o2hb_callback_func *hc)
1675{
1676 struct o2hb_callback_func *tmp;
1677 struct list_head *iter;
1678 struct o2hb_callback *hbcall;
1679 int ret;
1680
1681 BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
1682 BUG_ON(!list_empty(&hc->hc_item));
1683
1684 hbcall = hbcall_from_type(hc->hc_type);
1685 if (IS_ERR(hbcall)) {
1686 ret = PTR_ERR(hbcall);
1687 goto out;
1688 }
1689
1690 down_write(&o2hb_callback_sem);
1691
1692 list_for_each(iter, &hbcall->list) {
1693 tmp = list_entry(iter, struct o2hb_callback_func, hc_item);
1694 if (hc->hc_priority < tmp->hc_priority) {
1695 list_add_tail(&hc->hc_item, iter);
1696 break;
1697 }
1698 }
1699 if (list_empty(&hc->hc_item))
1700 list_add_tail(&hc->hc_item, &hbcall->list);
1701
1702 up_write(&o2hb_callback_sem);
1703 ret = 0;
1704out:
1705 mlog(ML_HEARTBEAT, "returning %d on behalf of %p for funcs %p\n",
1706 ret, __builtin_return_address(0), hc);
1707 return ret;
1708}
1709EXPORT_SYMBOL_GPL(o2hb_register_callback);
1710
1711int o2hb_unregister_callback(struct o2hb_callback_func *hc)
1712{
1713 BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
1714
1715 mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
1716 __builtin_return_address(0), hc);
1717
1718 if (list_empty(&hc->hc_item))
1719 return 0;
1720
1721 down_write(&o2hb_callback_sem);
1722
1723 list_del_init(&hc->hc_item);
1724
1725 up_write(&o2hb_callback_sem);
1726
1727 return 0;
1728}
1729EXPORT_SYMBOL_GPL(o2hb_unregister_callback);
1730
1731int o2hb_check_node_heartbeating(u8 node_num)
1732{
1733 unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
1734
1735 o2hb_fill_node_map(testing_map, sizeof(testing_map));
1736 if (!test_bit(node_num, testing_map)) {
1737 mlog(ML_HEARTBEAT,
1738 "node (%u) does not have heartbeating enabled.\n",
1739 node_num);
1740 return 0;
1741 }
1742
1743 return 1;
1744}
1745EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
1746
1747int o2hb_check_node_heartbeating_from_callback(u8 node_num)
1748{
1749 unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
1750
1751 o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
1752 if (!test_bit(node_num, testing_map)) {
1753 mlog(ML_HEARTBEAT,
1754 "node (%u) does not have heartbeating enabled.\n",
1755 node_num);
1756 return 0;
1757 }
1758
1759 return 1;
1760}
1761EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback);
1762
1763/* Makes sure our local node is configured with a node number, and is
1764 * heartbeating. */
1765int o2hb_check_local_node_heartbeating(void)
1766{
1767 u8 node_num;
1768
1769 /* if this node was set then we have networking */
1770 node_num = o2nm_this_node();
1771 if (node_num == O2NM_MAX_NODES) {
1772 mlog(ML_HEARTBEAT, "this node has not been configured.\n");
1773 return 0;
1774 }
1775
1776 return o2hb_check_node_heartbeating(node_num);
1777}
1778EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating);
1779
1780/*
1781 * this is just a hack until we get the plumbing which flips file systems
1782 * read only and drops the hb ref instead of killing the node dead.
1783 */
1784void o2hb_stop_all_regions(void)
1785{
1786 struct o2hb_region *reg;
1787
1788 mlog(ML_ERROR, "stopping heartbeat on all active regions.\n");
1789
1790 spin_lock(&o2hb_live_lock);
1791
1792 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item)
1793 reg->hr_unclean_stop = 1;
1794
1795 spin_unlock(&o2hb_live_lock);
1796}
1797EXPORT_SYMBOL_GPL(o2hb_stop_all_regions);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
new file mode 100644
index 00000000000..cac6223206a
--- /dev/null
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -0,0 +1,82 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * heartbeat.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27#ifndef O2CLUSTER_HEARTBEAT_H
28#define O2CLUSTER_HEARTBEAT_H
29
30#include "ocfs2_heartbeat.h"
31
32#define O2HB_REGION_TIMEOUT_MS 2000
33
34/* number of changes to be seen as live */
35#define O2HB_LIVE_THRESHOLD 2
36/* number of equal samples to be seen as dead */
37extern unsigned int o2hb_dead_threshold;
38#define O2HB_DEFAULT_DEAD_THRESHOLD 7
39/* Otherwise MAX_WRITE_TIMEOUT will be zero... */
40#define O2HB_MIN_DEAD_THRESHOLD 2
41#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1))
42
43#define O2HB_CB_MAGIC 0x51d1e4ec
44
45/* callback stuff */
46enum o2hb_callback_type {
47 O2HB_NODE_DOWN_CB = 0,
48 O2HB_NODE_UP_CB,
49 O2HB_NUM_CB
50};
51
52struct o2nm_node;
53typedef void (o2hb_cb_func)(struct o2nm_node *, int, void *);
54
55struct o2hb_callback_func {
56 u32 hc_magic;
57 struct list_head hc_item;
58 o2hb_cb_func *hc_func;
59 void *hc_data;
60 int hc_priority;
61 enum o2hb_callback_type hc_type;
62};
63
64struct config_group *o2hb_alloc_hb_set(void);
65void o2hb_free_hb_set(struct config_group *group);
66
67void o2hb_setup_callback(struct o2hb_callback_func *hc,
68 enum o2hb_callback_type type,
69 o2hb_cb_func *func,
70 void *data,
71 int priority);
72int o2hb_register_callback(struct o2hb_callback_func *hc);
73int o2hb_unregister_callback(struct o2hb_callback_func *hc);
74void o2hb_fill_node_map(unsigned long *map,
75 unsigned bytes);
76void o2hb_init(void);
77int o2hb_check_node_heartbeating(u8 node_num);
78int o2hb_check_node_heartbeating_from_callback(u8 node_num);
79int o2hb_check_local_node_heartbeating(void);
80void o2hb_stop_all_regions(void);
81
82#endif /* O2CLUSTER_HEARTBEAT_H */
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
new file mode 100644
index 00000000000..fd741cea570
--- /dev/null
+++ b/fs/ocfs2/cluster/masklog.c
@@ -0,0 +1,166 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2004, 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#include <linux/module.h>
23#include <linux/kernel.h>
24#include <linux/proc_fs.h>
25#include <linux/seq_file.h>
26#include <linux/string.h>
27#include <asm/uaccess.h>
28
29#include "masklog.h"
30
31struct mlog_bits mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK);
32EXPORT_SYMBOL_GPL(mlog_and_bits);
33struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(MLOG_INITIAL_NOT_MASK);
34EXPORT_SYMBOL_GPL(mlog_not_bits);
35
36static ssize_t mlog_mask_show(u64 mask, char *buf)
37{
38 char *state;
39
40 if (__mlog_test_u64(mask, mlog_and_bits))
41 state = "allow";
42 else if (__mlog_test_u64(mask, mlog_not_bits))
43 state = "deny";
44 else
45 state = "off";
46
47 return snprintf(buf, PAGE_SIZE, "%s\n", state);
48}
49
50static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count)
51{
52 if (!strnicmp(buf, "allow", 5)) {
53 __mlog_set_u64(mask, mlog_and_bits);
54 __mlog_clear_u64(mask, mlog_not_bits);
55 } else if (!strnicmp(buf, "deny", 4)) {
56 __mlog_set_u64(mask, mlog_not_bits);
57 __mlog_clear_u64(mask, mlog_and_bits);
58 } else if (!strnicmp(buf, "off", 3)) {
59 __mlog_clear_u64(mask, mlog_not_bits);
60 __mlog_clear_u64(mask, mlog_and_bits);
61 } else
62 return -EINVAL;
63
64 return count;
65}
66
67struct mlog_attribute {
68 struct attribute attr;
69 u64 mask;
70};
71
72#define to_mlog_attr(_attr) container_of(_attr, struct mlog_attribute, attr)
73
74#define define_mask(_name) { \
75 .attr = { \
76 .name = #_name, \
77 .mode = S_IRUGO | S_IWUSR, \
78 }, \
79 .mask = ML_##_name, \
80}
81
82static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
83 define_mask(ENTRY),
84 define_mask(EXIT),
85 define_mask(TCP),
86 define_mask(MSG),
87 define_mask(SOCKET),
88 define_mask(HEARTBEAT),
89 define_mask(HB_BIO),
90 define_mask(DLMFS),
91 define_mask(DLM),
92 define_mask(DLM_DOMAIN),
93 define_mask(DLM_THREAD),
94 define_mask(DLM_MASTER),
95 define_mask(DLM_RECOVERY),
96 define_mask(AIO),
97 define_mask(JOURNAL),
98 define_mask(DISK_ALLOC),
99 define_mask(SUPER),
100 define_mask(FILE_IO),
101 define_mask(EXTENT_MAP),
102 define_mask(DLM_GLUE),
103 define_mask(BH_IO),
104 define_mask(UPTODATE),
105 define_mask(NAMEI),
106 define_mask(INODE),
107 define_mask(VOTE),
108 define_mask(DCACHE),
109 define_mask(CONN),
110 define_mask(QUORUM),
111 define_mask(EXPORT),
112 define_mask(ERROR),
113 define_mask(NOTICE),
114 define_mask(KTHREAD),
115};
116
117static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
118
119static ssize_t mlog_show(struct kobject *obj, struct attribute *attr,
120 char *buf)
121{
122 struct mlog_attribute *mlog_attr = to_mlog_attr(attr);
123
124 return mlog_mask_show(mlog_attr->mask, buf);
125}
126
127static ssize_t mlog_store(struct kobject *obj, struct attribute *attr,
128 const char *buf, size_t count)
129{
130 struct mlog_attribute *mlog_attr = to_mlog_attr(attr);
131
132 return mlog_mask_store(mlog_attr->mask, buf, count);
133}
134
135static struct sysfs_ops mlog_attr_ops = {
136 .show = mlog_show,
137 .store = mlog_store,
138};
139
140static struct kobj_type mlog_ktype = {
141 .default_attrs = mlog_attr_ptrs,
142 .sysfs_ops = &mlog_attr_ops,
143};
144
145static struct kset mlog_kset = {
146 .kobj = {.name = "logmask", .ktype = &mlog_ktype},
147};
148
149int mlog_sys_init(struct subsystem *o2cb_subsys)
150{
151 int i = 0;
152
153 while (mlog_attrs[i].attr.mode) {
154 mlog_attr_ptrs[i] = &mlog_attrs[i].attr;
155 i++;
156 }
157 mlog_attr_ptrs[i] = NULL;
158
159 mlog_kset.subsys = o2cb_subsys;
160 return kset_register(&mlog_kset);
161}
162
163void mlog_sys_shutdown(void)
164{
165 kset_unregister(&mlog_kset);
166}
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
new file mode 100644
index 00000000000..e8c56a3d9c6
--- /dev/null
+++ b/fs/ocfs2/cluster/masklog.h
@@ -0,0 +1,274 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#ifndef O2CLUSTER_MASKLOG_H
23#define O2CLUSTER_MASKLOG_H
24
25/*
26 * For now this is a trivial wrapper around printk() that gives the critical
27 * ability to enable sets of debugging output at run-time. In the future this
28 * will almost certainly be redirected to relayfs so that it can pay a
29 * substantially lower heisenberg tax.
30 *
31 * Callers associate the message with a bitmask and a global bitmask is
32 * maintained with help from /proc. If any of the bits match the message is
33 * output.
34 *
35 * We must have efficient bit tests on i386 and it seems gcc still emits crazy
36 * code for the 64bit compare. It emits very good code for the dual unsigned
37 * long tests, though, completely avoiding tests that can never pass if the
38 * caller gives a constant bitmask that fills one of the longs with all 0s. So
39 * the desire is to have almost all of the calls decided on by comparing just
40 * one of the longs. This leads to having infrequently given bits that are
41 * frequently matched in the high bits.
42 *
43 * _ERROR and _NOTICE are used for messages that always go to the console and
44 * have appropriate KERN_ prefixes. We wrap these in our function instead of
45 * just calling printk() so that this can eventually make its way through
46 * relayfs along with the debugging messages. Everything else gets KERN_DEBUG.
47 * The inline tests and macro dance give GCC the opportunity to quite cleverly
48 * only emit the appropriage printk() when the caller passes in a constant
49 * mask, as is almost always the case.
50 *
51 * All this bitmask nonsense is hidden from the /proc interface so that Joel
52 * doesn't have an aneurism. Reading the file gives a straight forward
53 * indication of which bits are on or off:
54 * ENTRY off
55 * EXIT off
56 * TCP off
57 * MSG off
58 * SOCKET off
59 * ERROR off
60 * NOTICE on
61 *
62 * Writing changes the state of a given bit and requires a strictly formatted
63 * single write() call:
64 *
65 * write(fd, "ENTRY on", 8);
66 *
67 * would turn the entry bit on. "1" is also accepted in the place of "on", and
68 * "off" and "0" behave as expected.
69 *
70 * Some trivial shell can flip all the bits on or off:
71 *
72 * log_mask="/proc/fs/ocfs2_nodemanager/log_mask"
73 * cat $log_mask | (
74 * while read bit status; do
75 * # $1 is "on" or "off", say
76 * echo "$bit $1" > $log_mask
77 * done
78 * )
79 */
80
81/* for task_struct */
82#include <linux/sched.h>
83
84/* bits that are frequently given and infrequently matched in the low word */
85/* NOTE: If you add a flag, you need to also update mlog.c! */
86#define ML_ENTRY 0x0000000000000001ULL /* func call entry */
87#define ML_EXIT 0x0000000000000002ULL /* func call exit */
88#define ML_TCP 0x0000000000000004ULL /* net cluster/tcp.c */
89#define ML_MSG 0x0000000000000008ULL /* net network messages */
90#define ML_SOCKET 0x0000000000000010ULL /* net socket lifetime */
91#define ML_HEARTBEAT 0x0000000000000020ULL /* hb all heartbeat tracking */
92#define ML_HB_BIO 0x0000000000000040ULL /* hb io tracing */
93#define ML_DLMFS 0x0000000000000080ULL /* dlm user dlmfs */
94#define ML_DLM 0x0000000000000100ULL /* dlm general debugging */
95#define ML_DLM_DOMAIN 0x0000000000000200ULL /* dlm domain debugging */
96#define ML_DLM_THREAD 0x0000000000000400ULL /* dlm domain thread */
97#define ML_DLM_MASTER 0x0000000000000800ULL /* dlm master functions */
98#define ML_DLM_RECOVERY 0x0000000000001000ULL /* dlm master functions */
99#define ML_AIO 0x0000000000002000ULL /* ocfs2 aio read and write */
100#define ML_JOURNAL 0x0000000000004000ULL /* ocfs2 journalling functions */
101#define ML_DISK_ALLOC 0x0000000000008000ULL /* ocfs2 disk allocation */
102#define ML_SUPER 0x0000000000010000ULL /* ocfs2 mount / umount */
103#define ML_FILE_IO 0x0000000000020000ULL /* ocfs2 file I/O */
104#define ML_EXTENT_MAP 0x0000000000040000ULL /* ocfs2 extent map caching */
105#define ML_DLM_GLUE 0x0000000000080000ULL /* ocfs2 dlm glue layer */
106#define ML_BH_IO 0x0000000000100000ULL /* ocfs2 buffer I/O */
107#define ML_UPTODATE 0x0000000000200000ULL /* ocfs2 caching sequence #'s */
108#define ML_NAMEI 0x0000000000400000ULL /* ocfs2 directory / namespace */
109#define ML_INODE 0x0000000000800000ULL /* ocfs2 inode manipulation */
110#define ML_VOTE 0x0000000001000000ULL /* ocfs2 node messaging */
111#define ML_DCACHE 0x0000000002000000ULL /* ocfs2 dcache operations */
112#define ML_CONN 0x0000000004000000ULL /* net connection management */
113#define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */
114#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */
115/* bits that are infrequently given and frequently matched in the high word */
116#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
117#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
118#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */
119
120#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
121#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
122#ifndef MLOG_MASK_PREFIX
123#define MLOG_MASK_PREFIX 0
124#endif
125
126#define MLOG_MAX_BITS 64
127
128struct mlog_bits {
129 unsigned long words[MLOG_MAX_BITS / BITS_PER_LONG];
130};
131
132extern struct mlog_bits mlog_and_bits, mlog_not_bits;
133
134#if BITS_PER_LONG == 32
135
136#define __mlog_test_u64(mask, bits) \
137 ( (u32)(mask & 0xffffffff) & bits.words[0] || \
138 ((u64)(mask) >> 32) & bits.words[1] )
139#define __mlog_set_u64(mask, bits) do { \
140 bits.words[0] |= (u32)(mask & 0xffffffff); \
141 bits.words[1] |= (u64)(mask) >> 32; \
142} while (0)
143#define __mlog_clear_u64(mask, bits) do { \
144 bits.words[0] &= ~((u32)(mask & 0xffffffff)); \
145 bits.words[1] &= ~((u64)(mask) >> 32); \
146} while (0)
147#define MLOG_BITS_RHS(mask) { \
148 { \
149 [0] = (u32)(mask & 0xffffffff), \
150 [1] = (u64)(mask) >> 32, \
151 } \
152}
153
154#else /* 32bit long above, 64bit long below */
155
156#define __mlog_test_u64(mask, bits) ((mask) & bits.words[0])
157#define __mlog_set_u64(mask, bits) do { \
158 bits.words[0] |= (mask); \
159} while (0)
160#define __mlog_clear_u64(mask, bits) do { \
161 bits.words[0] &= ~(mask); \
162} while (0)
163#define MLOG_BITS_RHS(mask) { { (mask) } }
164
165#endif
166
167/*
168 * smp_processor_id() "helpfully" screams when called outside preemptible
169 * regions in current kernels. sles doesn't have the variants that don't
170 * scream. just do this instead of trying to guess which we're building
171 * against.. *sigh*.
172 */
173#define __mlog_cpu_guess ({ \
174 unsigned long _cpu = get_cpu(); \
175 put_cpu(); \
176 _cpu; \
177})
178
179/* In the following two macros, the whitespace after the ',' just
180 * before ##args is intentional. Otherwise, gcc 2.95 will eat the
181 * previous token if args expands to nothing.
182 */
183#define __mlog_printk(level, fmt, args...) \
184 printk(level "(%u,%lu):%s:%d " fmt, current->pid, \
185 __mlog_cpu_guess, __PRETTY_FUNCTION__, __LINE__ , \
186 ##args)
187
188#define mlog(mask, fmt, args...) do { \
189 u64 __m = MLOG_MASK_PREFIX | (mask); \
190 if (__mlog_test_u64(__m, mlog_and_bits) && \
191 !__mlog_test_u64(__m, mlog_not_bits)) { \
192 if (__m & ML_ERROR) \
193 __mlog_printk(KERN_ERR, "ERROR: "fmt , ##args); \
194 else if (__m & ML_NOTICE) \
195 __mlog_printk(KERN_NOTICE, fmt , ##args); \
196 else __mlog_printk(KERN_INFO, fmt , ##args); \
197 } \
198} while (0)
199
200#define mlog_errno(st) do { \
201 int _st = (st); \
202 if (_st != -ERESTARTSYS && _st != -EINTR && \
203 _st != AOP_TRUNCATED_PAGE) \
204 mlog(ML_ERROR, "status = %lld\n", (long long)_st); \
205} while (0)
206
207#define mlog_entry(fmt, args...) do { \
208 mlog(ML_ENTRY, "ENTRY:" fmt , ##args); \
209} while (0)
210
211#define mlog_entry_void() do { \
212 mlog(ML_ENTRY, "ENTRY:\n"); \
213} while (0)
214
215/*
216 * We disable this for sparse.
217 */
218#if !defined(__CHECKER__)
219#define mlog_exit(st) do { \
220 if (__builtin_types_compatible_p(typeof(st), unsigned long)) \
221 mlog(ML_EXIT, "EXIT: %lu\n", (unsigned long) (st)); \
222 else if (__builtin_types_compatible_p(typeof(st), signed long)) \
223 mlog(ML_EXIT, "EXIT: %ld\n", (signed long) (st)); \
224 else if (__builtin_types_compatible_p(typeof(st), unsigned int) \
225 || __builtin_types_compatible_p(typeof(st), unsigned short) \
226 || __builtin_types_compatible_p(typeof(st), unsigned char)) \
227 mlog(ML_EXIT, "EXIT: %u\n", (unsigned int) (st)); \
228 else if (__builtin_types_compatible_p(typeof(st), signed int) \
229 || __builtin_types_compatible_p(typeof(st), signed short) \
230 || __builtin_types_compatible_p(typeof(st), signed char)) \
231 mlog(ML_EXIT, "EXIT: %d\n", (signed int) (st)); \
232 else if (__builtin_types_compatible_p(typeof(st), long long)) \
233 mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st)); \
234 else \
235 mlog(ML_EXIT, "EXIT: %llu\n", (unsigned long long) (st)); \
236} while (0)
237#else
238#define mlog_exit(st) do { \
239 mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st)); \
240} while (0)
241#endif
242
243#define mlog_exit_ptr(ptr) do { \
244 mlog(ML_EXIT, "EXIT: %p\n", ptr); \
245} while (0)
246
247#define mlog_exit_void() do { \
248 mlog(ML_EXIT, "EXIT\n"); \
249} while (0)
250
251#define mlog_bug_on_msg(cond, fmt, args...) do { \
252 if (cond) { \
253 mlog(ML_ERROR, "bug expression: " #cond "\n"); \
254 mlog(ML_ERROR, fmt, ##args); \
255 BUG(); \
256 } \
257} while (0)
258
259#if (BITS_PER_LONG == 32) || defined(CONFIG_X86_64)
260#define MLFi64 "lld"
261#define MLFu64 "llu"
262#define MLFx64 "llx"
263#else
264#define MLFi64 "ld"
265#define MLFu64 "lu"
266#define MLFx64 "lx"
267#endif
268
269#include <linux/kobject.h>
270#include <linux/sysfs.h>
271int mlog_sys_init(struct subsystem *o2cb_subsys);
272void mlog_sys_shutdown(void);
273
274#endif /* O2CLUSTER_MASKLOG_H */
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
new file mode 100644
index 00000000000..cf7828f2336
--- /dev/null
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -0,0 +1,791 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2004, 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#include <linux/kernel.h>
23#include <linux/module.h>
24#include <linux/sysctl.h>
25#include <linux/configfs.h>
26
27#include "endian.h"
28#include "tcp.h"
29#include "nodemanager.h"
30#include "heartbeat.h"
31#include "masklog.h"
32#include "sys.h"
33#include "ver.h"
34
35/* for now we operate under the assertion that there can be only one
36 * cluster active at a time. Changing this will require trickling
37 * cluster references throughout where nodes are looked up */
38static struct o2nm_cluster *o2nm_single_cluster = NULL;
39
40#define OCFS2_MAX_HB_CTL_PATH 256
41static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
42
43static ctl_table ocfs2_nm_table[] = {
44 {
45 .ctl_name = 1,
46 .procname = "hb_ctl_path",
47 .data = ocfs2_hb_ctl_path,
48 .maxlen = OCFS2_MAX_HB_CTL_PATH,
49 .mode = 0644,
50 .proc_handler = &proc_dostring,
51 .strategy = &sysctl_string,
52 },
53 { .ctl_name = 0 }
54};
55
56static ctl_table ocfs2_mod_table[] = {
57 {
58 .ctl_name = KERN_OCFS2_NM,
59 .procname = "nm",
60 .data = NULL,
61 .maxlen = 0,
62 .mode = 0555,
63 .child = ocfs2_nm_table
64 },
65 { .ctl_name = 0}
66};
67
68static ctl_table ocfs2_kern_table[] = {
69 {
70 .ctl_name = KERN_OCFS2,
71 .procname = "ocfs2",
72 .data = NULL,
73 .maxlen = 0,
74 .mode = 0555,
75 .child = ocfs2_mod_table
76 },
77 { .ctl_name = 0}
78};
79
80static ctl_table ocfs2_root_table[] = {
81 {
82 .ctl_name = CTL_FS,
83 .procname = "fs",
84 .data = NULL,
85 .maxlen = 0,
86 .mode = 0555,
87 .child = ocfs2_kern_table
88 },
89 { .ctl_name = 0 }
90};
91
92static struct ctl_table_header *ocfs2_table_header = NULL;
93
94const char *o2nm_get_hb_ctl_path(void)
95{
96 return ocfs2_hb_ctl_path;
97}
98EXPORT_SYMBOL_GPL(o2nm_get_hb_ctl_path);
99
100struct o2nm_cluster {
101 struct config_group cl_group;
102 unsigned cl_has_local:1;
103 u8 cl_local_node;
104 rwlock_t cl_nodes_lock;
105 struct o2nm_node *cl_nodes[O2NM_MAX_NODES];
106 struct rb_root cl_node_ip_tree;
107 /* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */
108 unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
109};
110
111struct o2nm_node *o2nm_get_node_by_num(u8 node_num)
112{
113 struct o2nm_node *node = NULL;
114
115 if (node_num >= O2NM_MAX_NODES || o2nm_single_cluster == NULL)
116 goto out;
117
118 read_lock(&o2nm_single_cluster->cl_nodes_lock);
119 node = o2nm_single_cluster->cl_nodes[node_num];
120 if (node)
121 config_item_get(&node->nd_item);
122 read_unlock(&o2nm_single_cluster->cl_nodes_lock);
123out:
124 return node;
125}
126EXPORT_SYMBOL_GPL(o2nm_get_node_by_num);
127
128int o2nm_configured_node_map(unsigned long *map, unsigned bytes)
129{
130 struct o2nm_cluster *cluster = o2nm_single_cluster;
131
132 BUG_ON(bytes < (sizeof(cluster->cl_nodes_bitmap)));
133
134 if (cluster == NULL)
135 return -EINVAL;
136
137 read_lock(&cluster->cl_nodes_lock);
138 memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap));
139 read_unlock(&cluster->cl_nodes_lock);
140
141 return 0;
142}
143EXPORT_SYMBOL_GPL(o2nm_configured_node_map);
144
145static struct o2nm_node *o2nm_node_ip_tree_lookup(struct o2nm_cluster *cluster,
146 __be32 ip_needle,
147 struct rb_node ***ret_p,
148 struct rb_node **ret_parent)
149{
150 struct rb_node **p = &cluster->cl_node_ip_tree.rb_node;
151 struct rb_node *parent = NULL;
152 struct o2nm_node *node, *ret = NULL;
153
154 while (*p) {
155 parent = *p;
156 node = rb_entry(parent, struct o2nm_node, nd_ip_node);
157
158 if (memcmp(&ip_needle, &node->nd_ipv4_address,
159 sizeof(ip_needle)) < 0)
160 p = &(*p)->rb_left;
161 else if (memcmp(&ip_needle, &node->nd_ipv4_address,
162 sizeof(ip_needle)) > 0)
163 p = &(*p)->rb_right;
164 else {
165 ret = node;
166 break;
167 }
168 }
169
170 if (ret_p != NULL)
171 *ret_p = p;
172 if (ret_parent != NULL)
173 *ret_parent = parent;
174
175 return ret;
176}
177
178struct o2nm_node *o2nm_get_node_by_ip(__be32 addr)
179{
180 struct o2nm_node *node = NULL;
181 struct o2nm_cluster *cluster = o2nm_single_cluster;
182
183 if (cluster == NULL)
184 goto out;
185
186 read_lock(&cluster->cl_nodes_lock);
187 node = o2nm_node_ip_tree_lookup(cluster, addr, NULL, NULL);
188 if (node)
189 config_item_get(&node->nd_item);
190 read_unlock(&cluster->cl_nodes_lock);
191
192out:
193 return node;
194}
195EXPORT_SYMBOL_GPL(o2nm_get_node_by_ip);
196
197void o2nm_node_put(struct o2nm_node *node)
198{
199 config_item_put(&node->nd_item);
200}
201EXPORT_SYMBOL_GPL(o2nm_node_put);
202
203void o2nm_node_get(struct o2nm_node *node)
204{
205 config_item_get(&node->nd_item);
206}
207EXPORT_SYMBOL_GPL(o2nm_node_get);
208
209u8 o2nm_this_node(void)
210{
211 u8 node_num = O2NM_MAX_NODES;
212
213 if (o2nm_single_cluster && o2nm_single_cluster->cl_has_local)
214 node_num = o2nm_single_cluster->cl_local_node;
215
216 return node_num;
217}
218EXPORT_SYMBOL_GPL(o2nm_this_node);
219
220/* node configfs bits */
221
222static struct o2nm_cluster *to_o2nm_cluster(struct config_item *item)
223{
224 return item ?
225 container_of(to_config_group(item), struct o2nm_cluster,
226 cl_group)
227 : NULL;
228}
229
230static struct o2nm_node *to_o2nm_node(struct config_item *item)
231{
232 return item ? container_of(item, struct o2nm_node, nd_item) : NULL;
233}
234
235static void o2nm_node_release(struct config_item *item)
236{
237 struct o2nm_node *node = to_o2nm_node(item);
238 kfree(node);
239}
240
241static ssize_t o2nm_node_num_read(struct o2nm_node *node, char *page)
242{
243 return sprintf(page, "%d\n", node->nd_num);
244}
245
246static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node)
247{
248 /* through the first node_set .parent
249 * mycluster/nodes/mynode == o2nm_cluster->o2nm_node_group->o2nm_node */
250 return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent);
251}
252
253enum {
254 O2NM_NODE_ATTR_NUM = 0,
255 O2NM_NODE_ATTR_PORT,
256 O2NM_NODE_ATTR_ADDRESS,
257 O2NM_NODE_ATTR_LOCAL,
258};
259
260static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
261 size_t count)
262{
263 struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
264 unsigned long tmp;
265 char *p = (char *)page;
266
267 tmp = simple_strtoul(p, &p, 0);
268 if (!p || (*p && (*p != '\n')))
269 return -EINVAL;
270
271 if (tmp >= O2NM_MAX_NODES)
272 return -ERANGE;
273
274 /* once we're in the cl_nodes tree networking can look us up by
275 * node number and try to use our address and port attributes
276 * to connect to this node.. make sure that they've been set
277 * before writing the node attribute? */
278 if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) ||
279 !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
280 return -EINVAL; /* XXX */
281
282 write_lock(&cluster->cl_nodes_lock);
283 if (cluster->cl_nodes[tmp])
284 p = NULL;
285 else {
286 cluster->cl_nodes[tmp] = node;
287 node->nd_num = tmp;
288 set_bit(tmp, cluster->cl_nodes_bitmap);
289 }
290 write_unlock(&cluster->cl_nodes_lock);
291 if (p == NULL)
292 return -EEXIST;
293
294 return count;
295}
296static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page)
297{
298 return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port));
299}
300
301static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node,
302 const char *page, size_t count)
303{
304 unsigned long tmp;
305 char *p = (char *)page;
306
307 tmp = simple_strtoul(p, &p, 0);
308 if (!p || (*p && (*p != '\n')))
309 return -EINVAL;
310
311 if (tmp == 0)
312 return -EINVAL;
313 if (tmp >= (u16)-1)
314 return -ERANGE;
315
316 node->nd_ipv4_port = htons(tmp);
317
318 return count;
319}
320
321static ssize_t o2nm_node_ipv4_address_read(struct o2nm_node *node, char *page)
322{
323 return sprintf(page, "%u.%u.%u.%u\n", NIPQUAD(node->nd_ipv4_address));
324}
325
326static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
327 const char *page,
328 size_t count)
329{
330 struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
331 int ret, i;
332 struct rb_node **p, *parent;
333 unsigned int octets[4];
334 __be32 ipv4_addr = 0;
335
336 ret = sscanf(page, "%3u.%3u.%3u.%3u", &octets[3], &octets[2],
337 &octets[1], &octets[0]);
338 if (ret != 4)
339 return -EINVAL;
340
341 for (i = 0; i < ARRAY_SIZE(octets); i++) {
342 if (octets[i] > 255)
343 return -ERANGE;
344 be32_add_cpu(&ipv4_addr, octets[i] << (i * 8));
345 }
346
347 ret = 0;
348 write_lock(&cluster->cl_nodes_lock);
349 if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent))
350 ret = -EEXIST;
351 else {
352 rb_link_node(&node->nd_ip_node, parent, p);
353 rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree);
354 }
355 write_unlock(&cluster->cl_nodes_lock);
356 if (ret)
357 return ret;
358
359 memcpy(&node->nd_ipv4_address, &ipv4_addr, sizeof(ipv4_addr));
360
361 return count;
362}
363
364static ssize_t o2nm_node_local_read(struct o2nm_node *node, char *page)
365{
366 return sprintf(page, "%d\n", node->nd_local);
367}
368
369static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page,
370 size_t count)
371{
372 struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
373 unsigned long tmp;
374 char *p = (char *)page;
375 ssize_t ret;
376
377 tmp = simple_strtoul(p, &p, 0);
378 if (!p || (*p && (*p != '\n')))
379 return -EINVAL;
380
381 tmp = !!tmp; /* boolean of whether this node wants to be local */
382
383 /* setting local turns on networking rx for now so we require having
384 * set everything else first */
385 if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) ||
386 !test_bit(O2NM_NODE_ATTR_NUM, &node->nd_set_attributes) ||
387 !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
388 return -EINVAL; /* XXX */
389
390 /* the only failure case is trying to set a new local node
391 * when a different one is already set */
392 if (tmp && tmp == cluster->cl_has_local &&
393 cluster->cl_local_node != node->nd_num)
394 return -EBUSY;
395
396 /* bring up the rx thread if we're setting the new local node. */
397 if (tmp && !cluster->cl_has_local) {
398 ret = o2net_start_listening(node);
399 if (ret)
400 return ret;
401 }
402
403 if (!tmp && cluster->cl_has_local &&
404 cluster->cl_local_node == node->nd_num) {
405 o2net_stop_listening(node);
406 cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
407 }
408
409 node->nd_local = tmp;
410 if (node->nd_local) {
411 cluster->cl_has_local = tmp;
412 cluster->cl_local_node = node->nd_num;
413 }
414
415 return count;
416}
417
418struct o2nm_node_attribute {
419 struct configfs_attribute attr;
420 ssize_t (*show)(struct o2nm_node *, char *);
421 ssize_t (*store)(struct o2nm_node *, const char *, size_t);
422};
423
424static struct o2nm_node_attribute o2nm_node_attr_num = {
425 .attr = { .ca_owner = THIS_MODULE,
426 .ca_name = "num",
427 .ca_mode = S_IRUGO | S_IWUSR },
428 .show = o2nm_node_num_read,
429 .store = o2nm_node_num_write,
430};
431
432static struct o2nm_node_attribute o2nm_node_attr_ipv4_port = {
433 .attr = { .ca_owner = THIS_MODULE,
434 .ca_name = "ipv4_port",
435 .ca_mode = S_IRUGO | S_IWUSR },
436 .show = o2nm_node_ipv4_port_read,
437 .store = o2nm_node_ipv4_port_write,
438};
439
440static struct o2nm_node_attribute o2nm_node_attr_ipv4_address = {
441 .attr = { .ca_owner = THIS_MODULE,
442 .ca_name = "ipv4_address",
443 .ca_mode = S_IRUGO | S_IWUSR },
444 .show = o2nm_node_ipv4_address_read,
445 .store = o2nm_node_ipv4_address_write,
446};
447
448static struct o2nm_node_attribute o2nm_node_attr_local = {
449 .attr = { .ca_owner = THIS_MODULE,
450 .ca_name = "local",
451 .ca_mode = S_IRUGO | S_IWUSR },
452 .show = o2nm_node_local_read,
453 .store = o2nm_node_local_write,
454};
455
456static struct configfs_attribute *o2nm_node_attrs[] = {
457 [O2NM_NODE_ATTR_NUM] = &o2nm_node_attr_num.attr,
458 [O2NM_NODE_ATTR_PORT] = &o2nm_node_attr_ipv4_port.attr,
459 [O2NM_NODE_ATTR_ADDRESS] = &o2nm_node_attr_ipv4_address.attr,
460 [O2NM_NODE_ATTR_LOCAL] = &o2nm_node_attr_local.attr,
461 NULL,
462};
463
464static int o2nm_attr_index(struct configfs_attribute *attr)
465{
466 int i;
467 for (i = 0; i < ARRAY_SIZE(o2nm_node_attrs); i++) {
468 if (attr == o2nm_node_attrs[i])
469 return i;
470 }
471 BUG();
472 return 0;
473}
474
475static ssize_t o2nm_node_show(struct config_item *item,
476 struct configfs_attribute *attr,
477 char *page)
478{
479 struct o2nm_node *node = to_o2nm_node(item);
480 struct o2nm_node_attribute *o2nm_node_attr =
481 container_of(attr, struct o2nm_node_attribute, attr);
482 ssize_t ret = 0;
483
484 if (o2nm_node_attr->show)
485 ret = o2nm_node_attr->show(node, page);
486 return ret;
487}
488
489static ssize_t o2nm_node_store(struct config_item *item,
490 struct configfs_attribute *attr,
491 const char *page, size_t count)
492{
493 struct o2nm_node *node = to_o2nm_node(item);
494 struct o2nm_node_attribute *o2nm_node_attr =
495 container_of(attr, struct o2nm_node_attribute, attr);
496 ssize_t ret;
497 int attr_index = o2nm_attr_index(attr);
498
499 if (o2nm_node_attr->store == NULL) {
500 ret = -EINVAL;
501 goto out;
502 }
503
504 if (test_bit(attr_index, &node->nd_set_attributes))
505 return -EBUSY;
506
507 ret = o2nm_node_attr->store(node, page, count);
508 if (ret < count)
509 goto out;
510
511 set_bit(attr_index, &node->nd_set_attributes);
512out:
513 return ret;
514}
515
516static struct configfs_item_operations o2nm_node_item_ops = {
517 .release = o2nm_node_release,
518 .show_attribute = o2nm_node_show,
519 .store_attribute = o2nm_node_store,
520};
521
522static struct config_item_type o2nm_node_type = {
523 .ct_item_ops = &o2nm_node_item_ops,
524 .ct_attrs = o2nm_node_attrs,
525 .ct_owner = THIS_MODULE,
526};
527
528/* node set */
529
530struct o2nm_node_group {
531 struct config_group ns_group;
532 /* some stuff? */
533};
534
535#if 0
536static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group)
537{
538 return group ?
539 container_of(group, struct o2nm_node_group, ns_group)
540 : NULL;
541}
542#endif
543
544static struct config_item *o2nm_node_group_make_item(struct config_group *group,
545 const char *name)
546{
547 struct o2nm_node *node = NULL;
548 struct config_item *ret = NULL;
549
550 if (strlen(name) > O2NM_MAX_NAME_LEN)
551 goto out; /* ENAMETOOLONG */
552
553 node = kcalloc(1, sizeof(struct o2nm_node), GFP_KERNEL);
554 if (node == NULL)
555 goto out; /* ENOMEM */
556
557 strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */
558 config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
559 spin_lock_init(&node->nd_lock);
560
561 ret = &node->nd_item;
562
563out:
564 if (ret == NULL)
565 kfree(node);
566
567 return ret;
568}
569
570static void o2nm_node_group_drop_item(struct config_group *group,
571 struct config_item *item)
572{
573 struct o2nm_node *node = to_o2nm_node(item);
574 struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent);
575
576 o2net_disconnect_node(node);
577
578 if (cluster->cl_has_local &&
579 (cluster->cl_local_node == node->nd_num)) {
580 cluster->cl_has_local = 0;
581 cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
582 o2net_stop_listening(node);
583 }
584
585 /* XXX call into net to stop this node from trading messages */
586
587 write_lock(&cluster->cl_nodes_lock);
588
589 /* XXX sloppy */
590 if (node->nd_ipv4_address)
591 rb_erase(&node->nd_ip_node, &cluster->cl_node_ip_tree);
592
593 /* nd_num might be 0 if the node number hasn't been set.. */
594 if (cluster->cl_nodes[node->nd_num] == node) {
595 cluster->cl_nodes[node->nd_num] = NULL;
596 clear_bit(node->nd_num, cluster->cl_nodes_bitmap);
597 }
598 write_unlock(&cluster->cl_nodes_lock);
599
600 config_item_put(item);
601}
602
603static struct configfs_group_operations o2nm_node_group_group_ops = {
604 .make_item = o2nm_node_group_make_item,
605 .drop_item = o2nm_node_group_drop_item,
606};
607
608static struct config_item_type o2nm_node_group_type = {
609 .ct_group_ops = &o2nm_node_group_group_ops,
610 .ct_owner = THIS_MODULE,
611};
612
613/* cluster */
614
615static void o2nm_cluster_release(struct config_item *item)
616{
617 struct o2nm_cluster *cluster = to_o2nm_cluster(item);
618
619 kfree(cluster->cl_group.default_groups);
620 kfree(cluster);
621}
622
623static struct configfs_item_operations o2nm_cluster_item_ops = {
624 .release = o2nm_cluster_release,
625};
626
627static struct config_item_type o2nm_cluster_type = {
628 .ct_item_ops = &o2nm_cluster_item_ops,
629 .ct_owner = THIS_MODULE,
630};
631
632/* cluster set */
633
634struct o2nm_cluster_group {
635 struct configfs_subsystem cs_subsys;
636 /* some stuff? */
637};
638
639#if 0
640static struct o2nm_cluster_group *to_o2nm_cluster_group(struct config_group *group)
641{
642 return group ?
643 container_of(to_configfs_subsystem(group), struct o2nm_cluster_group, cs_subsys)
644 : NULL;
645}
646#endif
647
648static struct config_group *o2nm_cluster_group_make_group(struct config_group *group,
649 const char *name)
650{
651 struct o2nm_cluster *cluster = NULL;
652 struct o2nm_node_group *ns = NULL;
653 struct config_group *o2hb_group = NULL, *ret = NULL;
654 void *defs = NULL;
655
656 /* this runs under the parent dir's i_mutex; there can be only
657 * one caller in here at a time */
658 if (o2nm_single_cluster)
659 goto out; /* ENOSPC */
660
661 cluster = kcalloc(1, sizeof(struct o2nm_cluster), GFP_KERNEL);
662 ns = kcalloc(1, sizeof(struct o2nm_node_group), GFP_KERNEL);
663 defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
664 o2hb_group = o2hb_alloc_hb_set();
665 if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL)
666 goto out;
667
668 config_group_init_type_name(&cluster->cl_group, name,
669 &o2nm_cluster_type);
670 config_group_init_type_name(&ns->ns_group, "node",
671 &o2nm_node_group_type);
672
673 cluster->cl_group.default_groups = defs;
674 cluster->cl_group.default_groups[0] = &ns->ns_group;
675 cluster->cl_group.default_groups[1] = o2hb_group;
676 cluster->cl_group.default_groups[2] = NULL;
677 rwlock_init(&cluster->cl_nodes_lock);
678 cluster->cl_node_ip_tree = RB_ROOT;
679
680 ret = &cluster->cl_group;
681 o2nm_single_cluster = cluster;
682
683out:
684 if (ret == NULL) {
685 kfree(cluster);
686 kfree(ns);
687 o2hb_free_hb_set(o2hb_group);
688 kfree(defs);
689 }
690
691 return ret;
692}
693
694static void o2nm_cluster_group_drop_item(struct config_group *group, struct config_item *item)
695{
696 struct o2nm_cluster *cluster = to_o2nm_cluster(item);
697 int i;
698 struct config_item *killme;
699
700 BUG_ON(o2nm_single_cluster != cluster);
701 o2nm_single_cluster = NULL;
702
703 for (i = 0; cluster->cl_group.default_groups[i]; i++) {
704 killme = &cluster->cl_group.default_groups[i]->cg_item;
705 cluster->cl_group.default_groups[i] = NULL;
706 config_item_put(killme);
707 }
708
709 config_item_put(item);
710}
711
712static struct configfs_group_operations o2nm_cluster_group_group_ops = {
713 .make_group = o2nm_cluster_group_make_group,
714 .drop_item = o2nm_cluster_group_drop_item,
715};
716
717static struct config_item_type o2nm_cluster_group_type = {
718 .ct_group_ops = &o2nm_cluster_group_group_ops,
719 .ct_owner = THIS_MODULE,
720};
721
722static struct o2nm_cluster_group o2nm_cluster_group = {
723 .cs_subsys = {
724 .su_group = {
725 .cg_item = {
726 .ci_namebuf = "cluster",
727 .ci_type = &o2nm_cluster_group_type,
728 },
729 },
730 },
731};
732
733static void __exit exit_o2nm(void)
734{
735 if (ocfs2_table_header)
736 unregister_sysctl_table(ocfs2_table_header);
737
738 /* XXX sync with hb callbacks and shut down hb? */
739 o2net_unregister_hb_callbacks();
740 configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
741 o2cb_sys_shutdown();
742
743 o2net_exit();
744}
745
746static int __init init_o2nm(void)
747{
748 int ret = -1;
749
750 cluster_print_version();
751
752 o2hb_init();
753 o2net_init();
754
755 ocfs2_table_header = register_sysctl_table(ocfs2_root_table, 0);
756 if (!ocfs2_table_header) {
757 printk(KERN_ERR "nodemanager: unable to register sysctl\n");
758 ret = -ENOMEM; /* or something. */
759 goto out;
760 }
761
762 ret = o2net_register_hb_callbacks();
763 if (ret)
764 goto out_sysctl;
765
766 config_group_init(&o2nm_cluster_group.cs_subsys.su_group);
767 init_MUTEX(&o2nm_cluster_group.cs_subsys.su_sem);
768 ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys);
769 if (ret) {
770 printk(KERN_ERR "nodemanager: Registration returned %d\n", ret);
771 goto out_callbacks;
772 }
773
774 ret = o2cb_sys_init();
775 if (!ret)
776 goto out;
777
778 configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
779out_callbacks:
780 o2net_unregister_hb_callbacks();
781out_sysctl:
782 unregister_sysctl_table(ocfs2_table_header);
783out:
784 return ret;
785}
786
787MODULE_AUTHOR("Oracle");
788MODULE_LICENSE("GPL");
789
790module_init(init_o2nm)
791module_exit(exit_o2nm)
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h
new file mode 100644
index 00000000000..fce8033c310
--- /dev/null
+++ b/fs/ocfs2/cluster/nodemanager.h
@@ -0,0 +1,64 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * nodemanager.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27#ifndef O2CLUSTER_NODEMANAGER_H
28#define O2CLUSTER_NODEMANAGER_H
29
30#include "ocfs2_nodemanager.h"
31
32/* This totally doesn't belong here. */
33#include <linux/configfs.h>
34#include <linux/rbtree.h>
35
36#define KERN_OCFS2 988
37#define KERN_OCFS2_NM 1
38
39const char *o2nm_get_hb_ctl_path(void);
40
41struct o2nm_node {
42 spinlock_t nd_lock;
43 struct config_item nd_item;
44 char nd_name[O2NM_MAX_NAME_LEN+1]; /* replace? */
45 __u8 nd_num;
46 /* only one address per node, as attributes, for now. */
47 __be32 nd_ipv4_address;
48 __be16 nd_ipv4_port;
49 struct rb_node nd_ip_node;
50 /* there can be only one local node for now */
51 int nd_local;
52
53 unsigned long nd_set_attributes;
54};
55
56u8 o2nm_this_node(void);
57
58int o2nm_configured_node_map(unsigned long *map, unsigned bytes);
59struct o2nm_node *o2nm_get_node_by_num(u8 node_num);
60struct o2nm_node *o2nm_get_node_by_ip(__be32 addr);
61void o2nm_node_get(struct o2nm_node *node);
62void o2nm_node_put(struct o2nm_node *node);
63
64#endif /* O2CLUSTER_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/ocfs2_heartbeat.h b/fs/ocfs2/cluster/ocfs2_heartbeat.h
new file mode 100644
index 00000000000..94096069cb4
--- /dev/null
+++ b/fs/ocfs2/cluster/ocfs2_heartbeat.h
@@ -0,0 +1,37 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_heartbeat.h
5 *
6 * On-disk structures for ocfs2_heartbeat
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef _OCFS2_HEARTBEAT_H
27#define _OCFS2_HEARTBEAT_H
28
29struct o2hb_disk_heartbeat_block {
30 __le64 hb_seq;
31 __u8 hb_node;
32 __u8 hb_pad1[3];
33 __le32 hb_cksum;
34 __le64 hb_generation;
35};
36
37#endif /* _OCFS2_HEARTBEAT_H */
diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h
new file mode 100644
index 00000000000..5b9854bad57
--- /dev/null
+++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h
@@ -0,0 +1,39 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_nodemanager.h
5 *
6 * Header describing the interface between userspace and the kernel
7 * for the ocfs2_nodemanager module.
8 *
9 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 021110-1307, USA.
25 *
26 */
27
28#ifndef _OCFS2_NODEMANAGER_H
29#define _OCFS2_NODEMANAGER_H
30
31#define O2NM_API_VERSION 5
32
33#define O2NM_MAX_NODES 255
34#define O2NM_INVALID_NODE_NUM 255
35
36/* host name, group name, cluster name all 64 bytes */
37#define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN
38
39#endif /* _OCFS2_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
new file mode 100644
index 00000000000..7bba98fbfc1
--- /dev/null
+++ b/fs/ocfs2/cluster/quorum.c
@@ -0,0 +1,315 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 *
3 * vim: noexpandtab sw=8 ts=8 sts=0:
4 *
5 * Copyright (C) 2005 Oracle. All rights reserved.
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public
18 * License along with this program; if not, write to the
19 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 * Boston, MA 021110-1307, USA.
21 */
22
23/* This quorum hack is only here until we transition to some more rational
24 * approach that is driven from userspace. Honest. No foolin'.
25 *
26 * Imagine two nodes lose network connectivity to each other but they're still
27 * up and operating in every other way. Presumably a network timeout indicates
28 * that a node is broken and should be recovered. They can't both recover each
29 * other and both carry on without serialising their access to the file system.
30 * They need to decide who is authoritative. Now extend that problem to
31 * arbitrary groups of nodes losing connectivity between each other.
32 *
33 * So we declare that a node which has given up on connecting to a majority
34 * of nodes who are still heartbeating will fence itself.
35 *
36 * There are huge opportunities for races here. After we give up on a node's
37 * connection we need to wait long enough to give heartbeat an opportunity
38 * to declare the node as truly dead. We also need to be careful with the
39 * race between when we see a node start heartbeating and when we connect
40 * to it.
41 *
42 * So nodes that are in this transtion put a hold on the quorum decision
43 * with a counter. As they fall out of this transition they drop the count
44 * and if they're the last, they fire off the decision.
45 */
46#include <linux/kernel.h>
47#include <linux/slab.h>
48#include <linux/workqueue.h>
49
50#include "heartbeat.h"
51#include "nodemanager.h"
52#define MLOG_MASK_PREFIX ML_QUORUM
53#include "masklog.h"
54#include "quorum.h"
55
56static struct o2quo_state {
57 spinlock_t qs_lock;
58 struct work_struct qs_work;
59 int qs_pending;
60 int qs_heartbeating;
61 unsigned long qs_hb_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
62 int qs_connected;
63 unsigned long qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
64 int qs_holds;
65 unsigned long qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
66} o2quo_state;
67
68/* this is horribly heavy-handed. It should instead flip the file
69 * system RO and call some userspace script. */
70static void o2quo_fence_self(void)
71{
72 /* panic spins with interrupts enabled. with preempt
73 * threads can still schedule, etc, etc */
74 o2hb_stop_all_regions();
75 panic("ocfs2 is very sorry to be fencing this system by panicing\n");
76}
77
78/* Indicate that a timeout occured on a hearbeat region write. The
79 * other nodes in the cluster may consider us dead at that time so we
80 * want to "fence" ourselves so that we don't scribble on the disk
81 * after they think they've recovered us. This can't solve all
82 * problems related to writeout after recovery but this hack can at
83 * least close some of those gaps. When we have real fencing, this can
84 * go away as our node would be fenced externally before other nodes
85 * begin recovery. */
86void o2quo_disk_timeout(void)
87{
88 o2quo_fence_self();
89}
90
91static void o2quo_make_decision(void *arg)
92{
93 int quorum;
94 int lowest_hb, lowest_reachable = 0, fence = 0;
95 struct o2quo_state *qs = &o2quo_state;
96
97 spin_lock(&qs->qs_lock);
98
99 lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES);
100 if (lowest_hb != O2NM_MAX_NODES)
101 lowest_reachable = test_bit(lowest_hb, qs->qs_conn_bm);
102
103 mlog(0, "heartbeating: %d, connected: %d, "
104 "lowest: %d (%sreachable)\n", qs->qs_heartbeating,
105 qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un");
106
107 if (!test_bit(o2nm_this_node(), qs->qs_hb_bm) ||
108 qs->qs_heartbeating == 1)
109 goto out;
110
111 if (qs->qs_heartbeating & 1) {
112 /* the odd numbered cluster case is straight forward --
113 * if we can't talk to the majority we're hosed */
114 quorum = (qs->qs_heartbeating + 1)/2;
115 if (qs->qs_connected < quorum) {
116 mlog(ML_ERROR, "fencing this node because it is "
117 "only connected to %u nodes and %u is needed "
118 "to make a quorum out of %u heartbeating nodes\n",
119 qs->qs_connected, quorum,
120 qs->qs_heartbeating);
121 fence = 1;
122 }
123 } else {
124 /* the even numbered cluster adds the possibility of each half
125 * of the cluster being able to talk amongst themselves.. in
126 * that case we're hosed if we can't talk to the group that has
127 * the lowest numbered node */
128 quorum = qs->qs_heartbeating / 2;
129 if (qs->qs_connected < quorum) {
130 mlog(ML_ERROR, "fencing this node because it is "
131 "only connected to %u nodes and %u is needed "
132 "to make a quorum out of %u heartbeating nodes\n",
133 qs->qs_connected, quorum,
134 qs->qs_heartbeating);
135 fence = 1;
136 }
137 else if ((qs->qs_connected == quorum) &&
138 !lowest_reachable) {
139 mlog(ML_ERROR, "fencing this node because it is "
140 "connected to a half-quorum of %u out of %u "
141 "nodes which doesn't include the lowest active "
142 "node %u\n", quorum, qs->qs_heartbeating,
143 lowest_hb);
144 fence = 1;
145 }
146 }
147
148out:
149 spin_unlock(&qs->qs_lock);
150 if (fence)
151 o2quo_fence_self();
152}
153
154static void o2quo_set_hold(struct o2quo_state *qs, u8 node)
155{
156 assert_spin_locked(&qs->qs_lock);
157
158 if (!test_and_set_bit(node, qs->qs_hold_bm)) {
159 qs->qs_holds++;
160 mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES,
161 "node %u\n", node);
162 mlog(0, "node %u, %d total\n", node, qs->qs_holds);
163 }
164}
165
166static void o2quo_clear_hold(struct o2quo_state *qs, u8 node)
167{
168 assert_spin_locked(&qs->qs_lock);
169
170 if (test_and_clear_bit(node, qs->qs_hold_bm)) {
171 mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1);
172 if (--qs->qs_holds == 0) {
173 if (qs->qs_pending) {
174 qs->qs_pending = 0;
175 schedule_work(&qs->qs_work);
176 }
177 }
178 mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n",
179 node, qs->qs_holds);
180 }
181}
182
183/* as a node comes up we delay the quorum decision until we know the fate of
184 * the connection. the hold will be droped in conn_up or hb_down. it might be
185 * perpetuated by con_err until hb_down. if we already have a conn, we might
186 * be dropping a hold that conn_up got. */
187void o2quo_hb_up(u8 node)
188{
189 struct o2quo_state *qs = &o2quo_state;
190
191 spin_lock(&qs->qs_lock);
192
193 qs->qs_heartbeating++;
194 mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES,
195 "node %u\n", node);
196 mlog_bug_on_msg(test_bit(node, qs->qs_hb_bm), "node %u\n", node);
197 set_bit(node, qs->qs_hb_bm);
198
199 mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
200
201 if (!test_bit(node, qs->qs_conn_bm))
202 o2quo_set_hold(qs, node);
203 else
204 o2quo_clear_hold(qs, node);
205
206 spin_unlock(&qs->qs_lock);
207}
208
209/* hb going down releases any holds we might have had due to this node from
210 * conn_up, conn_err, or hb_up */
211void o2quo_hb_down(u8 node)
212{
213 struct o2quo_state *qs = &o2quo_state;
214
215 spin_lock(&qs->qs_lock);
216
217 qs->qs_heartbeating--;
218 mlog_bug_on_msg(qs->qs_heartbeating < 0,
219 "node %u, %d heartbeating\n",
220 node, qs->qs_heartbeating);
221 mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node);
222 clear_bit(node, qs->qs_hb_bm);
223
224 mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
225
226 o2quo_clear_hold(qs, node);
227
228 spin_unlock(&qs->qs_lock);
229}
230
231/* this tells us that we've decided that the node is still heartbeating
232 * even though we've lost it's conn. it must only be called after conn_err
233 * and indicates that we must now make a quorum decision in the future,
234 * though we might be doing so after waiting for holds to drain. Here
235 * we'll be dropping the hold from conn_err. */
236void o2quo_hb_still_up(u8 node)
237{
238 struct o2quo_state *qs = &o2quo_state;
239
240 spin_lock(&qs->qs_lock);
241
242 mlog(0, "node %u\n", node);
243
244 qs->qs_pending = 1;
245 o2quo_clear_hold(qs, node);
246
247 spin_unlock(&qs->qs_lock);
248}
249
250/* This is analagous to hb_up. as a node's connection comes up we delay the
251 * quorum decision until we see it heartbeating. the hold will be droped in
252 * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if
253 * it's already heartbeating we we might be dropping a hold that conn_up got.
254 * */
255void o2quo_conn_up(u8 node)
256{
257 struct o2quo_state *qs = &o2quo_state;
258
259 spin_lock(&qs->qs_lock);
260
261 qs->qs_connected++;
262 mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES,
263 "node %u\n", node);
264 mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node);
265 set_bit(node, qs->qs_conn_bm);
266
267 mlog(0, "node %u, %d total\n", node, qs->qs_connected);
268
269 if (!test_bit(node, qs->qs_hb_bm))
270 o2quo_set_hold(qs, node);
271 else
272 o2quo_clear_hold(qs, node);
273
274 spin_unlock(&qs->qs_lock);
275}
276
277/* we've decided that we won't ever be connecting to the node again. if it's
278 * still heartbeating we grab a hold that will delay decisions until either the
279 * node stops heartbeating from hb_down or the caller decides that the node is
280 * still up and calls still_up */
281void o2quo_conn_err(u8 node)
282{
283 struct o2quo_state *qs = &o2quo_state;
284
285 spin_lock(&qs->qs_lock);
286
287 if (test_bit(node, qs->qs_conn_bm)) {
288 qs->qs_connected--;
289 mlog_bug_on_msg(qs->qs_connected < 0,
290 "node %u, connected %d\n",
291 node, qs->qs_connected);
292
293 clear_bit(node, qs->qs_conn_bm);
294 }
295
296 mlog(0, "node %u, %d total\n", node, qs->qs_connected);
297
298 if (test_bit(node, qs->qs_hb_bm))
299 o2quo_set_hold(qs, node);
300
301 spin_unlock(&qs->qs_lock);
302}
303
304void o2quo_init(void)
305{
306 struct o2quo_state *qs = &o2quo_state;
307
308 spin_lock_init(&qs->qs_lock);
309 INIT_WORK(&qs->qs_work, o2quo_make_decision, NULL);
310}
311
312void o2quo_exit(void)
313{
314 flush_scheduled_work();
315}
diff --git a/fs/ocfs2/cluster/quorum.h b/fs/ocfs2/cluster/quorum.h
new file mode 100644
index 00000000000..6649cc6f67c
--- /dev/null
+++ b/fs/ocfs2/cluster/quorum.h
@@ -0,0 +1,36 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 *
21 */
22
23#ifndef O2CLUSTER_QUORUM_H
24#define O2CLUSTER_QUORUM_H
25
26void o2quo_init(void);
27void o2quo_exit(void);
28
29void o2quo_hb_up(u8 node);
30void o2quo_hb_down(u8 node);
31void o2quo_hb_still_up(u8 node);
32void o2quo_conn_up(u8 node);
33void o2quo_conn_err(u8 node);
34void o2quo_disk_timeout(void);
35
36#endif /* O2CLUSTER_QUORUM_H */
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
new file mode 100644
index 00000000000..1d9f6acafa2
--- /dev/null
+++ b/fs/ocfs2/cluster/sys.c
@@ -0,0 +1,124 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * sys.c
5 *
6 * OCFS2 cluster sysfs interface
7 *
8 * Copyright (C) 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation,
13 * version 2 of the License.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27#include <linux/kernel.h>
28#include <linux/module.h>
29#include <linux/kobject.h>
30#include <linux/sysfs.h>
31
32#include "ocfs2_nodemanager.h"
33#include "masklog.h"
34#include "sys.h"
35
36struct o2cb_attribute {
37 struct attribute attr;
38 ssize_t (*show)(char *buf);
39 ssize_t (*store)(const char *buf, size_t count);
40};
41
42#define O2CB_ATTR(_name, _mode, _show, _store) \
43struct o2cb_attribute o2cb_attr_##_name = __ATTR(_name, _mode, _show, _store)
44
45#define to_o2cb_subsys(k) container_of(to_kset(k), struct subsystem, kset)
46#define to_o2cb_attr(_attr) container_of(_attr, struct o2cb_attribute, attr)
47
48static ssize_t o2cb_interface_revision_show(char *buf)
49{
50 return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
51}
52
53static O2CB_ATTR(interface_revision, S_IFREG | S_IRUGO, o2cb_interface_revision_show, NULL);
54
55static struct attribute *o2cb_attrs[] = {
56 &o2cb_attr_interface_revision.attr,
57 NULL,
58};
59
60static ssize_t
61o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer);
62static ssize_t
63o2cb_store(struct kobject * kobj, struct attribute * attr,
64 const char * buffer, size_t count);
65static struct sysfs_ops o2cb_sysfs_ops = {
66 .show = o2cb_show,
67 .store = o2cb_store,
68};
69
70static struct kobj_type o2cb_subsys_type = {
71 .default_attrs = o2cb_attrs,
72 .sysfs_ops = &o2cb_sysfs_ops,
73};
74
75/* gives us o2cb_subsys */
76static decl_subsys(o2cb, NULL, NULL);
77
78static ssize_t
79o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer)
80{
81 struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
82 struct subsystem *sbs = to_o2cb_subsys(kobj);
83
84 BUG_ON(sbs != &o2cb_subsys);
85
86 if (o2cb_attr->show)
87 return o2cb_attr->show(buffer);
88 return -EIO;
89}
90
91static ssize_t
92o2cb_store(struct kobject * kobj, struct attribute * attr,
93 const char * buffer, size_t count)
94{
95 struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
96 struct subsystem *sbs = to_o2cb_subsys(kobj);
97
98 BUG_ON(sbs != &o2cb_subsys);
99
100 if (o2cb_attr->store)
101 return o2cb_attr->store(buffer, count);
102 return -EIO;
103}
104
105void o2cb_sys_shutdown(void)
106{
107 mlog_sys_shutdown();
108 subsystem_unregister(&o2cb_subsys);
109}
110
111int o2cb_sys_init(void)
112{
113 int ret;
114
115 o2cb_subsys.kset.kobj.ktype = &o2cb_subsys_type;
116 ret = subsystem_register(&o2cb_subsys);
117 if (ret)
118 return ret;
119
120 ret = mlog_sys_init(&o2cb_subsys);
121 if (ret)
122 subsystem_unregister(&o2cb_subsys);
123 return ret;
124}
diff --git a/fs/ocfs2/cluster/sys.h b/fs/ocfs2/cluster/sys.h
new file mode 100644
index 00000000000..d66b8ab0045
--- /dev/null
+++ b/fs/ocfs2/cluster/sys.h
@@ -0,0 +1,33 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * sys.h
5 *
6 * Function prototypes for o2cb sysfs interface
7 *
8 * Copyright (C) 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation,
13 * version 2 of the License.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27#ifndef O2CLUSTER_SYS_H
28#define O2CLUSTER_SYS_H
29
30void o2cb_sys_shutdown(void);
31int o2cb_sys_init(void);
32
33#endif /* O2CLUSTER_SYS_H */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
new file mode 100644
index 00000000000..35d92c01a97
--- /dev/null
+++ b/fs/ocfs2/cluster/tcp.c
@@ -0,0 +1,1829 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 *
3 * vim: noexpandtab sw=8 ts=8 sts=0:
4 *
5 * Copyright (C) 2004 Oracle. All rights reserved.
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public
18 * License along with this program; if not, write to the
19 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 * Boston, MA 021110-1307, USA.
21 *
22 * ----
23 *
24 * Callers for this were originally written against a very simple synchronus
25 * API. This implementation reflects those simple callers. Some day I'm sure
26 * we'll need to move to a more robust posting/callback mechanism.
27 *
28 * Transmit calls pass in kernel virtual addresses and block copying this into
29 * the socket's tx buffers via a usual blocking sendmsg. They'll block waiting
30 * for a failed socket to timeout. TX callers can also pass in a poniter to an
31 * 'int' which gets filled with an errno off the wire in response to the
32 * message they send.
33 *
34 * Handlers for unsolicited messages are registered. Each socket has a page
35 * that incoming data is copied into. First the header, then the data.
36 * Handlers are called from only one thread with a reference to this per-socket
37 * page. This page is destroyed after the handler call, so it can't be
38 * referenced beyond the call. Handlers may block but are discouraged from
39 * doing so.
40 *
41 * Any framing errors (bad magic, large payload lengths) close a connection.
42 *
43 * Our sock_container holds the state we associate with a socket. It's current
44 * framing state is held there as well as the refcounting we do around when it
45 * is safe to tear down the socket. The socket is only finally torn down from
46 * the container when the container loses all of its references -- so as long
47 * as you hold a ref on the container you can trust that the socket is valid
48 * for use with kernel socket APIs.
49 *
50 * Connections are initiated between a pair of nodes when the node with the
51 * higher node number gets a heartbeat callback which indicates that the lower
52 * numbered node has started heartbeating. The lower numbered node is passive
53 * and only accepts the connection if the higher numbered node is heartbeating.
54 */
55
56#include <linux/kernel.h>
57#include <linux/jiffies.h>
58#include <linux/slab.h>
59#include <linux/idr.h>
60#include <linux/kref.h>
61#include <net/tcp.h>
62
63#include <asm/uaccess.h>
64
65#include "heartbeat.h"
66#include "tcp.h"
67#include "nodemanager.h"
68#define MLOG_MASK_PREFIX ML_TCP
69#include "masklog.h"
70#include "quorum.h"
71
72#include "tcp_internal.h"
73
74/*
75 * The linux network stack isn't sparse endian clean.. It has macros like
76 * ntohs() which perform the endian checks and structs like sockaddr_in
77 * which aren't annotated. So __force is found here to get the build
78 * clean. When they emerge from the dark ages and annotate the code
79 * we can remove these.
80 */
81
82#define SC_NODEF_FMT "node %s (num %u) at %u.%u.%u.%u:%u"
83#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num, \
84 NIPQUAD(sc->sc_node->nd_ipv4_address), \
85 ntohs(sc->sc_node->nd_ipv4_port)
86
87/*
88 * In the following two log macros, the whitespace after the ',' just
89 * before ##args is intentional. Otherwise, gcc 2.95 will eat the
90 * previous token if args expands to nothing.
91 */
92#define msglog(hdr, fmt, args...) do { \
93 typeof(hdr) __hdr = (hdr); \
94 mlog(ML_MSG, "[mag %u len %u typ %u stat %d sys_stat %d " \
95 "key %08x num %u] " fmt, \
96 be16_to_cpu(__hdr->magic), be16_to_cpu(__hdr->data_len), \
97 be16_to_cpu(__hdr->msg_type), be32_to_cpu(__hdr->status), \
98 be32_to_cpu(__hdr->sys_status), be32_to_cpu(__hdr->key), \
99 be32_to_cpu(__hdr->msg_num) , ##args); \
100} while (0)
101
102#define sclog(sc, fmt, args...) do { \
103 typeof(sc) __sc = (sc); \
104 mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p " \
105 "pg_off %zu] " fmt, __sc, \
106 atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock, \
107 __sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off , \
108 ##args); \
109} while (0)
110
111static rwlock_t o2net_handler_lock = RW_LOCK_UNLOCKED;
112static struct rb_root o2net_handler_tree = RB_ROOT;
113
114static struct o2net_node o2net_nodes[O2NM_MAX_NODES];
115
116/* XXX someday we'll need better accounting */
117static struct socket *o2net_listen_sock = NULL;
118
119/*
120 * listen work is only queued by the listening socket callbacks on the
121 * o2net_wq. teardown detaches the callbacks before destroying the workqueue.
122 * quorum work is queued as sock containers are shutdown.. stop_listening
123 * tears down all the node's sock containers, preventing future shutdowns
124 * and queued quroum work, before canceling delayed quorum work and
125 * destroying the work queue.
126 */
127static struct workqueue_struct *o2net_wq;
128static struct work_struct o2net_listen_work;
129
130static struct o2hb_callback_func o2net_hb_up, o2net_hb_down;
131#define O2NET_HB_PRI 0x1
132
133static struct o2net_handshake *o2net_hand;
134static struct o2net_msg *o2net_keep_req, *o2net_keep_resp;
135
136static int o2net_sys_err_translations[O2NET_ERR_MAX] =
137 {[O2NET_ERR_NONE] = 0,
138 [O2NET_ERR_NO_HNDLR] = -ENOPROTOOPT,
139 [O2NET_ERR_OVERFLOW] = -EOVERFLOW,
140 [O2NET_ERR_DIED] = -EHOSTDOWN,};
141
142/* can't quite avoid *all* internal declarations :/ */
143static void o2net_sc_connect_completed(void *arg);
144static void o2net_rx_until_empty(void *arg);
145static void o2net_shutdown_sc(void *arg);
146static void o2net_listen_data_ready(struct sock *sk, int bytes);
147static void o2net_sc_send_keep_req(void *arg);
148static void o2net_idle_timer(unsigned long data);
149static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
150
151static inline int o2net_sys_err_to_errno(enum o2net_system_error err)
152{
153 int trans;
154 BUG_ON(err >= O2NET_ERR_MAX);
155 trans = o2net_sys_err_translations[err];
156
157 /* Just in case we mess up the translation table above */
158 BUG_ON(err != O2NET_ERR_NONE && trans == 0);
159 return trans;
160}
161
162static struct o2net_node * o2net_nn_from_num(u8 node_num)
163{
164 BUG_ON(node_num >= ARRAY_SIZE(o2net_nodes));
165 return &o2net_nodes[node_num];
166}
167
168static u8 o2net_num_from_nn(struct o2net_node *nn)
169{
170 BUG_ON(nn == NULL);
171 return nn - o2net_nodes;
172}
173
174/* ------------------------------------------------------------ */
175
176static int o2net_prep_nsw(struct o2net_node *nn, struct o2net_status_wait *nsw)
177{
178 int ret = 0;
179
180 do {
181 if (!idr_pre_get(&nn->nn_status_idr, GFP_ATOMIC)) {
182 ret = -EAGAIN;
183 break;
184 }
185 spin_lock(&nn->nn_lock);
186 ret = idr_get_new(&nn->nn_status_idr, nsw, &nsw->ns_id);
187 if (ret == 0)
188 list_add_tail(&nsw->ns_node_item,
189 &nn->nn_status_list);
190 spin_unlock(&nn->nn_lock);
191 } while (ret == -EAGAIN);
192
193 if (ret == 0) {
194 init_waitqueue_head(&nsw->ns_wq);
195 nsw->ns_sys_status = O2NET_ERR_NONE;
196 nsw->ns_status = 0;
197 }
198
199 return ret;
200}
201
202static void o2net_complete_nsw_locked(struct o2net_node *nn,
203 struct o2net_status_wait *nsw,
204 enum o2net_system_error sys_status,
205 s32 status)
206{
207 assert_spin_locked(&nn->nn_lock);
208
209 if (!list_empty(&nsw->ns_node_item)) {
210 list_del_init(&nsw->ns_node_item);
211 nsw->ns_sys_status = sys_status;
212 nsw->ns_status = status;
213 idr_remove(&nn->nn_status_idr, nsw->ns_id);
214 wake_up(&nsw->ns_wq);
215 }
216}
217
218static void o2net_complete_nsw(struct o2net_node *nn,
219 struct o2net_status_wait *nsw,
220 u64 id, enum o2net_system_error sys_status,
221 s32 status)
222{
223 spin_lock(&nn->nn_lock);
224 if (nsw == NULL) {
225 if (id > INT_MAX)
226 goto out;
227
228 nsw = idr_find(&nn->nn_status_idr, id);
229 if (nsw == NULL)
230 goto out;
231 }
232
233 o2net_complete_nsw_locked(nn, nsw, sys_status, status);
234
235out:
236 spin_unlock(&nn->nn_lock);
237 return;
238}
239
240static void o2net_complete_nodes_nsw(struct o2net_node *nn)
241{
242 struct list_head *iter, *tmp;
243 unsigned int num_kills = 0;
244 struct o2net_status_wait *nsw;
245
246 assert_spin_locked(&nn->nn_lock);
247
248 list_for_each_safe(iter, tmp, &nn->nn_status_list) {
249 nsw = list_entry(iter, struct o2net_status_wait, ns_node_item);
250 o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0);
251 num_kills++;
252 }
253
254 mlog(0, "completed %d messages for node %u\n", num_kills,
255 o2net_num_from_nn(nn));
256}
257
258static int o2net_nsw_completed(struct o2net_node *nn,
259 struct o2net_status_wait *nsw)
260{
261 int completed;
262 spin_lock(&nn->nn_lock);
263 completed = list_empty(&nsw->ns_node_item);
264 spin_unlock(&nn->nn_lock);
265 return completed;
266}
267
268/* ------------------------------------------------------------ */
269
270static void sc_kref_release(struct kref *kref)
271{
272 struct o2net_sock_container *sc = container_of(kref,
273 struct o2net_sock_container, sc_kref);
274 sclog(sc, "releasing\n");
275
276 if (sc->sc_sock) {
277 sock_release(sc->sc_sock);
278 sc->sc_sock = NULL;
279 }
280
281 o2nm_node_put(sc->sc_node);
282 sc->sc_node = NULL;
283
284 kfree(sc);
285}
286
287static void sc_put(struct o2net_sock_container *sc)
288{
289 sclog(sc, "put\n");
290 kref_put(&sc->sc_kref, sc_kref_release);
291}
292static void sc_get(struct o2net_sock_container *sc)
293{
294 sclog(sc, "get\n");
295 kref_get(&sc->sc_kref);
296}
297static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
298{
299 struct o2net_sock_container *sc, *ret = NULL;
300 struct page *page = NULL;
301
302 page = alloc_page(GFP_NOFS);
303 sc = kcalloc(1, sizeof(*sc), GFP_NOFS);
304 if (sc == NULL || page == NULL)
305 goto out;
306
307 kref_init(&sc->sc_kref);
308 o2nm_node_get(node);
309 sc->sc_node = node;
310
311 INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed, sc);
312 INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty, sc);
313 INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc, sc);
314 INIT_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req, sc);
315
316 init_timer(&sc->sc_idle_timeout);
317 sc->sc_idle_timeout.function = o2net_idle_timer;
318 sc->sc_idle_timeout.data = (unsigned long)sc;
319
320 sclog(sc, "alloced\n");
321
322 ret = sc;
323 sc->sc_page = page;
324 sc = NULL;
325 page = NULL;
326
327out:
328 if (page)
329 __free_page(page);
330 kfree(sc);
331
332 return ret;
333}
334
335/* ------------------------------------------------------------ */
336
337static void o2net_sc_queue_work(struct o2net_sock_container *sc,
338 struct work_struct *work)
339{
340 sc_get(sc);
341 if (!queue_work(o2net_wq, work))
342 sc_put(sc);
343}
344static void o2net_sc_queue_delayed_work(struct o2net_sock_container *sc,
345 struct work_struct *work,
346 int delay)
347{
348 sc_get(sc);
349 if (!queue_delayed_work(o2net_wq, work, delay))
350 sc_put(sc);
351}
352static void o2net_sc_cancel_delayed_work(struct o2net_sock_container *sc,
353 struct work_struct *work)
354{
355 if (cancel_delayed_work(work))
356 sc_put(sc);
357}
358
359static void o2net_set_nn_state(struct o2net_node *nn,
360 struct o2net_sock_container *sc,
361 unsigned valid, int err)
362{
363 int was_valid = nn->nn_sc_valid;
364 int was_err = nn->nn_persistent_error;
365 struct o2net_sock_container *old_sc = nn->nn_sc;
366
367 assert_spin_locked(&nn->nn_lock);
368
369 /* the node num comparison and single connect/accept path should stop
370 * an non-null sc from being overwritten with another */
371 BUG_ON(sc && nn->nn_sc && nn->nn_sc != sc);
372 mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid);
373 mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc);
374
375 /* we won't reconnect after our valid conn goes away for
376 * this hb iteration.. here so it shows up in the logs */
377 if (was_valid && !valid && err == 0)
378 err = -ENOTCONN;
379
380 mlog(ML_CONN, "node %u sc: %p -> %p, valid %u -> %u, err %d -> %d\n",
381 o2net_num_from_nn(nn), nn->nn_sc, sc, nn->nn_sc_valid, valid,
382 nn->nn_persistent_error, err);
383
384 nn->nn_sc = sc;
385 nn->nn_sc_valid = valid ? 1 : 0;
386 nn->nn_persistent_error = err;
387
388 /* mirrors o2net_tx_can_proceed() */
389 if (nn->nn_persistent_error || nn->nn_sc_valid)
390 wake_up(&nn->nn_sc_wq);
391
392 if (!was_err && nn->nn_persistent_error) {
393 o2quo_conn_err(o2net_num_from_nn(nn));
394 queue_delayed_work(o2net_wq, &nn->nn_still_up,
395 msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
396 }
397
398 if (was_valid && !valid) {
399 mlog(ML_NOTICE, "no longer connected to " SC_NODEF_FMT "\n",
400 SC_NODEF_ARGS(old_sc));
401 o2net_complete_nodes_nsw(nn);
402 }
403
404 if (!was_valid && valid) {
405 o2quo_conn_up(o2net_num_from_nn(nn));
406 /* this is a bit of a hack. we only try reconnecting
407 * when heartbeating starts until we get a connection.
408 * if that connection then dies we don't try reconnecting.
409 * the only way to start connecting again is to down
410 * heartbeat and bring it back up. */
411 cancel_delayed_work(&nn->nn_connect_expired);
412 mlog(ML_NOTICE, "%s " SC_NODEF_FMT "\n",
413 o2nm_this_node() > sc->sc_node->nd_num ?
414 "connected to" : "accepted connection from",
415 SC_NODEF_ARGS(sc));
416 }
417
418 /* trigger the connecting worker func as long as we're not valid,
419 * it will back off if it shouldn't connect. This can be called
420 * from node config teardown and so needs to be careful about
421 * the work queue actually being up. */
422 if (!valid && o2net_wq) {
423 unsigned long delay;
424 /* delay if we're withing a RECONNECT_DELAY of the
425 * last attempt */
426 delay = (nn->nn_last_connect_attempt +
427 msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS))
428 - jiffies;
429 if (delay > msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS))
430 delay = 0;
431 mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay);
432 queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay);
433 }
434
435 /* keep track of the nn's sc ref for the caller */
436 if ((old_sc == NULL) && sc)
437 sc_get(sc);
438 if (old_sc && (old_sc != sc)) {
439 o2net_sc_queue_work(old_sc, &old_sc->sc_shutdown_work);
440 sc_put(old_sc);
441 }
442}
443
444/* see o2net_register_callbacks() */
445static void o2net_data_ready(struct sock *sk, int bytes)
446{
447 void (*ready)(struct sock *sk, int bytes);
448
449 read_lock(&sk->sk_callback_lock);
450 if (sk->sk_user_data) {
451 struct o2net_sock_container *sc = sk->sk_user_data;
452 sclog(sc, "data_ready hit\n");
453 do_gettimeofday(&sc->sc_tv_data_ready);
454 o2net_sc_queue_work(sc, &sc->sc_rx_work);
455 ready = sc->sc_data_ready;
456 } else {
457 ready = sk->sk_data_ready;
458 }
459 read_unlock(&sk->sk_callback_lock);
460
461 ready(sk, bytes);
462}
463
464/* see o2net_register_callbacks() */
465static void o2net_state_change(struct sock *sk)
466{
467 void (*state_change)(struct sock *sk);
468 struct o2net_sock_container *sc;
469
470 read_lock(&sk->sk_callback_lock);
471 sc = sk->sk_user_data;
472 if (sc == NULL) {
473 state_change = sk->sk_state_change;
474 goto out;
475 }
476
477 sclog(sc, "state_change to %d\n", sk->sk_state);
478
479 state_change = sc->sc_state_change;
480
481 switch(sk->sk_state) {
482 /* ignore connecting sockets as they make progress */
483 case TCP_SYN_SENT:
484 case TCP_SYN_RECV:
485 break;
486 case TCP_ESTABLISHED:
487 o2net_sc_queue_work(sc, &sc->sc_connect_work);
488 break;
489 default:
490 o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
491 break;
492 }
493out:
494 read_unlock(&sk->sk_callback_lock);
495 state_change(sk);
496}
497
498/*
499 * we register callbacks so we can queue work on events before calling
500 * the original callbacks. our callbacks our careful to test user_data
501 * to discover when they've reaced with o2net_unregister_callbacks().
502 */
503static void o2net_register_callbacks(struct sock *sk,
504 struct o2net_sock_container *sc)
505{
506 write_lock_bh(&sk->sk_callback_lock);
507
508 /* accepted sockets inherit the old listen socket data ready */
509 if (sk->sk_data_ready == o2net_listen_data_ready) {
510 sk->sk_data_ready = sk->sk_user_data;
511 sk->sk_user_data = NULL;
512 }
513
514 BUG_ON(sk->sk_user_data != NULL);
515 sk->sk_user_data = sc;
516 sc_get(sc);
517
518 sc->sc_data_ready = sk->sk_data_ready;
519 sc->sc_state_change = sk->sk_state_change;
520 sk->sk_data_ready = o2net_data_ready;
521 sk->sk_state_change = o2net_state_change;
522
523 write_unlock_bh(&sk->sk_callback_lock);
524}
525
526static int o2net_unregister_callbacks(struct sock *sk,
527 struct o2net_sock_container *sc)
528{
529 int ret = 0;
530
531 write_lock_bh(&sk->sk_callback_lock);
532 if (sk->sk_user_data == sc) {
533 ret = 1;
534 sk->sk_user_data = NULL;
535 sk->sk_data_ready = sc->sc_data_ready;
536 sk->sk_state_change = sc->sc_state_change;
537 }
538 write_unlock_bh(&sk->sk_callback_lock);
539
540 return ret;
541}
542
543/*
544 * this is a little helper that is called by callers who have seen a problem
545 * with an sc and want to detach it from the nn if someone already hasn't beat
546 * them to it. if an error is given then the shutdown will be persistent
547 * and pending transmits will be canceled.
548 */
549static void o2net_ensure_shutdown(struct o2net_node *nn,
550 struct o2net_sock_container *sc,
551 int err)
552{
553 spin_lock(&nn->nn_lock);
554 if (nn->nn_sc == sc)
555 o2net_set_nn_state(nn, NULL, 0, err);
556 spin_unlock(&nn->nn_lock);
557}
558
559/*
560 * This work queue function performs the blocking parts of socket shutdown. A
561 * few paths lead here. set_nn_state will trigger this callback if it sees an
562 * sc detached from the nn. state_change will also trigger this callback
563 * directly when it sees errors. In that case we need to call set_nn_state
564 * ourselves as state_change couldn't get the nn_lock and call set_nn_state
565 * itself.
566 */
567static void o2net_shutdown_sc(void *arg)
568{
569 struct o2net_sock_container *sc = arg;
570 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
571
572 sclog(sc, "shutting down\n");
573
574 /* drop the callbacks ref and call shutdown only once */
575 if (o2net_unregister_callbacks(sc->sc_sock->sk, sc)) {
576 /* we shouldn't flush as we're in the thread, the
577 * races with pending sc work structs are harmless */
578 del_timer_sync(&sc->sc_idle_timeout);
579 o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
580 sc_put(sc);
581 sc->sc_sock->ops->shutdown(sc->sc_sock,
582 RCV_SHUTDOWN|SEND_SHUTDOWN);
583 }
584
585 /* not fatal so failed connects before the other guy has our
586 * heartbeat can be retried */
587 o2net_ensure_shutdown(nn, sc, 0);
588 sc_put(sc);
589}
590
591/* ------------------------------------------------------------ */
592
593static int o2net_handler_cmp(struct o2net_msg_handler *nmh, u32 msg_type,
594 u32 key)
595{
596 int ret = memcmp(&nmh->nh_key, &key, sizeof(key));
597
598 if (ret == 0)
599 ret = memcmp(&nmh->nh_msg_type, &msg_type, sizeof(msg_type));
600
601 return ret;
602}
603
604static struct o2net_msg_handler *
605o2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p,
606 struct rb_node **ret_parent)
607{
608 struct rb_node **p = &o2net_handler_tree.rb_node;
609 struct rb_node *parent = NULL;
610 struct o2net_msg_handler *nmh, *ret = NULL;
611 int cmp;
612
613 while (*p) {
614 parent = *p;
615 nmh = rb_entry(parent, struct o2net_msg_handler, nh_node);
616 cmp = o2net_handler_cmp(nmh, msg_type, key);
617
618 if (cmp < 0)
619 p = &(*p)->rb_left;
620 else if (cmp > 0)
621 p = &(*p)->rb_right;
622 else {
623 ret = nmh;
624 break;
625 }
626 }
627
628 if (ret_p != NULL)
629 *ret_p = p;
630 if (ret_parent != NULL)
631 *ret_parent = parent;
632
633 return ret;
634}
635
636static void o2net_handler_kref_release(struct kref *kref)
637{
638 struct o2net_msg_handler *nmh;
639 nmh = container_of(kref, struct o2net_msg_handler, nh_kref);
640
641 kfree(nmh);
642}
643
644static void o2net_handler_put(struct o2net_msg_handler *nmh)
645{
646 kref_put(&nmh->nh_kref, o2net_handler_kref_release);
647}
648
649/* max_len is protection for the handler func. incoming messages won't
650 * be given to the handler if their payload is longer than the max. */
651int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
652 o2net_msg_handler_func *func, void *data,
653 struct list_head *unreg_list)
654{
655 struct o2net_msg_handler *nmh = NULL;
656 struct rb_node **p, *parent;
657 int ret = 0;
658
659 if (max_len > O2NET_MAX_PAYLOAD_BYTES) {
660 mlog(0, "max_len for message handler out of range: %u\n",
661 max_len);
662 ret = -EINVAL;
663 goto out;
664 }
665
666 if (!msg_type) {
667 mlog(0, "no message type provided: %u, %p\n", msg_type, func);
668 ret = -EINVAL;
669 goto out;
670
671 }
672 if (!func) {
673 mlog(0, "no message handler provided: %u, %p\n",
674 msg_type, func);
675 ret = -EINVAL;
676 goto out;
677 }
678
679 nmh = kcalloc(1, sizeof(struct o2net_msg_handler), GFP_NOFS);
680 if (nmh == NULL) {
681 ret = -ENOMEM;
682 goto out;
683 }
684
685 nmh->nh_func = func;
686 nmh->nh_func_data = data;
687 nmh->nh_msg_type = msg_type;
688 nmh->nh_max_len = max_len;
689 nmh->nh_key = key;
690 /* the tree and list get this ref.. they're both removed in
691 * unregister when this ref is dropped */
692 kref_init(&nmh->nh_kref);
693 INIT_LIST_HEAD(&nmh->nh_unregister_item);
694
695 write_lock(&o2net_handler_lock);
696 if (o2net_handler_tree_lookup(msg_type, key, &p, &parent))
697 ret = -EEXIST;
698 else {
699 rb_link_node(&nmh->nh_node, parent, p);
700 rb_insert_color(&nmh->nh_node, &o2net_handler_tree);
701 list_add_tail(&nmh->nh_unregister_item, unreg_list);
702
703 mlog(ML_TCP, "registered handler func %p type %u key %08x\n",
704 func, msg_type, key);
705 /* we've had some trouble with handlers seemingly vanishing. */
706 mlog_bug_on_msg(o2net_handler_tree_lookup(msg_type, key, &p,
707 &parent) == NULL,
708 "couldn't find handler we *just* registerd "
709 "for type %u key %08x\n", msg_type, key);
710 }
711 write_unlock(&o2net_handler_lock);
712 if (ret)
713 goto out;
714
715out:
716 if (ret)
717 kfree(nmh);
718
719 return ret;
720}
721EXPORT_SYMBOL_GPL(o2net_register_handler);
722
723void o2net_unregister_handler_list(struct list_head *list)
724{
725 struct list_head *pos, *n;
726 struct o2net_msg_handler *nmh;
727
728 write_lock(&o2net_handler_lock);
729 list_for_each_safe(pos, n, list) {
730 nmh = list_entry(pos, struct o2net_msg_handler,
731 nh_unregister_item);
732 mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n",
733 nmh->nh_func, nmh->nh_msg_type, nmh->nh_key);
734 rb_erase(&nmh->nh_node, &o2net_handler_tree);
735 list_del_init(&nmh->nh_unregister_item);
736 kref_put(&nmh->nh_kref, o2net_handler_kref_release);
737 }
738 write_unlock(&o2net_handler_lock);
739}
740EXPORT_SYMBOL_GPL(o2net_unregister_handler_list);
741
742static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key)
743{
744 struct o2net_msg_handler *nmh;
745
746 read_lock(&o2net_handler_lock);
747 nmh = o2net_handler_tree_lookup(msg_type, key, NULL, NULL);
748 if (nmh)
749 kref_get(&nmh->nh_kref);
750 read_unlock(&o2net_handler_lock);
751
752 return nmh;
753}
754
755/* ------------------------------------------------------------ */
756
757static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len)
758{
759 int ret;
760 mm_segment_t oldfs;
761 struct kvec vec = {
762 .iov_len = len,
763 .iov_base = data,
764 };
765 struct msghdr msg = {
766 .msg_iovlen = 1,
767 .msg_iov = (struct iovec *)&vec,
768 .msg_flags = MSG_DONTWAIT,
769 };
770
771 oldfs = get_fs();
772 set_fs(get_ds());
773 ret = sock_recvmsg(sock, &msg, len, msg.msg_flags);
774 set_fs(oldfs);
775
776 return ret;
777}
778
779static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
780 size_t veclen, size_t total)
781{
782 int ret;
783 mm_segment_t oldfs;
784 struct msghdr msg = {
785 .msg_iov = (struct iovec *)vec,
786 .msg_iovlen = veclen,
787 };
788
789 if (sock == NULL) {
790 ret = -EINVAL;
791 goto out;
792 }
793
794 oldfs = get_fs();
795 set_fs(get_ds());
796 ret = sock_sendmsg(sock, &msg, total);
797 set_fs(oldfs);
798 if (ret != total) {
799 mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret,
800 total);
801 if (ret >= 0)
802 ret = -EPIPE; /* should be smarter, I bet */
803 goto out;
804 }
805
806 ret = 0;
807out:
808 if (ret < 0)
809 mlog(0, "returning error: %d\n", ret);
810 return ret;
811}
812
813static void o2net_sendpage(struct o2net_sock_container *sc,
814 void *kmalloced_virt,
815 size_t size)
816{
817 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
818 ssize_t ret;
819
820
821 ret = sc->sc_sock->ops->sendpage(sc->sc_sock,
822 virt_to_page(kmalloced_virt),
823 (long)kmalloced_virt & ~PAGE_MASK,
824 size, MSG_DONTWAIT);
825 if (ret != size) {
826 mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT
827 " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret);
828 o2net_ensure_shutdown(nn, sc, 0);
829 }
830}
831
832static void o2net_init_msg(struct o2net_msg *msg, u16 data_len, u16 msg_type, u32 key)
833{
834 memset(msg, 0, sizeof(struct o2net_msg));
835 msg->magic = cpu_to_be16(O2NET_MSG_MAGIC);
836 msg->data_len = cpu_to_be16(data_len);
837 msg->msg_type = cpu_to_be16(msg_type);
838 msg->sys_status = cpu_to_be32(O2NET_ERR_NONE);
839 msg->status = 0;
840 msg->key = cpu_to_be32(key);
841}
842
843static int o2net_tx_can_proceed(struct o2net_node *nn,
844 struct o2net_sock_container **sc_ret,
845 int *error)
846{
847 int ret = 0;
848
849 spin_lock(&nn->nn_lock);
850 if (nn->nn_persistent_error) {
851 ret = 1;
852 *sc_ret = NULL;
853 *error = nn->nn_persistent_error;
854 } else if (nn->nn_sc_valid) {
855 kref_get(&nn->nn_sc->sc_kref);
856
857 ret = 1;
858 *sc_ret = nn->nn_sc;
859 *error = 0;
860 }
861 spin_unlock(&nn->nn_lock);
862
863 return ret;
864}
865
866int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
867 size_t caller_veclen, u8 target_node, int *status)
868{
869 int ret, error = 0;
870 struct o2net_msg *msg = NULL;
871 size_t veclen, caller_bytes = 0;
872 struct kvec *vec = NULL;
873 struct o2net_sock_container *sc = NULL;
874 struct o2net_node *nn = o2net_nn_from_num(target_node);
875 struct o2net_status_wait nsw = {
876 .ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item),
877 };
878
879 if (o2net_wq == NULL) {
880 mlog(0, "attempt to tx without o2netd running\n");
881 ret = -ESRCH;
882 goto out;
883 }
884
885 if (caller_veclen == 0) {
886 mlog(0, "bad kvec array length\n");
887 ret = -EINVAL;
888 goto out;
889 }
890
891 caller_bytes = iov_length((struct iovec *)caller_vec, caller_veclen);
892 if (caller_bytes > O2NET_MAX_PAYLOAD_BYTES) {
893 mlog(0, "total payload len %zu too large\n", caller_bytes);
894 ret = -EINVAL;
895 goto out;
896 }
897
898 if (target_node == o2nm_this_node()) {
899 ret = -ELOOP;
900 goto out;
901 }
902
903 ret = wait_event_interruptible(nn->nn_sc_wq,
904 o2net_tx_can_proceed(nn, &sc, &error));
905 if (!ret && error)
906 ret = error;
907 if (ret)
908 goto out;
909
910 veclen = caller_veclen + 1;
911 vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC);
912 if (vec == NULL) {
913 mlog(0, "failed to %zu element kvec!\n", veclen);
914 ret = -ENOMEM;
915 goto out;
916 }
917
918 msg = kmalloc(sizeof(struct o2net_msg), GFP_ATOMIC);
919 if (!msg) {
920 mlog(0, "failed to allocate a o2net_msg!\n");
921 ret = -ENOMEM;
922 goto out;
923 }
924
925 o2net_init_msg(msg, caller_bytes, msg_type, key);
926
927 vec[0].iov_len = sizeof(struct o2net_msg);
928 vec[0].iov_base = msg;
929 memcpy(&vec[1], caller_vec, caller_veclen * sizeof(struct kvec));
930
931 ret = o2net_prep_nsw(nn, &nsw);
932 if (ret)
933 goto out;
934
935 msg->msg_num = cpu_to_be32(nsw.ns_id);
936
937 /* finally, convert the message header to network byte-order
938 * and send */
939 ret = o2net_send_tcp_msg(sc->sc_sock, vec, veclen,
940 sizeof(struct o2net_msg) + caller_bytes);
941 msglog(msg, "sending returned %d\n", ret);
942 if (ret < 0) {
943 mlog(0, "error returned from o2net_send_tcp_msg=%d\n", ret);
944 goto out;
945 }
946
947 /* wait on other node's handler */
948 wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
949
950 /* Note that we avoid overwriting the callers status return
951 * variable if a system error was reported on the other
952 * side. Callers beware. */
953 ret = o2net_sys_err_to_errno(nsw.ns_sys_status);
954 if (status && !ret)
955 *status = nsw.ns_status;
956
957 mlog(0, "woken, returning system status %d, user status %d\n",
958 ret, nsw.ns_status);
959out:
960 if (sc)
961 sc_put(sc);
962 if (vec)
963 kfree(vec);
964 if (msg)
965 kfree(msg);
966 o2net_complete_nsw(nn, &nsw, 0, 0, 0);
967 return ret;
968}
969EXPORT_SYMBOL_GPL(o2net_send_message_vec);
970
971int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len,
972 u8 target_node, int *status)
973{
974 struct kvec vec = {
975 .iov_base = data,
976 .iov_len = len,
977 };
978 return o2net_send_message_vec(msg_type, key, &vec, 1,
979 target_node, status);
980}
981EXPORT_SYMBOL_GPL(o2net_send_message);
982
983static int o2net_send_status_magic(struct socket *sock, struct o2net_msg *hdr,
984 enum o2net_system_error syserr, int err)
985{
986 struct kvec vec = {
987 .iov_base = hdr,
988 .iov_len = sizeof(struct o2net_msg),
989 };
990
991 BUG_ON(syserr >= O2NET_ERR_MAX);
992
993 /* leave other fields intact from the incoming message, msg_num
994 * in particular */
995 hdr->sys_status = cpu_to_be32(syserr);
996 hdr->status = cpu_to_be32(err);
997 hdr->magic = cpu_to_be16(O2NET_MSG_STATUS_MAGIC); // twiddle the magic
998 hdr->data_len = 0;
999
1000 msglog(hdr, "about to send status magic %d\n", err);
1001 /* hdr has been in host byteorder this whole time */
1002 return o2net_send_tcp_msg(sock, &vec, 1, sizeof(struct o2net_msg));
1003}
1004
1005/* this returns -errno if the header was unknown or too large, etc.
1006 * after this is called the buffer us reused for the next message */
1007static int o2net_process_message(struct o2net_sock_container *sc,
1008 struct o2net_msg *hdr)
1009{
1010 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
1011 int ret = 0, handler_status;
1012 enum o2net_system_error syserr;
1013 struct o2net_msg_handler *nmh = NULL;
1014
1015 msglog(hdr, "processing message\n");
1016
1017 o2net_sc_postpone_idle(sc);
1018
1019 switch(be16_to_cpu(hdr->magic)) {
1020 case O2NET_MSG_STATUS_MAGIC:
1021 /* special type for returning message status */
1022 o2net_complete_nsw(nn, NULL,
1023 be32_to_cpu(hdr->msg_num),
1024 be32_to_cpu(hdr->sys_status),
1025 be32_to_cpu(hdr->status));
1026 goto out;
1027 case O2NET_MSG_KEEP_REQ_MAGIC:
1028 o2net_sendpage(sc, o2net_keep_resp,
1029 sizeof(*o2net_keep_resp));
1030 goto out;
1031 case O2NET_MSG_KEEP_RESP_MAGIC:
1032 goto out;
1033 case O2NET_MSG_MAGIC:
1034 break;
1035 default:
1036 msglog(hdr, "bad magic\n");
1037 ret = -EINVAL;
1038 goto out;
1039 break;
1040 }
1041
1042 /* find a handler for it */
1043 handler_status = 0;
1044 nmh = o2net_handler_get(be16_to_cpu(hdr->msg_type),
1045 be32_to_cpu(hdr->key));
1046 if (!nmh) {
1047 mlog(ML_TCP, "couldn't find handler for type %u key %08x\n",
1048 be16_to_cpu(hdr->msg_type), be32_to_cpu(hdr->key));
1049 syserr = O2NET_ERR_NO_HNDLR;
1050 goto out_respond;
1051 }
1052
1053 syserr = O2NET_ERR_NONE;
1054
1055 if (be16_to_cpu(hdr->data_len) > nmh->nh_max_len)
1056 syserr = O2NET_ERR_OVERFLOW;
1057
1058 if (syserr != O2NET_ERR_NONE)
1059 goto out_respond;
1060
1061 do_gettimeofday(&sc->sc_tv_func_start);
1062 sc->sc_msg_key = be32_to_cpu(hdr->key);
1063 sc->sc_msg_type = be16_to_cpu(hdr->msg_type);
1064 handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) +
1065 be16_to_cpu(hdr->data_len),
1066 nmh->nh_func_data);
1067 do_gettimeofday(&sc->sc_tv_func_stop);
1068
1069out_respond:
1070 /* this destroys the hdr, so don't use it after this */
1071 ret = o2net_send_status_magic(sc->sc_sock, hdr, syserr,
1072 handler_status);
1073 hdr = NULL;
1074 mlog(0, "sending handler status %d, syserr %d returned %d\n",
1075 handler_status, syserr, ret);
1076
1077out:
1078 if (nmh)
1079 o2net_handler_put(nmh);
1080 return ret;
1081}
1082
1083static int o2net_check_handshake(struct o2net_sock_container *sc)
1084{
1085 struct o2net_handshake *hand = page_address(sc->sc_page);
1086 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
1087
1088 if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) {
1089 mlog(ML_NOTICE, SC_NODEF_FMT " advertised net protocol "
1090 "version %llu but %llu is required, disconnecting\n",
1091 SC_NODEF_ARGS(sc),
1092 (unsigned long long)be64_to_cpu(hand->protocol_version),
1093 O2NET_PROTOCOL_VERSION);
1094
1095 /* don't bother reconnecting if its the wrong version. */
1096 o2net_ensure_shutdown(nn, sc, -ENOTCONN);
1097 return -1;
1098 }
1099
1100 sc->sc_handshake_ok = 1;
1101
1102 spin_lock(&nn->nn_lock);
1103 /* set valid and queue the idle timers only if it hasn't been
1104 * shut down already */
1105 if (nn->nn_sc == sc) {
1106 o2net_sc_postpone_idle(sc);
1107 o2net_set_nn_state(nn, sc, 1, 0);
1108 }
1109 spin_unlock(&nn->nn_lock);
1110
1111 /* shift everything up as though it wasn't there */
1112 sc->sc_page_off -= sizeof(struct o2net_handshake);
1113 if (sc->sc_page_off)
1114 memmove(hand, hand + 1, sc->sc_page_off);
1115
1116 return 0;
1117}
1118
1119/* this demuxes the queued rx bytes into header or payload bits and calls
1120 * handlers as each full message is read off the socket. it returns -error,
1121 * == 0 eof, or > 0 for progress made.*/
1122static int o2net_advance_rx(struct o2net_sock_container *sc)
1123{
1124 struct o2net_msg *hdr;
1125 int ret = 0;
1126 void *data;
1127 size_t datalen;
1128
1129 sclog(sc, "receiving\n");
1130 do_gettimeofday(&sc->sc_tv_advance_start);
1131
1132 /* do we need more header? */
1133 if (sc->sc_page_off < sizeof(struct o2net_msg)) {
1134 data = page_address(sc->sc_page) + sc->sc_page_off;
1135 datalen = sizeof(struct o2net_msg) - sc->sc_page_off;
1136 ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
1137 if (ret > 0) {
1138 sc->sc_page_off += ret;
1139
1140 /* this working relies on the handshake being
1141 * smaller than the normal message header */
1142 if (sc->sc_page_off >= sizeof(struct o2net_handshake)&&
1143 !sc->sc_handshake_ok && o2net_check_handshake(sc)) {
1144 ret = -EPROTO;
1145 goto out;
1146 }
1147
1148 /* only swab incoming here.. we can
1149 * only get here once as we cross from
1150 * being under to over */
1151 if (sc->sc_page_off == sizeof(struct o2net_msg)) {
1152 hdr = page_address(sc->sc_page);
1153 if (be16_to_cpu(hdr->data_len) >
1154 O2NET_MAX_PAYLOAD_BYTES)
1155 ret = -EOVERFLOW;
1156 }
1157 }
1158 if (ret <= 0)
1159 goto out;
1160 }
1161
1162 if (sc->sc_page_off < sizeof(struct o2net_msg)) {
1163 /* oof, still don't have a header */
1164 goto out;
1165 }
1166
1167 /* this was swabbed above when we first read it */
1168 hdr = page_address(sc->sc_page);
1169
1170 msglog(hdr, "at page_off %zu\n", sc->sc_page_off);
1171
1172 /* do we need more payload? */
1173 if (sc->sc_page_off - sizeof(struct o2net_msg) < be16_to_cpu(hdr->data_len)) {
1174 /* need more payload */
1175 data = page_address(sc->sc_page) + sc->sc_page_off;
1176 datalen = (sizeof(struct o2net_msg) + be16_to_cpu(hdr->data_len)) -
1177 sc->sc_page_off;
1178 ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
1179 if (ret > 0)
1180 sc->sc_page_off += ret;
1181 if (ret <= 0)
1182 goto out;
1183 }
1184
1185 if (sc->sc_page_off - sizeof(struct o2net_msg) == be16_to_cpu(hdr->data_len)) {
1186 /* we can only get here once, the first time we read
1187 * the payload.. so set ret to progress if the handler
1188 * works out. after calling this the message is toast */
1189 ret = o2net_process_message(sc, hdr);
1190 if (ret == 0)
1191 ret = 1;
1192 sc->sc_page_off = 0;
1193 }
1194
1195out:
1196 sclog(sc, "ret = %d\n", ret);
1197 do_gettimeofday(&sc->sc_tv_advance_stop);
1198 return ret;
1199}
1200
1201/* this work func is triggerd by data ready. it reads until it can read no
1202 * more. it interprets 0, eof, as fatal. if data_ready hits while we're doing
1203 * our work the work struct will be marked and we'll be called again. */
1204static void o2net_rx_until_empty(void *arg)
1205{
1206 struct o2net_sock_container *sc = arg;
1207 int ret;
1208
1209 do {
1210 ret = o2net_advance_rx(sc);
1211 } while (ret > 0);
1212
1213 if (ret <= 0 && ret != -EAGAIN) {
1214 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
1215 sclog(sc, "saw error %d, closing\n", ret);
1216 /* not permanent so read failed handshake can retry */
1217 o2net_ensure_shutdown(nn, sc, 0);
1218 }
1219
1220 sc_put(sc);
1221}
1222
1223static int o2net_set_nodelay(struct socket *sock)
1224{
1225 int ret, val = 1;
1226 mm_segment_t oldfs;
1227
1228 oldfs = get_fs();
1229 set_fs(KERNEL_DS);
1230
1231 /*
1232 * Dear unsuspecting programmer,
1233 *
1234 * Don't use sock_setsockopt() for SOL_TCP. It doesn't check its level
1235 * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will
1236 * silently turn into SO_DEBUG.
1237 *
1238 * Yours,
1239 * Keeper of hilariously fragile interfaces.
1240 */
1241 ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY,
1242 (char __user *)&val, sizeof(val));
1243
1244 set_fs(oldfs);
1245 return ret;
1246}
1247
1248/* ------------------------------------------------------------ */
1249
1250/* called when a connect completes and after a sock is accepted. the
1251 * rx path will see the response and mark the sc valid */
1252static void o2net_sc_connect_completed(void *arg)
1253{
1254 struct o2net_sock_container *sc = arg;
1255
1256 mlog(ML_MSG, "sc sending handshake with ver %llu id %llx\n",
1257 (unsigned long long)O2NET_PROTOCOL_VERSION,
1258 (unsigned long long)be64_to_cpu(o2net_hand->connector_id));
1259
1260 o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
1261 sc_put(sc);
1262}
1263
1264/* this is called as a work_struct func. */
1265static void o2net_sc_send_keep_req(void *arg)
1266{
1267 struct o2net_sock_container *sc = arg;
1268
1269 o2net_sendpage(sc, o2net_keep_req, sizeof(*o2net_keep_req));
1270 sc_put(sc);
1271}
1272
1273/* socket shutdown does a del_timer_sync against this as it tears down.
1274 * we can't start this timer until we've got to the point in sc buildup
1275 * where shutdown is going to be involved */
1276static void o2net_idle_timer(unsigned long data)
1277{
1278 struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
1279 struct timeval now;
1280
1281 do_gettimeofday(&now);
1282
1283 mlog(ML_NOTICE, "connection to " SC_NODEF_FMT " has been idle for 10 "
1284 "seconds, shutting it down.\n", SC_NODEF_ARGS(sc));
1285 mlog(ML_NOTICE, "here are some times that might help debug the "
1286 "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
1287 "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
1288 sc->sc_tv_timer.tv_sec, sc->sc_tv_timer.tv_usec,
1289 now.tv_sec, now.tv_usec,
1290 sc->sc_tv_data_ready.tv_sec, sc->sc_tv_data_ready.tv_usec,
1291 sc->sc_tv_advance_start.tv_sec, sc->sc_tv_advance_start.tv_usec,
1292 sc->sc_tv_advance_stop.tv_sec, sc->sc_tv_advance_stop.tv_usec,
1293 sc->sc_msg_key, sc->sc_msg_type,
1294 sc->sc_tv_func_start.tv_sec, sc->sc_tv_func_start.tv_usec,
1295 sc->sc_tv_func_stop.tv_sec, sc->sc_tv_func_stop.tv_usec);
1296
1297 o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
1298}
1299
1300static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
1301{
1302 o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
1303 o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
1304 O2NET_KEEPALIVE_DELAY_SECS * HZ);
1305 do_gettimeofday(&sc->sc_tv_timer);
1306 mod_timer(&sc->sc_idle_timeout,
1307 jiffies + (O2NET_IDLE_TIMEOUT_SECS * HZ));
1308}
1309
1310/* this work func is kicked whenever a path sets the nn state which doesn't
1311 * have valid set. This includes seeing hb come up, losing a connection,
1312 * having a connect attempt fail, etc. This centralizes the logic which decides
1313 * if a connect attempt should be made or if we should give up and all future
1314 * transmit attempts should fail */
1315static void o2net_start_connect(void *arg)
1316{
1317 struct o2net_node *nn = arg;
1318 struct o2net_sock_container *sc = NULL;
1319 struct o2nm_node *node = NULL;
1320 struct socket *sock = NULL;
1321 struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
1322 int ret = 0;
1323
1324 /* if we're greater we initiate tx, otherwise we accept */
1325 if (o2nm_this_node() <= o2net_num_from_nn(nn))
1326 goto out;
1327
1328 /* watch for racing with tearing a node down */
1329 node = o2nm_get_node_by_num(o2net_num_from_nn(nn));
1330 if (node == NULL) {
1331 ret = 0;
1332 goto out;
1333 }
1334
1335 spin_lock(&nn->nn_lock);
1336 /* see if we already have one pending or have given up */
1337 if (nn->nn_sc || nn->nn_persistent_error)
1338 arg = NULL;
1339 spin_unlock(&nn->nn_lock);
1340 if (arg == NULL) /* *shrug*, needed some indicator */
1341 goto out;
1342
1343 nn->nn_last_connect_attempt = jiffies;
1344
1345 sc = sc_alloc(node);
1346 if (sc == NULL) {
1347 mlog(0, "couldn't allocate sc\n");
1348 ret = -ENOMEM;
1349 goto out;
1350 }
1351
1352 ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
1353 if (ret < 0) {
1354 mlog(0, "can't create socket: %d\n", ret);
1355 goto out;
1356 }
1357 sc->sc_sock = sock; /* freed by sc_kref_release */
1358
1359 sock->sk->sk_allocation = GFP_ATOMIC;
1360
1361 myaddr.sin_family = AF_INET;
1362 myaddr.sin_port = (__force u16)htons(0); /* any port */
1363
1364 ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr,
1365 sizeof(myaddr));
1366 if (ret) {
1367 mlog(0, "bind failed: %d\n", ret);
1368 goto out;
1369 }
1370
1371 ret = o2net_set_nodelay(sc->sc_sock);
1372 if (ret) {
1373 mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
1374 goto out;
1375 }
1376
1377 o2net_register_callbacks(sc->sc_sock->sk, sc);
1378
1379 spin_lock(&nn->nn_lock);
1380 /* handshake completion will set nn->nn_sc_valid */
1381 o2net_set_nn_state(nn, sc, 0, 0);
1382 spin_unlock(&nn->nn_lock);
1383
1384 remoteaddr.sin_family = AF_INET;
1385 remoteaddr.sin_addr.s_addr = (__force u32)node->nd_ipv4_address;
1386 remoteaddr.sin_port = (__force u16)node->nd_ipv4_port;
1387
1388 ret = sc->sc_sock->ops->connect(sc->sc_sock,
1389 (struct sockaddr *)&remoteaddr,
1390 sizeof(remoteaddr),
1391 O_NONBLOCK);
1392 if (ret == -EINPROGRESS)
1393 ret = 0;
1394
1395out:
1396 if (ret) {
1397 mlog(ML_NOTICE, "connect attempt to " SC_NODEF_FMT " failed "
1398 "with errno %d\n", SC_NODEF_ARGS(sc), ret);
1399 /* 0 err so that another will be queued and attempted
1400 * from set_nn_state */
1401 if (sc)
1402 o2net_ensure_shutdown(nn, sc, 0);
1403 }
1404 if (sc)
1405 sc_put(sc);
1406 if (node)
1407 o2nm_node_put(node);
1408
1409 return;
1410}
1411
1412static void o2net_connect_expired(void *arg)
1413{
1414 struct o2net_node *nn = arg;
1415
1416 spin_lock(&nn->nn_lock);
1417 if (!nn->nn_sc_valid) {
1418 mlog(ML_ERROR, "no connection established with node %u after "
1419 "%u seconds, giving up and returning errors.\n",
1420 o2net_num_from_nn(nn), O2NET_IDLE_TIMEOUT_SECS);
1421
1422 o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
1423 }
1424 spin_unlock(&nn->nn_lock);
1425}
1426
1427static void o2net_still_up(void *arg)
1428{
1429 struct o2net_node *nn = arg;
1430
1431 o2quo_hb_still_up(o2net_num_from_nn(nn));
1432}
1433
1434/* ------------------------------------------------------------ */
1435
1436void o2net_disconnect_node(struct o2nm_node *node)
1437{
1438 struct o2net_node *nn = o2net_nn_from_num(node->nd_num);
1439
1440 /* don't reconnect until it's heartbeating again */
1441 spin_lock(&nn->nn_lock);
1442 o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
1443 spin_unlock(&nn->nn_lock);
1444
1445 if (o2net_wq) {
1446 cancel_delayed_work(&nn->nn_connect_expired);
1447 cancel_delayed_work(&nn->nn_connect_work);
1448 cancel_delayed_work(&nn->nn_still_up);
1449 flush_workqueue(o2net_wq);
1450 }
1451}
1452
1453static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num,
1454 void *data)
1455{
1456 o2quo_hb_down(node_num);
1457
1458 if (node_num != o2nm_this_node())
1459 o2net_disconnect_node(node);
1460}
1461
1462static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
1463 void *data)
1464{
1465 struct o2net_node *nn = o2net_nn_from_num(node_num);
1466
1467 o2quo_hb_up(node_num);
1468
1469 /* ensure an immediate connect attempt */
1470 nn->nn_last_connect_attempt = jiffies -
1471 (msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS) + 1);
1472
1473 if (node_num != o2nm_this_node()) {
1474 /* heartbeat doesn't work unless a local node number is
1475 * configured and doing so brings up the o2net_wq, so we can
1476 * use it.. */
1477 queue_delayed_work(o2net_wq, &nn->nn_connect_expired,
1478 O2NET_IDLE_TIMEOUT_SECS * HZ);
1479
1480 /* believe it or not, accept and node hearbeating testing
1481 * can succeed for this node before we got here.. so
1482 * only use set_nn_state to clear the persistent error
1483 * if that hasn't already happened */
1484 spin_lock(&nn->nn_lock);
1485 if (nn->nn_persistent_error)
1486 o2net_set_nn_state(nn, NULL, 0, 0);
1487 spin_unlock(&nn->nn_lock);
1488 }
1489}
1490
1491void o2net_unregister_hb_callbacks(void)
1492{
1493 int ret;
1494
1495 ret = o2hb_unregister_callback(&o2net_hb_up);
1496 if (ret < 0)
1497 mlog(ML_ERROR, "Status return %d unregistering heartbeat up "
1498 "callback!\n", ret);
1499
1500 ret = o2hb_unregister_callback(&o2net_hb_down);
1501 if (ret < 0)
1502 mlog(ML_ERROR, "Status return %d unregistering heartbeat down "
1503 "callback!\n", ret);
1504}
1505
1506int o2net_register_hb_callbacks(void)
1507{
1508 int ret;
1509
1510 o2hb_setup_callback(&o2net_hb_down, O2HB_NODE_DOWN_CB,
1511 o2net_hb_node_down_cb, NULL, O2NET_HB_PRI);
1512 o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB,
1513 o2net_hb_node_up_cb, NULL, O2NET_HB_PRI);
1514
1515 ret = o2hb_register_callback(&o2net_hb_up);
1516 if (ret == 0)
1517 ret = o2hb_register_callback(&o2net_hb_down);
1518
1519 if (ret)
1520 o2net_unregister_hb_callbacks();
1521
1522 return ret;
1523}
1524
1525/* ------------------------------------------------------------ */
1526
1527static int o2net_accept_one(struct socket *sock)
1528{
1529 int ret, slen;
1530 struct sockaddr_in sin;
1531 struct socket *new_sock = NULL;
1532 struct o2nm_node *node = NULL;
1533 struct o2net_sock_container *sc = NULL;
1534 struct o2net_node *nn;
1535
1536 BUG_ON(sock == NULL);
1537 ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
1538 sock->sk->sk_protocol, &new_sock);
1539 if (ret)
1540 goto out;
1541
1542 new_sock->type = sock->type;
1543 new_sock->ops = sock->ops;
1544 ret = sock->ops->accept(sock, new_sock, O_NONBLOCK);
1545 if (ret < 0)
1546 goto out;
1547
1548 new_sock->sk->sk_allocation = GFP_ATOMIC;
1549
1550 ret = o2net_set_nodelay(new_sock);
1551 if (ret) {
1552 mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
1553 goto out;
1554 }
1555
1556 slen = sizeof(sin);
1557 ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin,
1558 &slen, 1);
1559 if (ret < 0)
1560 goto out;
1561
1562 node = o2nm_get_node_by_ip((__force __be32)sin.sin_addr.s_addr);
1563 if (node == NULL) {
1564 mlog(ML_NOTICE, "attempt to connect from unknown node at "
1565 "%u.%u.%u.%u:%d\n", NIPQUAD(sin.sin_addr.s_addr),
1566 ntohs((__force __be16)sin.sin_port));
1567 ret = -EINVAL;
1568 goto out;
1569 }
1570
1571 if (o2nm_this_node() > node->nd_num) {
1572 mlog(ML_NOTICE, "unexpected connect attempted from a lower "
1573 "numbered node '%s' at " "%u.%u.%u.%u:%d with num %u\n",
1574 node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
1575 ntohs((__force __be16)sin.sin_port), node->nd_num);
1576 ret = -EINVAL;
1577 goto out;
1578 }
1579
1580 /* this happens all the time when the other node sees our heartbeat
1581 * and tries to connect before we see their heartbeat */
1582 if (!o2hb_check_node_heartbeating_from_callback(node->nd_num)) {
1583 mlog(ML_CONN, "attempt to connect from node '%s' at "
1584 "%u.%u.%u.%u:%d but it isn't heartbeating\n",
1585 node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
1586 ntohs((__force __be16)sin.sin_port));
1587 ret = -EINVAL;
1588 goto out;
1589 }
1590
1591 nn = o2net_nn_from_num(node->nd_num);
1592
1593 spin_lock(&nn->nn_lock);
1594 if (nn->nn_sc)
1595 ret = -EBUSY;
1596 else
1597 ret = 0;
1598 spin_unlock(&nn->nn_lock);
1599 if (ret) {
1600 mlog(ML_NOTICE, "attempt to connect from node '%s' at "
1601 "%u.%u.%u.%u:%d but it already has an open connection\n",
1602 node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
1603 ntohs((__force __be16)sin.sin_port));
1604 goto out;
1605 }
1606
1607 sc = sc_alloc(node);
1608 if (sc == NULL) {
1609 ret = -ENOMEM;
1610 goto out;
1611 }
1612
1613 sc->sc_sock = new_sock;
1614 new_sock = NULL;
1615
1616 spin_lock(&nn->nn_lock);
1617 o2net_set_nn_state(nn, sc, 0, 0);
1618 spin_unlock(&nn->nn_lock);
1619
1620 o2net_register_callbacks(sc->sc_sock->sk, sc);
1621 o2net_sc_queue_work(sc, &sc->sc_rx_work);
1622
1623 o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
1624
1625out:
1626 if (new_sock)
1627 sock_release(new_sock);
1628 if (node)
1629 o2nm_node_put(node);
1630 if (sc)
1631 sc_put(sc);
1632 return ret;
1633}
1634
1635static void o2net_accept_many(void *arg)
1636{
1637 struct socket *sock = arg;
1638 while (o2net_accept_one(sock) == 0)
1639 cond_resched();
1640}
1641
1642static void o2net_listen_data_ready(struct sock *sk, int bytes)
1643{
1644 void (*ready)(struct sock *sk, int bytes);
1645
1646 read_lock(&sk->sk_callback_lock);
1647 ready = sk->sk_user_data;
1648 if (ready == NULL) { /* check for teardown race */
1649 ready = sk->sk_data_ready;
1650 goto out;
1651 }
1652
1653 /* ->sk_data_ready is also called for a newly established child socket
1654 * before it has been accepted and the acceptor has set up their
1655 * data_ready.. we only want to queue listen work for our listening
1656 * socket */
1657 if (sk->sk_state == TCP_LISTEN) {
1658 mlog(ML_TCP, "bytes: %d\n", bytes);
1659 queue_work(o2net_wq, &o2net_listen_work);
1660 }
1661
1662out:
1663 read_unlock(&sk->sk_callback_lock);
1664 ready(sk, bytes);
1665}
1666
1667static int o2net_open_listening_sock(__be16 port)
1668{
1669 struct socket *sock = NULL;
1670 int ret;
1671 struct sockaddr_in sin = {
1672 .sin_family = PF_INET,
1673 .sin_addr = { .s_addr = (__force u32)htonl(INADDR_ANY) },
1674 .sin_port = (__force u16)port,
1675 };
1676
1677 ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
1678 if (ret < 0) {
1679 mlog(ML_ERROR, "unable to create socket, ret=%d\n", ret);
1680 goto out;
1681 }
1682
1683 sock->sk->sk_allocation = GFP_ATOMIC;
1684
1685 write_lock_bh(&sock->sk->sk_callback_lock);
1686 sock->sk->sk_user_data = sock->sk->sk_data_ready;
1687 sock->sk->sk_data_ready = o2net_listen_data_ready;
1688 write_unlock_bh(&sock->sk->sk_callback_lock);
1689
1690 o2net_listen_sock = sock;
1691 INIT_WORK(&o2net_listen_work, o2net_accept_many, sock);
1692
1693 sock->sk->sk_reuse = 1;
1694 ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
1695 if (ret < 0) {
1696 mlog(ML_ERROR, "unable to bind socket to port %d, ret=%d\n",
1697 ntohs(port), ret);
1698 goto out;
1699 }
1700
1701 ret = sock->ops->listen(sock, 64);
1702 if (ret < 0) {
1703 mlog(ML_ERROR, "unable to listen on port %d, ret=%d\n",
1704 ntohs(port), ret);
1705 }
1706
1707out:
1708 if (ret) {
1709 o2net_listen_sock = NULL;
1710 if (sock)
1711 sock_release(sock);
1712 }
1713 return ret;
1714}
1715
1716/*
1717 * called from node manager when we should bring up our network listening
1718 * socket. node manager handles all the serialization to only call this
1719 * once and to match it with o2net_stop_listening(). note,
1720 * o2nm_this_node() doesn't work yet as we're being called while it
1721 * is being set up.
1722 */
1723int o2net_start_listening(struct o2nm_node *node)
1724{
1725 int ret = 0;
1726
1727 BUG_ON(o2net_wq != NULL);
1728 BUG_ON(o2net_listen_sock != NULL);
1729
1730 mlog(ML_KTHREAD, "starting o2net thread...\n");
1731 o2net_wq = create_singlethread_workqueue("o2net");
1732 if (o2net_wq == NULL) {
1733 mlog(ML_ERROR, "unable to launch o2net thread\n");
1734 return -ENOMEM; /* ? */
1735 }
1736
1737 ret = o2net_open_listening_sock(node->nd_ipv4_port);
1738 if (ret) {
1739 destroy_workqueue(o2net_wq);
1740 o2net_wq = NULL;
1741 } else
1742 o2quo_conn_up(node->nd_num);
1743
1744 return ret;
1745}
1746
1747/* again, o2nm_this_node() doesn't work here as we're involved in
1748 * tearing it down */
1749void o2net_stop_listening(struct o2nm_node *node)
1750{
1751 struct socket *sock = o2net_listen_sock;
1752 size_t i;
1753
1754 BUG_ON(o2net_wq == NULL);
1755 BUG_ON(o2net_listen_sock == NULL);
1756
1757 /* stop the listening socket from generating work */
1758 write_lock_bh(&sock->sk->sk_callback_lock);
1759 sock->sk->sk_data_ready = sock->sk->sk_user_data;
1760 sock->sk->sk_user_data = NULL;
1761 write_unlock_bh(&sock->sk->sk_callback_lock);
1762
1763 for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
1764 struct o2nm_node *node = o2nm_get_node_by_num(i);
1765 if (node) {
1766 o2net_disconnect_node(node);
1767 o2nm_node_put(node);
1768 }
1769 }
1770
1771 /* finish all work and tear down the work queue */
1772 mlog(ML_KTHREAD, "waiting for o2net thread to exit....\n");
1773 destroy_workqueue(o2net_wq);
1774 o2net_wq = NULL;
1775
1776 sock_release(o2net_listen_sock);
1777 o2net_listen_sock = NULL;
1778
1779 o2quo_conn_err(node->nd_num);
1780}
1781
1782/* ------------------------------------------------------------ */
1783
1784int o2net_init(void)
1785{
1786 unsigned long i;
1787
1788 o2quo_init();
1789
1790 o2net_hand = kcalloc(1, sizeof(struct o2net_handshake), GFP_KERNEL);
1791 o2net_keep_req = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL);
1792 o2net_keep_resp = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL);
1793 if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) {
1794 kfree(o2net_hand);
1795 kfree(o2net_keep_req);
1796 kfree(o2net_keep_resp);
1797 return -ENOMEM;
1798 }
1799
1800 o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION);
1801 o2net_hand->connector_id = cpu_to_be64(1);
1802
1803 o2net_keep_req->magic = cpu_to_be16(O2NET_MSG_KEEP_REQ_MAGIC);
1804 o2net_keep_resp->magic = cpu_to_be16(O2NET_MSG_KEEP_RESP_MAGIC);
1805
1806 for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
1807 struct o2net_node *nn = o2net_nn_from_num(i);
1808
1809 spin_lock_init(&nn->nn_lock);
1810 INIT_WORK(&nn->nn_connect_work, o2net_start_connect, nn);
1811 INIT_WORK(&nn->nn_connect_expired, o2net_connect_expired, nn);
1812 INIT_WORK(&nn->nn_still_up, o2net_still_up, nn);
1813 /* until we see hb from a node we'll return einval */
1814 nn->nn_persistent_error = -ENOTCONN;
1815 init_waitqueue_head(&nn->nn_sc_wq);
1816 idr_init(&nn->nn_status_idr);
1817 INIT_LIST_HEAD(&nn->nn_status_list);
1818 }
1819
1820 return 0;
1821}
1822
1823void o2net_exit(void)
1824{
1825 o2quo_exit();
1826 kfree(o2net_hand);
1827 kfree(o2net_keep_req);
1828 kfree(o2net_keep_resp);
1829}
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
new file mode 100644
index 00000000000..a6f4585501c
--- /dev/null
+++ b/fs/ocfs2/cluster/tcp.h
@@ -0,0 +1,113 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * tcp.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27#ifndef O2CLUSTER_TCP_H
28#define O2CLUSTER_TCP_H
29
30#include <linux/socket.h>
31#ifdef __KERNEL__
32#include <net/sock.h>
33#include <linux/tcp.h>
34#else
35#include <sys/socket.h>
36#endif
37#include <linux/inet.h>
38#include <linux/in.h>
39
40struct o2net_msg
41{
42 __be16 magic;
43 __be16 data_len;
44 __be16 msg_type;
45 __be16 pad1;
46 __be32 sys_status;
47 __be32 status;
48 __be32 key;
49 __be32 msg_num;
50 __u8 buf[0];
51};
52
53typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data);
54
55#define O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(struct o2net_msg))
56
57/* TODO: figure this out.... */
58static inline int o2net_link_down(int err, struct socket *sock)
59{
60 if (sock) {
61 if (sock->sk->sk_state != TCP_ESTABLISHED &&
62 sock->sk->sk_state != TCP_CLOSE_WAIT)
63 return 1;
64 }
65
66 if (err >= 0)
67 return 0;
68 switch (err) {
69 /* ????????????????????????? */
70 case -ERESTARTSYS:
71 case -EBADF:
72 /* When the server has died, an ICMP port unreachable
73 * message prompts ECONNREFUSED. */
74 case -ECONNREFUSED:
75 case -ENOTCONN:
76 case -ECONNRESET:
77 case -EPIPE:
78 return 1;
79 }
80 return 0;
81}
82
83enum {
84 O2NET_DRIVER_UNINITED,
85 O2NET_DRIVER_READY,
86};
87
88int o2net_init_tcp_sock(struct inode *inode);
89int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len,
90 u8 target_node, int *status);
91int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec,
92 size_t veclen, u8 target_node, int *status);
93int o2net_broadcast_message(u32 msg_type, u32 key, void *data, u32 len,
94 struct inode *group);
95
96int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
97 o2net_msg_handler_func *func, void *data,
98 struct list_head *unreg_list);
99void o2net_unregister_handler_list(struct list_head *list);
100
101struct o2nm_node;
102int o2net_register_hb_callbacks(void);
103void o2net_unregister_hb_callbacks(void);
104int o2net_start_listening(struct o2nm_node *node);
105void o2net_stop_listening(struct o2nm_node *node);
106void o2net_disconnect_node(struct o2nm_node *node);
107
108int o2net_init(void);
109void o2net_exit(void);
110int o2net_proc_init(struct proc_dir_entry *parent);
111void o2net_proc_exit(struct proc_dir_entry *parent);
112
113#endif /* O2CLUSTER_TCP_H */
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
new file mode 100644
index 00000000000..ff9e2e2104c
--- /dev/null
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -0,0 +1,174 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#ifndef O2CLUSTER_TCP_INTERNAL_H
23#define O2CLUSTER_TCP_INTERNAL_H
24
25#define O2NET_MSG_MAGIC ((u16)0xfa55)
26#define O2NET_MSG_STATUS_MAGIC ((u16)0xfa56)
27#define O2NET_MSG_KEEP_REQ_MAGIC ((u16)0xfa57)
28#define O2NET_MSG_KEEP_RESP_MAGIC ((u16)0xfa58)
29
30/* same as hb delay, we're waiting for another node to recognize our hb */
31#define O2NET_RECONNECT_DELAY_MS O2HB_REGION_TIMEOUT_MS
32
33/* we're delaying our quorum decision so that heartbeat will have timed
34 * out truly dead nodes by the time we come around to making decisions
35 * on their number */
36#define O2NET_QUORUM_DELAY_MS ((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS)
37
38#define O2NET_KEEPALIVE_DELAY_SECS 5
39#define O2NET_IDLE_TIMEOUT_SECS 10
40
41/*
42 * This version number represents quite a lot, unfortunately. It not
43 * only represents the raw network message protocol on the wire but also
44 * locking semantics of the file system using the protocol. It should
45 * be somewhere else, I'm sure, but right now it isn't.
46 *
47 * New in version 2:
48 * - full 64 bit i_size in the metadata lock lvbs
49 * - introduction of "rw" lock and pushing meta/data locking down
50 */
51#define O2NET_PROTOCOL_VERSION 2ULL
52struct o2net_handshake {
53 __be64 protocol_version;
54 __be64 connector_id;
55};
56
57struct o2net_node {
58 /* this is never called from int/bh */
59 spinlock_t nn_lock;
60
61 /* set the moment an sc is allocated and a connect is started */
62 struct o2net_sock_container *nn_sc;
63 /* _valid is only set after the handshake passes and tx can happen */
64 unsigned nn_sc_valid:1;
65 /* if this is set tx just returns it */
66 int nn_persistent_error;
67
68 /* threads waiting for an sc to arrive wait on the wq for generation
69 * to increase. it is increased when a connecting socket succeeds
70 * or fails or when an accepted socket is attached. */
71 wait_queue_head_t nn_sc_wq;
72
73 struct idr nn_status_idr;
74 struct list_head nn_status_list;
75
76 /* connects are attempted from when heartbeat comes up until either hb
77 * goes down, the node is unconfigured, no connect attempts succeed
78 * before O2NET_CONN_IDLE_DELAY, or a connect succeeds. connect_work
79 * is queued from set_nn_state both from hb up and from itself if a
80 * connect attempt fails and so can be self-arming. shutdown is
81 * careful to first mark the nn such that no connects will be attempted
82 * before canceling delayed connect work and flushing the queue. */
83 struct work_struct nn_connect_work;
84 unsigned long nn_last_connect_attempt;
85
86 /* this is queued as nodes come up and is canceled when a connection is
87 * established. this expiring gives up on the node and errors out
88 * transmits */
89 struct work_struct nn_connect_expired;
90
91 /* after we give up on a socket we wait a while before deciding
92 * that it is still heartbeating and that we should do some
93 * quorum work */
94 struct work_struct nn_still_up;
95};
96
97struct o2net_sock_container {
98 struct kref sc_kref;
99 /* the next two are vaild for the life time of the sc */
100 struct socket *sc_sock;
101 struct o2nm_node *sc_node;
102
103 /* all of these sc work structs hold refs on the sc while they are
104 * queued. they should not be able to ref a freed sc. the teardown
105 * race is with o2net_wq destruction in o2net_stop_listening() */
106
107 /* rx and connect work are generated from socket callbacks. sc
108 * shutdown removes the callbacks and then flushes the work queue */
109 struct work_struct sc_rx_work;
110 struct work_struct sc_connect_work;
111 /* shutdown work is triggered in two ways. the simple way is
112 * for a code path calls ensure_shutdown which gets a lock, removes
113 * the sc from the nn, and queues the work. in this case the
114 * work is single-shot. the work is also queued from a sock
115 * callback, though, and in this case the work will find the sc
116 * still on the nn and will call ensure_shutdown itself.. this
117 * ends up triggering the shutdown work again, though nothing
118 * will be done in that second iteration. so work queue teardown
119 * has to be careful to remove the sc from the nn before waiting
120 * on the work queue so that the shutdown work doesn't remove the
121 * sc and rearm itself.
122 */
123 struct work_struct sc_shutdown_work;
124
125 struct timer_list sc_idle_timeout;
126 struct work_struct sc_keepalive_work;
127
128 unsigned sc_handshake_ok:1;
129
130 struct page *sc_page;
131 size_t sc_page_off;
132
133 /* original handlers for the sockets */
134 void (*sc_state_change)(struct sock *sk);
135 void (*sc_data_ready)(struct sock *sk, int bytes);
136
137 struct timeval sc_tv_timer;
138 struct timeval sc_tv_data_ready;
139 struct timeval sc_tv_advance_start;
140 struct timeval sc_tv_advance_stop;
141 struct timeval sc_tv_func_start;
142 struct timeval sc_tv_func_stop;
143 u32 sc_msg_key;
144 u16 sc_msg_type;
145};
146
147struct o2net_msg_handler {
148 struct rb_node nh_node;
149 u32 nh_max_len;
150 u32 nh_msg_type;
151 u32 nh_key;
152 o2net_msg_handler_func *nh_func;
153 o2net_msg_handler_func *nh_func_data;
154 struct kref nh_kref;
155 struct list_head nh_unregister_item;
156};
157
158enum o2net_system_error {
159 O2NET_ERR_NONE = 0,
160 O2NET_ERR_NO_HNDLR,
161 O2NET_ERR_OVERFLOW,
162 O2NET_ERR_DIED,
163 O2NET_ERR_MAX
164};
165
166struct o2net_status_wait {
167 enum o2net_system_error ns_sys_status;
168 s32 ns_status;
169 int ns_id;
170 wait_queue_head_t ns_wq;
171 struct list_head ns_node_item;
172};
173
174#endif /* O2CLUSTER_TCP_INTERNAL_H */
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c
new file mode 100644
index 00000000000..7286c48bb30
--- /dev/null
+++ b/fs/ocfs2/cluster/ver.c
@@ -0,0 +1,42 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ver.c
5 *
6 * version string
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/kernel.h>
28
29#include "ver.h"
30
31#define CLUSTER_BUILD_VERSION "1.3.3"
32
33#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
34
35void cluster_print_version(void)
36{
37 printk(KERN_INFO "%s\n", VERSION_STR);
38}
39
40MODULE_DESCRIPTION(VERSION_STR);
41
42MODULE_VERSION(CLUSTER_BUILD_VERSION);
diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h
new file mode 100644
index 00000000000..32554c3382c
--- /dev/null
+++ b/fs/ocfs2/cluster/ver.h
@@ -0,0 +1,31 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ver.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef O2CLUSTER_VER_H
27#define O2CLUSTER_VER_H
28
29void cluster_print_version(void);
30
31#endif /* O2CLUSTER_VER_H */
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
new file mode 100644
index 00000000000..bd85182e97b
--- /dev/null
+++ b/fs/ocfs2/dcache.c
@@ -0,0 +1,91 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dcache.c
5 *
6 * dentry cache handling code
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/namei.h>
30
31#define MLOG_MASK_PREFIX ML_DCACHE
32#include <cluster/masklog.h>
33
34#include "ocfs2.h"
35
36#include "alloc.h"
37#include "dcache.h"
38#include "file.h"
39#include "inode.h"
40
41static int ocfs2_dentry_revalidate(struct dentry *dentry,
42 struct nameidata *nd)
43{
44 struct inode *inode = dentry->d_inode;
45 int ret = 0; /* if all else fails, just return false */
46 struct ocfs2_super *osb;
47
48 mlog_entry("(0x%p, '%.*s')\n", dentry,
49 dentry->d_name.len, dentry->d_name.name);
50
51 /* Never trust a negative dentry - force a new lookup. */
52 if (inode == NULL) {
53 mlog(0, "negative dentry: %.*s\n", dentry->d_name.len,
54 dentry->d_name.name);
55 goto bail;
56 }
57
58 osb = OCFS2_SB(inode->i_sb);
59
60 BUG_ON(!osb);
61
62 if (inode != osb->root_inode) {
63 spin_lock(&OCFS2_I(inode)->ip_lock);
64 /* did we or someone else delete this inode? */
65 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
66 spin_unlock(&OCFS2_I(inode)->ip_lock);
67 mlog(0, "inode (%"MLFu64") deleted, returning false\n",
68 OCFS2_I(inode)->ip_blkno);
69 goto bail;
70 }
71 spin_unlock(&OCFS2_I(inode)->ip_lock);
72
73 if (!inode->i_nlink) {
74 mlog(0, "Inode %"MLFu64" orphaned, returning false "
75 "dir = %d\n", OCFS2_I(inode)->ip_blkno,
76 S_ISDIR(inode->i_mode));
77 goto bail;
78 }
79 }
80
81 ret = 1;
82
83bail:
84 mlog_exit(ret);
85
86 return ret;
87}
88
89struct dentry_operations ocfs2_dentry_ops = {
90 .d_revalidate = ocfs2_dentry_revalidate,
91};
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
new file mode 100644
index 00000000000..90072771114
--- /dev/null
+++ b/fs/ocfs2/dcache.h
@@ -0,0 +1,31 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dcache.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_DCACHE_H
27#define OCFS2_DCACHE_H
28
29extern struct dentry_operations ocfs2_dentry_ops;
30
31#endif /* OCFS2_DCACHE_H */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
new file mode 100644
index 00000000000..57158fa75d9
--- /dev/null
+++ b/fs/ocfs2/dir.c
@@ -0,0 +1,618 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dir.c
5 *
6 * Creates, reads, walks and deletes directory-nodes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * Portions of this code from linux/fs/ext3/dir.c
11 *
12 * Copyright (C) 1992, 1993, 1994, 1995
13 * Remy Card (card@masi.ibp.fr)
14 * Laboratoire MASI - Institut Blaise pascal
15 * Universite Pierre et Marie Curie (Paris VI)
16 *
17 * from
18 *
19 * linux/fs/minix/dir.c
20 *
21 * Copyright (C) 1991, 1992 Linux Torvalds
22 *
23 * This program is free software; you can redistribute it and/or
24 * modify it under the terms of the GNU General Public
25 * License as published by the Free Software Foundation; either
26 * version 2 of the License, or (at your option) any later version.
27 *
28 * This program is distributed in the hope that it will be useful,
29 * but WITHOUT ANY WARRANTY; without even the implied warranty of
30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
31 * General Public License for more details.
32 *
33 * You should have received a copy of the GNU General Public
34 * License along with this program; if not, write to the
35 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
36 * Boston, MA 021110-1307, USA.
37 */
38
39#include <linux/fs.h>
40#include <linux/types.h>
41#include <linux/slab.h>
42#include <linux/highmem.h>
43
44#define MLOG_MASK_PREFIX ML_NAMEI
45#include <cluster/masklog.h>
46
47#include "ocfs2.h"
48
49#include "alloc.h"
50#include "dir.h"
51#include "dlmglue.h"
52#include "extent_map.h"
53#include "file.h"
54#include "inode.h"
55#include "journal.h"
56#include "namei.h"
57#include "suballoc.h"
58#include "uptodate.h"
59
60#include "buffer_head_io.h"
61
62static unsigned char ocfs2_filetype_table[] = {
63 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
64};
65
66static int ocfs2_extend_dir(struct ocfs2_super *osb,
67 struct inode *dir,
68 struct buffer_head *parent_fe_bh,
69 struct buffer_head **new_de_bh);
70/*
71 * ocfs2_readdir()
72 *
73 */
74int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
75{
76 int error = 0;
77 unsigned long offset, blk;
78 int i, num, stored;
79 struct buffer_head * bh, * tmp;
80 struct ocfs2_dir_entry * de;
81 int err;
82 struct inode *inode = filp->f_dentry->d_inode;
83 struct super_block * sb = inode->i_sb;
84 int have_disk_lock = 0;
85
86 mlog_entry("dirino=%"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
87
88 stored = 0;
89 bh = NULL;
90
91 error = ocfs2_meta_lock(inode, NULL, NULL, 0);
92 if (error < 0) {
93 if (error != -ENOENT)
94 mlog_errno(error);
95 /* we haven't got any yet, so propagate the error. */
96 stored = error;
97 goto bail;
98 }
99 have_disk_lock = 1;
100
101 offset = filp->f_pos & (sb->s_blocksize - 1);
102
103 while (!error && !stored && filp->f_pos < i_size_read(inode)) {
104 blk = (filp->f_pos) >> sb->s_blocksize_bits;
105 bh = ocfs2_bread(inode, blk, &err, 0);
106 if (!bh) {
107 mlog(ML_ERROR, "directory #%"MLFu64" contains a hole "
108 "at offset %lld\n",
109 OCFS2_I(inode)->ip_blkno,
110 filp->f_pos);
111 filp->f_pos += sb->s_blocksize - offset;
112 continue;
113 }
114
115 /*
116 * Do the readahead (8k)
117 */
118 if (!offset) {
119 for (i = 16 >> (sb->s_blocksize_bits - 9), num = 0;
120 i > 0; i--) {
121 tmp = ocfs2_bread(inode, ++blk, &err, 1);
122 if (tmp)
123 brelse(tmp);
124 }
125 }
126
127revalidate:
128 /* If the dir block has changed since the last call to
129 * readdir(2), then we might be pointing to an invalid
130 * dirent right now. Scan from the start of the block
131 * to make sure. */
132 if (filp->f_version != inode->i_version) {
133 for (i = 0; i < sb->s_blocksize && i < offset; ) {
134 de = (struct ocfs2_dir_entry *) (bh->b_data + i);
135 /* It's too expensive to do a full
136 * dirent test each time round this
137 * loop, but we do have to test at
138 * least that it is non-zero. A
139 * failure will be detected in the
140 * dirent test below. */
141 if (le16_to_cpu(de->rec_len) <
142 OCFS2_DIR_REC_LEN(1))
143 break;
144 i += le16_to_cpu(de->rec_len);
145 }
146 offset = i;
147 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
148 | offset;
149 filp->f_version = inode->i_version;
150 }
151
152 while (!error && filp->f_pos < i_size_read(inode)
153 && offset < sb->s_blocksize) {
154 de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
155 if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
156 /* On error, skip the f_pos to the
157 next block. */
158 filp->f_pos = (filp->f_pos |
159 (sb->s_blocksize - 1)) + 1;
160 brelse(bh);
161 goto bail;
162 }
163 offset += le16_to_cpu(de->rec_len);
164 if (le64_to_cpu(de->inode)) {
165 /* We might block in the next section
166 * if the data destination is
167 * currently swapped out. So, use a
168 * version stamp to detect whether or
169 * not the directory has been modified
170 * during the copy operation.
171 */
172 unsigned long version = filp->f_version;
173 unsigned char d_type = DT_UNKNOWN;
174
175 if (de->file_type < OCFS2_FT_MAX)
176 d_type = ocfs2_filetype_table[de->file_type];
177 error = filldir(dirent, de->name,
178 de->name_len,
179 filp->f_pos,
180 ino_from_blkno(sb, le64_to_cpu(de->inode)),
181 d_type);
182 if (error)
183 break;
184 if (version != filp->f_version)
185 goto revalidate;
186 stored ++;
187 }
188 filp->f_pos += le16_to_cpu(de->rec_len);
189 }
190 offset = 0;
191 brelse(bh);
192 }
193
194 stored = 0;
195bail:
196 if (have_disk_lock)
197 ocfs2_meta_unlock(inode, 0);
198
199 mlog_exit(stored);
200
201 return stored;
202}
203
204/*
205 * NOTE: this should always be called with parent dir i_mutex taken.
206 */
207int ocfs2_find_files_on_disk(const char *name,
208 int namelen,
209 u64 *blkno,
210 struct inode *inode,
211 struct buffer_head **dirent_bh,
212 struct ocfs2_dir_entry **dirent)
213{
214 int status = -ENOENT;
215 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
216
217 mlog_entry("(osb=%p, parent=%"MLFu64", name='%.*s', blkno=%p, "
218 "inode=%p)\n",
219 osb, OCFS2_I(inode)->ip_blkno, namelen, name, blkno, inode);
220
221 *dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent);
222 if (!*dirent_bh || !*dirent) {
223 status = -ENOENT;
224 goto leave;
225 }
226
227 *blkno = le64_to_cpu((*dirent)->inode);
228
229 status = 0;
230leave:
231 if (status < 0) {
232 *dirent = NULL;
233 if (*dirent_bh) {
234 brelse(*dirent_bh);
235 *dirent_bh = NULL;
236 }
237 }
238
239 mlog_exit(status);
240 return status;
241}
242
243/* Check for a name within a directory.
244 *
245 * Return 0 if the name does not exist
246 * Return -EEXIST if the directory contains the name
247 *
248 * Callers should have i_mutex + a cluster lock on dir
249 */
250int ocfs2_check_dir_for_entry(struct inode *dir,
251 const char *name,
252 int namelen)
253{
254 int ret;
255 struct buffer_head *dirent_bh = NULL;
256 struct ocfs2_dir_entry *dirent = NULL;
257
258 mlog_entry("dir %"MLFu64", name '%.*s'\n", OCFS2_I(dir)->ip_blkno,
259 namelen, name);
260
261 ret = -EEXIST;
262 dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent);
263 if (dirent_bh)
264 goto bail;
265
266 ret = 0;
267bail:
268 if (dirent_bh)
269 brelse(dirent_bh);
270
271 mlog_exit(ret);
272 return ret;
273}
274
275/*
276 * routine to check that the specified directory is empty (for rmdir)
277 */
278int ocfs2_empty_dir(struct inode *inode)
279{
280 unsigned long offset;
281 struct buffer_head * bh;
282 struct ocfs2_dir_entry * de, * de1;
283 struct super_block * sb;
284 int err;
285
286 sb = inode->i_sb;
287 if ((i_size_read(inode) <
288 (OCFS2_DIR_REC_LEN(1) + OCFS2_DIR_REC_LEN(2))) ||
289 !(bh = ocfs2_bread(inode, 0, &err, 0))) {
290 mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - "
291 "no data block\n",
292 OCFS2_I(inode)->ip_blkno);
293 return 1;
294 }
295
296 de = (struct ocfs2_dir_entry *) bh->b_data;
297 de1 = (struct ocfs2_dir_entry *)
298 ((char *)de + le16_to_cpu(de->rec_len));
299 if ((le64_to_cpu(de->inode) != OCFS2_I(inode)->ip_blkno) ||
300 !le64_to_cpu(de1->inode) ||
301 strcmp(".", de->name) ||
302 strcmp("..", de1->name)) {
303 mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - "
304 "no `.' or `..'\n",
305 OCFS2_I(inode)->ip_blkno);
306 brelse(bh);
307 return 1;
308 }
309 offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
310 de = (struct ocfs2_dir_entry *)((char *)de1 + le16_to_cpu(de1->rec_len));
311 while (offset < i_size_read(inode) ) {
312 if (!bh || (void *)de >= (void *)(bh->b_data + sb->s_blocksize)) {
313 brelse(bh);
314 bh = ocfs2_bread(inode,
315 offset >> sb->s_blocksize_bits, &err, 0);
316 if (!bh) {
317 mlog(ML_ERROR, "directory #%"MLFu64" contains "
318 "a hole at offset %lu\n",
319 OCFS2_I(inode)->ip_blkno, offset);
320 offset += sb->s_blocksize;
321 continue;
322 }
323 de = (struct ocfs2_dir_entry *) bh->b_data;
324 }
325 if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
326 brelse(bh);
327 return 1;
328 }
329 if (le64_to_cpu(de->inode)) {
330 brelse(bh);
331 return 0;
332 }
333 offset += le16_to_cpu(de->rec_len);
334 de = (struct ocfs2_dir_entry *)
335 ((char *)de + le16_to_cpu(de->rec_len));
336 }
337 brelse(bh);
338 return 1;
339}
340
341/* returns a bh of the 1st new block in the allocation. */
342int ocfs2_do_extend_dir(struct super_block *sb,
343 struct ocfs2_journal_handle *handle,
344 struct inode *dir,
345 struct buffer_head *parent_fe_bh,
346 struct ocfs2_alloc_context *data_ac,
347 struct ocfs2_alloc_context *meta_ac,
348 struct buffer_head **new_bh)
349{
350 int status;
351 int extend;
352 u64 p_blkno;
353
354 spin_lock(&OCFS2_I(dir)->ip_lock);
355 extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
356 spin_unlock(&OCFS2_I(dir)->ip_lock);
357
358 if (extend) {
359 status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1,
360 parent_fe_bh, handle,
361 data_ac, meta_ac, NULL);
362 BUG_ON(status == -EAGAIN);
363 if (status < 0) {
364 mlog_errno(status);
365 goto bail;
366 }
367 }
368
369 status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >>
370 (sb->s_blocksize_bits - 9)),
371 1, &p_blkno, NULL);
372 if (status < 0) {
373 mlog_errno(status);
374 goto bail;
375 }
376
377 *new_bh = sb_getblk(sb, p_blkno);
378 if (!*new_bh) {
379 status = -EIO;
380 mlog_errno(status);
381 goto bail;
382 }
383 status = 0;
384bail:
385 mlog_exit(status);
386 return status;
387}
388
389/* assumes you already have a cluster lock on the directory. */
390static int ocfs2_extend_dir(struct ocfs2_super *osb,
391 struct inode *dir,
392 struct buffer_head *parent_fe_bh,
393 struct buffer_head **new_de_bh)
394{
395 int status = 0;
396 int credits, num_free_extents;
397 loff_t dir_i_size;
398 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
399 struct ocfs2_alloc_context *data_ac = NULL;
400 struct ocfs2_alloc_context *meta_ac = NULL;
401 struct ocfs2_journal_handle *handle = NULL;
402 struct buffer_head *new_bh = NULL;
403 struct ocfs2_dir_entry * de;
404 struct super_block *sb = osb->sb;
405
406 mlog_entry_void();
407
408 dir_i_size = i_size_read(dir);
409 mlog(0, "extending dir %"MLFu64" (i_size = %lld)\n",
410 OCFS2_I(dir)->ip_blkno, dir_i_size);
411
412 handle = ocfs2_alloc_handle(osb);
413 if (handle == NULL) {
414 status = -ENOMEM;
415 mlog_errno(status);
416 goto bail;
417 }
418
419 /* dir->i_size is always block aligned. */
420 spin_lock(&OCFS2_I(dir)->ip_lock);
421 if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
422 spin_unlock(&OCFS2_I(dir)->ip_lock);
423 num_free_extents = ocfs2_num_free_extents(osb, dir, fe);
424 if (num_free_extents < 0) {
425 status = num_free_extents;
426 mlog_errno(status);
427 goto bail;
428 }
429
430 if (!num_free_extents) {
431 status = ocfs2_reserve_new_metadata(osb, handle,
432 fe, &meta_ac);
433 if (status < 0) {
434 if (status != -ENOSPC)
435 mlog_errno(status);
436 goto bail;
437 }
438 }
439
440 status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
441 if (status < 0) {
442 if (status != -ENOSPC)
443 mlog_errno(status);
444 goto bail;
445 }
446
447 credits = ocfs2_calc_extend_credits(sb, fe, 1);
448 } else {
449 spin_unlock(&OCFS2_I(dir)->ip_lock);
450 credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
451 }
452
453 handle = ocfs2_start_trans(osb, handle, credits);
454 if (IS_ERR(handle)) {
455 status = PTR_ERR(handle);
456 handle = NULL;
457 mlog_errno(status);
458 goto bail;
459 }
460
461 status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh,
462 data_ac, meta_ac, &new_bh);
463 if (status < 0) {
464 mlog_errno(status);
465 goto bail;
466 }
467
468 ocfs2_set_new_buffer_uptodate(dir, new_bh);
469
470 status = ocfs2_journal_access(handle, dir, new_bh,
471 OCFS2_JOURNAL_ACCESS_CREATE);
472 if (status < 0) {
473 mlog_errno(status);
474 goto bail;
475 }
476 memset(new_bh->b_data, 0, sb->s_blocksize);
477 de = (struct ocfs2_dir_entry *) new_bh->b_data;
478 de->inode = 0;
479 de->rec_len = cpu_to_le16(sb->s_blocksize);
480 status = ocfs2_journal_dirty(handle, new_bh);
481 if (status < 0) {
482 mlog_errno(status);
483 goto bail;
484 }
485
486 dir_i_size += dir->i_sb->s_blocksize;
487 i_size_write(dir, dir_i_size);
488 dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size);
489 status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
490 if (status < 0) {
491 mlog_errno(status);
492 goto bail;
493 }
494
495 *new_de_bh = new_bh;
496 get_bh(*new_de_bh);
497bail:
498 if (handle)
499 ocfs2_commit_trans(handle);
500
501 if (data_ac)
502 ocfs2_free_alloc_context(data_ac);
503 if (meta_ac)
504 ocfs2_free_alloc_context(meta_ac);
505
506 if (new_bh)
507 brelse(new_bh);
508
509 mlog_exit(status);
510 return status;
511}
512
513/*
514 * Search the dir for a good spot, extending it if necessary. The
515 * block containing an appropriate record is returned in ret_de_bh.
516 */
517int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
518 struct inode *dir,
519 struct buffer_head *parent_fe_bh,
520 const char *name,
521 int namelen,
522 struct buffer_head **ret_de_bh)
523{
524 unsigned long offset;
525 struct buffer_head * bh = NULL;
526 unsigned short rec_len;
527 struct ocfs2_dinode *fe;
528 struct ocfs2_dir_entry *de;
529 struct super_block *sb;
530 int status;
531
532 mlog_entry_void();
533
534 mlog(0, "getting ready to insert namelen %d into dir %"MLFu64"\n",
535 namelen, OCFS2_I(dir)->ip_blkno);
536
537 BUG_ON(!S_ISDIR(dir->i_mode));
538 fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
539 BUG_ON(le64_to_cpu(fe->i_size) != i_size_read(dir));
540
541 sb = dir->i_sb;
542
543 if (!namelen) {
544 status = -EINVAL;
545 mlog_errno(status);
546 goto bail;
547 }
548
549 bh = ocfs2_bread(dir, 0, &status, 0);
550 if (!bh) {
551 mlog_errno(status);
552 goto bail;
553 }
554
555 rec_len = OCFS2_DIR_REC_LEN(namelen);
556 offset = 0;
557 de = (struct ocfs2_dir_entry *) bh->b_data;
558 while (1) {
559 if ((char *)de >= sb->s_blocksize + bh->b_data) {
560 brelse(bh);
561 bh = NULL;
562
563 if (i_size_read(dir) <= offset) {
564 status = ocfs2_extend_dir(osb,
565 dir,
566 parent_fe_bh,
567 &bh);
568 if (status < 0) {
569 mlog_errno(status);
570 goto bail;
571 }
572 BUG_ON(!bh);
573 *ret_de_bh = bh;
574 get_bh(*ret_de_bh);
575 goto bail;
576 }
577 bh = ocfs2_bread(dir,
578 offset >> sb->s_blocksize_bits,
579 &status,
580 0);
581 if (!bh) {
582 mlog_errno(status);
583 goto bail;
584 }
585 /* move to next block */
586 de = (struct ocfs2_dir_entry *) bh->b_data;
587 }
588 if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
589 status = -ENOENT;
590 goto bail;
591 }
592 if (ocfs2_match(namelen, name, de)) {
593 status = -EEXIST;
594 goto bail;
595 }
596 if (((le64_to_cpu(de->inode) == 0) &&
597 (le16_to_cpu(de->rec_len) >= rec_len)) ||
598 (le16_to_cpu(de->rec_len) >=
599 (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
600 /* Ok, we found a spot. Return this bh and let
601 * the caller actually fill it in. */
602 *ret_de_bh = bh;
603 get_bh(*ret_de_bh);
604 status = 0;
605 goto bail;
606 }
607 offset += le16_to_cpu(de->rec_len);
608 de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
609 }
610
611 status = 0;
612bail:
613 if (bh)
614 brelse(bh);
615
616 mlog_exit(status);
617 return status;
618}
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
new file mode 100644
index 00000000000..5f614ec9649
--- /dev/null
+++ b/fs/ocfs2/dir.h
@@ -0,0 +1,54 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dir.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_DIR_H
27#define OCFS2_DIR_H
28
29int ocfs2_check_dir_for_entry(struct inode *dir,
30 const char *name,
31 int namelen);
32int ocfs2_empty_dir(struct inode *inode); /* FIXME: to namei.c */
33int ocfs2_find_files_on_disk(const char *name,
34 int namelen,
35 u64 *blkno,
36 struct inode *inode,
37 struct buffer_head **dirent_bh,
38 struct ocfs2_dir_entry **dirent);
39int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
40int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
41 struct inode *dir,
42 struct buffer_head *parent_fe_bh,
43 const char *name,
44 int namelen,
45 struct buffer_head **ret_de_bh);
46struct ocfs2_alloc_context;
47int ocfs2_do_extend_dir(struct super_block *sb,
48 struct ocfs2_journal_handle *handle,
49 struct inode *dir,
50 struct buffer_head *parent_fe_bh,
51 struct ocfs2_alloc_context *data_ac,
52 struct ocfs2_alloc_context *meta_ac,
53 struct buffer_head **new_bh);
54#endif /* OCFS2_DIR_H */
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
new file mode 100644
index 00000000000..ce3f7c29d27
--- /dev/null
+++ b/fs/ocfs2/dlm/Makefile
@@ -0,0 +1,8 @@
1EXTRA_CFLAGS += -Ifs/ocfs2
2
3obj-$(CONFIG_OCFS2_FS) += ocfs2_dlm.o ocfs2_dlmfs.o
4
5ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
6 dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
7
8ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h
new file mode 100644
index 00000000000..53652f51c0e
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmapi.h
@@ -0,0 +1,214 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmapi.h
5 *
6 * externally exported dlm interfaces
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27#ifndef DLMAPI_H
28#define DLMAPI_H
29
30struct dlm_lock;
31struct dlm_ctxt;
32
33/* NOTE: changes made to this enum should be reflected in dlmdebug.c */
34enum dlm_status {
35 DLM_NORMAL = 0, /* 0: request in progress */
36 DLM_GRANTED, /* 1: request granted */
37 DLM_DENIED, /* 2: request denied */
38 DLM_DENIED_NOLOCKS, /* 3: request denied, out of system resources */
39 DLM_WORKING, /* 4: async request in progress */
40 DLM_BLOCKED, /* 5: lock request blocked */
41 DLM_BLOCKED_ORPHAN, /* 6: lock request blocked by a orphan lock*/
42 DLM_DENIED_GRACE_PERIOD, /* 7: topological change in progress */
43 DLM_SYSERR, /* 8: system error */
44 DLM_NOSUPPORT, /* 9: unsupported */
45 DLM_CANCELGRANT, /* 10: can't cancel convert: already granted */
46 DLM_IVLOCKID, /* 11: bad lockid */
47 DLM_SYNC, /* 12: synchronous request granted */
48 DLM_BADTYPE, /* 13: bad resource type */
49 DLM_BADRESOURCE, /* 14: bad resource handle */
50 DLM_MAXHANDLES, /* 15: no more resource handles */
51 DLM_NOCLINFO, /* 16: can't contact cluster manager */
52 DLM_NOLOCKMGR, /* 17: can't contact lock manager */
53 DLM_NOPURGED, /* 18: can't contact purge daemon */
54 DLM_BADARGS, /* 19: bad api args */
55 DLM_VOID, /* 20: no status */
56 DLM_NOTQUEUED, /* 21: NOQUEUE was specified and request failed */
57 DLM_IVBUFLEN, /* 22: invalid resource name length */
58 DLM_CVTUNGRANT, /* 23: attempted to convert ungranted lock */
59 DLM_BADPARAM, /* 24: invalid lock mode specified */
60 DLM_VALNOTVALID, /* 25: value block has been invalidated */
61 DLM_REJECTED, /* 26: request rejected, unrecognized client */
62 DLM_ABORT, /* 27: blocked lock request cancelled */
63 DLM_CANCEL, /* 28: conversion request cancelled */
64 DLM_IVRESHANDLE, /* 29: invalid resource handle */
65 DLM_DEADLOCK, /* 30: deadlock recovery refused this request */
66 DLM_DENIED_NOASTS, /* 31: failed to allocate AST */
67 DLM_FORWARD, /* 32: request must wait for primary's response */
68 DLM_TIMEOUT, /* 33: timeout value for lock has expired */
69 DLM_IVGROUPID, /* 34: invalid group specification */
70 DLM_VERS_CONFLICT, /* 35: version conflicts prevent request handling */
71 DLM_BAD_DEVICE_PATH, /* 36: Locks device does not exist or path wrong */
72 DLM_NO_DEVICE_PERMISSION, /* 37: Client has insufficient pers for device */
73 DLM_NO_CONTROL_DEVICE, /* 38: Cannot set options on opened device */
74
75 DLM_RECOVERING, /* 39: extension, allows caller to fail a lock
76 request if it is being recovered */
77 DLM_MIGRATING, /* 40: extension, allows caller to fail a lock
78 request if it is being migrated */
79 DLM_MAXSTATS, /* 41: upper limit for return code validation */
80};
81
82/* for pretty-printing dlm_status error messages */
83const char *dlm_errmsg(enum dlm_status err);
84/* for pretty-printing dlm_status error names */
85const char *dlm_errname(enum dlm_status err);
86
87/* Eventually the DLM will use standard errno values, but in the
88 * meantime this lets us track dlm errors as they bubble up. When we
89 * bring its error reporting into line with the rest of the stack,
90 * these can just be replaced with calls to mlog_errno. */
91#define dlm_error(st) do { \
92 if ((st) != DLM_RECOVERING && \
93 (st) != DLM_MIGRATING && \
94 (st) != DLM_FORWARD) \
95 mlog(ML_ERROR, "dlm status = %s\n", dlm_errname((st))); \
96} while (0)
97
98#define DLM_LKSB_UNUSED1 0x01
99#define DLM_LKSB_PUT_LVB 0x02
100#define DLM_LKSB_GET_LVB 0x04
101#define DLM_LKSB_UNUSED2 0x08
102#define DLM_LKSB_UNUSED3 0x10
103#define DLM_LKSB_UNUSED4 0x20
104#define DLM_LKSB_UNUSED5 0x40
105#define DLM_LKSB_UNUSED6 0x80
106
107#define DLM_LVB_LEN 64
108
109/* Callers are only allowed access to the lvb and status members of
110 * this struct. */
111struct dlm_lockstatus {
112 enum dlm_status status;
113 u32 flags;
114 struct dlm_lock *lockid;
115 char lvb[DLM_LVB_LEN];
116};
117
118/* Valid lock modes. */
119#define LKM_IVMODE (-1) /* invalid mode */
120#define LKM_NLMODE 0 /* null lock */
121#define LKM_CRMODE 1 /* concurrent read unsupported */
122#define LKM_CWMODE 2 /* concurrent write unsupported */
123#define LKM_PRMODE 3 /* protected read */
124#define LKM_PWMODE 4 /* protected write unsupported */
125#define LKM_EXMODE 5 /* exclusive */
126#define LKM_MAXMODE 5
127#define LKM_MODEMASK 0xff
128
129/* Flags passed to dlmlock and dlmunlock:
130 * reserved: flags used by the "real" dlm
131 * only a few are supported by this dlm
132 * (U) = unsupported by ocfs2 dlm */
133#define LKM_ORPHAN 0x00000010 /* this lock is orphanable (U) */
134#define LKM_PARENTABLE 0x00000020 /* this lock was orphaned (U) */
135#define LKM_BLOCK 0x00000040 /* blocking lock request (U) */
136#define LKM_LOCAL 0x00000080 /* local lock request */
137#define LKM_VALBLK 0x00000100 /* lock value block request */
138#define LKM_NOQUEUE 0x00000200 /* non blocking request */
139#define LKM_CONVERT 0x00000400 /* conversion request */
140#define LKM_NODLCKWT 0x00000800 /* this lock wont deadlock (U) */
141#define LKM_UNLOCK 0x00001000 /* deallocate this lock */
142#define LKM_CANCEL 0x00002000 /* cancel conversion request */
143#define LKM_DEQALL 0x00004000 /* remove all locks held by proc (U) */
144#define LKM_INVVALBLK 0x00008000 /* invalidate lock value block */
145#define LKM_SYNCSTS 0x00010000 /* return synchronous status if poss (U) */
146#define LKM_TIMEOUT 0x00020000 /* lock request contains timeout (U) */
147#define LKM_SNGLDLCK 0x00040000 /* request can self-deadlock (U) */
148#define LKM_FINDLOCAL 0x00080000 /* find local lock request (U) */
149#define LKM_PROC_OWNED 0x00100000 /* owned by process, not group (U) */
150#define LKM_XID 0x00200000 /* use transaction id for deadlock (U) */
151#define LKM_XID_CONFLICT 0x00400000 /* do not allow lock inheritance (U) */
152#define LKM_FORCE 0x00800000 /* force unlock flag */
153#define LKM_REVVALBLK 0x01000000 /* temporary solution: re-validate
154 lock value block (U) */
155/* unused */
156#define LKM_UNUSED1 0x00000001 /* unused */
157#define LKM_UNUSED2 0x00000002 /* unused */
158#define LKM_UNUSED3 0x00000004 /* unused */
159#define LKM_UNUSED4 0x00000008 /* unused */
160#define LKM_UNUSED5 0x02000000 /* unused */
161#define LKM_UNUSED6 0x04000000 /* unused */
162#define LKM_UNUSED7 0x08000000 /* unused */
163
164/* ocfs2 extensions: internal only
165 * should never be used by caller */
166#define LKM_MIGRATION 0x10000000 /* extension: lockres is to be migrated
167 to another node */
168#define LKM_PUT_LVB 0x20000000 /* extension: lvb is being passed
169 should be applied to lockres */
170#define LKM_GET_LVB 0x40000000 /* extension: lvb should be copied
171 from lockres when lock is granted */
172#define LKM_RECOVERY 0x80000000 /* extension: flag for recovery lock
173 used to avoid recovery rwsem */
174
175
176typedef void (dlm_astlockfunc_t)(void *);
177typedef void (dlm_bastlockfunc_t)(void *, int);
178typedef void (dlm_astunlockfunc_t)(void *, enum dlm_status);
179
180enum dlm_status dlmlock(struct dlm_ctxt *dlm,
181 int mode,
182 struct dlm_lockstatus *lksb,
183 int flags,
184 const char *name,
185 dlm_astlockfunc_t *ast,
186 void *data,
187 dlm_bastlockfunc_t *bast);
188
189enum dlm_status dlmunlock(struct dlm_ctxt *dlm,
190 struct dlm_lockstatus *lksb,
191 int flags,
192 dlm_astunlockfunc_t *unlockast,
193 void *data);
194
195struct dlm_ctxt * dlm_register_domain(const char *domain, u32 key);
196
197void dlm_unregister_domain(struct dlm_ctxt *dlm);
198
199void dlm_print_one_lock(struct dlm_lock *lockid);
200
201typedef void (dlm_eviction_func)(int, void *);
202struct dlm_eviction_cb {
203 struct list_head ec_item;
204 dlm_eviction_func *ec_func;
205 void *ec_data;
206};
207void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb,
208 dlm_eviction_func *f,
209 void *data);
210void dlm_register_eviction_cb(struct dlm_ctxt *dlm,
211 struct dlm_eviction_cb *cb);
212void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb);
213
214#endif /* DLMAPI_H */
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
new file mode 100644
index 00000000000..8d17d28ef91
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -0,0 +1,466 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmast.c
5 *
6 * AST and BAST functionality for local and remote nodes
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27
28#include <linux/module.h>
29#include <linux/fs.h>
30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h>
35#include <linux/sysctl.h>
36#include <linux/random.h>
37#include <linux/blkdev.h>
38#include <linux/socket.h>
39#include <linux/inet.h>
40#include <linux/spinlock.h>
41
42
43#include "cluster/heartbeat.h"
44#include "cluster/nodemanager.h"
45#include "cluster/tcp.h"
46#include "cluster/endian.h"
47
48#include "dlmapi.h"
49#include "dlmcommon.h"
50
51#define MLOG_MASK_PREFIX ML_DLM
52#include "cluster/masklog.h"
53
54static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
55 struct dlm_lock *lock);
56static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
57
58/* Should be called as an ast gets queued to see if the new
59 * lock level will obsolete a pending bast.
60 * For example, if dlm_thread queued a bast for an EX lock that
61 * was blocking another EX, but before sending the bast the
62 * lock owner downconverted to NL, the bast is now obsolete.
63 * Only the ast should be sent.
64 * This is needed because the lock and convert paths can queue
65 * asts out-of-band (not waiting for dlm_thread) in order to
66 * allow for LKM_NOQUEUE to get immediate responses. */
67static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
68{
69 assert_spin_locked(&dlm->ast_lock);
70 assert_spin_locked(&lock->spinlock);
71
72 if (lock->ml.highest_blocked == LKM_IVMODE)
73 return 0;
74 BUG_ON(lock->ml.highest_blocked == LKM_NLMODE);
75
76 if (lock->bast_pending &&
77 list_empty(&lock->bast_list))
78 /* old bast already sent, ok */
79 return 0;
80
81 if (lock->ml.type == LKM_EXMODE)
82 /* EX blocks anything left, any bast still valid */
83 return 0;
84 else if (lock->ml.type == LKM_NLMODE)
85 /* NL blocks nothing, no reason to send any bast, cancel it */
86 return 1;
87 else if (lock->ml.highest_blocked != LKM_EXMODE)
88 /* PR only blocks EX */
89 return 1;
90
91 return 0;
92}
93
94static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
95{
96 mlog_entry_void();
97
98 BUG_ON(!dlm);
99 BUG_ON(!lock);
100
101 assert_spin_locked(&dlm->ast_lock);
102 if (!list_empty(&lock->ast_list)) {
103 mlog(ML_ERROR, "ast list not empty!! pending=%d, newlevel=%d\n",
104 lock->ast_pending, lock->ml.type);
105 BUG();
106 }
107 BUG_ON(!list_empty(&lock->ast_list));
108 if (lock->ast_pending)
109 mlog(0, "lock has an ast getting flushed right now\n");
110
111 /* putting lock on list, add a ref */
112 dlm_lock_get(lock);
113 spin_lock(&lock->spinlock);
114
115 /* check to see if this ast obsoletes the bast */
116 if (dlm_should_cancel_bast(dlm, lock)) {
117 struct dlm_lock_resource *res = lock->lockres;
118 mlog(0, "%s: cancelling bast for %.*s\n",
119 dlm->name, res->lockname.len, res->lockname.name);
120 lock->bast_pending = 0;
121 list_del_init(&lock->bast_list);
122 lock->ml.highest_blocked = LKM_IVMODE;
123 /* removing lock from list, remove a ref. guaranteed
124 * this won't be the last ref because of the get above,
125 * so res->spinlock will not be taken here */
126 dlm_lock_put(lock);
127 /* free up the reserved bast that we are cancelling.
128 * guaranteed that this will not be the last reserved
129 * ast because *both* an ast and a bast were reserved
130 * to get to this point. the res->spinlock will not be
131 * taken here */
132 dlm_lockres_release_ast(dlm, res);
133 }
134 list_add_tail(&lock->ast_list, &dlm->pending_asts);
135 lock->ast_pending = 1;
136 spin_unlock(&lock->spinlock);
137}
138
139void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
140{
141 mlog_entry_void();
142
143 BUG_ON(!dlm);
144 BUG_ON(!lock);
145
146 spin_lock(&dlm->ast_lock);
147 __dlm_queue_ast(dlm, lock);
148 spin_unlock(&dlm->ast_lock);
149}
150
151
152static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
153{
154 mlog_entry_void();
155
156 BUG_ON(!dlm);
157 BUG_ON(!lock);
158 assert_spin_locked(&dlm->ast_lock);
159
160 BUG_ON(!list_empty(&lock->bast_list));
161 if (lock->bast_pending)
162 mlog(0, "lock has a bast getting flushed right now\n");
163
164 /* putting lock on list, add a ref */
165 dlm_lock_get(lock);
166 spin_lock(&lock->spinlock);
167 list_add_tail(&lock->bast_list, &dlm->pending_basts);
168 lock->bast_pending = 1;
169 spin_unlock(&lock->spinlock);
170}
171
172void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
173{
174 mlog_entry_void();
175
176 BUG_ON(!dlm);
177 BUG_ON(!lock);
178
179 spin_lock(&dlm->ast_lock);
180 __dlm_queue_bast(dlm, lock);
181 spin_unlock(&dlm->ast_lock);
182}
183
184static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
185 struct dlm_lock *lock)
186{
187 struct dlm_lockstatus *lksb = lock->lksb;
188 BUG_ON(!lksb);
189
190 /* only updates if this node masters the lockres */
191 if (res->owner == dlm->node_num) {
192
193 spin_lock(&res->spinlock);
194 /* check the lksb flags for the direction */
195 if (lksb->flags & DLM_LKSB_GET_LVB) {
196 mlog(0, "getting lvb from lockres for %s node\n",
197 lock->ml.node == dlm->node_num ? "master" :
198 "remote");
199 memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN);
200 } else if (lksb->flags & DLM_LKSB_PUT_LVB) {
201 mlog(0, "setting lvb from lockres for %s node\n",
202 lock->ml.node == dlm->node_num ? "master" :
203 "remote");
204 memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
205 }
206 spin_unlock(&res->spinlock);
207 }
208
209 /* reset any lvb flags on the lksb */
210 lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
211}
212
213void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
214 struct dlm_lock *lock)
215{
216 dlm_astlockfunc_t *fn;
217 struct dlm_lockstatus *lksb;
218
219 mlog_entry_void();
220
221 lksb = lock->lksb;
222 fn = lock->ast;
223 BUG_ON(lock->ml.node != dlm->node_num);
224
225 dlm_update_lvb(dlm, res, lock);
226 (*fn)(lock->astdata);
227}
228
229
230int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
231 struct dlm_lock *lock)
232{
233 int ret;
234 struct dlm_lockstatus *lksb;
235 int lksbflags;
236
237 mlog_entry_void();
238
239 lksb = lock->lksb;
240 BUG_ON(lock->ml.node == dlm->node_num);
241
242 lksbflags = lksb->flags;
243 dlm_update_lvb(dlm, res, lock);
244
245 /* lock request came from another node
246 * go do the ast over there */
247 ret = dlm_send_proxy_ast(dlm, res, lock, lksbflags);
248 return ret;
249}
250
251void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
252 struct dlm_lock *lock, int blocked_type)
253{
254 dlm_bastlockfunc_t *fn = lock->bast;
255
256 mlog_entry_void();
257 BUG_ON(lock->ml.node != dlm->node_num);
258
259 (*fn)(lock->astdata, blocked_type);
260}
261
262
263
264int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data)
265{
266 int ret;
267 unsigned int locklen;
268 struct dlm_ctxt *dlm = data;
269 struct dlm_lock_resource *res = NULL;
270 struct dlm_lock *lock = NULL;
271 struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf;
272 char *name;
273 struct list_head *iter, *head=NULL;
274 u64 cookie;
275 u32 flags;
276
277 if (!dlm_grab(dlm)) {
278 dlm_error(DLM_REJECTED);
279 return DLM_REJECTED;
280 }
281
282 mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
283 "Domain %s not fully joined!\n", dlm->name);
284
285 name = past->name;
286 locklen = past->namelen;
287 cookie = be64_to_cpu(past->cookie);
288 flags = be32_to_cpu(past->flags);
289
290 if (locklen > DLM_LOCKID_NAME_MAX) {
291 ret = DLM_IVBUFLEN;
292 mlog(ML_ERROR, "Invalid name length in proxy ast handler!\n");
293 goto leave;
294 }
295
296 if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
297 (LKM_PUT_LVB|LKM_GET_LVB)) {
298 mlog(ML_ERROR, "both PUT and GET lvb specified\n");
299 ret = DLM_BADARGS;
300 goto leave;
301 }
302
303 mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" :
304 (flags & LKM_GET_LVB ? "get lvb" : "none"));
305
306 mlog(0, "type=%d, blocked_type=%d\n", past->type, past->blocked_type);
307
308 if (past->type != DLM_AST &&
309 past->type != DLM_BAST) {
310 mlog(ML_ERROR, "Unknown ast type! %d, cookie=%"MLFu64", "
311 "name=%.*s\n", past->type, cookie, locklen, name);
312 ret = DLM_IVLOCKID;
313 goto leave;
314 }
315
316 res = dlm_lookup_lockres(dlm, name, locklen);
317 if (!res) {
318 mlog(ML_ERROR, "got %sast for unknown lockres! "
319 "cookie=%"MLFu64", name=%.*s, namelen=%u\n",
320 past->type == DLM_AST ? "" : "b",
321 cookie, locklen, name, locklen);
322 ret = DLM_IVLOCKID;
323 goto leave;
324 }
325
326 /* cannot get a proxy ast message if this node owns it */
327 BUG_ON(res->owner == dlm->node_num);
328
329 mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name);
330
331 spin_lock(&res->spinlock);
332 if (res->state & DLM_LOCK_RES_RECOVERING) {
333 mlog(0, "responding with DLM_RECOVERING!\n");
334 ret = DLM_RECOVERING;
335 goto unlock_out;
336 }
337 if (res->state & DLM_LOCK_RES_MIGRATING) {
338 mlog(0, "responding with DLM_MIGRATING!\n");
339 ret = DLM_MIGRATING;
340 goto unlock_out;
341 }
342 /* try convert queue for both ast/bast */
343 head = &res->converting;
344 lock = NULL;
345 list_for_each(iter, head) {
346 lock = list_entry (iter, struct dlm_lock, list);
347 if (be64_to_cpu(lock->ml.cookie) == cookie)
348 goto do_ast;
349 }
350
351 /* if not on convert, try blocked for ast, granted for bast */
352 if (past->type == DLM_AST)
353 head = &res->blocked;
354 else
355 head = &res->granted;
356
357 list_for_each(iter, head) {
358 lock = list_entry (iter, struct dlm_lock, list);
359 if (be64_to_cpu(lock->ml.cookie) == cookie)
360 goto do_ast;
361 }
362
363 mlog(ML_ERROR, "got %sast for unknown lock! cookie=%"MLFu64", "
364 "name=%.*s, namelen=%u\n",
365 past->type == DLM_AST ? "" : "b", cookie, locklen, name, locklen);
366
367 ret = DLM_NORMAL;
368unlock_out:
369 spin_unlock(&res->spinlock);
370 goto leave;
371
372do_ast:
373 ret = DLM_NORMAL;
374 if (past->type == DLM_AST) {
375 /* do not alter lock refcount. switching lists. */
376 list_del_init(&lock->list);
377 list_add_tail(&lock->list, &res->granted);
378 mlog(0, "ast: adding to granted list... type=%d, "
379 "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
380 if (lock->ml.convert_type != LKM_IVMODE) {
381 lock->ml.type = lock->ml.convert_type;
382 lock->ml.convert_type = LKM_IVMODE;
383 } else {
384 // should already be there....
385 }
386
387 lock->lksb->status = DLM_NORMAL;
388
389 /* if we requested the lvb, fetch it into our lksb now */
390 if (flags & LKM_GET_LVB) {
391 BUG_ON(!(lock->lksb->flags & DLM_LKSB_GET_LVB));
392 memcpy(lock->lksb->lvb, past->lvb, DLM_LVB_LEN);
393 }
394 }
395 spin_unlock(&res->spinlock);
396
397 if (past->type == DLM_AST)
398 dlm_do_local_ast(dlm, res, lock);
399 else
400 dlm_do_local_bast(dlm, res, lock, past->blocked_type);
401
402leave:
403
404 if (res)
405 dlm_lockres_put(res);
406
407 dlm_put(dlm);
408 return ret;
409}
410
411
412
413int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
414 struct dlm_lock *lock, int msg_type,
415 int blocked_type, int flags)
416{
417 int ret = 0;
418 struct dlm_proxy_ast past;
419 struct kvec vec[2];
420 size_t veclen = 1;
421 int status;
422
423 mlog_entry("res %.*s, to=%u, type=%d, blocked_type=%d\n",
424 res->lockname.len, res->lockname.name, lock->ml.node,
425 msg_type, blocked_type);
426
427 memset(&past, 0, sizeof(struct dlm_proxy_ast));
428 past.node_idx = dlm->node_num;
429 past.type = msg_type;
430 past.blocked_type = blocked_type;
431 past.namelen = res->lockname.len;
432 memcpy(past.name, res->lockname.name, past.namelen);
433 past.cookie = lock->ml.cookie;
434
435 vec[0].iov_len = sizeof(struct dlm_proxy_ast);
436 vec[0].iov_base = &past;
437 if (flags & DLM_LKSB_GET_LVB) {
438 mlog(0, "returning requested LVB data\n");
439 be32_add_cpu(&past.flags, LKM_GET_LVB);
440 vec[1].iov_len = DLM_LVB_LEN;
441 vec[1].iov_base = lock->lksb->lvb;
442 veclen++;
443 }
444
445 ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
446 lock->ml.node, &status);
447 if (ret < 0)
448 mlog_errno(ret);
449 else {
450 if (status == DLM_RECOVERING) {
451 mlog(ML_ERROR, "sent AST to node %u, it thinks this "
452 "node is dead!\n", lock->ml.node);
453 BUG();
454 } else if (status == DLM_MIGRATING) {
455 mlog(ML_ERROR, "sent AST to node %u, it returned "
456 "DLM_MIGRATING!\n", lock->ml.node);
457 BUG();
458 } else if (status != DLM_NORMAL) {
459 mlog(ML_ERROR, "AST to node %u returned %d!\n",
460 lock->ml.node, status);
461 /* ignore it */
462 }
463 ret = 0;
464 }
465 return ret;
466}
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
new file mode 100644
index 00000000000..3fecba0a602
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -0,0 +1,884 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmcommon.h
5 *
6 * Copyright (C) 2004 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public
19 * License along with this program; if not, write to the
20 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 * Boston, MA 021110-1307, USA.
22 *
23 */
24
25#ifndef DLMCOMMON_H
26#define DLMCOMMON_H
27
28#include <linux/kref.h>
29
30#define DLM_HB_NODE_DOWN_PRI (0xf000000)
31#define DLM_HB_NODE_UP_PRI (0x8000000)
32
33#define DLM_LOCKID_NAME_MAX 32
34
35#define DLM_DOMAIN_NAME_MAX_LEN 255
36#define DLM_LOCK_RES_OWNER_UNKNOWN O2NM_MAX_NODES
37#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes
38#define DLM_THREAD_MS 200 // flush at least every 200 ms
39
40#define DLM_HASH_BITS 7
41#define DLM_HASH_SIZE (1 << DLM_HASH_BITS)
42#define DLM_HASH_MASK (DLM_HASH_SIZE - 1)
43
44enum dlm_ast_type {
45 DLM_AST = 0,
46 DLM_BAST,
47 DLM_ASTUNLOCK
48};
49
50
51#define LKM_VALID_FLAGS (LKM_VALBLK | LKM_CONVERT | LKM_UNLOCK | \
52 LKM_CANCEL | LKM_INVVALBLK | LKM_FORCE | \
53 LKM_RECOVERY | LKM_LOCAL | LKM_NOQUEUE)
54
55#define DLM_RECOVERY_LOCK_NAME "$RECOVERY"
56#define DLM_RECOVERY_LOCK_NAME_LEN 9
57
58static inline int dlm_is_recovery_lock(const char *lock_name, int name_len)
59{
60 if (name_len == DLM_RECOVERY_LOCK_NAME_LEN &&
61 memcmp(lock_name, DLM_RECOVERY_LOCK_NAME, name_len)==0)
62 return 1;
63 return 0;
64}
65
66#define DLM_RECO_STATE_ACTIVE 0x0001
67
68struct dlm_recovery_ctxt
69{
70 struct list_head resources;
71 struct list_head received;
72 struct list_head node_data;
73 u8 new_master;
74 u8 dead_node;
75 u16 state;
76 unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
77 wait_queue_head_t event;
78};
79
80enum dlm_ctxt_state {
81 DLM_CTXT_NEW = 0,
82 DLM_CTXT_JOINED,
83 DLM_CTXT_IN_SHUTDOWN,
84 DLM_CTXT_LEAVING,
85};
86
87struct dlm_ctxt
88{
89 struct list_head list;
90 struct list_head *resources;
91 struct list_head dirty_list;
92 struct list_head purge_list;
93 struct list_head pending_asts;
94 struct list_head pending_basts;
95 unsigned int purge_count;
96 spinlock_t spinlock;
97 spinlock_t ast_lock;
98 char *name;
99 u8 node_num;
100 u32 key;
101 u8 joining_node;
102 wait_queue_head_t dlm_join_events;
103 unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
104 unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
105 unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
106 struct dlm_recovery_ctxt reco;
107 spinlock_t master_lock;
108 struct list_head master_list;
109 struct list_head mle_hb_events;
110
111 /* these give a really vague idea of the system load */
112 atomic_t local_resources;
113 atomic_t remote_resources;
114 atomic_t unknown_resources;
115
116 /* NOTE: Next three are protected by dlm_domain_lock */
117 struct kref dlm_refs;
118 enum dlm_ctxt_state dlm_state;
119 unsigned int num_joins;
120
121 struct o2hb_callback_func dlm_hb_up;
122 struct o2hb_callback_func dlm_hb_down;
123 struct task_struct *dlm_thread_task;
124 struct task_struct *dlm_reco_thread_task;
125 wait_queue_head_t dlm_thread_wq;
126 wait_queue_head_t dlm_reco_thread_wq;
127 wait_queue_head_t ast_wq;
128 wait_queue_head_t migration_wq;
129
130 struct work_struct dispatched_work;
131 struct list_head work_list;
132 spinlock_t work_lock;
133 struct list_head dlm_domain_handlers;
134 struct list_head dlm_eviction_callbacks;
135};
136
137/* these keventd work queue items are for less-frequently
138 * called functions that cannot be directly called from the
139 * net message handlers for some reason, usually because
140 * they need to send net messages of their own. */
141void dlm_dispatch_work(void *data);
142
143struct dlm_lock_resource;
144struct dlm_work_item;
145
146typedef void (dlm_workfunc_t)(struct dlm_work_item *, void *);
147
148struct dlm_request_all_locks_priv
149{
150 u8 reco_master;
151 u8 dead_node;
152};
153
154struct dlm_mig_lockres_priv
155{
156 struct dlm_lock_resource *lockres;
157 u8 real_master;
158};
159
160struct dlm_assert_master_priv
161{
162 struct dlm_lock_resource *lockres;
163 u8 request_from;
164 u32 flags;
165 unsigned ignore_higher:1;
166};
167
168
169struct dlm_work_item
170{
171 struct list_head list;
172 dlm_workfunc_t *func;
173 struct dlm_ctxt *dlm;
174 void *data;
175 union {
176 struct dlm_request_all_locks_priv ral;
177 struct dlm_mig_lockres_priv ml;
178 struct dlm_assert_master_priv am;
179 } u;
180};
181
182static inline void dlm_init_work_item(struct dlm_ctxt *dlm,
183 struct dlm_work_item *i,
184 dlm_workfunc_t *f, void *data)
185{
186 memset(i, 0, sizeof(*i));
187 i->func = f;
188 INIT_LIST_HEAD(&i->list);
189 i->data = data;
190 i->dlm = dlm; /* must have already done a dlm_grab on this! */
191}
192
193
194
195static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
196 u8 node)
197{
198 assert_spin_locked(&dlm->spinlock);
199
200 dlm->joining_node = node;
201 wake_up(&dlm->dlm_join_events);
202}
203
204#define DLM_LOCK_RES_UNINITED 0x00000001
205#define DLM_LOCK_RES_RECOVERING 0x00000002
206#define DLM_LOCK_RES_READY 0x00000004
207#define DLM_LOCK_RES_DIRTY 0x00000008
208#define DLM_LOCK_RES_IN_PROGRESS 0x00000010
209#define DLM_LOCK_RES_MIGRATING 0x00000020
210
211#define DLM_PURGE_INTERVAL_MS (8 * 1000)
212
213struct dlm_lock_resource
214{
215 /* WARNING: Please see the comment in dlm_init_lockres before
216 * adding fields here. */
217 struct list_head list;
218 struct kref refs;
219
220 /* please keep these next 3 in this order
221 * some funcs want to iterate over all lists */
222 struct list_head granted;
223 struct list_head converting;
224 struct list_head blocked;
225
226 struct list_head dirty;
227 struct list_head recovering; // dlm_recovery_ctxt.resources list
228
229 /* unused lock resources have their last_used stamped and are
230 * put on a list for the dlm thread to run. */
231 struct list_head purge;
232 unsigned long last_used;
233
234 unsigned migration_pending:1;
235 atomic_t asts_reserved;
236 spinlock_t spinlock;
237 wait_queue_head_t wq;
238 u8 owner; //node which owns the lock resource, or unknown
239 u16 state;
240 struct qstr lockname;
241 char lvb[DLM_LVB_LEN];
242};
243
244struct dlm_migratable_lock
245{
246 __be64 cookie;
247
248 /* these 3 are just padding for the in-memory structure, but
249 * list and flags are actually used when sent over the wire */
250 __be16 pad1;
251 u8 list; // 0=granted, 1=converting, 2=blocked
252 u8 flags;
253
254 s8 type;
255 s8 convert_type;
256 s8 highest_blocked;
257 u8 node;
258}; // 16 bytes
259
260struct dlm_lock
261{
262 struct dlm_migratable_lock ml;
263
264 struct list_head list;
265 struct list_head ast_list;
266 struct list_head bast_list;
267 struct dlm_lock_resource *lockres;
268 spinlock_t spinlock;
269 struct kref lock_refs;
270
271 // ast and bast must be callable while holding a spinlock!
272 dlm_astlockfunc_t *ast;
273 dlm_bastlockfunc_t *bast;
274 void *astdata;
275 struct dlm_lockstatus *lksb;
276 unsigned ast_pending:1,
277 bast_pending:1,
278 convert_pending:1,
279 lock_pending:1,
280 cancel_pending:1,
281 unlock_pending:1,
282 lksb_kernel_allocated:1;
283};
284
285
286#define DLM_LKSB_UNUSED1 0x01
287#define DLM_LKSB_PUT_LVB 0x02
288#define DLM_LKSB_GET_LVB 0x04
289#define DLM_LKSB_UNUSED2 0x08
290#define DLM_LKSB_UNUSED3 0x10
291#define DLM_LKSB_UNUSED4 0x20
292#define DLM_LKSB_UNUSED5 0x40
293#define DLM_LKSB_UNUSED6 0x80
294
295
296enum dlm_lockres_list {
297 DLM_GRANTED_LIST = 0,
298 DLM_CONVERTING_LIST,
299 DLM_BLOCKED_LIST
300};
301
302static inline struct list_head *
303dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
304{
305 struct list_head *ret = NULL;
306 if (idx == DLM_GRANTED_LIST)
307 ret = &res->granted;
308 else if (idx == DLM_CONVERTING_LIST)
309 ret = &res->converting;
310 else if (idx == DLM_BLOCKED_LIST)
311 ret = &res->blocked;
312 else
313 BUG();
314 return ret;
315}
316
317
318
319
320struct dlm_node_iter
321{
322 unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
323 int curnode;
324};
325
326
327enum {
328 DLM_MASTER_REQUEST_MSG = 500,
329 DLM_UNUSED_MSG1, /* 501 */
330 DLM_ASSERT_MASTER_MSG, /* 502 */
331 DLM_CREATE_LOCK_MSG, /* 503 */
332 DLM_CONVERT_LOCK_MSG, /* 504 */
333 DLM_PROXY_AST_MSG, /* 505 */
334 DLM_UNLOCK_LOCK_MSG, /* 506 */
335 DLM_UNUSED_MSG2, /* 507 */
336 DLM_MIGRATE_REQUEST_MSG, /* 508 */
337 DLM_MIG_LOCKRES_MSG, /* 509 */
338 DLM_QUERY_JOIN_MSG, /* 510 */
339 DLM_ASSERT_JOINED_MSG, /* 511 */
340 DLM_CANCEL_JOIN_MSG, /* 512 */
341 DLM_EXIT_DOMAIN_MSG, /* 513 */
342 DLM_MASTER_REQUERY_MSG, /* 514 */
343 DLM_LOCK_REQUEST_MSG, /* 515 */
344 DLM_RECO_DATA_DONE_MSG, /* 516 */
345 DLM_BEGIN_RECO_MSG, /* 517 */
346 DLM_FINALIZE_RECO_MSG /* 518 */
347};
348
349struct dlm_reco_node_data
350{
351 int state;
352 u8 node_num;
353 struct list_head list;
354};
355
356enum {
357 DLM_RECO_NODE_DATA_DEAD = -1,
358 DLM_RECO_NODE_DATA_INIT = 0,
359 DLM_RECO_NODE_DATA_REQUESTING,
360 DLM_RECO_NODE_DATA_REQUESTED,
361 DLM_RECO_NODE_DATA_RECEIVING,
362 DLM_RECO_NODE_DATA_DONE,
363 DLM_RECO_NODE_DATA_FINALIZE_SENT,
364};
365
366
367enum {
368 DLM_MASTER_RESP_NO = 0,
369 DLM_MASTER_RESP_YES,
370 DLM_MASTER_RESP_MAYBE,
371 DLM_MASTER_RESP_ERROR
372};
373
374
375struct dlm_master_request
376{
377 u8 node_idx;
378 u8 namelen;
379 __be16 pad1;
380 __be32 flags;
381
382 u8 name[O2NM_MAX_NAME_LEN];
383};
384
385#define DLM_ASSERT_MASTER_MLE_CLEANUP 0x00000001
386#define DLM_ASSERT_MASTER_REQUERY 0x00000002
387#define DLM_ASSERT_MASTER_FINISH_MIGRATION 0x00000004
388struct dlm_assert_master
389{
390 u8 node_idx;
391 u8 namelen;
392 __be16 pad1;
393 __be32 flags;
394
395 u8 name[O2NM_MAX_NAME_LEN];
396};
397
398struct dlm_migrate_request
399{
400 u8 master;
401 u8 new_master;
402 u8 namelen;
403 u8 pad1;
404 __be32 pad2;
405 u8 name[O2NM_MAX_NAME_LEN];
406};
407
408struct dlm_master_requery
409{
410 u8 pad1;
411 u8 pad2;
412 u8 node_idx;
413 u8 namelen;
414 __be32 pad3;
415 u8 name[O2NM_MAX_NAME_LEN];
416};
417
418#define DLM_MRES_RECOVERY 0x01
419#define DLM_MRES_MIGRATION 0x02
420#define DLM_MRES_ALL_DONE 0x04
421
422/*
423 * We would like to get one whole lockres into a single network
424 * message whenever possible. Generally speaking, there will be
425 * at most one dlm_lock on a lockres for each node in the cluster,
426 * plus (infrequently) any additional locks coming in from userdlm.
427 *
428 * struct _dlm_lockres_page
429 * {
430 * dlm_migratable_lockres mres;
431 * dlm_migratable_lock ml[DLM_MAX_MIGRATABLE_LOCKS];
432 * u8 pad[DLM_MIG_LOCKRES_RESERVED];
433 * };
434 *
435 * from ../cluster/tcp.h
436 * NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg))
437 * (roughly 4080 bytes)
438 * and sizeof(dlm_migratable_lockres) = 112 bytes
439 * and sizeof(dlm_migratable_lock) = 16 bytes
440 *
441 * Choosing DLM_MAX_MIGRATABLE_LOCKS=240 and
442 * DLM_MIG_LOCKRES_RESERVED=128 means we have this:
443 *
444 * (DLM_MAX_MIGRATABLE_LOCKS * sizeof(dlm_migratable_lock)) +
445 * sizeof(dlm_migratable_lockres) + DLM_MIG_LOCKRES_RESERVED =
446 * NET_MAX_PAYLOAD_BYTES
447 * (240 * 16) + 112 + 128 = 4080
448 *
449 * So a lockres would need more than 240 locks before it would
450 * use more than one network packet to recover. Not too bad.
451 */
452#define DLM_MAX_MIGRATABLE_LOCKS 240
453
454struct dlm_migratable_lockres
455{
456 u8 master;
457 u8 lockname_len;
458 u8 num_locks; // locks sent in this structure
459 u8 flags;
460 __be32 total_locks; // locks to be sent for this migration cookie
461 __be64 mig_cookie; // cookie for this lockres migration
462 // or zero if not needed
463 // 16 bytes
464 u8 lockname[DLM_LOCKID_NAME_MAX];
465 // 48 bytes
466 u8 lvb[DLM_LVB_LEN];
467 // 112 bytes
468 struct dlm_migratable_lock ml[0]; // 16 bytes each, begins at byte 112
469};
470#define DLM_MIG_LOCKRES_MAX_LEN \
471 (sizeof(struct dlm_migratable_lockres) + \
472 (sizeof(struct dlm_migratable_lock) * \
473 DLM_MAX_MIGRATABLE_LOCKS) )
474
475/* from above, 128 bytes
476 * for some undetermined future use */
477#define DLM_MIG_LOCKRES_RESERVED (NET_MAX_PAYLOAD_BYTES - \
478 DLM_MIG_LOCKRES_MAX_LEN)
479
480struct dlm_create_lock
481{
482 __be64 cookie;
483
484 __be32 flags;
485 u8 pad1;
486 u8 node_idx;
487 s8 requested_type;
488 u8 namelen;
489
490 u8 name[O2NM_MAX_NAME_LEN];
491};
492
493struct dlm_convert_lock
494{
495 __be64 cookie;
496
497 __be32 flags;
498 u8 pad1;
499 u8 node_idx;
500 s8 requested_type;
501 u8 namelen;
502
503 u8 name[O2NM_MAX_NAME_LEN];
504
505 s8 lvb[0];
506};
507#define DLM_CONVERT_LOCK_MAX_LEN (sizeof(struct dlm_convert_lock)+DLM_LVB_LEN)
508
509struct dlm_unlock_lock
510{
511 __be64 cookie;
512
513 __be32 flags;
514 __be16 pad1;
515 u8 node_idx;
516 u8 namelen;
517
518 u8 name[O2NM_MAX_NAME_LEN];
519
520 s8 lvb[0];
521};
522#define DLM_UNLOCK_LOCK_MAX_LEN (sizeof(struct dlm_unlock_lock)+DLM_LVB_LEN)
523
524struct dlm_proxy_ast
525{
526 __be64 cookie;
527
528 __be32 flags;
529 u8 node_idx;
530 u8 type;
531 u8 blocked_type;
532 u8 namelen;
533
534 u8 name[O2NM_MAX_NAME_LEN];
535
536 s8 lvb[0];
537};
538#define DLM_PROXY_AST_MAX_LEN (sizeof(struct dlm_proxy_ast)+DLM_LVB_LEN)
539
540#define DLM_MOD_KEY (0x666c6172)
541enum dlm_query_join_response {
542 JOIN_DISALLOW = 0,
543 JOIN_OK,
544 JOIN_OK_NO_MAP,
545};
546
547struct dlm_lock_request
548{
549 u8 node_idx;
550 u8 dead_node;
551 __be16 pad1;
552 __be32 pad2;
553};
554
555struct dlm_reco_data_done
556{
557 u8 node_idx;
558 u8 dead_node;
559 __be16 pad1;
560 __be32 pad2;
561
562 /* unused for now */
563 /* eventually we can use this to attempt
564 * lvb recovery based on each node's info */
565 u8 reco_lvb[DLM_LVB_LEN];
566};
567
568struct dlm_begin_reco
569{
570 u8 node_idx;
571 u8 dead_node;
572 __be16 pad1;
573 __be32 pad2;
574};
575
576
577struct dlm_query_join_request
578{
579 u8 node_idx;
580 u8 pad1[2];
581 u8 name_len;
582 u8 domain[O2NM_MAX_NAME_LEN];
583};
584
585struct dlm_assert_joined
586{
587 u8 node_idx;
588 u8 pad1[2];
589 u8 name_len;
590 u8 domain[O2NM_MAX_NAME_LEN];
591};
592
593struct dlm_cancel_join
594{
595 u8 node_idx;
596 u8 pad1[2];
597 u8 name_len;
598 u8 domain[O2NM_MAX_NAME_LEN];
599};
600
601struct dlm_exit_domain
602{
603 u8 node_idx;
604 u8 pad1[3];
605};
606
607struct dlm_finalize_reco
608{
609 u8 node_idx;
610 u8 dead_node;
611 __be16 pad1;
612 __be32 pad2;
613};
614
615static inline enum dlm_status
616__dlm_lockres_state_to_status(struct dlm_lock_resource *res)
617{
618 enum dlm_status status = DLM_NORMAL;
619
620 assert_spin_locked(&res->spinlock);
621
622 if (res->state & DLM_LOCK_RES_RECOVERING)
623 status = DLM_RECOVERING;
624 else if (res->state & DLM_LOCK_RES_MIGRATING)
625 status = DLM_MIGRATING;
626 else if (res->state & DLM_LOCK_RES_IN_PROGRESS)
627 status = DLM_FORWARD;
628
629 return status;
630}
631
632struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
633 struct dlm_lockstatus *lksb);
634void dlm_lock_get(struct dlm_lock *lock);
635void dlm_lock_put(struct dlm_lock *lock);
636
637void dlm_lock_attach_lockres(struct dlm_lock *lock,
638 struct dlm_lock_resource *res);
639
640int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data);
641int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data);
642int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data);
643
644void dlm_revert_pending_convert(struct dlm_lock_resource *res,
645 struct dlm_lock *lock);
646void dlm_revert_pending_lock(struct dlm_lock_resource *res,
647 struct dlm_lock *lock);
648
649int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data);
650void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
651 struct dlm_lock *lock);
652void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
653 struct dlm_lock *lock);
654
655int dlm_launch_thread(struct dlm_ctxt *dlm);
656void dlm_complete_thread(struct dlm_ctxt *dlm);
657int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
658void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
659void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
660
661void dlm_put(struct dlm_ctxt *dlm);
662struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
663int dlm_domain_fully_joined(struct dlm_ctxt *dlm);
664
665void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
666 struct dlm_lock_resource *res);
667void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
668 struct dlm_lock_resource *res);
669void dlm_purge_lockres(struct dlm_ctxt *dlm,
670 struct dlm_lock_resource *lockres);
671void dlm_lockres_get(struct dlm_lock_resource *res);
672void dlm_lockres_put(struct dlm_lock_resource *res);
673void __dlm_unhash_lockres(struct dlm_lock_resource *res);
674void __dlm_insert_lockres(struct dlm_ctxt *dlm,
675 struct dlm_lock_resource *res);
676struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
677 const char *name,
678 unsigned int len);
679struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
680 const char *name,
681 unsigned int len);
682
683int dlm_is_host_down(int errno);
684void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
685 struct dlm_lock_resource *res,
686 u8 owner);
687struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
688 const char *lockid,
689 int flags);
690struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
691 const char *name,
692 unsigned int namelen);
693
694void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
695void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
696void dlm_do_local_ast(struct dlm_ctxt *dlm,
697 struct dlm_lock_resource *res,
698 struct dlm_lock *lock);
699int dlm_do_remote_ast(struct dlm_ctxt *dlm,
700 struct dlm_lock_resource *res,
701 struct dlm_lock *lock);
702void dlm_do_local_bast(struct dlm_ctxt *dlm,
703 struct dlm_lock_resource *res,
704 struct dlm_lock *lock,
705 int blocked_type);
706int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm,
707 struct dlm_lock_resource *res,
708 struct dlm_lock *lock,
709 int msg_type,
710 int blocked_type, int flags);
711static inline int dlm_send_proxy_bast(struct dlm_ctxt *dlm,
712 struct dlm_lock_resource *res,
713 struct dlm_lock *lock,
714 int blocked_type)
715{
716 return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_BAST,
717 blocked_type, 0);
718}
719
720static inline int dlm_send_proxy_ast(struct dlm_ctxt *dlm,
721 struct dlm_lock_resource *res,
722 struct dlm_lock *lock,
723 int flags)
724{
725 return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_AST,
726 0, flags);
727}
728
729void dlm_print_one_lock_resource(struct dlm_lock_resource *res);
730void __dlm_print_one_lock_resource(struct dlm_lock_resource *res);
731
732u8 dlm_nm_this_node(struct dlm_ctxt *dlm);
733void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
734void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
735
736
737int dlm_nm_init(struct dlm_ctxt *dlm);
738int dlm_heartbeat_init(struct dlm_ctxt *dlm);
739void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data);
740void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data);
741
742int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
743int dlm_migrate_lockres(struct dlm_ctxt *dlm,
744 struct dlm_lock_resource *res,
745 u8 target);
746int dlm_finish_migration(struct dlm_ctxt *dlm,
747 struct dlm_lock_resource *res,
748 u8 old_master);
749void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
750 struct dlm_lock_resource *res);
751void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res);
752
753int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data);
754int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data);
755int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data);
756int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data);
757int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data);
758int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data);
759int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data);
760int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data);
761int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data);
762
763int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
764 struct dlm_lock_resource *res,
765 int ignore_higher,
766 u8 request_from,
767 u32 flags);
768
769
770int dlm_send_one_lockres(struct dlm_ctxt *dlm,
771 struct dlm_lock_resource *res,
772 struct dlm_migratable_lockres *mres,
773 u8 send_to,
774 u8 flags);
775void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
776 struct dlm_lock_resource *res);
777
778/* will exit holding res->spinlock, but may drop in function */
779void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags);
780void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int flags);
781
782/* will exit holding res->spinlock, but may drop in function */
783static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
784{
785 __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS|
786 DLM_LOCK_RES_RECOVERING|
787 DLM_LOCK_RES_MIGRATING));
788}
789
790
791int dlm_init_mle_cache(void);
792void dlm_destroy_mle_cache(void);
793void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up);
794void dlm_clean_master_list(struct dlm_ctxt *dlm,
795 u8 dead_node);
796int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
797
798
799static inline const char * dlm_lock_mode_name(int mode)
800{
801 switch (mode) {
802 case LKM_EXMODE:
803 return "EX";
804 case LKM_PRMODE:
805 return "PR";
806 case LKM_NLMODE:
807 return "NL";
808 }
809 return "UNKNOWN";
810}
811
812
813static inline int dlm_lock_compatible(int existing, int request)
814{
815 /* NO_LOCK compatible with all */
816 if (request == LKM_NLMODE ||
817 existing == LKM_NLMODE)
818 return 1;
819
820 /* EX incompatible with all non-NO_LOCK */
821 if (request == LKM_EXMODE)
822 return 0;
823
824 /* request must be PR, which is compatible with PR */
825 if (existing == LKM_PRMODE)
826 return 1;
827
828 return 0;
829}
830
831static inline int dlm_lock_on_list(struct list_head *head,
832 struct dlm_lock *lock)
833{
834 struct list_head *iter;
835 struct dlm_lock *tmplock;
836
837 list_for_each(iter, head) {
838 tmplock = list_entry(iter, struct dlm_lock, list);
839 if (tmplock == lock)
840 return 1;
841 }
842 return 0;
843}
844
845
846static inline enum dlm_status dlm_err_to_dlm_status(int err)
847{
848 enum dlm_status ret;
849 if (err == -ENOMEM)
850 ret = DLM_SYSERR;
851 else if (err == -ETIMEDOUT || o2net_link_down(err, NULL))
852 ret = DLM_NOLOCKMGR;
853 else if (err == -EINVAL)
854 ret = DLM_BADPARAM;
855 else if (err == -ENAMETOOLONG)
856 ret = DLM_IVBUFLEN;
857 else
858 ret = DLM_BADARGS;
859 return ret;
860}
861
862
863static inline void dlm_node_iter_init(unsigned long *map,
864 struct dlm_node_iter *iter)
865{
866 memcpy(iter->node_map, map, sizeof(iter->node_map));
867 iter->curnode = -1;
868}
869
870static inline int dlm_node_iter_next(struct dlm_node_iter *iter)
871{
872 int bit;
873 bit = find_next_bit(iter->node_map, O2NM_MAX_NODES, iter->curnode+1);
874 if (bit >= O2NM_MAX_NODES) {
875 iter->curnode = O2NM_MAX_NODES;
876 return -ENOENT;
877 }
878 iter->curnode = bit;
879 return bit;
880}
881
882
883
884#endif /* DLMCOMMON_H */
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
new file mode 100644
index 00000000000..6001b22a997
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -0,0 +1,530 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmconvert.c
5 *
6 * underlying calls for lock conversion
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27
28#include <linux/module.h>
29#include <linux/fs.h>
30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h>
35#include <linux/sysctl.h>
36#include <linux/random.h>
37#include <linux/blkdev.h>
38#include <linux/socket.h>
39#include <linux/inet.h>
40#include <linux/spinlock.h>
41
42
43#include "cluster/heartbeat.h"
44#include "cluster/nodemanager.h"
45#include "cluster/tcp.h"
46
47#include "dlmapi.h"
48#include "dlmcommon.h"
49
50#include "dlmconvert.h"
51
52#define MLOG_MASK_PREFIX ML_DLM
53#include "cluster/masklog.h"
54
55/* NOTE: __dlmconvert_master is the only function in here that
56 * needs a spinlock held on entry (res->spinlock) and it is the
57 * only one that holds a lock on exit (res->spinlock).
58 * All other functions in here need no locks and drop all of
59 * the locks that they acquire. */
60static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
61 struct dlm_lock_resource *res,
62 struct dlm_lock *lock, int flags,
63 int type, int *call_ast,
64 int *kick_thread);
65static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
66 struct dlm_lock_resource *res,
67 struct dlm_lock *lock, int flags, int type);
68
69/*
70 * this is only called directly by dlmlock(), and only when the
71 * local node is the owner of the lockres
72 * locking:
73 * caller needs: none
74 * taken: takes and drops res->spinlock
75 * held on exit: none
76 * returns: see __dlmconvert_master
77 */
78enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm,
79 struct dlm_lock_resource *res,
80 struct dlm_lock *lock, int flags, int type)
81{
82 int call_ast = 0, kick_thread = 0;
83 enum dlm_status status;
84
85 spin_lock(&res->spinlock);
86 /* we are not in a network handler, this is fine */
87 __dlm_wait_on_lockres(res);
88 __dlm_lockres_reserve_ast(res);
89 res->state |= DLM_LOCK_RES_IN_PROGRESS;
90
91 status = __dlmconvert_master(dlm, res, lock, flags, type,
92 &call_ast, &kick_thread);
93
94 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
95 spin_unlock(&res->spinlock);
96 wake_up(&res->wq);
97 if (status != DLM_NORMAL && status != DLM_NOTQUEUED)
98 dlm_error(status);
99
100 /* either queue the ast or release it */
101 if (call_ast)
102 dlm_queue_ast(dlm, lock);
103 else
104 dlm_lockres_release_ast(dlm, res);
105
106 if (kick_thread)
107 dlm_kick_thread(dlm, res);
108
109 return status;
110}
111
112/* performs lock conversion at the lockres master site
113 * locking:
114 * caller needs: res->spinlock
115 * taken: takes and drops lock->spinlock
116 * held on exit: res->spinlock
117 * returns: DLM_NORMAL, DLM_NOTQUEUED, DLM_DENIED
118 * call_ast: whether ast should be called for this lock
119 * kick_thread: whether dlm_kick_thread should be called
120 */
121static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
122 struct dlm_lock_resource *res,
123 struct dlm_lock *lock, int flags,
124 int type, int *call_ast,
125 int *kick_thread)
126{
127 enum dlm_status status = DLM_NORMAL;
128 struct list_head *iter;
129 struct dlm_lock *tmplock=NULL;
130
131 assert_spin_locked(&res->spinlock);
132
133 mlog_entry("type=%d, convert_type=%d, new convert_type=%d\n",
134 lock->ml.type, lock->ml.convert_type, type);
135
136 spin_lock(&lock->spinlock);
137
138 /* already converting? */
139 if (lock->ml.convert_type != LKM_IVMODE) {
140 mlog(ML_ERROR, "attempted to convert a lock with a lock "
141 "conversion pending\n");
142 status = DLM_DENIED;
143 goto unlock_exit;
144 }
145
146 /* must be on grant queue to convert */
147 if (!dlm_lock_on_list(&res->granted, lock)) {
148 mlog(ML_ERROR, "attempted to convert a lock not on grant "
149 "queue\n");
150 status = DLM_DENIED;
151 goto unlock_exit;
152 }
153
154 if (flags & LKM_VALBLK) {
155 switch (lock->ml.type) {
156 case LKM_EXMODE:
157 /* EX + LKM_VALBLK + convert == set lvb */
158 mlog(0, "will set lvb: converting %s->%s\n",
159 dlm_lock_mode_name(lock->ml.type),
160 dlm_lock_mode_name(type));
161 lock->lksb->flags |= DLM_LKSB_PUT_LVB;
162 break;
163 case LKM_PRMODE:
164 case LKM_NLMODE:
165 /* refetch if new level is not NL */
166 if (type > LKM_NLMODE) {
167 mlog(0, "will fetch new value into "
168 "lvb: converting %s->%s\n",
169 dlm_lock_mode_name(lock->ml.type),
170 dlm_lock_mode_name(type));
171 lock->lksb->flags |= DLM_LKSB_GET_LVB;
172 } else {
173 mlog(0, "will NOT fetch new value "
174 "into lvb: converting %s->%s\n",
175 dlm_lock_mode_name(lock->ml.type),
176 dlm_lock_mode_name(type));
177 flags &= ~(LKM_VALBLK);
178 }
179 break;
180 }
181 }
182
183
184 /* in-place downconvert? */
185 if (type <= lock->ml.type)
186 goto grant;
187
188 /* upconvert from here on */
189 status = DLM_NORMAL;
190 list_for_each(iter, &res->granted) {
191 tmplock = list_entry(iter, struct dlm_lock, list);
192 if (tmplock == lock)
193 continue;
194 if (!dlm_lock_compatible(tmplock->ml.type, type))
195 goto switch_queues;
196 }
197
198 list_for_each(iter, &res->converting) {
199 tmplock = list_entry(iter, struct dlm_lock, list);
200 if (!dlm_lock_compatible(tmplock->ml.type, type))
201 goto switch_queues;
202 /* existing conversion requests take precedence */
203 if (!dlm_lock_compatible(tmplock->ml.convert_type, type))
204 goto switch_queues;
205 }
206
207 /* fall thru to grant */
208
209grant:
210 mlog(0, "res %.*s, granting %s lock\n", res->lockname.len,
211 res->lockname.name, dlm_lock_mode_name(type));
212 /* immediately grant the new lock type */
213 lock->lksb->status = DLM_NORMAL;
214 if (lock->ml.node == dlm->node_num)
215 mlog(0, "doing in-place convert for nonlocal lock\n");
216 lock->ml.type = type;
217 status = DLM_NORMAL;
218 *call_ast = 1;
219 goto unlock_exit;
220
221switch_queues:
222 if (flags & LKM_NOQUEUE) {
223 mlog(0, "failed to convert NOQUEUE lock %.*s from "
224 "%d to %d...\n", res->lockname.len, res->lockname.name,
225 lock->ml.type, type);
226 status = DLM_NOTQUEUED;
227 goto unlock_exit;
228 }
229 mlog(0, "res %.*s, queueing...\n", res->lockname.len,
230 res->lockname.name);
231
232 lock->ml.convert_type = type;
233 /* do not alter lock refcount. switching lists. */
234 list_del_init(&lock->list);
235 list_add_tail(&lock->list, &res->converting);
236
237unlock_exit:
238 spin_unlock(&lock->spinlock);
239 if (status == DLM_DENIED) {
240 __dlm_print_one_lock_resource(res);
241 }
242 if (status == DLM_NORMAL)
243 *kick_thread = 1;
244 return status;
245}
246
247void dlm_revert_pending_convert(struct dlm_lock_resource *res,
248 struct dlm_lock *lock)
249{
250 /* do not alter lock refcount. switching lists. */
251 list_del_init(&lock->list);
252 list_add_tail(&lock->list, &res->granted);
253 lock->ml.convert_type = LKM_IVMODE;
254 lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
255}
256
257/* messages the master site to do lock conversion
258 * locking:
259 * caller needs: none
260 * taken: takes and drops res->spinlock, uses DLM_LOCK_RES_IN_PROGRESS
261 * held on exit: none
262 * returns: DLM_NORMAL, DLM_RECOVERING, status from remote node
263 */
264enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
265 struct dlm_lock_resource *res,
266 struct dlm_lock *lock, int flags, int type)
267{
268 enum dlm_status status;
269
270 mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
271 lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
272
273 spin_lock(&res->spinlock);
274 if (res->state & DLM_LOCK_RES_RECOVERING) {
275 mlog(0, "bailing out early since res is RECOVERING "
276 "on secondary queue\n");
277 /* __dlm_print_one_lock_resource(res); */
278 status = DLM_RECOVERING;
279 goto bail;
280 }
281 /* will exit this call with spinlock held */
282 __dlm_wait_on_lockres(res);
283
284 if (lock->ml.convert_type != LKM_IVMODE) {
285 __dlm_print_one_lock_resource(res);
286 mlog(ML_ERROR, "converting a remote lock that is already "
287 "converting! (cookie=%"MLFu64", conv=%d)\n",
288 lock->ml.cookie, lock->ml.convert_type);
289 status = DLM_DENIED;
290 goto bail;
291 }
292 res->state |= DLM_LOCK_RES_IN_PROGRESS;
293 /* move lock to local convert queue */
294 /* do not alter lock refcount. switching lists. */
295 list_del_init(&lock->list);
296 list_add_tail(&lock->list, &res->converting);
297 lock->convert_pending = 1;
298 lock->ml.convert_type = type;
299
300 if (flags & LKM_VALBLK) {
301 if (lock->ml.type == LKM_EXMODE) {
302 flags |= LKM_PUT_LVB;
303 lock->lksb->flags |= DLM_LKSB_PUT_LVB;
304 } else {
305 if (lock->ml.convert_type == LKM_NLMODE)
306 flags &= ~LKM_VALBLK;
307 else {
308 flags |= LKM_GET_LVB;
309 lock->lksb->flags |= DLM_LKSB_GET_LVB;
310 }
311 }
312 }
313 spin_unlock(&res->spinlock);
314
315 /* no locks held here.
316 * need to wait for a reply as to whether it got queued or not. */
317 status = dlm_send_remote_convert_request(dlm, res, lock, flags, type);
318
319 spin_lock(&res->spinlock);
320 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
321 lock->convert_pending = 0;
322 /* if it failed, move it back to granted queue */
323 if (status != DLM_NORMAL) {
324 if (status != DLM_NOTQUEUED)
325 dlm_error(status);
326 dlm_revert_pending_convert(res, lock);
327 }
328bail:
329 spin_unlock(&res->spinlock);
330
331 /* TODO: should this be a wake_one? */
332 /* wake up any IN_PROGRESS waiters */
333 wake_up(&res->wq);
334
335 return status;
336}
337
338/* sends DLM_CONVERT_LOCK_MSG to master site
339 * locking:
340 * caller needs: none
341 * taken: none
342 * held on exit: none
343 * returns: DLM_NOLOCKMGR, status from remote node
344 */
345static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
346 struct dlm_lock_resource *res,
347 struct dlm_lock *lock, int flags, int type)
348{
349 struct dlm_convert_lock convert;
350 int tmpret;
351 enum dlm_status ret;
352 int status = 0;
353 struct kvec vec[2];
354 size_t veclen = 1;
355
356 mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
357
358 memset(&convert, 0, sizeof(struct dlm_convert_lock));
359 convert.node_idx = dlm->node_num;
360 convert.requested_type = type;
361 convert.cookie = lock->ml.cookie;
362 convert.namelen = res->lockname.len;
363 convert.flags = cpu_to_be32(flags);
364 memcpy(convert.name, res->lockname.name, convert.namelen);
365
366 vec[0].iov_len = sizeof(struct dlm_convert_lock);
367 vec[0].iov_base = &convert;
368
369 if (flags & LKM_PUT_LVB) {
370 /* extra data to send if we are updating lvb */
371 vec[1].iov_len = DLM_LVB_LEN;
372 vec[1].iov_base = lock->lksb->lvb;
373 veclen++;
374 }
375
376 tmpret = o2net_send_message_vec(DLM_CONVERT_LOCK_MSG, dlm->key,
377 vec, veclen, res->owner, &status);
378 if (tmpret >= 0) {
379 // successfully sent and received
380 ret = status; // this is already a dlm_status
381 if (ret == DLM_RECOVERING) {
382 mlog(0, "node %u returned DLM_RECOVERING from convert "
383 "message!\n", res->owner);
384 } else if (ret == DLM_MIGRATING) {
385 mlog(0, "node %u returned DLM_MIGRATING from convert "
386 "message!\n", res->owner);
387 } else if (ret == DLM_FORWARD) {
388 mlog(0, "node %u returned DLM_FORWARD from convert "
389 "message!\n", res->owner);
390 } else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED)
391 dlm_error(ret);
392 } else {
393 mlog_errno(tmpret);
394 if (dlm_is_host_down(tmpret)) {
395 ret = DLM_RECOVERING;
396 mlog(0, "node %u died so returning DLM_RECOVERING "
397 "from convert message!\n", res->owner);
398 } else {
399 ret = dlm_err_to_dlm_status(tmpret);
400 }
401 }
402
403 return ret;
404}
405
406/* handler for DLM_CONVERT_LOCK_MSG on master site
407 * locking:
408 * caller needs: none
409 * taken: takes and drop res->spinlock
410 * held on exit: none
411 * returns: DLM_NORMAL, DLM_IVLOCKID, DLM_BADARGS,
412 * status from __dlmconvert_master
413 */
414int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
415{
416 struct dlm_ctxt *dlm = data;
417 struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf;
418 struct dlm_lock_resource *res = NULL;
419 struct list_head *iter;
420 struct dlm_lock *lock = NULL;
421 struct dlm_lockstatus *lksb;
422 enum dlm_status status = DLM_NORMAL;
423 u32 flags;
424 int call_ast = 0, kick_thread = 0;
425
426 if (!dlm_grab(dlm)) {
427 dlm_error(DLM_REJECTED);
428 return DLM_REJECTED;
429 }
430
431 mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
432 "Domain %s not fully joined!\n", dlm->name);
433
434 if (cnv->namelen > DLM_LOCKID_NAME_MAX) {
435 status = DLM_IVBUFLEN;
436 dlm_error(status);
437 goto leave;
438 }
439
440 flags = be32_to_cpu(cnv->flags);
441
442 if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
443 (LKM_PUT_LVB|LKM_GET_LVB)) {
444 mlog(ML_ERROR, "both PUT and GET lvb specified\n");
445 status = DLM_BADARGS;
446 goto leave;
447 }
448
449 mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" :
450 (flags & LKM_GET_LVB ? "get lvb" : "none"));
451
452 status = DLM_IVLOCKID;
453 res = dlm_lookup_lockres(dlm, cnv->name, cnv->namelen);
454 if (!res) {
455 dlm_error(status);
456 goto leave;
457 }
458
459 spin_lock(&res->spinlock);
460 list_for_each(iter, &res->granted) {
461 lock = list_entry(iter, struct dlm_lock, list);
462 if (lock->ml.cookie == cnv->cookie &&
463 lock->ml.node == cnv->node_idx) {
464 dlm_lock_get(lock);
465 break;
466 }
467 lock = NULL;
468 }
469 spin_unlock(&res->spinlock);
470 if (!lock) {
471 status = DLM_IVLOCKID;
472 dlm_error(status);
473 goto leave;
474 }
475
476 /* found the lock */
477 lksb = lock->lksb;
478
479 /* see if caller needed to get/put lvb */
480 if (flags & LKM_PUT_LVB) {
481 BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
482 lksb->flags |= DLM_LKSB_PUT_LVB;
483 memcpy(&lksb->lvb[0], &cnv->lvb[0], DLM_LVB_LEN);
484 } else if (flags & LKM_GET_LVB) {
485 BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
486 lksb->flags |= DLM_LKSB_GET_LVB;
487 }
488
489 spin_lock(&res->spinlock);
490 status = __dlm_lockres_state_to_status(res);
491 if (status == DLM_NORMAL) {
492 __dlm_lockres_reserve_ast(res);
493 res->state |= DLM_LOCK_RES_IN_PROGRESS;
494 status = __dlmconvert_master(dlm, res, lock, flags,
495 cnv->requested_type,
496 &call_ast, &kick_thread);
497 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
498 }
499 spin_unlock(&res->spinlock);
500
501 if (status != DLM_NORMAL) {
502 if (status != DLM_NOTQUEUED)
503 dlm_error(status);
504 lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
505 }
506
507leave:
508 if (!lock)
509 mlog(ML_ERROR, "did not find lock to convert on grant queue! "
510 "cookie=%"MLFu64"\n",
511 cnv->cookie);
512 else
513 dlm_lock_put(lock);
514
515 /* either queue the ast or release it */
516 if (call_ast)
517 dlm_queue_ast(dlm, lock);
518 else
519 dlm_lockres_release_ast(dlm, res);
520
521 if (kick_thread)
522 dlm_kick_thread(dlm, res);
523
524 if (res)
525 dlm_lockres_put(res);
526
527 dlm_put(dlm);
528
529 return status;
530}
diff --git a/fs/ocfs2/dlm/dlmconvert.h b/fs/ocfs2/dlm/dlmconvert.h
new file mode 100644
index 00000000000..b2e3677df87
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmconvert.h
@@ -0,0 +1,35 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmconvert.h
5 *
6 * Copyright (C) 2004 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public
19 * License along with this program; if not, write to the
20 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 * Boston, MA 021110-1307, USA.
22 *
23 */
24
25#ifndef DLMCONVERT_H
26#define DLMCONVERT_H
27
28enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm,
29 struct dlm_lock_resource *res,
30 struct dlm_lock *lock, int flags, int type);
31enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
32 struct dlm_lock_resource *res,
33 struct dlm_lock *lock, int flags, int type);
34
35#endif
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
new file mode 100644
index 00000000000..f339fe27975
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -0,0 +1,246 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmdebug.c
5 *
6 * debug functionality for the dlm
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30#include <linux/utsname.h>
31#include <linux/sysctl.h>
32#include <linux/spinlock.h>
33
34#include "cluster/heartbeat.h"
35#include "cluster/nodemanager.h"
36#include "cluster/tcp.h"
37
38#include "dlmapi.h"
39#include "dlmcommon.h"
40#include "dlmdebug.h"
41
42#include "dlmdomain.h"
43#include "dlmdebug.h"
44
45#define MLOG_MASK_PREFIX ML_DLM
46#include "cluster/masklog.h"
47
48void dlm_print_one_lock_resource(struct dlm_lock_resource *res)
49{
50 mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
51 res->lockname.len, res->lockname.name,
52 res->owner, res->state);
53 spin_lock(&res->spinlock);
54 __dlm_print_one_lock_resource(res);
55 spin_unlock(&res->spinlock);
56}
57
58void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
59{
60 struct list_head *iter2;
61 struct dlm_lock *lock;
62
63 assert_spin_locked(&res->spinlock);
64
65 mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
66 res->lockname.len, res->lockname.name,
67 res->owner, res->state);
68 mlog(ML_NOTICE, " last used: %lu, on purge list: %s\n",
69 res->last_used, list_empty(&res->purge) ? "no" : "yes");
70 mlog(ML_NOTICE, " granted queue: \n");
71 list_for_each(iter2, &res->granted) {
72 lock = list_entry(iter2, struct dlm_lock, list);
73 spin_lock(&lock->spinlock);
74 mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
75 "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
76 lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie,
77 list_empty(&lock->ast_list) ? 'y' : 'n',
78 lock->ast_pending ? 'y' : 'n',
79 list_empty(&lock->bast_list) ? 'y' : 'n',
80 lock->bast_pending ? 'y' : 'n');
81 spin_unlock(&lock->spinlock);
82 }
83 mlog(ML_NOTICE, " converting queue: \n");
84 list_for_each(iter2, &res->converting) {
85 lock = list_entry(iter2, struct dlm_lock, list);
86 spin_lock(&lock->spinlock);
87 mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
88 "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
89 lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie,
90 list_empty(&lock->ast_list) ? 'y' : 'n',
91 lock->ast_pending ? 'y' : 'n',
92 list_empty(&lock->bast_list) ? 'y' : 'n',
93 lock->bast_pending ? 'y' : 'n');
94 spin_unlock(&lock->spinlock);
95 }
96 mlog(ML_NOTICE, " blocked queue: \n");
97 list_for_each(iter2, &res->blocked) {
98 lock = list_entry(iter2, struct dlm_lock, list);
99 spin_lock(&lock->spinlock);
100 mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
101 "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
102 lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie,
103 list_empty(&lock->ast_list) ? 'y' : 'n',
104 lock->ast_pending ? 'y' : 'n',
105 list_empty(&lock->bast_list) ? 'y' : 'n',
106 lock->bast_pending ? 'y' : 'n');
107 spin_unlock(&lock->spinlock);
108 }
109}
110
111void dlm_print_one_lock(struct dlm_lock *lockid)
112{
113 dlm_print_one_lock_resource(lockid->lockres);
114}
115EXPORT_SYMBOL_GPL(dlm_print_one_lock);
116
117void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
118{
119 struct dlm_lock_resource *res;
120 struct list_head *iter;
121 struct list_head *bucket;
122 int i;
123
124 mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n",
125 dlm->name, dlm->node_num, dlm->key);
126 if (!dlm || !dlm->name) {
127 mlog(ML_ERROR, "dlm=%p\n", dlm);
128 return;
129 }
130
131 spin_lock(&dlm->spinlock);
132 for (i=0; i<DLM_HASH_SIZE; i++) {
133 bucket = &(dlm->resources[i]);
134 list_for_each(iter, bucket) {
135 res = list_entry(iter, struct dlm_lock_resource, list);
136 dlm_print_one_lock_resource(res);
137 }
138 }
139 spin_unlock(&dlm->spinlock);
140}
141
142static const char *dlm_errnames[] = {
143 [DLM_NORMAL] = "DLM_NORMAL",
144 [DLM_GRANTED] = "DLM_GRANTED",
145 [DLM_DENIED] = "DLM_DENIED",
146 [DLM_DENIED_NOLOCKS] = "DLM_DENIED_NOLOCKS",
147 [DLM_WORKING] = "DLM_WORKING",
148 [DLM_BLOCKED] = "DLM_BLOCKED",
149 [DLM_BLOCKED_ORPHAN] = "DLM_BLOCKED_ORPHAN",
150 [DLM_DENIED_GRACE_PERIOD] = "DLM_DENIED_GRACE_PERIOD",
151 [DLM_SYSERR] = "DLM_SYSERR",
152 [DLM_NOSUPPORT] = "DLM_NOSUPPORT",
153 [DLM_CANCELGRANT] = "DLM_CANCELGRANT",
154 [DLM_IVLOCKID] = "DLM_IVLOCKID",
155 [DLM_SYNC] = "DLM_SYNC",
156 [DLM_BADTYPE] = "DLM_BADTYPE",
157 [DLM_BADRESOURCE] = "DLM_BADRESOURCE",
158 [DLM_MAXHANDLES] = "DLM_MAXHANDLES",
159 [DLM_NOCLINFO] = "DLM_NOCLINFO",
160 [DLM_NOLOCKMGR] = "DLM_NOLOCKMGR",
161 [DLM_NOPURGED] = "DLM_NOPURGED",
162 [DLM_BADARGS] = "DLM_BADARGS",
163 [DLM_VOID] = "DLM_VOID",
164 [DLM_NOTQUEUED] = "DLM_NOTQUEUED",
165 [DLM_IVBUFLEN] = "DLM_IVBUFLEN",
166 [DLM_CVTUNGRANT] = "DLM_CVTUNGRANT",
167 [DLM_BADPARAM] = "DLM_BADPARAM",
168 [DLM_VALNOTVALID] = "DLM_VALNOTVALID",
169 [DLM_REJECTED] = "DLM_REJECTED",
170 [DLM_ABORT] = "DLM_ABORT",
171 [DLM_CANCEL] = "DLM_CANCEL",
172 [DLM_IVRESHANDLE] = "DLM_IVRESHANDLE",
173 [DLM_DEADLOCK] = "DLM_DEADLOCK",
174 [DLM_DENIED_NOASTS] = "DLM_DENIED_NOASTS",
175 [DLM_FORWARD] = "DLM_FORWARD",
176 [DLM_TIMEOUT] = "DLM_TIMEOUT",
177 [DLM_IVGROUPID] = "DLM_IVGROUPID",
178 [DLM_VERS_CONFLICT] = "DLM_VERS_CONFLICT",
179 [DLM_BAD_DEVICE_PATH] = "DLM_BAD_DEVICE_PATH",
180 [DLM_NO_DEVICE_PERMISSION] = "DLM_NO_DEVICE_PERMISSION",
181 [DLM_NO_CONTROL_DEVICE ] = "DLM_NO_CONTROL_DEVICE ",
182 [DLM_RECOVERING] = "DLM_RECOVERING",
183 [DLM_MIGRATING] = "DLM_MIGRATING",
184 [DLM_MAXSTATS] = "DLM_MAXSTATS",
185};
186
187static const char *dlm_errmsgs[] = {
188 [DLM_NORMAL] = "request in progress",
189 [DLM_GRANTED] = "request granted",
190 [DLM_DENIED] = "request denied",
191 [DLM_DENIED_NOLOCKS] = "request denied, out of system resources",
192 [DLM_WORKING] = "async request in progress",
193 [DLM_BLOCKED] = "lock request blocked",
194 [DLM_BLOCKED_ORPHAN] = "lock request blocked by a orphan lock",
195 [DLM_DENIED_GRACE_PERIOD] = "topological change in progress",
196 [DLM_SYSERR] = "system error",
197 [DLM_NOSUPPORT] = "unsupported",
198 [DLM_CANCELGRANT] = "can't cancel convert: already granted",
199 [DLM_IVLOCKID] = "bad lockid",
200 [DLM_SYNC] = "synchronous request granted",
201 [DLM_BADTYPE] = "bad resource type",
202 [DLM_BADRESOURCE] = "bad resource handle",
203 [DLM_MAXHANDLES] = "no more resource handles",
204 [DLM_NOCLINFO] = "can't contact cluster manager",
205 [DLM_NOLOCKMGR] = "can't contact lock manager",
206 [DLM_NOPURGED] = "can't contact purge daemon",
207 [DLM_BADARGS] = "bad api args",
208 [DLM_VOID] = "no status",
209 [DLM_NOTQUEUED] = "NOQUEUE was specified and request failed",
210 [DLM_IVBUFLEN] = "invalid resource name length",
211 [DLM_CVTUNGRANT] = "attempted to convert ungranted lock",
212 [DLM_BADPARAM] = "invalid lock mode specified",
213 [DLM_VALNOTVALID] = "value block has been invalidated",
214 [DLM_REJECTED] = "request rejected, unrecognized client",
215 [DLM_ABORT] = "blocked lock request cancelled",
216 [DLM_CANCEL] = "conversion request cancelled",
217 [DLM_IVRESHANDLE] = "invalid resource handle",
218 [DLM_DEADLOCK] = "deadlock recovery refused this request",
219 [DLM_DENIED_NOASTS] = "failed to allocate AST",
220 [DLM_FORWARD] = "request must wait for primary's response",
221 [DLM_TIMEOUT] = "timeout value for lock has expired",
222 [DLM_IVGROUPID] = "invalid group specification",
223 [DLM_VERS_CONFLICT] = "version conflicts prevent request handling",
224 [DLM_BAD_DEVICE_PATH] = "Locks device does not exist or path wrong",
225 [DLM_NO_DEVICE_PERMISSION] = "Client has insufficient perms for device",
226 [DLM_NO_CONTROL_DEVICE] = "Cannot set options on opened device ",
227 [DLM_RECOVERING] = "lock resource being recovered",
228 [DLM_MIGRATING] = "lock resource being migrated",
229 [DLM_MAXSTATS] = "invalid error number",
230};
231
232const char *dlm_errmsg(enum dlm_status err)
233{
234 if (err >= DLM_MAXSTATS || err < 0)
235 return dlm_errmsgs[DLM_MAXSTATS];
236 return dlm_errmsgs[err];
237}
238EXPORT_SYMBOL_GPL(dlm_errmsg);
239
240const char *dlm_errname(enum dlm_status err)
241{
242 if (err >= DLM_MAXSTATS || err < 0)
243 return dlm_errnames[DLM_MAXSTATS];
244 return dlm_errnames[err];
245}
246EXPORT_SYMBOL_GPL(dlm_errname);
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
new file mode 100644
index 00000000000..6858510c3cc
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -0,0 +1,30 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmdebug.h
5 *
6 * Copyright (C) 2004 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public
19 * License along with this program; if not, write to the
20 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 * Boston, MA 021110-1307, USA.
22 *
23 */
24
25#ifndef DLMDEBUG_H
26#define DLMDEBUG_H
27
28void dlm_dump_lock_resources(struct dlm_ctxt *dlm);
29
30#endif
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
new file mode 100644
index 00000000000..da3c22045f8
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -0,0 +1,1469 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmdomain.c
5 *
6 * defines domain join / leave apis
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27#include <linux/module.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h>
31#include <linux/utsname.h>
32#include <linux/init.h>
33#include <linux/spinlock.h>
34#include <linux/delay.h>
35#include <linux/err.h>
36
37#include "cluster/heartbeat.h"
38#include "cluster/nodemanager.h"
39#include "cluster/tcp.h"
40
41#include "dlmapi.h"
42#include "dlmcommon.h"
43
44#include "dlmdebug.h"
45#include "dlmdomain.h"
46
47#include "dlmver.h"
48
49#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
50#include "cluster/masklog.h"
51
52/*
53 *
54 * spinlock lock ordering: if multiple locks are needed, obey this ordering:
55 * dlm_domain_lock
56 * struct dlm_ctxt->spinlock
57 * struct dlm_lock_resource->spinlock
58 * struct dlm_ctxt->master_lock
59 * struct dlm_ctxt->ast_lock
60 * dlm_master_list_entry->spinlock
61 * dlm_lock->spinlock
62 *
63 */
64
65spinlock_t dlm_domain_lock = SPIN_LOCK_UNLOCKED;
66LIST_HEAD(dlm_domains);
67static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
68
69#define DLM_DOMAIN_BACKOFF_MS 200
70
71static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data);
72static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data);
73static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data);
74static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data);
75
76static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
77
78void __dlm_unhash_lockres(struct dlm_lock_resource *lockres)
79{
80 list_del_init(&lockres->list);
81 dlm_lockres_put(lockres);
82}
83
84void __dlm_insert_lockres(struct dlm_ctxt *dlm,
85 struct dlm_lock_resource *res)
86{
87 struct list_head *bucket;
88 struct qstr *q;
89
90 assert_spin_locked(&dlm->spinlock);
91
92 q = &res->lockname;
93 q->hash = full_name_hash(q->name, q->len);
94 bucket = &(dlm->resources[q->hash & DLM_HASH_MASK]);
95
96 /* get a reference for our hashtable */
97 dlm_lockres_get(res);
98
99 list_add_tail(&res->list, bucket);
100}
101
102struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
103 const char *name,
104 unsigned int len)
105{
106 unsigned int hash;
107 struct list_head *iter;
108 struct dlm_lock_resource *tmpres=NULL;
109 struct list_head *bucket;
110
111 mlog_entry("%.*s\n", len, name);
112
113 assert_spin_locked(&dlm->spinlock);
114
115 hash = full_name_hash(name, len);
116
117 bucket = &(dlm->resources[hash & DLM_HASH_MASK]);
118
119 /* check for pre-existing lock */
120 list_for_each(iter, bucket) {
121 tmpres = list_entry(iter, struct dlm_lock_resource, list);
122 if (tmpres->lockname.len == len &&
123 memcmp(tmpres->lockname.name, name, len) == 0) {
124 dlm_lockres_get(tmpres);
125 break;
126 }
127
128 tmpres = NULL;
129 }
130 return tmpres;
131}
132
133struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
134 const char *name,
135 unsigned int len)
136{
137 struct dlm_lock_resource *res;
138
139 spin_lock(&dlm->spinlock);
140 res = __dlm_lookup_lockres(dlm, name, len);
141 spin_unlock(&dlm->spinlock);
142 return res;
143}
144
145static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len)
146{
147 struct dlm_ctxt *tmp = NULL;
148 struct list_head *iter;
149
150 assert_spin_locked(&dlm_domain_lock);
151
152 /* tmp->name here is always NULL terminated,
153 * but domain may not be! */
154 list_for_each(iter, &dlm_domains) {
155 tmp = list_entry (iter, struct dlm_ctxt, list);
156 if (strlen(tmp->name) == len &&
157 memcmp(tmp->name, domain, len)==0)
158 break;
159 tmp = NULL;
160 }
161
162 return tmp;
163}
164
165/* For null terminated domain strings ONLY */
166static struct dlm_ctxt * __dlm_lookup_domain(const char *domain)
167{
168 assert_spin_locked(&dlm_domain_lock);
169
170 return __dlm_lookup_domain_full(domain, strlen(domain));
171}
172
173
174/* returns true on one of two conditions:
175 * 1) the domain does not exist
176 * 2) the domain exists and it's state is "joined" */
177static int dlm_wait_on_domain_helper(const char *domain)
178{
179 int ret = 0;
180 struct dlm_ctxt *tmp = NULL;
181
182 spin_lock(&dlm_domain_lock);
183
184 tmp = __dlm_lookup_domain(domain);
185 if (!tmp)
186 ret = 1;
187 else if (tmp->dlm_state == DLM_CTXT_JOINED)
188 ret = 1;
189
190 spin_unlock(&dlm_domain_lock);
191 return ret;
192}
193
194static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
195{
196 if (dlm->resources)
197 free_page((unsigned long) dlm->resources);
198
199 if (dlm->name)
200 kfree(dlm->name);
201
202 kfree(dlm);
203}
204
205/* A little strange - this function will be called while holding
206 * dlm_domain_lock and is expected to be holding it on the way out. We
207 * will however drop and reacquire it multiple times */
208static void dlm_ctxt_release(struct kref *kref)
209{
210 struct dlm_ctxt *dlm;
211
212 dlm = container_of(kref, struct dlm_ctxt, dlm_refs);
213
214 BUG_ON(dlm->num_joins);
215 BUG_ON(dlm->dlm_state == DLM_CTXT_JOINED);
216
217 /* we may still be in the list if we hit an error during join. */
218 list_del_init(&dlm->list);
219
220 spin_unlock(&dlm_domain_lock);
221
222 mlog(0, "freeing memory from domain %s\n", dlm->name);
223
224 wake_up(&dlm_domain_events);
225
226 dlm_free_ctxt_mem(dlm);
227
228 spin_lock(&dlm_domain_lock);
229}
230
231void dlm_put(struct dlm_ctxt *dlm)
232{
233 spin_lock(&dlm_domain_lock);
234 kref_put(&dlm->dlm_refs, dlm_ctxt_release);
235 spin_unlock(&dlm_domain_lock);
236}
237
238static void __dlm_get(struct dlm_ctxt *dlm)
239{
240 kref_get(&dlm->dlm_refs);
241}
242
243/* given a questionable reference to a dlm object, gets a reference if
244 * it can find it in the list, otherwise returns NULL in which case
245 * you shouldn't trust your pointer. */
246struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm)
247{
248 struct list_head *iter;
249 struct dlm_ctxt *target = NULL;
250
251 spin_lock(&dlm_domain_lock);
252
253 list_for_each(iter, &dlm_domains) {
254 target = list_entry (iter, struct dlm_ctxt, list);
255
256 if (target == dlm) {
257 __dlm_get(target);
258 break;
259 }
260
261 target = NULL;
262 }
263
264 spin_unlock(&dlm_domain_lock);
265
266 return target;
267}
268
269int dlm_domain_fully_joined(struct dlm_ctxt *dlm)
270{
271 int ret;
272
273 spin_lock(&dlm_domain_lock);
274 ret = (dlm->dlm_state == DLM_CTXT_JOINED) ||
275 (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN);
276 spin_unlock(&dlm_domain_lock);
277
278 return ret;
279}
280
281static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
282{
283 dlm_unregister_domain_handlers(dlm);
284 dlm_complete_thread(dlm);
285 dlm_complete_recovery_thread(dlm);
286
287 /* We've left the domain. Now we can take ourselves out of the
288 * list and allow the kref stuff to help us free the
289 * memory. */
290 spin_lock(&dlm_domain_lock);
291 list_del_init(&dlm->list);
292 spin_unlock(&dlm_domain_lock);
293
294 /* Wake up anyone waiting for us to remove this domain */
295 wake_up(&dlm_domain_events);
296}
297
298static void dlm_migrate_all_locks(struct dlm_ctxt *dlm)
299{
300 int i;
301 struct dlm_lock_resource *res;
302
303 mlog(0, "Migrating locks from domain %s\n", dlm->name);
304restart:
305 spin_lock(&dlm->spinlock);
306 for (i=0; i<DLM_HASH_SIZE; i++) {
307 while (!list_empty(&dlm->resources[i])) {
308 res = list_entry(dlm->resources[i].next,
309 struct dlm_lock_resource, list);
310 /* need reference when manually grabbing lockres */
311 dlm_lockres_get(res);
312 /* this should unhash the lockres
313 * and exit with dlm->spinlock */
314 mlog(0, "purging res=%p\n", res);
315 if (dlm_lockres_is_dirty(dlm, res)) {
316 /* HACK! this should absolutely go.
317 * need to figure out why some empty
318 * lockreses are still marked dirty */
319 mlog(ML_ERROR, "lockres %.*s dirty!\n",
320 res->lockname.len, res->lockname.name);
321
322 spin_unlock(&dlm->spinlock);
323 dlm_kick_thread(dlm, res);
324 wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
325 dlm_lockres_put(res);
326 goto restart;
327 }
328 dlm_purge_lockres(dlm, res);
329 dlm_lockres_put(res);
330 }
331 }
332 spin_unlock(&dlm->spinlock);
333
334 mlog(0, "DONE Migrating locks from domain %s\n", dlm->name);
335}
336
337static int dlm_no_joining_node(struct dlm_ctxt *dlm)
338{
339 int ret;
340
341 spin_lock(&dlm->spinlock);
342 ret = dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN;
343 spin_unlock(&dlm->spinlock);
344
345 return ret;
346}
347
348static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm)
349{
350 /* Yikes, a double spinlock! I need domain_lock for the dlm
351 * state and the dlm spinlock for join state... Sorry! */
352again:
353 spin_lock(&dlm_domain_lock);
354 spin_lock(&dlm->spinlock);
355
356 if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) {
357 mlog(0, "Node %d is joining, we wait on it.\n",
358 dlm->joining_node);
359 spin_unlock(&dlm->spinlock);
360 spin_unlock(&dlm_domain_lock);
361
362 wait_event(dlm->dlm_join_events, dlm_no_joining_node(dlm));
363 goto again;
364 }
365
366 dlm->dlm_state = DLM_CTXT_LEAVING;
367 spin_unlock(&dlm->spinlock);
368 spin_unlock(&dlm_domain_lock);
369}
370
371static void __dlm_print_nodes(struct dlm_ctxt *dlm)
372{
373 int node = -1;
374
375 assert_spin_locked(&dlm->spinlock);
376
377 mlog(ML_NOTICE, "Nodes in my domain (\"%s\"):\n", dlm->name);
378
379 while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
380 node + 1)) < O2NM_MAX_NODES) {
381 mlog(ML_NOTICE, " node %d\n", node);
382 }
383}
384
385static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data)
386{
387 struct dlm_ctxt *dlm = data;
388 unsigned int node;
389 struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
390
391 mlog_entry("%p %u %p", msg, len, data);
392
393 if (!dlm_grab(dlm))
394 return 0;
395
396 node = exit_msg->node_idx;
397
398 mlog(0, "Node %u leaves domain %s\n", node, dlm->name);
399
400 spin_lock(&dlm->spinlock);
401 clear_bit(node, dlm->domain_map);
402 __dlm_print_nodes(dlm);
403
404 /* notify anything attached to the heartbeat events */
405 dlm_hb_event_notify_attached(dlm, node, 0);
406
407 spin_unlock(&dlm->spinlock);
408
409 dlm_put(dlm);
410
411 return 0;
412}
413
414static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
415 unsigned int node)
416{
417 int status;
418 struct dlm_exit_domain leave_msg;
419
420 mlog(0, "Asking node %u if we can leave the domain %s me = %u\n",
421 node, dlm->name, dlm->node_num);
422
423 memset(&leave_msg, 0, sizeof(leave_msg));
424 leave_msg.node_idx = dlm->node_num;
425
426 status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
427 &leave_msg, sizeof(leave_msg), node,
428 NULL);
429
430 mlog(0, "status return %d from o2net_send_message\n", status);
431
432 return status;
433}
434
435
436static void dlm_leave_domain(struct dlm_ctxt *dlm)
437{
438 int node, clear_node, status;
439
440 /* At this point we've migrated away all our locks and won't
441 * accept mastership of new ones. The dlm is responsible for
442 * almost nothing now. We make sure not to confuse any joining
443 * nodes and then commence shutdown procedure. */
444
445 spin_lock(&dlm->spinlock);
446 /* Clear ourselves from the domain map */
447 clear_bit(dlm->node_num, dlm->domain_map);
448 while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
449 0)) < O2NM_MAX_NODES) {
450 /* Drop the dlm spinlock. This is safe wrt the domain_map.
451 * -nodes cannot be added now as the
452 * query_join_handlers knows to respond with OK_NO_MAP
453 * -we catch the right network errors if a node is
454 * removed from the map while we're sending him the
455 * exit message. */
456 spin_unlock(&dlm->spinlock);
457
458 clear_node = 1;
459
460 status = dlm_send_one_domain_exit(dlm, node);
461 if (status < 0 &&
462 status != -ENOPROTOOPT &&
463 status != -ENOTCONN) {
464 mlog(ML_NOTICE, "Error %d sending domain exit message "
465 "to node %d\n", status, node);
466
467 /* Not sure what to do here but lets sleep for
468 * a bit in case this was a transient
469 * error... */
470 msleep(DLM_DOMAIN_BACKOFF_MS);
471 clear_node = 0;
472 }
473
474 spin_lock(&dlm->spinlock);
475 /* If we're not clearing the node bit then we intend
476 * to loop back around to try again. */
477 if (clear_node)
478 clear_bit(node, dlm->domain_map);
479 }
480 spin_unlock(&dlm->spinlock);
481}
482
483int dlm_joined(struct dlm_ctxt *dlm)
484{
485 int ret = 0;
486
487 spin_lock(&dlm_domain_lock);
488
489 if (dlm->dlm_state == DLM_CTXT_JOINED)
490 ret = 1;
491
492 spin_unlock(&dlm_domain_lock);
493
494 return ret;
495}
496
497int dlm_shutting_down(struct dlm_ctxt *dlm)
498{
499 int ret = 0;
500
501 spin_lock(&dlm_domain_lock);
502
503 if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN)
504 ret = 1;
505
506 spin_unlock(&dlm_domain_lock);
507
508 return ret;
509}
510
511void dlm_unregister_domain(struct dlm_ctxt *dlm)
512{
513 int leave = 0;
514
515 spin_lock(&dlm_domain_lock);
516 BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED);
517 BUG_ON(!dlm->num_joins);
518
519 dlm->num_joins--;
520 if (!dlm->num_joins) {
521 /* We mark it "in shutdown" now so new register
522 * requests wait until we've completely left the
523 * domain. Don't use DLM_CTXT_LEAVING yet as we still
524 * want new domain joins to communicate with us at
525 * least until we've completed migration of our
526 * resources. */
527 dlm->dlm_state = DLM_CTXT_IN_SHUTDOWN;
528 leave = 1;
529 }
530 spin_unlock(&dlm_domain_lock);
531
532 if (leave) {
533 mlog(0, "shutting down domain %s\n", dlm->name);
534
535 /* We changed dlm state, notify the thread */
536 dlm_kick_thread(dlm, NULL);
537
538 dlm_migrate_all_locks(dlm);
539 dlm_mark_domain_leaving(dlm);
540 dlm_leave_domain(dlm);
541 dlm_complete_dlm_shutdown(dlm);
542 }
543 dlm_put(dlm);
544}
545EXPORT_SYMBOL_GPL(dlm_unregister_domain);
546
547static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data)
548{
549 struct dlm_query_join_request *query;
550 enum dlm_query_join_response response;
551 struct dlm_ctxt *dlm = NULL;
552
553 query = (struct dlm_query_join_request *) msg->buf;
554
555 mlog(0, "node %u wants to join domain %s\n", query->node_idx,
556 query->domain);
557
558 /*
559 * If heartbeat doesn't consider the node live, tell it
560 * to back off and try again. This gives heartbeat a chance
561 * to catch up.
562 */
563 if (!o2hb_check_node_heartbeating(query->node_idx)) {
564 mlog(0, "node %u is not in our live map yet\n",
565 query->node_idx);
566
567 response = JOIN_DISALLOW;
568 goto respond;
569 }
570
571 response = JOIN_OK_NO_MAP;
572
573 spin_lock(&dlm_domain_lock);
574 dlm = __dlm_lookup_domain_full(query->domain, query->name_len);
575 /* Once the dlm ctxt is marked as leaving then we don't want
576 * to be put in someone's domain map. */
577 if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) {
578 spin_lock(&dlm->spinlock);
579
580 if (dlm->dlm_state == DLM_CTXT_NEW &&
581 dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN) {
582 /*If this is a brand new context and we
583 * haven't started our join process yet, then
584 * the other node won the race. */
585 response = JOIN_OK_NO_MAP;
586 } else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) {
587 /* Disallow parallel joins. */
588 response = JOIN_DISALLOW;
589 } else {
590 /* Alright we're fully a part of this domain
591 * so we keep some state as to who's joining
592 * and indicate to him that needs to be fixed
593 * up. */
594 response = JOIN_OK;
595 __dlm_set_joining_node(dlm, query->node_idx);
596 }
597
598 spin_unlock(&dlm->spinlock);
599 }
600 spin_unlock(&dlm_domain_lock);
601
602respond:
603 mlog(0, "We respond with %u\n", response);
604
605 return response;
606}
607
608static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data)
609{
610 struct dlm_assert_joined *assert;
611 struct dlm_ctxt *dlm = NULL;
612
613 assert = (struct dlm_assert_joined *) msg->buf;
614
615 mlog(0, "node %u asserts join on domain %s\n", assert->node_idx,
616 assert->domain);
617
618 spin_lock(&dlm_domain_lock);
619 dlm = __dlm_lookup_domain_full(assert->domain, assert->name_len);
620 /* XXX should we consider no dlm ctxt an error? */
621 if (dlm) {
622 spin_lock(&dlm->spinlock);
623
624 /* Alright, this node has officially joined our
625 * domain. Set him in the map and clean up our
626 * leftover join state. */
627 BUG_ON(dlm->joining_node != assert->node_idx);
628 set_bit(assert->node_idx, dlm->domain_map);
629 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
630
631 __dlm_print_nodes(dlm);
632
633 /* notify anything attached to the heartbeat events */
634 dlm_hb_event_notify_attached(dlm, assert->node_idx, 1);
635
636 spin_unlock(&dlm->spinlock);
637 }
638 spin_unlock(&dlm_domain_lock);
639
640 return 0;
641}
642
643static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data)
644{
645 struct dlm_cancel_join *cancel;
646 struct dlm_ctxt *dlm = NULL;
647
648 cancel = (struct dlm_cancel_join *) msg->buf;
649
650 mlog(0, "node %u cancels join on domain %s\n", cancel->node_idx,
651 cancel->domain);
652
653 spin_lock(&dlm_domain_lock);
654 dlm = __dlm_lookup_domain_full(cancel->domain, cancel->name_len);
655
656 if (dlm) {
657 spin_lock(&dlm->spinlock);
658
659 /* Yikes, this guy wants to cancel his join. No
660 * problem, we simply cleanup our join state. */
661 BUG_ON(dlm->joining_node != cancel->node_idx);
662 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
663
664 spin_unlock(&dlm->spinlock);
665 }
666 spin_unlock(&dlm_domain_lock);
667
668 return 0;
669}
670
671static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm,
672 unsigned int node)
673{
674 int status;
675 struct dlm_cancel_join cancel_msg;
676
677 memset(&cancel_msg, 0, sizeof(cancel_msg));
678 cancel_msg.node_idx = dlm->node_num;
679 cancel_msg.name_len = strlen(dlm->name);
680 memcpy(cancel_msg.domain, dlm->name, cancel_msg.name_len);
681
682 status = o2net_send_message(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
683 &cancel_msg, sizeof(cancel_msg), node,
684 NULL);
685 if (status < 0) {
686 mlog_errno(status);
687 goto bail;
688 }
689
690bail:
691 return status;
692}
693
694/* map_size should be in bytes. */
695static int dlm_send_join_cancels(struct dlm_ctxt *dlm,
696 unsigned long *node_map,
697 unsigned int map_size)
698{
699 int status, tmpstat;
700 unsigned int node;
701
702 if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) *
703 sizeof(unsigned long))) {
704 mlog(ML_ERROR,
705 "map_size %u != BITS_TO_LONGS(O2NM_MAX_NODES) %u\n",
706 map_size, BITS_TO_LONGS(O2NM_MAX_NODES));
707 return -EINVAL;
708 }
709
710 status = 0;
711 node = -1;
712 while ((node = find_next_bit(node_map, O2NM_MAX_NODES,
713 node + 1)) < O2NM_MAX_NODES) {
714 if (node == dlm->node_num)
715 continue;
716
717 tmpstat = dlm_send_one_join_cancel(dlm, node);
718 if (tmpstat) {
719 mlog(ML_ERROR, "Error return %d cancelling join on "
720 "node %d\n", tmpstat, node);
721 if (!status)
722 status = tmpstat;
723 }
724 }
725
726 if (status)
727 mlog_errno(status);
728 return status;
729}
730
731static int dlm_request_join(struct dlm_ctxt *dlm,
732 int node,
733 enum dlm_query_join_response *response)
734{
735 int status, retval;
736 struct dlm_query_join_request join_msg;
737
738 mlog(0, "querying node %d\n", node);
739
740 memset(&join_msg, 0, sizeof(join_msg));
741 join_msg.node_idx = dlm->node_num;
742 join_msg.name_len = strlen(dlm->name);
743 memcpy(join_msg.domain, dlm->name, join_msg.name_len);
744
745 status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
746 sizeof(join_msg), node, &retval);
747 if (status < 0 && status != -ENOPROTOOPT) {
748 mlog_errno(status);
749 goto bail;
750 }
751
752 /* -ENOPROTOOPT from the net code means the other side isn't
753 listening for our message type -- that's fine, it means
754 his dlm isn't up, so we can consider him a 'yes' but not
755 joined into the domain. */
756 if (status == -ENOPROTOOPT) {
757 status = 0;
758 *response = JOIN_OK_NO_MAP;
759 } else if (retval == JOIN_DISALLOW ||
760 retval == JOIN_OK ||
761 retval == JOIN_OK_NO_MAP) {
762 *response = retval;
763 } else {
764 status = -EINVAL;
765 mlog(ML_ERROR, "invalid response %d from node %u\n", retval,
766 node);
767 }
768
769 mlog(0, "status %d, node %d response is %d\n", status, node,
770 *response);
771
772bail:
773 return status;
774}
775
776static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
777 unsigned int node)
778{
779 int status;
780 struct dlm_assert_joined assert_msg;
781
782 mlog(0, "Sending join assert to node %u\n", node);
783
784 memset(&assert_msg, 0, sizeof(assert_msg));
785 assert_msg.node_idx = dlm->node_num;
786 assert_msg.name_len = strlen(dlm->name);
787 memcpy(assert_msg.domain, dlm->name, assert_msg.name_len);
788
789 status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
790 &assert_msg, sizeof(assert_msg), node,
791 NULL);
792 if (status < 0)
793 mlog_errno(status);
794
795 return status;
796}
797
798static void dlm_send_join_asserts(struct dlm_ctxt *dlm,
799 unsigned long *node_map)
800{
801 int status, node, live;
802
803 status = 0;
804 node = -1;
805 while ((node = find_next_bit(node_map, O2NM_MAX_NODES,
806 node + 1)) < O2NM_MAX_NODES) {
807 if (node == dlm->node_num)
808 continue;
809
810 do {
811 /* It is very important that this message be
812 * received so we spin until either the node
813 * has died or it gets the message. */
814 status = dlm_send_one_join_assert(dlm, node);
815
816 spin_lock(&dlm->spinlock);
817 live = test_bit(node, dlm->live_nodes_map);
818 spin_unlock(&dlm->spinlock);
819
820 if (status) {
821 mlog(ML_ERROR, "Error return %d asserting "
822 "join on node %d\n", status, node);
823
824 /* give us some time between errors... */
825 if (live)
826 msleep(DLM_DOMAIN_BACKOFF_MS);
827 }
828 } while (status && live);
829 }
830}
831
832struct domain_join_ctxt {
833 unsigned long live_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
834 unsigned long yes_resp_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
835};
836
837static int dlm_should_restart_join(struct dlm_ctxt *dlm,
838 struct domain_join_ctxt *ctxt,
839 enum dlm_query_join_response response)
840{
841 int ret;
842
843 if (response == JOIN_DISALLOW) {
844 mlog(0, "Latest response of disallow -- should restart\n");
845 return 1;
846 }
847
848 spin_lock(&dlm->spinlock);
849 /* For now, we restart the process if the node maps have
850 * changed at all */
851 ret = memcmp(ctxt->live_map, dlm->live_nodes_map,
852 sizeof(dlm->live_nodes_map));
853 spin_unlock(&dlm->spinlock);
854
855 if (ret)
856 mlog(0, "Node maps changed -- should restart\n");
857
858 return ret;
859}
860
861static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
862{
863 int status = 0, tmpstat, node;
864 struct domain_join_ctxt *ctxt;
865 enum dlm_query_join_response response;
866
867 mlog_entry("%p", dlm);
868
869 ctxt = kcalloc(1, sizeof(*ctxt), GFP_KERNEL);
870 if (!ctxt) {
871 status = -ENOMEM;
872 mlog_errno(status);
873 goto bail;
874 }
875
876 /* group sem locking should work for us here -- we're already
877 * registered for heartbeat events so filling this should be
878 * atomic wrt getting those handlers called. */
879 o2hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map));
880
881 spin_lock(&dlm->spinlock);
882 memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map));
883
884 __dlm_set_joining_node(dlm, dlm->node_num);
885
886 spin_unlock(&dlm->spinlock);
887
888 node = -1;
889 while ((node = find_next_bit(ctxt->live_map, O2NM_MAX_NODES,
890 node + 1)) < O2NM_MAX_NODES) {
891 if (node == dlm->node_num)
892 continue;
893
894 status = dlm_request_join(dlm, node, &response);
895 if (status < 0) {
896 mlog_errno(status);
897 goto bail;
898 }
899
900 /* Ok, either we got a response or the node doesn't have a
901 * dlm up. */
902 if (response == JOIN_OK)
903 set_bit(node, ctxt->yes_resp_map);
904
905 if (dlm_should_restart_join(dlm, ctxt, response)) {
906 status = -EAGAIN;
907 goto bail;
908 }
909 }
910
911 mlog(0, "Yay, done querying nodes!\n");
912
913 /* Yay, everyone agree's we can join the domain. My domain is
914 * comprised of all nodes who were put in the
915 * yes_resp_map. Copy that into our domain map and send a join
916 * assert message to clean up everyone elses state. */
917 spin_lock(&dlm->spinlock);
918 memcpy(dlm->domain_map, ctxt->yes_resp_map,
919 sizeof(ctxt->yes_resp_map));
920 set_bit(dlm->node_num, dlm->domain_map);
921 spin_unlock(&dlm->spinlock);
922
923 dlm_send_join_asserts(dlm, ctxt->yes_resp_map);
924
925 /* Joined state *must* be set before the joining node
926 * information, otherwise the query_join handler may read no
927 * current joiner but a state of NEW and tell joining nodes
928 * we're not in the domain. */
929 spin_lock(&dlm_domain_lock);
930 dlm->dlm_state = DLM_CTXT_JOINED;
931 dlm->num_joins++;
932 spin_unlock(&dlm_domain_lock);
933
934bail:
935 spin_lock(&dlm->spinlock);
936 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
937 if (!status)
938 __dlm_print_nodes(dlm);
939 spin_unlock(&dlm->spinlock);
940
941 if (ctxt) {
942 /* Do we need to send a cancel message to any nodes? */
943 if (status < 0) {
944 tmpstat = dlm_send_join_cancels(dlm,
945 ctxt->yes_resp_map,
946 sizeof(ctxt->yes_resp_map));
947 if (tmpstat < 0)
948 mlog_errno(tmpstat);
949 }
950 kfree(ctxt);
951 }
952
953 mlog(0, "returning %d\n", status);
954 return status;
955}
956
957static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
958{
959 o2hb_unregister_callback(&dlm->dlm_hb_up);
960 o2hb_unregister_callback(&dlm->dlm_hb_down);
961 o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
962}
963
964static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
965{
966 int status;
967
968 mlog(0, "registering handlers.\n");
969
970 o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
971 dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
972 status = o2hb_register_callback(&dlm->dlm_hb_down);
973 if (status)
974 goto bail;
975
976 o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
977 dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
978 status = o2hb_register_callback(&dlm->dlm_hb_up);
979 if (status)
980 goto bail;
981
982 status = o2net_register_handler(DLM_MASTER_REQUEST_MSG, dlm->key,
983 sizeof(struct dlm_master_request),
984 dlm_master_request_handler,
985 dlm, &dlm->dlm_domain_handlers);
986 if (status)
987 goto bail;
988
989 status = o2net_register_handler(DLM_ASSERT_MASTER_MSG, dlm->key,
990 sizeof(struct dlm_assert_master),
991 dlm_assert_master_handler,
992 dlm, &dlm->dlm_domain_handlers);
993 if (status)
994 goto bail;
995
996 status = o2net_register_handler(DLM_CREATE_LOCK_MSG, dlm->key,
997 sizeof(struct dlm_create_lock),
998 dlm_create_lock_handler,
999 dlm, &dlm->dlm_domain_handlers);
1000 if (status)
1001 goto bail;
1002
1003 status = o2net_register_handler(DLM_CONVERT_LOCK_MSG, dlm->key,
1004 DLM_CONVERT_LOCK_MAX_LEN,
1005 dlm_convert_lock_handler,
1006 dlm, &dlm->dlm_domain_handlers);
1007 if (status)
1008 goto bail;
1009
1010 status = o2net_register_handler(DLM_UNLOCK_LOCK_MSG, dlm->key,
1011 DLM_UNLOCK_LOCK_MAX_LEN,
1012 dlm_unlock_lock_handler,
1013 dlm, &dlm->dlm_domain_handlers);
1014 if (status)
1015 goto bail;
1016
1017 status = o2net_register_handler(DLM_PROXY_AST_MSG, dlm->key,
1018 DLM_PROXY_AST_MAX_LEN,
1019 dlm_proxy_ast_handler,
1020 dlm, &dlm->dlm_domain_handlers);
1021 if (status)
1022 goto bail;
1023
1024 status = o2net_register_handler(DLM_EXIT_DOMAIN_MSG, dlm->key,
1025 sizeof(struct dlm_exit_domain),
1026 dlm_exit_domain_handler,
1027 dlm, &dlm->dlm_domain_handlers);
1028 if (status)
1029 goto bail;
1030
1031 status = o2net_register_handler(DLM_MIGRATE_REQUEST_MSG, dlm->key,
1032 sizeof(struct dlm_migrate_request),
1033 dlm_migrate_request_handler,
1034 dlm, &dlm->dlm_domain_handlers);
1035 if (status)
1036 goto bail;
1037
1038 status = o2net_register_handler(DLM_MIG_LOCKRES_MSG, dlm->key,
1039 DLM_MIG_LOCKRES_MAX_LEN,
1040 dlm_mig_lockres_handler,
1041 dlm, &dlm->dlm_domain_handlers);
1042 if (status)
1043 goto bail;
1044
1045 status = o2net_register_handler(DLM_MASTER_REQUERY_MSG, dlm->key,
1046 sizeof(struct dlm_master_requery),
1047 dlm_master_requery_handler,
1048 dlm, &dlm->dlm_domain_handlers);
1049 if (status)
1050 goto bail;
1051
1052 status = o2net_register_handler(DLM_LOCK_REQUEST_MSG, dlm->key,
1053 sizeof(struct dlm_lock_request),
1054 dlm_request_all_locks_handler,
1055 dlm, &dlm->dlm_domain_handlers);
1056 if (status)
1057 goto bail;
1058
1059 status = o2net_register_handler(DLM_RECO_DATA_DONE_MSG, dlm->key,
1060 sizeof(struct dlm_reco_data_done),
1061 dlm_reco_data_done_handler,
1062 dlm, &dlm->dlm_domain_handlers);
1063 if (status)
1064 goto bail;
1065
1066 status = o2net_register_handler(DLM_BEGIN_RECO_MSG, dlm->key,
1067 sizeof(struct dlm_begin_reco),
1068 dlm_begin_reco_handler,
1069 dlm, &dlm->dlm_domain_handlers);
1070 if (status)
1071 goto bail;
1072
1073 status = o2net_register_handler(DLM_FINALIZE_RECO_MSG, dlm->key,
1074 sizeof(struct dlm_finalize_reco),
1075 dlm_finalize_reco_handler,
1076 dlm, &dlm->dlm_domain_handlers);
1077 if (status)
1078 goto bail;
1079
1080bail:
1081 if (status)
1082 dlm_unregister_domain_handlers(dlm);
1083
1084 return status;
1085}
1086
1087static int dlm_join_domain(struct dlm_ctxt *dlm)
1088{
1089 int status;
1090
1091 BUG_ON(!dlm);
1092
1093 mlog(0, "Join domain %s\n", dlm->name);
1094
1095 status = dlm_register_domain_handlers(dlm);
1096 if (status) {
1097 mlog_errno(status);
1098 goto bail;
1099 }
1100
1101 status = dlm_launch_thread(dlm);
1102 if (status < 0) {
1103 mlog_errno(status);
1104 goto bail;
1105 }
1106
1107 status = dlm_launch_recovery_thread(dlm);
1108 if (status < 0) {
1109 mlog_errno(status);
1110 goto bail;
1111 }
1112
1113 do {
1114 unsigned int backoff;
1115 status = dlm_try_to_join_domain(dlm);
1116
1117 /* If we're racing another node to the join, then we
1118 * need to back off temporarily and let them
1119 * complete. */
1120 if (status == -EAGAIN) {
1121 if (signal_pending(current)) {
1122 status = -ERESTARTSYS;
1123 goto bail;
1124 }
1125
1126 /*
1127 * <chip> After you!
1128 * <dale> No, after you!
1129 * <chip> I insist!
1130 * <dale> But you first!
1131 * ...
1132 */
1133 backoff = (unsigned int)(jiffies & 0x3);
1134 backoff *= DLM_DOMAIN_BACKOFF_MS;
1135 mlog(0, "backoff %d\n", backoff);
1136 msleep(backoff);
1137 }
1138 } while (status == -EAGAIN);
1139
1140 if (status < 0) {
1141 mlog_errno(status);
1142 goto bail;
1143 }
1144
1145 status = 0;
1146bail:
1147 wake_up(&dlm_domain_events);
1148
1149 if (status) {
1150 dlm_unregister_domain_handlers(dlm);
1151 dlm_complete_thread(dlm);
1152 dlm_complete_recovery_thread(dlm);
1153 }
1154
1155 return status;
1156}
1157
1158static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1159 u32 key)
1160{
1161 int i;
1162 struct dlm_ctxt *dlm = NULL;
1163
1164 dlm = kcalloc(1, sizeof(*dlm), GFP_KERNEL);
1165 if (!dlm) {
1166 mlog_errno(-ENOMEM);
1167 goto leave;
1168 }
1169
1170 dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL);
1171 if (dlm->name == NULL) {
1172 mlog_errno(-ENOMEM);
1173 kfree(dlm);
1174 dlm = NULL;
1175 goto leave;
1176 }
1177
1178 dlm->resources = (struct list_head *) __get_free_page(GFP_KERNEL);
1179 if (!dlm->resources) {
1180 mlog_errno(-ENOMEM);
1181 kfree(dlm->name);
1182 kfree(dlm);
1183 dlm = NULL;
1184 goto leave;
1185 }
1186 memset(dlm->resources, 0, PAGE_SIZE);
1187
1188 for (i=0; i<DLM_HASH_SIZE; i++)
1189 INIT_LIST_HEAD(&dlm->resources[i]);
1190
1191 strcpy(dlm->name, domain);
1192 dlm->key = key;
1193 dlm->node_num = o2nm_this_node();
1194
1195 spin_lock_init(&dlm->spinlock);
1196 spin_lock_init(&dlm->master_lock);
1197 spin_lock_init(&dlm->ast_lock);
1198 INIT_LIST_HEAD(&dlm->list);
1199 INIT_LIST_HEAD(&dlm->dirty_list);
1200 INIT_LIST_HEAD(&dlm->reco.resources);
1201 INIT_LIST_HEAD(&dlm->reco.received);
1202 INIT_LIST_HEAD(&dlm->reco.node_data);
1203 INIT_LIST_HEAD(&dlm->purge_list);
1204 INIT_LIST_HEAD(&dlm->dlm_domain_handlers);
1205 dlm->reco.state = 0;
1206
1207 INIT_LIST_HEAD(&dlm->pending_asts);
1208 INIT_LIST_HEAD(&dlm->pending_basts);
1209
1210 mlog(0, "dlm->recovery_map=%p, &(dlm->recovery_map[0])=%p\n",
1211 dlm->recovery_map, &(dlm->recovery_map[0]));
1212
1213 memset(dlm->recovery_map, 0, sizeof(dlm->recovery_map));
1214 memset(dlm->live_nodes_map, 0, sizeof(dlm->live_nodes_map));
1215 memset(dlm->domain_map, 0, sizeof(dlm->domain_map));
1216
1217 dlm->dlm_thread_task = NULL;
1218 dlm->dlm_reco_thread_task = NULL;
1219 init_waitqueue_head(&dlm->dlm_thread_wq);
1220 init_waitqueue_head(&dlm->dlm_reco_thread_wq);
1221 init_waitqueue_head(&dlm->reco.event);
1222 init_waitqueue_head(&dlm->ast_wq);
1223 init_waitqueue_head(&dlm->migration_wq);
1224 INIT_LIST_HEAD(&dlm->master_list);
1225 INIT_LIST_HEAD(&dlm->mle_hb_events);
1226
1227 dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
1228 init_waitqueue_head(&dlm->dlm_join_events);
1229
1230 dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
1231 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
1232 atomic_set(&dlm->local_resources, 0);
1233 atomic_set(&dlm->remote_resources, 0);
1234 atomic_set(&dlm->unknown_resources, 0);
1235
1236 spin_lock_init(&dlm->work_lock);
1237 INIT_LIST_HEAD(&dlm->work_list);
1238 INIT_WORK(&dlm->dispatched_work, dlm_dispatch_work, dlm);
1239
1240 kref_init(&dlm->dlm_refs);
1241 dlm->dlm_state = DLM_CTXT_NEW;
1242
1243 INIT_LIST_HEAD(&dlm->dlm_eviction_callbacks);
1244
1245 mlog(0, "context init: refcount %u\n",
1246 atomic_read(&dlm->dlm_refs.refcount));
1247
1248leave:
1249 return dlm;
1250}
1251
1252/*
1253 * dlm_register_domain: one-time setup per "domain"
1254 */
1255struct dlm_ctxt * dlm_register_domain(const char *domain,
1256 u32 key)
1257{
1258 int ret;
1259 struct dlm_ctxt *dlm = NULL;
1260 struct dlm_ctxt *new_ctxt = NULL;
1261
1262 if (strlen(domain) > O2NM_MAX_NAME_LEN) {
1263 ret = -ENAMETOOLONG;
1264 mlog(ML_ERROR, "domain name length too long\n");
1265 goto leave;
1266 }
1267
1268 if (!o2hb_check_local_node_heartbeating()) {
1269 mlog(ML_ERROR, "the local node has not been configured, or is "
1270 "not heartbeating\n");
1271 ret = -EPROTO;
1272 goto leave;
1273 }
1274
1275 mlog(0, "register called for domain \"%s\"\n", domain);
1276
1277retry:
1278 dlm = NULL;
1279 if (signal_pending(current)) {
1280 ret = -ERESTARTSYS;
1281 mlog_errno(ret);
1282 goto leave;
1283 }
1284
1285 spin_lock(&dlm_domain_lock);
1286
1287 dlm = __dlm_lookup_domain(domain);
1288 if (dlm) {
1289 if (dlm->dlm_state != DLM_CTXT_JOINED) {
1290 spin_unlock(&dlm_domain_lock);
1291
1292 mlog(0, "This ctxt is not joined yet!\n");
1293 wait_event_interruptible(dlm_domain_events,
1294 dlm_wait_on_domain_helper(
1295 domain));
1296 goto retry;
1297 }
1298
1299 __dlm_get(dlm);
1300 dlm->num_joins++;
1301
1302 spin_unlock(&dlm_domain_lock);
1303
1304 ret = 0;
1305 goto leave;
1306 }
1307
1308 /* doesn't exist */
1309 if (!new_ctxt) {
1310 spin_unlock(&dlm_domain_lock);
1311
1312 new_ctxt = dlm_alloc_ctxt(domain, key);
1313 if (new_ctxt)
1314 goto retry;
1315
1316 ret = -ENOMEM;
1317 mlog_errno(ret);
1318 goto leave;
1319 }
1320
1321 /* a little variable switch-a-roo here... */
1322 dlm = new_ctxt;
1323 new_ctxt = NULL;
1324
1325 /* add the new domain */
1326 list_add_tail(&dlm->list, &dlm_domains);
1327 spin_unlock(&dlm_domain_lock);
1328
1329 ret = dlm_join_domain(dlm);
1330 if (ret) {
1331 mlog_errno(ret);
1332 dlm_put(dlm);
1333 goto leave;
1334 }
1335
1336 ret = 0;
1337leave:
1338 if (new_ctxt)
1339 dlm_free_ctxt_mem(new_ctxt);
1340
1341 if (ret < 0)
1342 dlm = ERR_PTR(ret);
1343
1344 return dlm;
1345}
1346EXPORT_SYMBOL_GPL(dlm_register_domain);
1347
1348static LIST_HEAD(dlm_join_handlers);
1349
1350static void dlm_unregister_net_handlers(void)
1351{
1352 o2net_unregister_handler_list(&dlm_join_handlers);
1353}
1354
1355static int dlm_register_net_handlers(void)
1356{
1357 int status = 0;
1358
1359 status = o2net_register_handler(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
1360 sizeof(struct dlm_query_join_request),
1361 dlm_query_join_handler,
1362 NULL, &dlm_join_handlers);
1363 if (status)
1364 goto bail;
1365
1366 status = o2net_register_handler(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
1367 sizeof(struct dlm_assert_joined),
1368 dlm_assert_joined_handler,
1369 NULL, &dlm_join_handlers);
1370 if (status)
1371 goto bail;
1372
1373 status = o2net_register_handler(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
1374 sizeof(struct dlm_cancel_join),
1375 dlm_cancel_join_handler,
1376 NULL, &dlm_join_handlers);
1377
1378bail:
1379 if (status < 0)
1380 dlm_unregister_net_handlers();
1381
1382 return status;
1383}
1384
1385/* Domain eviction callback handling.
1386 *
1387 * The file system requires notification of node death *before* the
1388 * dlm completes it's recovery work, otherwise it may be able to
1389 * acquire locks on resources requiring recovery. Since the dlm can
1390 * evict a node from it's domain *before* heartbeat fires, a similar
1391 * mechanism is required. */
1392
1393/* Eviction is not expected to happen often, so a per-domain lock is
1394 * not necessary. Eviction callbacks are allowed to sleep for short
1395 * periods of time. */
1396static DECLARE_RWSEM(dlm_callback_sem);
1397
1398void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
1399 int node_num)
1400{
1401 struct list_head *iter;
1402 struct dlm_eviction_cb *cb;
1403
1404 down_read(&dlm_callback_sem);
1405 list_for_each(iter, &dlm->dlm_eviction_callbacks) {
1406 cb = list_entry(iter, struct dlm_eviction_cb, ec_item);
1407
1408 cb->ec_func(node_num, cb->ec_data);
1409 }
1410 up_read(&dlm_callback_sem);
1411}
1412
1413void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb,
1414 dlm_eviction_func *f,
1415 void *data)
1416{
1417 INIT_LIST_HEAD(&cb->ec_item);
1418 cb->ec_func = f;
1419 cb->ec_data = data;
1420}
1421EXPORT_SYMBOL_GPL(dlm_setup_eviction_cb);
1422
1423void dlm_register_eviction_cb(struct dlm_ctxt *dlm,
1424 struct dlm_eviction_cb *cb)
1425{
1426 down_write(&dlm_callback_sem);
1427 list_add_tail(&cb->ec_item, &dlm->dlm_eviction_callbacks);
1428 up_write(&dlm_callback_sem);
1429}
1430EXPORT_SYMBOL_GPL(dlm_register_eviction_cb);
1431
1432void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb)
1433{
1434 down_write(&dlm_callback_sem);
1435 list_del_init(&cb->ec_item);
1436 up_write(&dlm_callback_sem);
1437}
1438EXPORT_SYMBOL_GPL(dlm_unregister_eviction_cb);
1439
1440static int __init dlm_init(void)
1441{
1442 int status;
1443
1444 dlm_print_version();
1445
1446 status = dlm_init_mle_cache();
1447 if (status)
1448 return -1;
1449
1450 status = dlm_register_net_handlers();
1451 if (status) {
1452 dlm_destroy_mle_cache();
1453 return -1;
1454 }
1455
1456 return 0;
1457}
1458
1459static void __exit dlm_exit (void)
1460{
1461 dlm_unregister_net_handlers();
1462 dlm_destroy_mle_cache();
1463}
1464
1465MODULE_AUTHOR("Oracle");
1466MODULE_LICENSE("GPL");
1467
1468module_init(dlm_init);
1469module_exit(dlm_exit);
diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h
new file mode 100644
index 00000000000..2f7f60bfeb3
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdomain.h
@@ -0,0 +1,36 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmdomain.h
5 *
6 * Copyright (C) 2004 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public
19 * License along with this program; if not, write to the
20 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 * Boston, MA 021110-1307, USA.
22 *
23 */
24
25#ifndef DLMDOMAIN_H
26#define DLMDOMAIN_H
27
28extern spinlock_t dlm_domain_lock;
29extern struct list_head dlm_domains;
30
31int dlm_joined(struct dlm_ctxt *dlm);
32int dlm_shutting_down(struct dlm_ctxt *dlm);
33void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
34 int node_num);
35
36#endif
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
new file mode 100644
index 00000000000..dd2d24dc25e
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -0,0 +1,640 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmfs.c
5 *
6 * Code which implements the kernel side of a minimal userspace
7 * interface to our DLM. This file handles the virtual file system
8 * used for communication with userspace. Credit should go to ramfs,
9 * which was a template for the fs side of this module.
10 *
11 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
12 *
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public
15 * License as published by the Free Software Foundation; either
16 * version 2 of the License, or (at your option) any later version.
17 *
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 * General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public
24 * License along with this program; if not, write to the
25 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
26 * Boston, MA 021110-1307, USA.
27 */
28
29/* Simple VFS hooks based on: */
30/*
31 * Resizable simple ram filesystem for Linux.
32 *
33 * Copyright (C) 2000 Linus Torvalds.
34 * 2000 Transmeta Corp.
35 */
36
37#include <linux/module.h>
38#include <linux/fs.h>
39#include <linux/pagemap.h>
40#include <linux/types.h>
41#include <linux/slab.h>
42#include <linux/highmem.h>
43#include <linux/init.h>
44#include <linux/string.h>
45#include <linux/smp_lock.h>
46#include <linux/backing-dev.h>
47
48#include <asm/uaccess.h>
49
50
51#include "cluster/nodemanager.h"
52#include "cluster/heartbeat.h"
53#include "cluster/tcp.h"
54
55#include "dlmapi.h"
56
57#include "userdlm.h"
58
59#include "dlmfsver.h"
60
61#define MLOG_MASK_PREFIX ML_DLMFS
62#include "cluster/masklog.h"
63
64static struct super_operations dlmfs_ops;
65static struct file_operations dlmfs_file_operations;
66static struct inode_operations dlmfs_dir_inode_operations;
67static struct inode_operations dlmfs_root_inode_operations;
68static struct inode_operations dlmfs_file_inode_operations;
69static kmem_cache_t *dlmfs_inode_cache;
70
71struct workqueue_struct *user_dlm_worker;
72
73/*
74 * decodes a set of open flags into a valid lock level and a set of flags.
75 * returns < 0 if we have invalid flags
76 * flags which mean something to us:
77 * O_RDONLY -> PRMODE level
78 * O_WRONLY -> EXMODE level
79 *
80 * O_NONBLOCK -> LKM_NOQUEUE
81 */
82static int dlmfs_decode_open_flags(int open_flags,
83 int *level,
84 int *flags)
85{
86 if (open_flags & (O_WRONLY|O_RDWR))
87 *level = LKM_EXMODE;
88 else
89 *level = LKM_PRMODE;
90
91 *flags = 0;
92 if (open_flags & O_NONBLOCK)
93 *flags |= LKM_NOQUEUE;
94
95 return 0;
96}
97
98static int dlmfs_file_open(struct inode *inode,
99 struct file *file)
100{
101 int status, level, flags;
102 struct dlmfs_filp_private *fp = NULL;
103 struct dlmfs_inode_private *ip;
104
105 if (S_ISDIR(inode->i_mode))
106 BUG();
107
108 mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino,
109 file->f_flags);
110
111 status = dlmfs_decode_open_flags(file->f_flags, &level, &flags);
112 if (status < 0)
113 goto bail;
114
115 /* We don't want to honor O_APPEND at read/write time as it
116 * doesn't make sense for LVB writes. */
117 file->f_flags &= ~O_APPEND;
118
119 fp = kmalloc(sizeof(*fp), GFP_KERNEL);
120 if (!fp) {
121 status = -ENOMEM;
122 goto bail;
123 }
124 fp->fp_lock_level = level;
125
126 ip = DLMFS_I(inode);
127
128 status = user_dlm_cluster_lock(&ip->ip_lockres, level, flags);
129 if (status < 0) {
130 /* this is a strange error to return here but I want
131 * to be able userspace to be able to distinguish a
132 * valid lock request from one that simply couldn't be
133 * granted. */
134 if (flags & LKM_NOQUEUE && status == -EAGAIN)
135 status = -ETXTBSY;
136 kfree(fp);
137 goto bail;
138 }
139
140 file->private_data = fp;
141bail:
142 return status;
143}
144
145static int dlmfs_file_release(struct inode *inode,
146 struct file *file)
147{
148 int level, status;
149 struct dlmfs_inode_private *ip = DLMFS_I(inode);
150 struct dlmfs_filp_private *fp =
151 (struct dlmfs_filp_private *) file->private_data;
152
153 if (S_ISDIR(inode->i_mode))
154 BUG();
155
156 mlog(0, "close called on inode %lu\n", inode->i_ino);
157
158 status = 0;
159 if (fp) {
160 level = fp->fp_lock_level;
161 if (level != LKM_IVMODE)
162 user_dlm_cluster_unlock(&ip->ip_lockres, level);
163
164 kfree(fp);
165 file->private_data = NULL;
166 }
167
168 return 0;
169}
170
171static ssize_t dlmfs_file_read(struct file *filp,
172 char __user *buf,
173 size_t count,
174 loff_t *ppos)
175{
176 int bytes_left;
177 ssize_t readlen;
178 char *lvb_buf;
179 struct inode *inode = filp->f_dentry->d_inode;
180
181 mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
182 inode->i_ino, count, *ppos);
183
184 if (*ppos >= i_size_read(inode))
185 return 0;
186
187 if (!count)
188 return 0;
189
190 if (!access_ok(VERIFY_WRITE, buf, count))
191 return -EFAULT;
192
193 /* don't read past the lvb */
194 if ((count + *ppos) > i_size_read(inode))
195 readlen = i_size_read(inode) - *ppos;
196 else
197 readlen = count - *ppos;
198
199 lvb_buf = kmalloc(readlen, GFP_KERNEL);
200 if (!lvb_buf)
201 return -ENOMEM;
202
203 user_dlm_read_lvb(inode, lvb_buf, readlen);
204 bytes_left = __copy_to_user(buf, lvb_buf, readlen);
205 readlen -= bytes_left;
206
207 kfree(lvb_buf);
208
209 *ppos = *ppos + readlen;
210
211 mlog(0, "read %zd bytes\n", readlen);
212 return readlen;
213}
214
215static ssize_t dlmfs_file_write(struct file *filp,
216 const char __user *buf,
217 size_t count,
218 loff_t *ppos)
219{
220 int bytes_left;
221 ssize_t writelen;
222 char *lvb_buf;
223 struct inode *inode = filp->f_dentry->d_inode;
224
225 mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
226 inode->i_ino, count, *ppos);
227
228 if (*ppos >= i_size_read(inode))
229 return -ENOSPC;
230
231 if (!count)
232 return 0;
233
234 if (!access_ok(VERIFY_READ, buf, count))
235 return -EFAULT;
236
237 /* don't write past the lvb */
238 if ((count + *ppos) > i_size_read(inode))
239 writelen = i_size_read(inode) - *ppos;
240 else
241 writelen = count - *ppos;
242
243 lvb_buf = kmalloc(writelen, GFP_KERNEL);
244 if (!lvb_buf)
245 return -ENOMEM;
246
247 bytes_left = copy_from_user(lvb_buf, buf, writelen);
248 writelen -= bytes_left;
249 if (writelen)
250 user_dlm_write_lvb(inode, lvb_buf, writelen);
251
252 kfree(lvb_buf);
253
254 *ppos = *ppos + writelen;
255 mlog(0, "wrote %zd bytes\n", writelen);
256 return writelen;
257}
258
259static void dlmfs_init_once(void *foo,
260 kmem_cache_t *cachep,
261 unsigned long flags)
262{
263 struct dlmfs_inode_private *ip =
264 (struct dlmfs_inode_private *) foo;
265
266 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
267 SLAB_CTOR_CONSTRUCTOR) {
268 ip->ip_dlm = NULL;
269 ip->ip_parent = NULL;
270
271 inode_init_once(&ip->ip_vfs_inode);
272 }
273}
274
275static struct inode *dlmfs_alloc_inode(struct super_block *sb)
276{
277 struct dlmfs_inode_private *ip;
278
279 ip = kmem_cache_alloc(dlmfs_inode_cache, SLAB_NOFS);
280 if (!ip)
281 return NULL;
282
283 return &ip->ip_vfs_inode;
284}
285
286static void dlmfs_destroy_inode(struct inode *inode)
287{
288 kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
289}
290
291static void dlmfs_clear_inode(struct inode *inode)
292{
293 int status;
294 struct dlmfs_inode_private *ip;
295
296 if (!inode)
297 return;
298
299 mlog(0, "inode %lu\n", inode->i_ino);
300
301 ip = DLMFS_I(inode);
302
303 if (S_ISREG(inode->i_mode)) {
304 status = user_dlm_destroy_lock(&ip->ip_lockres);
305 if (status < 0)
306 mlog_errno(status);
307 iput(ip->ip_parent);
308 goto clear_fields;
309 }
310
311 mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm);
312 /* we must be a directory. If required, lets unregister the
313 * dlm context now. */
314 if (ip->ip_dlm)
315 user_dlm_unregister_context(ip->ip_dlm);
316clear_fields:
317 ip->ip_parent = NULL;
318 ip->ip_dlm = NULL;
319}
320
321static struct backing_dev_info dlmfs_backing_dev_info = {
322 .ra_pages = 0, /* No readahead */
323 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
324};
325
326static struct inode *dlmfs_get_root_inode(struct super_block *sb)
327{
328 struct inode *inode = new_inode(sb);
329 int mode = S_IFDIR | 0755;
330 struct dlmfs_inode_private *ip;
331
332 if (inode) {
333 ip = DLMFS_I(inode);
334
335 inode->i_mode = mode;
336 inode->i_uid = current->fsuid;
337 inode->i_gid = current->fsgid;
338 inode->i_blksize = PAGE_CACHE_SIZE;
339 inode->i_blocks = 0;
340 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
341 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
342 inode->i_nlink++;
343
344 inode->i_fop = &simple_dir_operations;
345 inode->i_op = &dlmfs_root_inode_operations;
346 }
347
348 return inode;
349}
350
351static struct inode *dlmfs_get_inode(struct inode *parent,
352 struct dentry *dentry,
353 int mode)
354{
355 struct super_block *sb = parent->i_sb;
356 struct inode * inode = new_inode(sb);
357 struct dlmfs_inode_private *ip;
358
359 if (!inode)
360 return NULL;
361
362 inode->i_mode = mode;
363 inode->i_uid = current->fsuid;
364 inode->i_gid = current->fsgid;
365 inode->i_blksize = PAGE_CACHE_SIZE;
366 inode->i_blocks = 0;
367 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
368 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
369
370 ip = DLMFS_I(inode);
371 ip->ip_dlm = DLMFS_I(parent)->ip_dlm;
372
373 switch (mode & S_IFMT) {
374 default:
375 /* for now we don't support anything other than
376 * directories and regular files. */
377 BUG();
378 break;
379 case S_IFREG:
380 inode->i_op = &dlmfs_file_inode_operations;
381 inode->i_fop = &dlmfs_file_operations;
382
383 i_size_write(inode, DLM_LVB_LEN);
384
385 user_dlm_lock_res_init(&ip->ip_lockres, dentry);
386
387 /* released at clear_inode time, this insures that we
388 * get to drop the dlm reference on each lock *before*
389 * we call the unregister code for releasing parent
390 * directories. */
391 ip->ip_parent = igrab(parent);
392 BUG_ON(!ip->ip_parent);
393 break;
394 case S_IFDIR:
395 inode->i_op = &dlmfs_dir_inode_operations;
396 inode->i_fop = &simple_dir_operations;
397
398 /* directory inodes start off with i_nlink ==
399 * 2 (for "." entry) */
400 inode->i_nlink++;
401 break;
402 }
403
404 if (parent->i_mode & S_ISGID) {
405 inode->i_gid = parent->i_gid;
406 if (S_ISDIR(mode))
407 inode->i_mode |= S_ISGID;
408 }
409
410 return inode;
411}
412
413/*
414 * File creation. Allocate an inode, and we're done..
415 */
416/* SMP-safe */
417static int dlmfs_mkdir(struct inode * dir,
418 struct dentry * dentry,
419 int mode)
420{
421 int status;
422 struct inode *inode = NULL;
423 struct qstr *domain = &dentry->d_name;
424 struct dlmfs_inode_private *ip;
425 struct dlm_ctxt *dlm;
426
427 mlog(0, "mkdir %.*s\n", domain->len, domain->name);
428
429 /* verify that we have a proper domain */
430 if (domain->len >= O2NM_MAX_NAME_LEN) {
431 status = -EINVAL;
432 mlog(ML_ERROR, "invalid domain name for directory.\n");
433 goto bail;
434 }
435
436 inode = dlmfs_get_inode(dir, dentry, mode | S_IFDIR);
437 if (!inode) {
438 status = -ENOMEM;
439 mlog_errno(status);
440 goto bail;
441 }
442
443 ip = DLMFS_I(inode);
444
445 dlm = user_dlm_register_context(domain);
446 if (IS_ERR(dlm)) {
447 status = PTR_ERR(dlm);
448 mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
449 status, domain->len, domain->name);
450 goto bail;
451 }
452 ip->ip_dlm = dlm;
453
454 dir->i_nlink++;
455 d_instantiate(dentry, inode);
456 dget(dentry); /* Extra count - pin the dentry in core */
457
458 status = 0;
459bail:
460 if (status < 0)
461 iput(inode);
462 return status;
463}
464
465static int dlmfs_create(struct inode *dir,
466 struct dentry *dentry,
467 int mode,
468 struct nameidata *nd)
469{
470 int status = 0;
471 struct inode *inode;
472 struct qstr *name = &dentry->d_name;
473
474 mlog(0, "create %.*s\n", name->len, name->name);
475
476 /* verify name is valid and doesn't contain any dlm reserved
477 * characters */
478 if (name->len >= USER_DLM_LOCK_ID_MAX_LEN ||
479 name->name[0] == '$') {
480 status = -EINVAL;
481 mlog(ML_ERROR, "invalid lock name, %.*s\n", name->len,
482 name->name);
483 goto bail;
484 }
485
486 inode = dlmfs_get_inode(dir, dentry, mode | S_IFREG);
487 if (!inode) {
488 status = -ENOMEM;
489 mlog_errno(status);
490 goto bail;
491 }
492
493 d_instantiate(dentry, inode);
494 dget(dentry); /* Extra count - pin the dentry in core */
495bail:
496 return status;
497}
498
499static int dlmfs_unlink(struct inode *dir,
500 struct dentry *dentry)
501{
502 int status;
503 struct inode *inode = dentry->d_inode;
504
505 mlog(0, "unlink inode %lu\n", inode->i_ino);
506
507 /* if there are no current holders, or none that are waiting
508 * to acquire a lock, this basically destroys our lockres. */
509 status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres);
510 if (status < 0) {
511 mlog(ML_ERROR, "unlink %.*s, error %d from destroy\n",
512 dentry->d_name.len, dentry->d_name.name, status);
513 goto bail;
514 }
515 status = simple_unlink(dir, dentry);
516bail:
517 return status;
518}
519
520static int dlmfs_fill_super(struct super_block * sb,
521 void * data,
522 int silent)
523{
524 struct inode * inode;
525 struct dentry * root;
526
527 sb->s_maxbytes = MAX_LFS_FILESIZE;
528 sb->s_blocksize = PAGE_CACHE_SIZE;
529 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
530 sb->s_magic = DLMFS_MAGIC;
531 sb->s_op = &dlmfs_ops;
532 inode = dlmfs_get_root_inode(sb);
533 if (!inode)
534 return -ENOMEM;
535
536 root = d_alloc_root(inode);
537 if (!root) {
538 iput(inode);
539 return -ENOMEM;
540 }
541 sb->s_root = root;
542 return 0;
543}
544
545static struct file_operations dlmfs_file_operations = {
546 .open = dlmfs_file_open,
547 .release = dlmfs_file_release,
548 .read = dlmfs_file_read,
549 .write = dlmfs_file_write,
550};
551
552static struct inode_operations dlmfs_dir_inode_operations = {
553 .create = dlmfs_create,
554 .lookup = simple_lookup,
555 .unlink = dlmfs_unlink,
556};
557
558/* this way we can restrict mkdir to only the toplevel of the fs. */
559static struct inode_operations dlmfs_root_inode_operations = {
560 .lookup = simple_lookup,
561 .mkdir = dlmfs_mkdir,
562 .rmdir = simple_rmdir,
563};
564
565static struct super_operations dlmfs_ops = {
566 .statfs = simple_statfs,
567 .alloc_inode = dlmfs_alloc_inode,
568 .destroy_inode = dlmfs_destroy_inode,
569 .clear_inode = dlmfs_clear_inode,
570 .drop_inode = generic_delete_inode,
571};
572
573static struct inode_operations dlmfs_file_inode_operations = {
574 .getattr = simple_getattr,
575};
576
577static struct super_block *dlmfs_get_sb(struct file_system_type *fs_type,
578 int flags, const char *dev_name, void *data)
579{
580 return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super);
581}
582
583static struct file_system_type dlmfs_fs_type = {
584 .owner = THIS_MODULE,
585 .name = "ocfs2_dlmfs",
586 .get_sb = dlmfs_get_sb,
587 .kill_sb = kill_litter_super,
588};
589
590static int __init init_dlmfs_fs(void)
591{
592 int status;
593 int cleanup_inode = 0, cleanup_worker = 0;
594
595 dlmfs_print_version();
596
597 dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
598 sizeof(struct dlmfs_inode_private),
599 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
600 dlmfs_init_once, NULL);
601 if (!dlmfs_inode_cache)
602 return -ENOMEM;
603 cleanup_inode = 1;
604
605 user_dlm_worker = create_singlethread_workqueue("user_dlm");
606 if (!user_dlm_worker) {
607 status = -ENOMEM;
608 goto bail;
609 }
610 cleanup_worker = 1;
611
612 status = register_filesystem(&dlmfs_fs_type);
613bail:
614 if (status) {
615 if (cleanup_inode)
616 kmem_cache_destroy(dlmfs_inode_cache);
617 if (cleanup_worker)
618 destroy_workqueue(user_dlm_worker);
619 } else
620 printk("OCFS2 User DLM kernel interface loaded\n");
621 return status;
622}
623
624static void __exit exit_dlmfs_fs(void)
625{
626 unregister_filesystem(&dlmfs_fs_type);
627
628 flush_workqueue(user_dlm_worker);
629 destroy_workqueue(user_dlm_worker);
630
631 if (kmem_cache_destroy(dlmfs_inode_cache))
632 printk(KERN_INFO "dlmfs_inode_cache: not all structures "
633 "were freed\n");
634}
635
636MODULE_AUTHOR("Oracle");
637MODULE_LICENSE("GPL");
638
639module_init(init_dlmfs_fs)
640module_exit(exit_dlmfs_fs)
diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlm/dlmfsver.c
new file mode 100644
index 00000000000..d2be3ad841f
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmfsver.c
@@ -0,0 +1,42 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmfsver.c
5 *
6 * version string
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/kernel.h>
28
29#include "dlmfsver.h"
30
31#define DLM_BUILD_VERSION "1.3.3"
32
33#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
34
35void dlmfs_print_version(void)
36{
37 printk(KERN_INFO "%s\n", VERSION_STR);
38}
39
40MODULE_DESCRIPTION(VERSION_STR);
41
42MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlm/dlmfsver.h b/fs/ocfs2/dlm/dlmfsver.h
new file mode 100644
index 00000000000..f35eadbed25
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmfsver.h
@@ -0,0 +1,31 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmver.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef DLMFS_VER_H
27#define DLMFS_VER_H
28
29void dlmfs_print_version(void);
30
31#endif /* DLMFS_VER_H */
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
new file mode 100644
index 00000000000..d1a0038557a
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -0,0 +1,676 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmlock.c
5 *
6 * underlying calls for lock creation
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27
28#include <linux/module.h>
29#include <linux/fs.h>
30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h>
35#include <linux/sysctl.h>
36#include <linux/random.h>
37#include <linux/blkdev.h>
38#include <linux/socket.h>
39#include <linux/inet.h>
40#include <linux/spinlock.h>
41#include <linux/delay.h>
42
43
44#include "cluster/heartbeat.h"
45#include "cluster/nodemanager.h"
46#include "cluster/tcp.h"
47
48#include "dlmapi.h"
49#include "dlmcommon.h"
50
51#include "dlmconvert.h"
52
53#define MLOG_MASK_PREFIX ML_DLM
54#include "cluster/masklog.h"
55
56static spinlock_t dlm_cookie_lock = SPIN_LOCK_UNLOCKED;
57static u64 dlm_next_cookie = 1;
58
59static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
60 struct dlm_lock_resource *res,
61 struct dlm_lock *lock, int flags);
62static void dlm_init_lock(struct dlm_lock *newlock, int type,
63 u8 node, u64 cookie);
64static void dlm_lock_release(struct kref *kref);
65static void dlm_lock_detach_lockres(struct dlm_lock *lock);
66
67/* Tell us whether we can grant a new lock request.
68 * locking:
69 * caller needs: res->spinlock
70 * taken: none
71 * held on exit: none
72 * returns: 1 if the lock can be granted, 0 otherwise.
73 */
74static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
75 struct dlm_lock *lock)
76{
77 struct list_head *iter;
78 struct dlm_lock *tmplock;
79
80 list_for_each(iter, &res->granted) {
81 tmplock = list_entry(iter, struct dlm_lock, list);
82
83 if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
84 return 0;
85 }
86
87 list_for_each(iter, &res->converting) {
88 tmplock = list_entry(iter, struct dlm_lock, list);
89
90 if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
91 return 0;
92 }
93
94 return 1;
95}
96
97/* performs lock creation at the lockres master site
98 * locking:
99 * caller needs: none
100 * taken: takes and drops res->spinlock
101 * held on exit: none
102 * returns: DLM_NORMAL, DLM_NOTQUEUED
103 */
104static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
105 struct dlm_lock_resource *res,
106 struct dlm_lock *lock, int flags)
107{
108 int call_ast = 0, kick_thread = 0;
109 enum dlm_status status = DLM_NORMAL;
110
111 mlog_entry("type=%d\n", lock->ml.type);
112
113 spin_lock(&res->spinlock);
114 /* if called from dlm_create_lock_handler, need to
115 * ensure it will not sleep in dlm_wait_on_lockres */
116 status = __dlm_lockres_state_to_status(res);
117 if (status != DLM_NORMAL &&
118 lock->ml.node != dlm->node_num) {
119 /* erf. state changed after lock was dropped. */
120 spin_unlock(&res->spinlock);
121 dlm_error(status);
122 return status;
123 }
124 __dlm_wait_on_lockres(res);
125 __dlm_lockres_reserve_ast(res);
126
127 if (dlm_can_grant_new_lock(res, lock)) {
128 mlog(0, "I can grant this lock right away\n");
129 /* got it right away */
130 lock->lksb->status = DLM_NORMAL;
131 status = DLM_NORMAL;
132 dlm_lock_get(lock);
133 list_add_tail(&lock->list, &res->granted);
134
135 /* for the recovery lock, we can't allow the ast
136 * to be queued since the dlmthread is already
137 * frozen. but the recovery lock is always locked
138 * with LKM_NOQUEUE so we do not need the ast in
139 * this special case */
140 if (!dlm_is_recovery_lock(res->lockname.name,
141 res->lockname.len)) {
142 kick_thread = 1;
143 call_ast = 1;
144 }
145 } else {
146 /* for NOQUEUE request, unless we get the
147 * lock right away, return DLM_NOTQUEUED */
148 if (flags & LKM_NOQUEUE)
149 status = DLM_NOTQUEUED;
150 else {
151 dlm_lock_get(lock);
152 list_add_tail(&lock->list, &res->blocked);
153 kick_thread = 1;
154 }
155 }
156
157 spin_unlock(&res->spinlock);
158 wake_up(&res->wq);
159
160 /* either queue the ast or release it */
161 if (call_ast)
162 dlm_queue_ast(dlm, lock);
163 else
164 dlm_lockres_release_ast(dlm, res);
165
166 dlm_lockres_calc_usage(dlm, res);
167 if (kick_thread)
168 dlm_kick_thread(dlm, res);
169
170 return status;
171}
172
173void dlm_revert_pending_lock(struct dlm_lock_resource *res,
174 struct dlm_lock *lock)
175{
176 /* remove from local queue if it failed */
177 list_del_init(&lock->list);
178 lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
179}
180
181
182/*
183 * locking:
184 * caller needs: none
185 * taken: takes and drops res->spinlock
186 * held on exit: none
187 * returns: DLM_DENIED, DLM_RECOVERING, or net status
188 */
189static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
190 struct dlm_lock_resource *res,
191 struct dlm_lock *lock, int flags)
192{
193 enum dlm_status status = DLM_DENIED;
194
195 mlog_entry("type=%d\n", lock->ml.type);
196 mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len,
197 res->lockname.name, flags);
198
199 spin_lock(&res->spinlock);
200
201 /* will exit this call with spinlock held */
202 __dlm_wait_on_lockres(res);
203 res->state |= DLM_LOCK_RES_IN_PROGRESS;
204
205 /* add lock to local (secondary) queue */
206 dlm_lock_get(lock);
207 list_add_tail(&lock->list, &res->blocked);
208 lock->lock_pending = 1;
209 spin_unlock(&res->spinlock);
210
211 /* spec seems to say that you will get DLM_NORMAL when the lock
212 * has been queued, meaning we need to wait for a reply here. */
213 status = dlm_send_remote_lock_request(dlm, res, lock, flags);
214
215 spin_lock(&res->spinlock);
216 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
217 lock->lock_pending = 0;
218 if (status != DLM_NORMAL) {
219 if (status != DLM_NOTQUEUED)
220 dlm_error(status);
221 dlm_revert_pending_lock(res, lock);
222 dlm_lock_put(lock);
223 }
224 spin_unlock(&res->spinlock);
225
226 dlm_lockres_calc_usage(dlm, res);
227
228 wake_up(&res->wq);
229 return status;
230}
231
232
233/* for remote lock creation.
234 * locking:
235 * caller needs: none, but need res->state & DLM_LOCK_RES_IN_PROGRESS
236 * taken: none
237 * held on exit: none
238 * returns: DLM_NOLOCKMGR, or net status
239 */
240static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
241 struct dlm_lock_resource *res,
242 struct dlm_lock *lock, int flags)
243{
244 struct dlm_create_lock create;
245 int tmpret, status = 0;
246 enum dlm_status ret;
247
248 mlog_entry_void();
249
250 memset(&create, 0, sizeof(create));
251 create.node_idx = dlm->node_num;
252 create.requested_type = lock->ml.type;
253 create.cookie = lock->ml.cookie;
254 create.namelen = res->lockname.len;
255 create.flags = cpu_to_be32(flags);
256 memcpy(create.name, res->lockname.name, create.namelen);
257
258 tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create,
259 sizeof(create), res->owner, &status);
260 if (tmpret >= 0) {
261 // successfully sent and received
262 ret = status; // this is already a dlm_status
263 } else {
264 mlog_errno(tmpret);
265 if (dlm_is_host_down(tmpret)) {
266 ret = DLM_RECOVERING;
267 mlog(0, "node %u died so returning DLM_RECOVERING "
268 "from lock message!\n", res->owner);
269 } else {
270 ret = dlm_err_to_dlm_status(tmpret);
271 }
272 }
273
274 return ret;
275}
276
277void dlm_lock_get(struct dlm_lock *lock)
278{
279 kref_get(&lock->lock_refs);
280}
281
282void dlm_lock_put(struct dlm_lock *lock)
283{
284 kref_put(&lock->lock_refs, dlm_lock_release);
285}
286
287static void dlm_lock_release(struct kref *kref)
288{
289 struct dlm_lock *lock;
290
291 lock = container_of(kref, struct dlm_lock, lock_refs);
292
293 BUG_ON(!list_empty(&lock->list));
294 BUG_ON(!list_empty(&lock->ast_list));
295 BUG_ON(!list_empty(&lock->bast_list));
296 BUG_ON(lock->ast_pending);
297 BUG_ON(lock->bast_pending);
298
299 dlm_lock_detach_lockres(lock);
300
301 if (lock->lksb_kernel_allocated) {
302 mlog(0, "freeing kernel-allocated lksb\n");
303 kfree(lock->lksb);
304 }
305 kfree(lock);
306}
307
308/* associate a lock with it's lockres, getting a ref on the lockres */
309void dlm_lock_attach_lockres(struct dlm_lock *lock,
310 struct dlm_lock_resource *res)
311{
312 dlm_lockres_get(res);
313 lock->lockres = res;
314}
315
316/* drop ref on lockres, if there is still one associated with lock */
317static void dlm_lock_detach_lockres(struct dlm_lock *lock)
318{
319 struct dlm_lock_resource *res;
320
321 res = lock->lockres;
322 if (res) {
323 lock->lockres = NULL;
324 mlog(0, "removing lock's lockres reference\n");
325 dlm_lockres_put(res);
326 }
327}
328
329static void dlm_init_lock(struct dlm_lock *newlock, int type,
330 u8 node, u64 cookie)
331{
332 INIT_LIST_HEAD(&newlock->list);
333 INIT_LIST_HEAD(&newlock->ast_list);
334 INIT_LIST_HEAD(&newlock->bast_list);
335 spin_lock_init(&newlock->spinlock);
336 newlock->ml.type = type;
337 newlock->ml.convert_type = LKM_IVMODE;
338 newlock->ml.highest_blocked = LKM_IVMODE;
339 newlock->ml.node = node;
340 newlock->ml.pad1 = 0;
341 newlock->ml.list = 0;
342 newlock->ml.flags = 0;
343 newlock->ast = NULL;
344 newlock->bast = NULL;
345 newlock->astdata = NULL;
346 newlock->ml.cookie = cpu_to_be64(cookie);
347 newlock->ast_pending = 0;
348 newlock->bast_pending = 0;
349 newlock->convert_pending = 0;
350 newlock->lock_pending = 0;
351 newlock->unlock_pending = 0;
352 newlock->cancel_pending = 0;
353 newlock->lksb_kernel_allocated = 0;
354
355 kref_init(&newlock->lock_refs);
356}
357
358struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
359 struct dlm_lockstatus *lksb)
360{
361 struct dlm_lock *lock;
362 int kernel_allocated = 0;
363
364 lock = kcalloc(1, sizeof(*lock), GFP_KERNEL);
365 if (!lock)
366 return NULL;
367
368 if (!lksb) {
369 /* zero memory only if kernel-allocated */
370 lksb = kcalloc(1, sizeof(*lksb), GFP_KERNEL);
371 if (!lksb) {
372 kfree(lock);
373 return NULL;
374 }
375 kernel_allocated = 1;
376 }
377
378 dlm_init_lock(lock, type, node, cookie);
379 if (kernel_allocated)
380 lock->lksb_kernel_allocated = 1;
381 lock->lksb = lksb;
382 lksb->lockid = lock;
383 return lock;
384}
385
386/* handler for lock creation net message
387 * locking:
388 * caller needs: none
389 * taken: takes and drops res->spinlock
390 * held on exit: none
391 * returns: DLM_NORMAL, DLM_SYSERR, DLM_IVLOCKID, DLM_NOTQUEUED
392 */
393int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data)
394{
395 struct dlm_ctxt *dlm = data;
396 struct dlm_create_lock *create = (struct dlm_create_lock *)msg->buf;
397 struct dlm_lock_resource *res = NULL;
398 struct dlm_lock *newlock = NULL;
399 struct dlm_lockstatus *lksb = NULL;
400 enum dlm_status status = DLM_NORMAL;
401 char *name;
402 unsigned int namelen;
403
404 BUG_ON(!dlm);
405
406 mlog_entry_void();
407
408 if (!dlm_grab(dlm))
409 return DLM_REJECTED;
410
411 mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
412 "Domain %s not fully joined!\n", dlm->name);
413
414 name = create->name;
415 namelen = create->namelen;
416
417 status = DLM_IVBUFLEN;
418 if (namelen > DLM_LOCKID_NAME_MAX) {
419 dlm_error(status);
420 goto leave;
421 }
422
423 status = DLM_SYSERR;
424 newlock = dlm_new_lock(create->requested_type,
425 create->node_idx,
426 be64_to_cpu(create->cookie), NULL);
427 if (!newlock) {
428 dlm_error(status);
429 goto leave;
430 }
431
432 lksb = newlock->lksb;
433
434 if (be32_to_cpu(create->flags) & LKM_GET_LVB) {
435 lksb->flags |= DLM_LKSB_GET_LVB;
436 mlog(0, "set DLM_LKSB_GET_LVB flag\n");
437 }
438
439 status = DLM_IVLOCKID;
440 res = dlm_lookup_lockres(dlm, name, namelen);
441 if (!res) {
442 dlm_error(status);
443 goto leave;
444 }
445
446 spin_lock(&res->spinlock);
447 status = __dlm_lockres_state_to_status(res);
448 spin_unlock(&res->spinlock);
449
450 if (status != DLM_NORMAL) {
451 mlog(0, "lockres recovering/migrating/in-progress\n");
452 goto leave;
453 }
454
455 dlm_lock_attach_lockres(newlock, res);
456
457 status = dlmlock_master(dlm, res, newlock, be32_to_cpu(create->flags));
458leave:
459 if (status != DLM_NORMAL)
460 if (newlock)
461 dlm_lock_put(newlock);
462
463 if (res)
464 dlm_lockres_put(res);
465
466 dlm_put(dlm);
467
468 return status;
469}
470
471
472/* fetch next node-local (u8 nodenum + u56 cookie) into u64 */
473static inline void dlm_get_next_cookie(u8 node_num, u64 *cookie)
474{
475 u64 tmpnode = node_num;
476
477 /* shift single byte of node num into top 8 bits */
478 tmpnode <<= 56;
479
480 spin_lock(&dlm_cookie_lock);
481 *cookie = (dlm_next_cookie | tmpnode);
482 if (++dlm_next_cookie & 0xff00000000000000ull) {
483 mlog(0, "This node's cookie will now wrap!\n");
484 dlm_next_cookie = 1;
485 }
486 spin_unlock(&dlm_cookie_lock);
487}
488
489enum dlm_status dlmlock(struct dlm_ctxt *dlm, int mode,
490 struct dlm_lockstatus *lksb, int flags,
491 const char *name, dlm_astlockfunc_t *ast, void *data,
492 dlm_bastlockfunc_t *bast)
493{
494 enum dlm_status status;
495 struct dlm_lock_resource *res = NULL;
496 struct dlm_lock *lock = NULL;
497 int convert = 0, recovery = 0;
498
499 /* yes this function is a mess.
500 * TODO: clean this up. lots of common code in the
501 * lock and convert paths, especially in the retry blocks */
502 if (!lksb) {
503 dlm_error(DLM_BADARGS);
504 return DLM_BADARGS;
505 }
506
507 status = DLM_BADPARAM;
508 if (mode != LKM_EXMODE && mode != LKM_PRMODE && mode != LKM_NLMODE) {
509 dlm_error(status);
510 goto error;
511 }
512
513 if (flags & ~LKM_VALID_FLAGS) {
514 dlm_error(status);
515 goto error;
516 }
517
518 convert = (flags & LKM_CONVERT);
519 recovery = (flags & LKM_RECOVERY);
520
521 if (recovery &&
522 (!dlm_is_recovery_lock(name, strlen(name)) || convert) ) {
523 dlm_error(status);
524 goto error;
525 }
526 if (convert && (flags & LKM_LOCAL)) {
527 mlog(ML_ERROR, "strange LOCAL convert request!\n");
528 goto error;
529 }
530
531 if (convert) {
532 /* CONVERT request */
533
534 /* if converting, must pass in a valid dlm_lock */
535 lock = lksb->lockid;
536 if (!lock) {
537 mlog(ML_ERROR, "NULL lock pointer in convert "
538 "request\n");
539 goto error;
540 }
541
542 res = lock->lockres;
543 if (!res) {
544 mlog(ML_ERROR, "NULL lockres pointer in convert "
545 "request\n");
546 goto error;
547 }
548 dlm_lockres_get(res);
549
550 /* XXX: for ocfs2 purposes, the ast/bast/astdata/lksb are
551 * static after the original lock call. convert requests will
552 * ensure that everything is the same, or return DLM_BADARGS.
553 * this means that DLM_DENIED_NOASTS will never be returned.
554 */
555 if (lock->lksb != lksb || lock->ast != ast ||
556 lock->bast != bast || lock->astdata != data) {
557 status = DLM_BADARGS;
558 mlog(ML_ERROR, "new args: lksb=%p, ast=%p, bast=%p, "
559 "astdata=%p\n", lksb, ast, bast, data);
560 mlog(ML_ERROR, "orig args: lksb=%p, ast=%p, bast=%p, "
561 "astdata=%p\n", lock->lksb, lock->ast,
562 lock->bast, lock->astdata);
563 goto error;
564 }
565retry_convert:
566 dlm_wait_for_recovery(dlm);
567
568 if (res->owner == dlm->node_num)
569 status = dlmconvert_master(dlm, res, lock, flags, mode);
570 else
571 status = dlmconvert_remote(dlm, res, lock, flags, mode);
572 if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
573 status == DLM_FORWARD) {
574 /* for now, see how this works without sleeping
575 * and just retry right away. I suspect the reco
576 * or migration will complete fast enough that
577 * no waiting will be necessary */
578 mlog(0, "retrying convert with migration/recovery/"
579 "in-progress\n");
580 msleep(100);
581 goto retry_convert;
582 }
583 } else {
584 u64 tmpcookie;
585
586 /* LOCK request */
587 status = DLM_BADARGS;
588 if (!name) {
589 dlm_error(status);
590 goto error;
591 }
592
593 status = DLM_IVBUFLEN;
594 if (strlen(name) > DLM_LOCKID_NAME_MAX || strlen(name) < 1) {
595 dlm_error(status);
596 goto error;
597 }
598
599 dlm_get_next_cookie(dlm->node_num, &tmpcookie);
600 lock = dlm_new_lock(mode, dlm->node_num, tmpcookie, lksb);
601 if (!lock) {
602 dlm_error(status);
603 goto error;
604 }
605
606 if (!recovery)
607 dlm_wait_for_recovery(dlm);
608
609 /* find or create the lock resource */
610 res = dlm_get_lock_resource(dlm, name, flags);
611 if (!res) {
612 status = DLM_IVLOCKID;
613 dlm_error(status);
614 goto error;
615 }
616
617 mlog(0, "type=%d, flags = 0x%x\n", mode, flags);
618 mlog(0, "creating lock: lock=%p res=%p\n", lock, res);
619
620 dlm_lock_attach_lockres(lock, res);
621 lock->ast = ast;
622 lock->bast = bast;
623 lock->astdata = data;
624
625retry_lock:
626 if (flags & LKM_VALBLK) {
627 mlog(0, "LKM_VALBLK passed by caller\n");
628
629 /* LVB requests for non PR, PW or EX locks are
630 * ignored. */
631 if (mode < LKM_PRMODE)
632 flags &= ~LKM_VALBLK;
633 else {
634 flags |= LKM_GET_LVB;
635 lock->lksb->flags |= DLM_LKSB_GET_LVB;
636 }
637 }
638
639 if (res->owner == dlm->node_num)
640 status = dlmlock_master(dlm, res, lock, flags);
641 else
642 status = dlmlock_remote(dlm, res, lock, flags);
643
644 if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
645 status == DLM_FORWARD) {
646 mlog(0, "retrying lock with migration/"
647 "recovery/in progress\n");
648 msleep(100);
649 dlm_wait_for_recovery(dlm);
650 goto retry_lock;
651 }
652
653 if (status != DLM_NORMAL) {
654 lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
655 if (status != DLM_NOTQUEUED)
656 dlm_error(status);
657 goto error;
658 }
659 }
660
661error:
662 if (status != DLM_NORMAL) {
663 if (lock && !convert)
664 dlm_lock_put(lock);
665 // this is kind of unnecessary
666 lksb->status = status;
667 }
668
669 /* put lockres ref from the convert path
670 * or from dlm_get_lock_resource */
671 if (res)
672 dlm_lockres_put(res);
673
674 return status;
675}
676EXPORT_SYMBOL_GPL(dlmlock);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
new file mode 100644
index 00000000000..27e984f7e4c
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -0,0 +1,2664 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmmod.c
5 *
6 * standalone DLM module
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27
28#include <linux/module.h>
29#include <linux/fs.h>
30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h>
35#include <linux/sysctl.h>
36#include <linux/random.h>
37#include <linux/blkdev.h>
38#include <linux/socket.h>
39#include <linux/inet.h>
40#include <linux/spinlock.h>
41#include <linux/delay.h>
42
43
44#include "cluster/heartbeat.h"
45#include "cluster/nodemanager.h"
46#include "cluster/tcp.h"
47
48#include "dlmapi.h"
49#include "dlmcommon.h"
50#include "dlmdebug.h"
51#include "dlmdomain.h"
52
53#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
54#include "cluster/masklog.h"
55
56enum dlm_mle_type {
57 DLM_MLE_BLOCK,
58 DLM_MLE_MASTER,
59 DLM_MLE_MIGRATION
60};
61
62struct dlm_lock_name
63{
64 u8 len;
65 u8 name[DLM_LOCKID_NAME_MAX];
66};
67
68struct dlm_master_list_entry
69{
70 struct list_head list;
71 struct list_head hb_events;
72 struct dlm_ctxt *dlm;
73 spinlock_t spinlock;
74 wait_queue_head_t wq;
75 atomic_t woken;
76 struct kref mle_refs;
77 unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
78 unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
79 unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
80 unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
81 u8 master;
82 u8 new_master;
83 enum dlm_mle_type type;
84 struct o2hb_callback_func mle_hb_up;
85 struct o2hb_callback_func mle_hb_down;
86 union {
87 struct dlm_lock_resource *res;
88 struct dlm_lock_name name;
89 } u;
90};
91
92static void dlm_mle_node_down(struct dlm_ctxt *dlm,
93 struct dlm_master_list_entry *mle,
94 struct o2nm_node *node,
95 int idx);
96static void dlm_mle_node_up(struct dlm_ctxt *dlm,
97 struct dlm_master_list_entry *mle,
98 struct o2nm_node *node,
99 int idx);
100
101static void dlm_assert_master_worker(struct dlm_work_item *item, void *data);
102static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname,
103 unsigned int namelen, void *nodemap,
104 u32 flags);
105
106static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
107 struct dlm_master_list_entry *mle,
108 const char *name,
109 unsigned int namelen)
110{
111 struct dlm_lock_resource *res;
112
113 if (dlm != mle->dlm)
114 return 0;
115
116 if (mle->type == DLM_MLE_BLOCK ||
117 mle->type == DLM_MLE_MIGRATION) {
118 if (namelen != mle->u.name.len ||
119 memcmp(name, mle->u.name.name, namelen)!=0)
120 return 0;
121 } else {
122 res = mle->u.res;
123 if (namelen != res->lockname.len ||
124 memcmp(res->lockname.name, name, namelen) != 0)
125 return 0;
126 }
127 return 1;
128}
129
130#if 0
131/* Code here is included but defined out as it aids debugging */
132
133void dlm_print_one_mle(struct dlm_master_list_entry *mle)
134{
135 int i = 0, refs;
136 char *type;
137 char attached;
138 u8 master;
139 unsigned int namelen;
140 const char *name;
141 struct kref *k;
142
143 k = &mle->mle_refs;
144 if (mle->type == DLM_MLE_BLOCK)
145 type = "BLK";
146 else if (mle->type == DLM_MLE_MASTER)
147 type = "MAS";
148 else
149 type = "MIG";
150 refs = atomic_read(&k->refcount);
151 master = mle->master;
152 attached = (list_empty(&mle->hb_events) ? 'N' : 'Y');
153
154 if (mle->type != DLM_MLE_MASTER) {
155 namelen = mle->u.name.len;
156 name = mle->u.name.name;
157 } else {
158 namelen = mle->u.res->lockname.len;
159 name = mle->u.res->lockname.name;
160 }
161
162 mlog(ML_NOTICE, " #%3d: %3s %3d %3u %3u %c (%d)%.*s\n",
163 i, type, refs, master, mle->new_master, attached,
164 namelen, namelen, name);
165}
166
167static void dlm_dump_mles(struct dlm_ctxt *dlm)
168{
169 struct dlm_master_list_entry *mle;
170 struct list_head *iter;
171
172 mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
173 mlog(ML_NOTICE, " ####: type refs owner new events? lockname nodemap votemap respmap maybemap\n");
174 spin_lock(&dlm->master_lock);
175 list_for_each(iter, &dlm->master_list) {
176 mle = list_entry(iter, struct dlm_master_list_entry, list);
177 dlm_print_one_mle(mle);
178 }
179 spin_unlock(&dlm->master_lock);
180}
181
182int dlm_dump_all_mles(const char __user *data, unsigned int len)
183{
184 struct list_head *iter;
185 struct dlm_ctxt *dlm;
186
187 spin_lock(&dlm_domain_lock);
188 list_for_each(iter, &dlm_domains) {
189 dlm = list_entry (iter, struct dlm_ctxt, list);
190 mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name);
191 dlm_dump_mles(dlm);
192 }
193 spin_unlock(&dlm_domain_lock);
194 return len;
195}
196EXPORT_SYMBOL_GPL(dlm_dump_all_mles);
197
198#endif /* 0 */
199
200
201static kmem_cache_t *dlm_mle_cache = NULL;
202
203
204static void dlm_mle_release(struct kref *kref);
205static void dlm_init_mle(struct dlm_master_list_entry *mle,
206 enum dlm_mle_type type,
207 struct dlm_ctxt *dlm,
208 struct dlm_lock_resource *res,
209 const char *name,
210 unsigned int namelen);
211static void dlm_put_mle(struct dlm_master_list_entry *mle);
212static void __dlm_put_mle(struct dlm_master_list_entry *mle);
213static int dlm_find_mle(struct dlm_ctxt *dlm,
214 struct dlm_master_list_entry **mle,
215 char *name, unsigned int namelen);
216
217static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to);
218
219
220static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
221 struct dlm_lock_resource *res,
222 struct dlm_master_list_entry *mle,
223 int *blocked);
224static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
225 struct dlm_lock_resource *res,
226 struct dlm_master_list_entry *mle,
227 int blocked);
228static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
229 struct dlm_lock_resource *res,
230 struct dlm_master_list_entry *mle,
231 struct dlm_master_list_entry **oldmle,
232 const char *name, unsigned int namelen,
233 u8 new_master, u8 master);
234
235static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
236 struct dlm_lock_resource *res);
237static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
238 struct dlm_lock_resource *res);
239static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
240 struct dlm_lock_resource *res,
241 u8 target);
242
243
244int dlm_is_host_down(int errno)
245{
246 switch (errno) {
247 case -EBADF:
248 case -ECONNREFUSED:
249 case -ENOTCONN:
250 case -ECONNRESET:
251 case -EPIPE:
252 case -EHOSTDOWN:
253 case -EHOSTUNREACH:
254 case -ETIMEDOUT:
255 case -ECONNABORTED:
256 case -ENETDOWN:
257 case -ENETUNREACH:
258 case -ENETRESET:
259 case -ESHUTDOWN:
260 case -ENOPROTOOPT:
261 case -EINVAL: /* if returned from our tcp code,
262 this means there is no socket */
263 return 1;
264 }
265 return 0;
266}
267
268
269/*
270 * MASTER LIST FUNCTIONS
271 */
272
273
274/*
275 * regarding master list entries and heartbeat callbacks:
276 *
277 * in order to avoid sleeping and allocation that occurs in
278 * heartbeat, master list entries are simply attached to the
279 * dlm's established heartbeat callbacks. the mle is attached
280 * when it is created, and since the dlm->spinlock is held at
281 * that time, any heartbeat event will be properly discovered
282 * by the mle. the mle needs to be detached from the
283 * dlm->mle_hb_events list as soon as heartbeat events are no
284 * longer useful to the mle, and before the mle is freed.
285 *
286 * as a general rule, heartbeat events are no longer needed by
287 * the mle once an "answer" regarding the lock master has been
288 * received.
289 */
290static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm,
291 struct dlm_master_list_entry *mle)
292{
293 assert_spin_locked(&dlm->spinlock);
294
295 list_add_tail(&mle->hb_events, &dlm->mle_hb_events);
296}
297
298
299static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
300 struct dlm_master_list_entry *mle)
301{
302 if (!list_empty(&mle->hb_events))
303 list_del_init(&mle->hb_events);
304}
305
306
307static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
308 struct dlm_master_list_entry *mle)
309{
310 spin_lock(&dlm->spinlock);
311 __dlm_mle_detach_hb_events(dlm, mle);
312 spin_unlock(&dlm->spinlock);
313}
314
315/* remove from list and free */
316static void __dlm_put_mle(struct dlm_master_list_entry *mle)
317{
318 struct dlm_ctxt *dlm;
319 dlm = mle->dlm;
320
321 assert_spin_locked(&dlm->spinlock);
322 assert_spin_locked(&dlm->master_lock);
323 BUG_ON(!atomic_read(&mle->mle_refs.refcount));
324
325 kref_put(&mle->mle_refs, dlm_mle_release);
326}
327
328
329/* must not have any spinlocks coming in */
330static void dlm_put_mle(struct dlm_master_list_entry *mle)
331{
332 struct dlm_ctxt *dlm;
333 dlm = mle->dlm;
334
335 spin_lock(&dlm->spinlock);
336 spin_lock(&dlm->master_lock);
337 __dlm_put_mle(mle);
338 spin_unlock(&dlm->master_lock);
339 spin_unlock(&dlm->spinlock);
340}
341
342static inline void dlm_get_mle(struct dlm_master_list_entry *mle)
343{
344 kref_get(&mle->mle_refs);
345}
346
347static void dlm_init_mle(struct dlm_master_list_entry *mle,
348 enum dlm_mle_type type,
349 struct dlm_ctxt *dlm,
350 struct dlm_lock_resource *res,
351 const char *name,
352 unsigned int namelen)
353{
354 assert_spin_locked(&dlm->spinlock);
355
356 mle->dlm = dlm;
357 mle->type = type;
358 INIT_LIST_HEAD(&mle->list);
359 INIT_LIST_HEAD(&mle->hb_events);
360 memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
361 spin_lock_init(&mle->spinlock);
362 init_waitqueue_head(&mle->wq);
363 atomic_set(&mle->woken, 0);
364 kref_init(&mle->mle_refs);
365 memset(mle->response_map, 0, sizeof(mle->response_map));
366 mle->master = O2NM_MAX_NODES;
367 mle->new_master = O2NM_MAX_NODES;
368
369 if (mle->type == DLM_MLE_MASTER) {
370 BUG_ON(!res);
371 mle->u.res = res;
372 } else if (mle->type == DLM_MLE_BLOCK) {
373 BUG_ON(!name);
374 memcpy(mle->u.name.name, name, namelen);
375 mle->u.name.len = namelen;
376 } else /* DLM_MLE_MIGRATION */ {
377 BUG_ON(!name);
378 memcpy(mle->u.name.name, name, namelen);
379 mle->u.name.len = namelen;
380 }
381
382 /* copy off the node_map and register hb callbacks on our copy */
383 memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
384 memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
385 clear_bit(dlm->node_num, mle->vote_map);
386 clear_bit(dlm->node_num, mle->node_map);
387
388 /* attach the mle to the domain node up/down events */
389 __dlm_mle_attach_hb_events(dlm, mle);
390}
391
392
393/* returns 1 if found, 0 if not */
394static int dlm_find_mle(struct dlm_ctxt *dlm,
395 struct dlm_master_list_entry **mle,
396 char *name, unsigned int namelen)
397{
398 struct dlm_master_list_entry *tmpmle;
399 struct list_head *iter;
400
401 assert_spin_locked(&dlm->master_lock);
402
403 list_for_each(iter, &dlm->master_list) {
404 tmpmle = list_entry(iter, struct dlm_master_list_entry, list);
405 if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
406 continue;
407 dlm_get_mle(tmpmle);
408 *mle = tmpmle;
409 return 1;
410 }
411 return 0;
412}
413
414void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
415{
416 struct dlm_master_list_entry *mle;
417 struct list_head *iter;
418
419 assert_spin_locked(&dlm->spinlock);
420
421 list_for_each(iter, &dlm->mle_hb_events) {
422 mle = list_entry(iter, struct dlm_master_list_entry,
423 hb_events);
424 if (node_up)
425 dlm_mle_node_up(dlm, mle, NULL, idx);
426 else
427 dlm_mle_node_down(dlm, mle, NULL, idx);
428 }
429}
430
431static void dlm_mle_node_down(struct dlm_ctxt *dlm,
432 struct dlm_master_list_entry *mle,
433 struct o2nm_node *node, int idx)
434{
435 spin_lock(&mle->spinlock);
436
437 if (!test_bit(idx, mle->node_map))
438 mlog(0, "node %u already removed from nodemap!\n", idx);
439 else
440 clear_bit(idx, mle->node_map);
441
442 spin_unlock(&mle->spinlock);
443}
444
445static void dlm_mle_node_up(struct dlm_ctxt *dlm,
446 struct dlm_master_list_entry *mle,
447 struct o2nm_node *node, int idx)
448{
449 spin_lock(&mle->spinlock);
450
451 if (test_bit(idx, mle->node_map))
452 mlog(0, "node %u already in node map!\n", idx);
453 else
454 set_bit(idx, mle->node_map);
455
456 spin_unlock(&mle->spinlock);
457}
458
459
460int dlm_init_mle_cache(void)
461{
462 dlm_mle_cache = kmem_cache_create("dlm_mle_cache",
463 sizeof(struct dlm_master_list_entry),
464 0, SLAB_HWCACHE_ALIGN,
465 NULL, NULL);
466 if (dlm_mle_cache == NULL)
467 return -ENOMEM;
468 return 0;
469}
470
471void dlm_destroy_mle_cache(void)
472{
473 if (dlm_mle_cache)
474 kmem_cache_destroy(dlm_mle_cache);
475}
476
477static void dlm_mle_release(struct kref *kref)
478{
479 struct dlm_master_list_entry *mle;
480 struct dlm_ctxt *dlm;
481
482 mlog_entry_void();
483
484 mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
485 dlm = mle->dlm;
486
487 if (mle->type != DLM_MLE_MASTER) {
488 mlog(0, "calling mle_release for %.*s, type %d\n",
489 mle->u.name.len, mle->u.name.name, mle->type);
490 } else {
491 mlog(0, "calling mle_release for %.*s, type %d\n",
492 mle->u.res->lockname.len,
493 mle->u.res->lockname.name, mle->type);
494 }
495 assert_spin_locked(&dlm->spinlock);
496 assert_spin_locked(&dlm->master_lock);
497
498 /* remove from list if not already */
499 if (!list_empty(&mle->list))
500 list_del_init(&mle->list);
501
502 /* detach the mle from the domain node up/down events */
503 __dlm_mle_detach_hb_events(dlm, mle);
504
505 /* NOTE: kfree under spinlock here.
506 * if this is bad, we can move this to a freelist. */
507 kmem_cache_free(dlm_mle_cache, mle);
508}
509
510
511/*
512 * LOCK RESOURCE FUNCTIONS
513 */
514
515static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
516 struct dlm_lock_resource *res,
517 u8 owner)
518{
519 assert_spin_locked(&res->spinlock);
520
521 mlog_entry("%.*s, %u\n", res->lockname.len, res->lockname.name, owner);
522
523 if (owner == dlm->node_num)
524 atomic_inc(&dlm->local_resources);
525 else if (owner == DLM_LOCK_RES_OWNER_UNKNOWN)
526 atomic_inc(&dlm->unknown_resources);
527 else
528 atomic_inc(&dlm->remote_resources);
529
530 res->owner = owner;
531}
532
533void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
534 struct dlm_lock_resource *res, u8 owner)
535{
536 assert_spin_locked(&res->spinlock);
537
538 if (owner == res->owner)
539 return;
540
541 if (res->owner == dlm->node_num)
542 atomic_dec(&dlm->local_resources);
543 else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN)
544 atomic_dec(&dlm->unknown_resources);
545 else
546 atomic_dec(&dlm->remote_resources);
547
548 dlm_set_lockres_owner(dlm, res, owner);
549}
550
551
552static void dlm_lockres_release(struct kref *kref)
553{
554 struct dlm_lock_resource *res;
555
556 res = container_of(kref, struct dlm_lock_resource, refs);
557
558 /* This should not happen -- all lockres' have a name
559 * associated with them at init time. */
560 BUG_ON(!res->lockname.name);
561
562 mlog(0, "destroying lockres %.*s\n", res->lockname.len,
563 res->lockname.name);
564
565 /* By the time we're ready to blow this guy away, we shouldn't
566 * be on any lists. */
567 BUG_ON(!list_empty(&res->list));
568 BUG_ON(!list_empty(&res->granted));
569 BUG_ON(!list_empty(&res->converting));
570 BUG_ON(!list_empty(&res->blocked));
571 BUG_ON(!list_empty(&res->dirty));
572 BUG_ON(!list_empty(&res->recovering));
573 BUG_ON(!list_empty(&res->purge));
574
575 kfree(res->lockname.name);
576
577 kfree(res);
578}
579
580void dlm_lockres_get(struct dlm_lock_resource *res)
581{
582 kref_get(&res->refs);
583}
584
585void dlm_lockres_put(struct dlm_lock_resource *res)
586{
587 kref_put(&res->refs, dlm_lockres_release);
588}
589
590static void dlm_init_lockres(struct dlm_ctxt *dlm,
591 struct dlm_lock_resource *res,
592 const char *name, unsigned int namelen)
593{
594 char *qname;
595
596 /* If we memset here, we lose our reference to the kmalloc'd
597 * res->lockname.name, so be sure to init every field
598 * correctly! */
599
600 qname = (char *) res->lockname.name;
601 memcpy(qname, name, namelen);
602
603 res->lockname.len = namelen;
604 res->lockname.hash = full_name_hash(name, namelen);
605
606 init_waitqueue_head(&res->wq);
607 spin_lock_init(&res->spinlock);
608 INIT_LIST_HEAD(&res->list);
609 INIT_LIST_HEAD(&res->granted);
610 INIT_LIST_HEAD(&res->converting);
611 INIT_LIST_HEAD(&res->blocked);
612 INIT_LIST_HEAD(&res->dirty);
613 INIT_LIST_HEAD(&res->recovering);
614 INIT_LIST_HEAD(&res->purge);
615 atomic_set(&res->asts_reserved, 0);
616 res->migration_pending = 0;
617
618 kref_init(&res->refs);
619
620 /* just for consistency */
621 spin_lock(&res->spinlock);
622 dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
623 spin_unlock(&res->spinlock);
624
625 res->state = DLM_LOCK_RES_IN_PROGRESS;
626
627 res->last_used = 0;
628
629 memset(res->lvb, 0, DLM_LVB_LEN);
630}
631
632struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
633 const char *name,
634 unsigned int namelen)
635{
636 struct dlm_lock_resource *res;
637
638 res = kmalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
639 if (!res)
640 return NULL;
641
642 res->lockname.name = kmalloc(namelen, GFP_KERNEL);
643 if (!res->lockname.name) {
644 kfree(res);
645 return NULL;
646 }
647
648 dlm_init_lockres(dlm, res, name, namelen);
649 return res;
650}
651
652/*
653 * lookup a lock resource by name.
654 * may already exist in the hashtable.
655 * lockid is null terminated
656 *
657 * if not, allocate enough for the lockres and for
658 * the temporary structure used in doing the mastering.
659 *
660 * also, do a lookup in the dlm->master_list to see
661 * if another node has begun mastering the same lock.
662 * if so, there should be a block entry in there
663 * for this name, and we should *not* attempt to master
664 * the lock here. need to wait around for that node
665 * to assert_master (or die).
666 *
667 */
668struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
669 const char *lockid,
670 int flags)
671{
672 struct dlm_lock_resource *tmpres=NULL, *res=NULL;
673 struct dlm_master_list_entry *mle = NULL;
674 struct dlm_master_list_entry *alloc_mle = NULL;
675 int blocked = 0;
676 int ret, nodenum;
677 struct dlm_node_iter iter;
678 unsigned int namelen;
679 int tries = 0;
680
681 BUG_ON(!lockid);
682
683 namelen = strlen(lockid);
684
685 mlog(0, "get lockres %s (len %d)\n", lockid, namelen);
686
687lookup:
688 spin_lock(&dlm->spinlock);
689 tmpres = __dlm_lookup_lockres(dlm, lockid, namelen);
690 if (tmpres) {
691 spin_unlock(&dlm->spinlock);
692 mlog(0, "found in hash!\n");
693 if (res)
694 dlm_lockres_put(res);
695 res = tmpres;
696 goto leave;
697 }
698
699 if (!res) {
700 spin_unlock(&dlm->spinlock);
701 mlog(0, "allocating a new resource\n");
702 /* nothing found and we need to allocate one. */
703 alloc_mle = (struct dlm_master_list_entry *)
704 kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
705 if (!alloc_mle)
706 goto leave;
707 res = dlm_new_lockres(dlm, lockid, namelen);
708 if (!res)
709 goto leave;
710 goto lookup;
711 }
712
713 mlog(0, "no lockres found, allocated our own: %p\n", res);
714
715 if (flags & LKM_LOCAL) {
716 /* caller knows it's safe to assume it's not mastered elsewhere
717 * DONE! return right away */
718 spin_lock(&res->spinlock);
719 dlm_change_lockres_owner(dlm, res, dlm->node_num);
720 __dlm_insert_lockres(dlm, res);
721 spin_unlock(&res->spinlock);
722 spin_unlock(&dlm->spinlock);
723 /* lockres still marked IN_PROGRESS */
724 goto wake_waiters;
725 }
726
727 /* check master list to see if another node has started mastering it */
728 spin_lock(&dlm->master_lock);
729
730 /* if we found a block, wait for lock to be mastered by another node */
731 blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen);
732 if (blocked) {
733 if (mle->type == DLM_MLE_MASTER) {
734 mlog(ML_ERROR, "master entry for nonexistent lock!\n");
735 BUG();
736 } else if (mle->type == DLM_MLE_MIGRATION) {
737 /* migration is in progress! */
738 /* the good news is that we now know the
739 * "current" master (mle->master). */
740
741 spin_unlock(&dlm->master_lock);
742 assert_spin_locked(&dlm->spinlock);
743
744 /* set the lockres owner and hash it */
745 spin_lock(&res->spinlock);
746 dlm_set_lockres_owner(dlm, res, mle->master);
747 __dlm_insert_lockres(dlm, res);
748 spin_unlock(&res->spinlock);
749 spin_unlock(&dlm->spinlock);
750
751 /* master is known, detach */
752 dlm_mle_detach_hb_events(dlm, mle);
753 dlm_put_mle(mle);
754 mle = NULL;
755 goto wake_waiters;
756 }
757 } else {
758 /* go ahead and try to master lock on this node */
759 mle = alloc_mle;
760 /* make sure this does not get freed below */
761 alloc_mle = NULL;
762 dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
763 set_bit(dlm->node_num, mle->maybe_map);
764 list_add(&mle->list, &dlm->master_list);
765 }
766
767 /* at this point there is either a DLM_MLE_BLOCK or a
768 * DLM_MLE_MASTER on the master list, so it's safe to add the
769 * lockres to the hashtable. anyone who finds the lock will
770 * still have to wait on the IN_PROGRESS. */
771
772 /* finally add the lockres to its hash bucket */
773 __dlm_insert_lockres(dlm, res);
774 /* get an extra ref on the mle in case this is a BLOCK
775 * if so, the creator of the BLOCK may try to put the last
776 * ref at this time in the assert master handler, so we
777 * need an extra one to keep from a bad ptr deref. */
778 dlm_get_mle(mle);
779 spin_unlock(&dlm->master_lock);
780 spin_unlock(&dlm->spinlock);
781
782 /* must wait for lock to be mastered elsewhere */
783 if (blocked)
784 goto wait;
785
786redo_request:
787 ret = -EINVAL;
788 dlm_node_iter_init(mle->vote_map, &iter);
789 while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
790 ret = dlm_do_master_request(mle, nodenum);
791 if (ret < 0)
792 mlog_errno(ret);
793 if (mle->master != O2NM_MAX_NODES) {
794 /* found a master ! */
795 break;
796 }
797 }
798
799wait:
800 /* keep going until the response map includes all nodes */
801 ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
802 if (ret < 0) {
803 mlog(0, "%s:%.*s: node map changed, redo the "
804 "master request now, blocked=%d\n",
805 dlm->name, res->lockname.len,
806 res->lockname.name, blocked);
807 if (++tries > 20) {
808 mlog(ML_ERROR, "%s:%.*s: spinning on "
809 "dlm_wait_for_lock_mastery, blocked=%d\n",
810 dlm->name, res->lockname.len,
811 res->lockname.name, blocked);
812 dlm_print_one_lock_resource(res);
813 /* dlm_print_one_mle(mle); */
814 tries = 0;
815 }
816 goto redo_request;
817 }
818
819 mlog(0, "lockres mastered by %u\n", res->owner);
820 /* make sure we never continue without this */
821 BUG_ON(res->owner == O2NM_MAX_NODES);
822
823 /* master is known, detach if not already detached */
824 dlm_mle_detach_hb_events(dlm, mle);
825 dlm_put_mle(mle);
826 /* put the extra ref */
827 dlm_put_mle(mle);
828
829wake_waiters:
830 spin_lock(&res->spinlock);
831 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
832 spin_unlock(&res->spinlock);
833 wake_up(&res->wq);
834
835leave:
836 /* need to free the unused mle */
837 if (alloc_mle)
838 kmem_cache_free(dlm_mle_cache, alloc_mle);
839
840 return res;
841}
842
843
844#define DLM_MASTERY_TIMEOUT_MS 5000
845
846static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
847 struct dlm_lock_resource *res,
848 struct dlm_master_list_entry *mle,
849 int *blocked)
850{
851 u8 m;
852 int ret, bit;
853 int map_changed, voting_done;
854 int assert, sleep;
855
856recheck:
857 ret = 0;
858 assert = 0;
859
860 /* check if another node has already become the owner */
861 spin_lock(&res->spinlock);
862 if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
863 spin_unlock(&res->spinlock);
864 goto leave;
865 }
866 spin_unlock(&res->spinlock);
867
868 spin_lock(&mle->spinlock);
869 m = mle->master;
870 map_changed = (memcmp(mle->vote_map, mle->node_map,
871 sizeof(mle->vote_map)) != 0);
872 voting_done = (memcmp(mle->vote_map, mle->response_map,
873 sizeof(mle->vote_map)) == 0);
874
875 /* restart if we hit any errors */
876 if (map_changed) {
877 int b;
878 mlog(0, "%s: %.*s: node map changed, restarting\n",
879 dlm->name, res->lockname.len, res->lockname.name);
880 ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked);
881 b = (mle->type == DLM_MLE_BLOCK);
882 if ((*blocked && !b) || (!*blocked && b)) {
883 mlog(0, "%s:%.*s: status change: old=%d new=%d\n",
884 dlm->name, res->lockname.len, res->lockname.name,
885 *blocked, b);
886 *blocked = b;
887 }
888 spin_unlock(&mle->spinlock);
889 if (ret < 0) {
890 mlog_errno(ret);
891 goto leave;
892 }
893 mlog(0, "%s:%.*s: restart lock mastery succeeded, "
894 "rechecking now\n", dlm->name, res->lockname.len,
895 res->lockname.name);
896 goto recheck;
897 }
898
899 if (m != O2NM_MAX_NODES) {
900 /* another node has done an assert!
901 * all done! */
902 sleep = 0;
903 } else {
904 sleep = 1;
905 /* have all nodes responded? */
906 if (voting_done && !*blocked) {
907 bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
908 if (dlm->node_num <= bit) {
909 /* my node number is lowest.
910 * now tell other nodes that I am
911 * mastering this. */
912 mle->master = dlm->node_num;
913 assert = 1;
914 sleep = 0;
915 }
916 /* if voting is done, but we have not received
917 * an assert master yet, we must sleep */
918 }
919 }
920
921 spin_unlock(&mle->spinlock);
922
923 /* sleep if we haven't finished voting yet */
924 if (sleep) {
925 unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);
926
927 /*
928 if (atomic_read(&mle->mle_refs.refcount) < 2)
929 mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle,
930 atomic_read(&mle->mle_refs.refcount),
931 res->lockname.len, res->lockname.name);
932 */
933 atomic_set(&mle->woken, 0);
934 (void)wait_event_timeout(mle->wq,
935 (atomic_read(&mle->woken) == 1),
936 timeo);
937 if (res->owner == O2NM_MAX_NODES) {
938 mlog(0, "waiting again\n");
939 goto recheck;
940 }
941 mlog(0, "done waiting, master is %u\n", res->owner);
942 ret = 0;
943 goto leave;
944 }
945
946 ret = 0; /* done */
947 if (assert) {
948 m = dlm->node_num;
949 mlog(0, "about to master %.*s here, this=%u\n",
950 res->lockname.len, res->lockname.name, m);
951 ret = dlm_do_assert_master(dlm, res->lockname.name,
952 res->lockname.len, mle->vote_map, 0);
953 if (ret) {
954 /* This is a failure in the network path,
955 * not in the response to the assert_master
956 * (any nonzero response is a BUG on this node).
957 * Most likely a socket just got disconnected
958 * due to node death. */
959 mlog_errno(ret);
960 }
961 /* no longer need to restart lock mastery.
962 * all living nodes have been contacted. */
963 ret = 0;
964 }
965
966 /* set the lockres owner */
967 spin_lock(&res->spinlock);
968 dlm_change_lockres_owner(dlm, res, m);
969 spin_unlock(&res->spinlock);
970
971leave:
972 return ret;
973}
974
975struct dlm_bitmap_diff_iter
976{
977 int curnode;
978 unsigned long *orig_bm;
979 unsigned long *cur_bm;
980 unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
981};
982
983enum dlm_node_state_change
984{
985 NODE_DOWN = -1,
986 NODE_NO_CHANGE = 0,
987 NODE_UP
988};
989
990static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter,
991 unsigned long *orig_bm,
992 unsigned long *cur_bm)
993{
994 unsigned long p1, p2;
995 int i;
996
997 iter->curnode = -1;
998 iter->orig_bm = orig_bm;
999 iter->cur_bm = cur_bm;
1000
1001 for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) {
1002 p1 = *(iter->orig_bm + i);
1003 p2 = *(iter->cur_bm + i);
1004 iter->diff_bm[i] = (p1 & ~p2) | (p2 & ~p1);
1005 }
1006}
1007
1008static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter,
1009 enum dlm_node_state_change *state)
1010{
1011 int bit;
1012
1013 if (iter->curnode >= O2NM_MAX_NODES)
1014 return -ENOENT;
1015
1016 bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES,
1017 iter->curnode+1);
1018 if (bit >= O2NM_MAX_NODES) {
1019 iter->curnode = O2NM_MAX_NODES;
1020 return -ENOENT;
1021 }
1022
1023 /* if it was there in the original then this node died */
1024 if (test_bit(bit, iter->orig_bm))
1025 *state = NODE_DOWN;
1026 else
1027 *state = NODE_UP;
1028
1029 iter->curnode = bit;
1030 return bit;
1031}
1032
1033
1034static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
1035 struct dlm_lock_resource *res,
1036 struct dlm_master_list_entry *mle,
1037 int blocked)
1038{
1039 struct dlm_bitmap_diff_iter bdi;
1040 enum dlm_node_state_change sc;
1041 int node;
1042 int ret = 0;
1043
1044 mlog(0, "something happened such that the "
1045 "master process may need to be restarted!\n");
1046
1047 assert_spin_locked(&mle->spinlock);
1048
1049 dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map);
1050 node = dlm_bitmap_diff_iter_next(&bdi, &sc);
1051 while (node >= 0) {
1052 if (sc == NODE_UP) {
1053 /* a node came up. easy. might not even need
1054 * to talk to it if its node number is higher
1055 * or if we are already blocked. */
1056 mlog(0, "node up! %d\n", node);
1057 if (blocked)
1058 goto next;
1059
1060 if (node > dlm->node_num) {
1061 mlog(0, "node > this node. skipping.\n");
1062 goto next;
1063 }
1064
1065 /* redo the master request, but only for the new node */
1066 mlog(0, "sending request to new node\n");
1067 clear_bit(node, mle->response_map);
1068 set_bit(node, mle->vote_map);
1069 } else {
1070 mlog(ML_ERROR, "node down! %d\n", node);
1071
1072 /* if the node wasn't involved in mastery skip it,
1073 * but clear it out from the maps so that it will
1074 * not affect mastery of this lockres */
1075 clear_bit(node, mle->response_map);
1076 clear_bit(node, mle->vote_map);
1077 if (!test_bit(node, mle->maybe_map))
1078 goto next;
1079
1080 /* if we're already blocked on lock mastery, and the
1081 * dead node wasn't the expected master, or there is
1082 * another node in the maybe_map, keep waiting */
1083 if (blocked) {
1084 int lowest = find_next_bit(mle->maybe_map,
1085 O2NM_MAX_NODES, 0);
1086
1087 /* act like it was never there */
1088 clear_bit(node, mle->maybe_map);
1089
1090 if (node != lowest)
1091 goto next;
1092
1093 mlog(ML_ERROR, "expected master %u died while "
1094 "this node was blocked waiting on it!\n",
1095 node);
1096 lowest = find_next_bit(mle->maybe_map,
1097 O2NM_MAX_NODES,
1098 lowest+1);
1099 if (lowest < O2NM_MAX_NODES) {
1100 mlog(0, "still blocked. waiting "
1101 "on %u now\n", lowest);
1102 goto next;
1103 }
1104
1105 /* mle is an MLE_BLOCK, but there is now
1106 * nothing left to block on. we need to return
1107 * all the way back out and try again with
1108 * an MLE_MASTER. dlm_do_local_recovery_cleanup
1109 * has already run, so the mle refcount is ok */
1110 mlog(0, "no longer blocking. we can "
1111 "try to master this here\n");
1112 mle->type = DLM_MLE_MASTER;
1113 memset(mle->maybe_map, 0,
1114 sizeof(mle->maybe_map));
1115 memset(mle->response_map, 0,
1116 sizeof(mle->maybe_map));
1117 memcpy(mle->vote_map, mle->node_map,
1118 sizeof(mle->node_map));
1119 mle->u.res = res;
1120 set_bit(dlm->node_num, mle->maybe_map);
1121
1122 ret = -EAGAIN;
1123 goto next;
1124 }
1125
1126 clear_bit(node, mle->maybe_map);
1127 if (node > dlm->node_num)
1128 goto next;
1129
1130 mlog(0, "dead node in map!\n");
1131 /* yuck. go back and re-contact all nodes
1132 * in the vote_map, removing this node. */
1133 memset(mle->response_map, 0,
1134 sizeof(mle->response_map));
1135 }
1136 ret = -EAGAIN;
1137next:
1138 node = dlm_bitmap_diff_iter_next(&bdi, &sc);
1139 }
1140 return ret;
1141}
1142
1143
1144/*
1145 * DLM_MASTER_REQUEST_MSG
1146 *
1147 * returns: 0 on success,
1148 * -errno on a network error
1149 *
1150 * on error, the caller should assume the target node is "dead"
1151 *
1152 */
1153
1154static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to)
1155{
1156 struct dlm_ctxt *dlm = mle->dlm;
1157 struct dlm_master_request request;
1158 int ret, response=0, resend;
1159
1160 memset(&request, 0, sizeof(request));
1161 request.node_idx = dlm->node_num;
1162
1163 BUG_ON(mle->type == DLM_MLE_MIGRATION);
1164
1165 if (mle->type != DLM_MLE_MASTER) {
1166 request.namelen = mle->u.name.len;
1167 memcpy(request.name, mle->u.name.name, request.namelen);
1168 } else {
1169 request.namelen = mle->u.res->lockname.len;
1170 memcpy(request.name, mle->u.res->lockname.name,
1171 request.namelen);
1172 }
1173
1174again:
1175 ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
1176 sizeof(request), to, &response);
1177 if (ret < 0) {
1178 if (ret == -ESRCH) {
1179 /* should never happen */
1180 mlog(ML_ERROR, "TCP stack not ready!\n");
1181 BUG();
1182 } else if (ret == -EINVAL) {
1183 mlog(ML_ERROR, "bad args passed to o2net!\n");
1184 BUG();
1185 } else if (ret == -ENOMEM) {
1186 mlog(ML_ERROR, "out of memory while trying to send "
1187 "network message! retrying\n");
1188 /* this is totally crude */
1189 msleep(50);
1190 goto again;
1191 } else if (!dlm_is_host_down(ret)) {
1192 /* not a network error. bad. */
1193 mlog_errno(ret);
1194 mlog(ML_ERROR, "unhandled error!");
1195 BUG();
1196 }
1197 /* all other errors should be network errors,
1198 * and likely indicate node death */
1199 mlog(ML_ERROR, "link to %d went down!\n", to);
1200 goto out;
1201 }
1202
1203 ret = 0;
1204 resend = 0;
1205 spin_lock(&mle->spinlock);
1206 switch (response) {
1207 case DLM_MASTER_RESP_YES:
1208 set_bit(to, mle->response_map);
1209 mlog(0, "node %u is the master, response=YES\n", to);
1210 mle->master = to;
1211 break;
1212 case DLM_MASTER_RESP_NO:
1213 mlog(0, "node %u not master, response=NO\n", to);
1214 set_bit(to, mle->response_map);
1215 break;
1216 case DLM_MASTER_RESP_MAYBE:
1217 mlog(0, "node %u not master, response=MAYBE\n", to);
1218 set_bit(to, mle->response_map);
1219 set_bit(to, mle->maybe_map);
1220 break;
1221 case DLM_MASTER_RESP_ERROR:
1222 mlog(0, "node %u hit an error, resending\n", to);
1223 resend = 1;
1224 response = 0;
1225 break;
1226 default:
1227 mlog(ML_ERROR, "bad response! %u\n", response);
1228 BUG();
1229 }
1230 spin_unlock(&mle->spinlock);
1231 if (resend) {
1232 /* this is also totally crude */
1233 msleep(50);
1234 goto again;
1235 }
1236
1237out:
1238 return ret;
1239}
1240
1241/*
1242 * locks that can be taken here:
1243 * dlm->spinlock
1244 * res->spinlock
1245 * mle->spinlock
1246 * dlm->master_list
1247 *
1248 * if possible, TRIM THIS DOWN!!!
1249 */
1250int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
1251{
1252 u8 response = DLM_MASTER_RESP_MAYBE;
1253 struct dlm_ctxt *dlm = data;
1254 struct dlm_lock_resource *res;
1255 struct dlm_master_request *request = (struct dlm_master_request *) msg->buf;
1256 struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
1257 char *name;
1258 unsigned int namelen;
1259 int found, ret;
1260 int set_maybe;
1261
1262 if (!dlm_grab(dlm))
1263 return DLM_MASTER_RESP_NO;
1264
1265 if (!dlm_domain_fully_joined(dlm)) {
1266 response = DLM_MASTER_RESP_NO;
1267 goto send_response;
1268 }
1269
1270 name = request->name;
1271 namelen = request->namelen;
1272
1273 if (namelen > DLM_LOCKID_NAME_MAX) {
1274 response = DLM_IVBUFLEN;
1275 goto send_response;
1276 }
1277
1278way_up_top:
1279 spin_lock(&dlm->spinlock);
1280 res = __dlm_lookup_lockres(dlm, name, namelen);
1281 if (res) {
1282 spin_unlock(&dlm->spinlock);
1283
1284 /* take care of the easy cases up front */
1285 spin_lock(&res->spinlock);
1286 if (res->state & DLM_LOCK_RES_RECOVERING) {
1287 spin_unlock(&res->spinlock);
1288 mlog(0, "returning DLM_MASTER_RESP_ERROR since res is "
1289 "being recovered\n");
1290 response = DLM_MASTER_RESP_ERROR;
1291 if (mle)
1292 kmem_cache_free(dlm_mle_cache, mle);
1293 goto send_response;
1294 }
1295
1296 if (res->owner == dlm->node_num) {
1297 u32 flags = DLM_ASSERT_MASTER_MLE_CLEANUP;
1298 spin_unlock(&res->spinlock);
1299 // mlog(0, "this node is the master\n");
1300 response = DLM_MASTER_RESP_YES;
1301 if (mle)
1302 kmem_cache_free(dlm_mle_cache, mle);
1303
1304 /* this node is the owner.
1305 * there is some extra work that needs to
1306 * happen now. the requesting node has
1307 * caused all nodes up to this one to
1308 * create mles. this node now needs to
1309 * go back and clean those up. */
1310 mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
1311 dlm->node_num, res->lockname.len, res->lockname.name);
1312 ret = dlm_dispatch_assert_master(dlm, res, 1,
1313 request->node_idx,
1314 flags);
1315 if (ret < 0) {
1316 mlog(ML_ERROR, "failed to dispatch assert "
1317 "master work\n");
1318 response = DLM_MASTER_RESP_ERROR;
1319 }
1320 goto send_response;
1321 } else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
1322 spin_unlock(&res->spinlock);
1323 // mlog(0, "node %u is the master\n", res->owner);
1324 response = DLM_MASTER_RESP_NO;
1325 if (mle)
1326 kmem_cache_free(dlm_mle_cache, mle);
1327 goto send_response;
1328 }
1329
1330 /* ok, there is no owner. either this node is
1331 * being blocked, or it is actively trying to
1332 * master this lock. */
1333 if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
1334 mlog(ML_ERROR, "lock with no owner should be "
1335 "in-progress!\n");
1336 BUG();
1337 }
1338
1339 // mlog(0, "lockres is in progress...\n");
1340 spin_lock(&dlm->master_lock);
1341 found = dlm_find_mle(dlm, &tmpmle, name, namelen);
1342 if (!found) {
1343 mlog(ML_ERROR, "no mle found for this lock!\n");
1344 BUG();
1345 }
1346 set_maybe = 1;
1347 spin_lock(&tmpmle->spinlock);
1348 if (tmpmle->type == DLM_MLE_BLOCK) {
1349 // mlog(0, "this node is waiting for "
1350 // "lockres to be mastered\n");
1351 response = DLM_MASTER_RESP_NO;
1352 } else if (tmpmle->type == DLM_MLE_MIGRATION) {
1353 mlog(0, "node %u is master, but trying to migrate to "
1354 "node %u.\n", tmpmle->master, tmpmle->new_master);
1355 if (tmpmle->master == dlm->node_num) {
1356 response = DLM_MASTER_RESP_YES;
1357 mlog(ML_ERROR, "no owner on lockres, but this "
1358 "node is trying to migrate it to %u?!\n",
1359 tmpmle->new_master);
1360 BUG();
1361 } else {
1362 /* the real master can respond on its own */
1363 response = DLM_MASTER_RESP_NO;
1364 }
1365 } else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) {
1366 set_maybe = 0;
1367 if (tmpmle->master == dlm->node_num)
1368 response = DLM_MASTER_RESP_YES;
1369 else
1370 response = DLM_MASTER_RESP_NO;
1371 } else {
1372 // mlog(0, "this node is attempting to "
1373 // "master lockres\n");
1374 response = DLM_MASTER_RESP_MAYBE;
1375 }
1376 if (set_maybe)
1377 set_bit(request->node_idx, tmpmle->maybe_map);
1378 spin_unlock(&tmpmle->spinlock);
1379
1380 spin_unlock(&dlm->master_lock);
1381 spin_unlock(&res->spinlock);
1382
1383 /* keep the mle attached to heartbeat events */
1384 dlm_put_mle(tmpmle);
1385 if (mle)
1386 kmem_cache_free(dlm_mle_cache, mle);
1387 goto send_response;
1388 }
1389
1390 /*
1391 * lockres doesn't exist on this node
1392 * if there is an MLE_BLOCK, return NO
1393 * if there is an MLE_MASTER, return MAYBE
1394 * otherwise, add an MLE_BLOCK, return NO
1395 */
1396 spin_lock(&dlm->master_lock);
1397 found = dlm_find_mle(dlm, &tmpmle, name, namelen);
1398 if (!found) {
1399 /* this lockid has never been seen on this node yet */
1400 // mlog(0, "no mle found\n");
1401 if (!mle) {
1402 spin_unlock(&dlm->master_lock);
1403 spin_unlock(&dlm->spinlock);
1404
1405 mle = (struct dlm_master_list_entry *)
1406 kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
1407 if (!mle) {
1408 // bad bad bad... this sucks.
1409 response = DLM_MASTER_RESP_ERROR;
1410 goto send_response;
1411 }
1412 spin_lock(&dlm->spinlock);
1413 dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL,
1414 name, namelen);
1415 spin_unlock(&dlm->spinlock);
1416 goto way_up_top;
1417 }
1418
1419 // mlog(0, "this is second time thru, already allocated, "
1420 // "add the block.\n");
1421 set_bit(request->node_idx, mle->maybe_map);
1422 list_add(&mle->list, &dlm->master_list);
1423 response = DLM_MASTER_RESP_NO;
1424 } else {
1425 // mlog(0, "mle was found\n");
1426 set_maybe = 1;
1427 spin_lock(&tmpmle->spinlock);
1428 if (tmpmle->type == DLM_MLE_BLOCK)
1429 response = DLM_MASTER_RESP_NO;
1430 else if (tmpmle->type == DLM_MLE_MIGRATION) {
1431 mlog(0, "migration mle was found (%u->%u)\n",
1432 tmpmle->master, tmpmle->new_master);
1433 if (tmpmle->master == dlm->node_num) {
1434 mlog(ML_ERROR, "no lockres, but migration mle "
1435 "says that this node is master!\n");
1436 BUG();
1437 }
1438 /* real master can respond on its own */
1439 response = DLM_MASTER_RESP_NO;
1440 } else {
1441 if (tmpmle->master == dlm->node_num) {
1442 response = DLM_MASTER_RESP_YES;
1443 set_maybe = 0;
1444 } else
1445 response = DLM_MASTER_RESP_MAYBE;
1446 }
1447 if (set_maybe)
1448 set_bit(request->node_idx, tmpmle->maybe_map);
1449 spin_unlock(&tmpmle->spinlock);
1450 }
1451 spin_unlock(&dlm->master_lock);
1452 spin_unlock(&dlm->spinlock);
1453
1454 if (found) {
1455 /* keep the mle attached to heartbeat events */
1456 dlm_put_mle(tmpmle);
1457 }
1458send_response:
1459 dlm_put(dlm);
1460 return response;
1461}
1462
1463/*
1464 * DLM_ASSERT_MASTER_MSG
1465 */
1466
1467
1468/*
1469 * NOTE: this can be used for debugging
1470 * can periodically run all locks owned by this node
1471 * and re-assert across the cluster...
1472 */
1473static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname,
1474 unsigned int namelen, void *nodemap,
1475 u32 flags)
1476{
1477 struct dlm_assert_master assert;
1478 int to, tmpret;
1479 struct dlm_node_iter iter;
1480 int ret = 0;
1481
1482 BUG_ON(namelen > O2NM_MAX_NAME_LEN);
1483
1484 /* note that if this nodemap is empty, it returns 0 */
1485 dlm_node_iter_init(nodemap, &iter);
1486 while ((to = dlm_node_iter_next(&iter)) >= 0) {
1487 int r = 0;
1488 mlog(0, "sending assert master to %d (%.*s)\n", to,
1489 namelen, lockname);
1490 memset(&assert, 0, sizeof(assert));
1491 assert.node_idx = dlm->node_num;
1492 assert.namelen = namelen;
1493 memcpy(assert.name, lockname, namelen);
1494 assert.flags = cpu_to_be32(flags);
1495
1496 tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
1497 &assert, sizeof(assert), to, &r);
1498 if (tmpret < 0) {
1499 mlog(ML_ERROR, "assert_master returned %d!\n", tmpret);
1500 if (!dlm_is_host_down(tmpret)) {
1501 mlog(ML_ERROR, "unhandled error!\n");
1502 BUG();
1503 }
1504 /* a node died. finish out the rest of the nodes. */
1505 mlog(ML_ERROR, "link to %d went down!\n", to);
1506 /* any nonzero status return will do */
1507 ret = tmpret;
1508 } else if (r < 0) {
1509 /* ok, something horribly messed. kill thyself. */
1510 mlog(ML_ERROR,"during assert master of %.*s to %u, "
1511 "got %d.\n", namelen, lockname, to, r);
1512 dlm_dump_lock_resources(dlm);
1513 BUG();
1514 }
1515 }
1516
1517 return ret;
1518}
1519
1520/*
1521 * locks that can be taken here:
1522 * dlm->spinlock
1523 * res->spinlock
1524 * mle->spinlock
1525 * dlm->master_list
1526 *
1527 * if possible, TRIM THIS DOWN!!!
1528 */
1529int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
1530{
1531 struct dlm_ctxt *dlm = data;
1532 struct dlm_master_list_entry *mle = NULL;
1533 struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf;
1534 struct dlm_lock_resource *res = NULL;
1535 char *name;
1536 unsigned int namelen;
1537 u32 flags;
1538
1539 if (!dlm_grab(dlm))
1540 return 0;
1541
1542 name = assert->name;
1543 namelen = assert->namelen;
1544 flags = be32_to_cpu(assert->flags);
1545
1546 if (namelen > DLM_LOCKID_NAME_MAX) {
1547 mlog(ML_ERROR, "Invalid name length!");
1548 goto done;
1549 }
1550
1551 spin_lock(&dlm->spinlock);
1552
1553 if (flags)
1554 mlog(0, "assert_master with flags: %u\n", flags);
1555
1556 /* find the MLE */
1557 spin_lock(&dlm->master_lock);
1558 if (!dlm_find_mle(dlm, &mle, name, namelen)) {
1559 /* not an error, could be master just re-asserting */
1560 mlog(0, "just got an assert_master from %u, but no "
1561 "MLE for it! (%.*s)\n", assert->node_idx,
1562 namelen, name);
1563 } else {
1564 int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0);
1565 if (bit >= O2NM_MAX_NODES) {
1566 /* not necessarily an error, though less likely.
1567 * could be master just re-asserting. */
1568 mlog(ML_ERROR, "no bits set in the maybe_map, but %u "
1569 "is asserting! (%.*s)\n", assert->node_idx,
1570 namelen, name);
1571 } else if (bit != assert->node_idx) {
1572 if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
1573 mlog(0, "master %u was found, %u should "
1574 "back off\n", assert->node_idx, bit);
1575 } else {
1576 /* with the fix for bug 569, a higher node
1577 * number winning the mastery will respond
1578 * YES to mastery requests, but this node
1579 * had no way of knowing. let it pass. */
1580 mlog(ML_ERROR, "%u is the lowest node, "
1581 "%u is asserting. (%.*s) %u must "
1582 "have begun after %u won.\n", bit,
1583 assert->node_idx, namelen, name, bit,
1584 assert->node_idx);
1585 }
1586 }
1587 }
1588 spin_unlock(&dlm->master_lock);
1589
1590 /* ok everything checks out with the MLE
1591 * now check to see if there is a lockres */
1592 res = __dlm_lookup_lockres(dlm, name, namelen);
1593 if (res) {
1594 spin_lock(&res->spinlock);
1595 if (res->state & DLM_LOCK_RES_RECOVERING) {
1596 mlog(ML_ERROR, "%u asserting but %.*s is "
1597 "RECOVERING!\n", assert->node_idx, namelen, name);
1598 goto kill;
1599 }
1600 if (!mle) {
1601 if (res->owner != assert->node_idx) {
1602 mlog(ML_ERROR, "assert_master from "
1603 "%u, but current owner is "
1604 "%u! (%.*s)\n",
1605 assert->node_idx, res->owner,
1606 namelen, name);
1607 goto kill;
1608 }
1609 } else if (mle->type != DLM_MLE_MIGRATION) {
1610 if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
1611 /* owner is just re-asserting */
1612 if (res->owner == assert->node_idx) {
1613 mlog(0, "owner %u re-asserting on "
1614 "lock %.*s\n", assert->node_idx,
1615 namelen, name);
1616 goto ok;
1617 }
1618 mlog(ML_ERROR, "got assert_master from "
1619 "node %u, but %u is the owner! "
1620 "(%.*s)\n", assert->node_idx,
1621 res->owner, namelen, name);
1622 goto kill;
1623 }
1624 if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
1625 mlog(ML_ERROR, "got assert from %u, but lock "
1626 "with no owner should be "
1627 "in-progress! (%.*s)\n",
1628 assert->node_idx,
1629 namelen, name);
1630 goto kill;
1631 }
1632 } else /* mle->type == DLM_MLE_MIGRATION */ {
1633 /* should only be getting an assert from new master */
1634 if (assert->node_idx != mle->new_master) {
1635 mlog(ML_ERROR, "got assert from %u, but "
1636 "new master is %u, and old master "
1637 "was %u (%.*s)\n",
1638 assert->node_idx, mle->new_master,
1639 mle->master, namelen, name);
1640 goto kill;
1641 }
1642
1643 }
1644ok:
1645 spin_unlock(&res->spinlock);
1646 }
1647 spin_unlock(&dlm->spinlock);
1648
1649 // mlog(0, "woo! got an assert_master from node %u!\n",
1650 // assert->node_idx);
1651 if (mle) {
1652 int extra_ref;
1653
1654 spin_lock(&mle->spinlock);
1655 extra_ref = !!(mle->type == DLM_MLE_BLOCK
1656 || mle->type == DLM_MLE_MIGRATION);
1657 mle->master = assert->node_idx;
1658 atomic_set(&mle->woken, 1);
1659 wake_up(&mle->wq);
1660 spin_unlock(&mle->spinlock);
1661
1662 if (mle->type == DLM_MLE_MIGRATION && res) {
1663 mlog(0, "finishing off migration of lockres %.*s, "
1664 "from %u to %u\n",
1665 res->lockname.len, res->lockname.name,
1666 dlm->node_num, mle->new_master);
1667 spin_lock(&res->spinlock);
1668 res->state &= ~DLM_LOCK_RES_MIGRATING;
1669 dlm_change_lockres_owner(dlm, res, mle->new_master);
1670 BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
1671 spin_unlock(&res->spinlock);
1672 }
1673 /* master is known, detach if not already detached */
1674 dlm_mle_detach_hb_events(dlm, mle);
1675 dlm_put_mle(mle);
1676
1677 if (extra_ref) {
1678 /* the assert master message now balances the extra
1679 * ref given by the master / migration request message.
1680 * if this is the last put, it will be removed
1681 * from the list. */
1682 dlm_put_mle(mle);
1683 }
1684 }
1685
1686done:
1687 if (res)
1688 dlm_lockres_put(res);
1689 dlm_put(dlm);
1690 return 0;
1691
1692kill:
1693 /* kill the caller! */
1694 spin_unlock(&res->spinlock);
1695 spin_unlock(&dlm->spinlock);
1696 dlm_lockres_put(res);
1697 mlog(ML_ERROR, "Bad message received from another node. Dumping state "
1698 "and killing the other node now! This node is OK and can continue.\n");
1699 dlm_dump_lock_resources(dlm);
1700 dlm_put(dlm);
1701 return -EINVAL;
1702}
1703
1704int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
1705 struct dlm_lock_resource *res,
1706 int ignore_higher, u8 request_from, u32 flags)
1707{
1708 struct dlm_work_item *item;
1709 item = kcalloc(1, sizeof(*item), GFP_KERNEL);
1710 if (!item)
1711 return -ENOMEM;
1712
1713
1714 /* queue up work for dlm_assert_master_worker */
1715 dlm_grab(dlm); /* get an extra ref for the work item */
1716 dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL);
1717 item->u.am.lockres = res; /* already have a ref */
1718 /* can optionally ignore node numbers higher than this node */
1719 item->u.am.ignore_higher = ignore_higher;
1720 item->u.am.request_from = request_from;
1721 item->u.am.flags = flags;
1722
1723 spin_lock(&dlm->work_lock);
1724 list_add_tail(&item->list, &dlm->work_list);
1725 spin_unlock(&dlm->work_lock);
1726
1727 schedule_work(&dlm->dispatched_work);
1728 return 0;
1729}
1730
1731static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
1732{
1733 struct dlm_ctxt *dlm = data;
1734 int ret = 0;
1735 struct dlm_lock_resource *res;
1736 unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)];
1737 int ignore_higher;
1738 int bit;
1739 u8 request_from;
1740 u32 flags;
1741
1742 dlm = item->dlm;
1743 res = item->u.am.lockres;
1744 ignore_higher = item->u.am.ignore_higher;
1745 request_from = item->u.am.request_from;
1746 flags = item->u.am.flags;
1747
1748 spin_lock(&dlm->spinlock);
1749 memcpy(nodemap, dlm->domain_map, sizeof(nodemap));
1750 spin_unlock(&dlm->spinlock);
1751
1752 clear_bit(dlm->node_num, nodemap);
1753 if (ignore_higher) {
1754 /* if is this just to clear up mles for nodes below
1755 * this node, do not send the message to the original
1756 * caller or any node number higher than this */
1757 clear_bit(request_from, nodemap);
1758 bit = dlm->node_num;
1759 while (1) {
1760 bit = find_next_bit(nodemap, O2NM_MAX_NODES,
1761 bit+1);
1762 if (bit >= O2NM_MAX_NODES)
1763 break;
1764 clear_bit(bit, nodemap);
1765 }
1766 }
1767
1768 /* this call now finishes out the nodemap
1769 * even if one or more nodes die */
1770 mlog(0, "worker about to master %.*s here, this=%u\n",
1771 res->lockname.len, res->lockname.name, dlm->node_num);
1772 ret = dlm_do_assert_master(dlm, res->lockname.name,
1773 res->lockname.len,
1774 nodemap, flags);
1775 if (ret < 0) {
1776 /* no need to restart, we are done */
1777 mlog_errno(ret);
1778 }
1779
1780 dlm_lockres_put(res);
1781
1782 mlog(0, "finished with dlm_assert_master_worker\n");
1783}
1784
1785
1786/*
1787 * DLM_MIGRATE_LOCKRES
1788 */
1789
1790
1791int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
1792 u8 target)
1793{
1794 struct dlm_master_list_entry *mle = NULL;
1795 struct dlm_master_list_entry *oldmle = NULL;
1796 struct dlm_migratable_lockres *mres = NULL;
1797 int ret = -EINVAL;
1798 const char *name;
1799 unsigned int namelen;
1800 int mle_added = 0;
1801 struct list_head *queue, *iter;
1802 int i;
1803 struct dlm_lock *lock;
1804 int empty = 1;
1805
1806 if (!dlm_grab(dlm))
1807 return -EINVAL;
1808
1809 name = res->lockname.name;
1810 namelen = res->lockname.len;
1811
1812 mlog(0, "migrating %.*s to %u\n", namelen, name, target);
1813
1814 /*
1815 * ensure this lockres is a proper candidate for migration
1816 */
1817 spin_lock(&res->spinlock);
1818 if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
1819 mlog(0, "cannot migrate lockres with unknown owner!\n");
1820 spin_unlock(&res->spinlock);
1821 goto leave;
1822 }
1823 if (res->owner != dlm->node_num) {
1824 mlog(0, "cannot migrate lockres this node doesn't own!\n");
1825 spin_unlock(&res->spinlock);
1826 goto leave;
1827 }
1828 mlog(0, "checking queues...\n");
1829 queue = &res->granted;
1830 for (i=0; i<3; i++) {
1831 list_for_each(iter, queue) {
1832 lock = list_entry (iter, struct dlm_lock, list);
1833 empty = 0;
1834 if (lock->ml.node == dlm->node_num) {
1835 mlog(0, "found a lock owned by this node "
1836 "still on the %s queue! will not "
1837 "migrate this lockres\n",
1838 i==0 ? "granted" :
1839 (i==1 ? "converting" : "blocked"));
1840 spin_unlock(&res->spinlock);
1841 ret = -ENOTEMPTY;
1842 goto leave;
1843 }
1844 }
1845 queue++;
1846 }
1847 mlog(0, "all locks on this lockres are nonlocal. continuing\n");
1848 spin_unlock(&res->spinlock);
1849
1850 /* no work to do */
1851 if (empty) {
1852 mlog(0, "no locks were found on this lockres! done!\n");
1853 ret = 0;
1854 goto leave;
1855 }
1856
1857 /*
1858 * preallocate up front
1859 * if this fails, abort
1860 */
1861
1862 ret = -ENOMEM;
1863 mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_KERNEL);
1864 if (!mres) {
1865 mlog_errno(ret);
1866 goto leave;
1867 }
1868
1869 mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
1870 GFP_KERNEL);
1871 if (!mle) {
1872 mlog_errno(ret);
1873 goto leave;
1874 }
1875 ret = 0;
1876
1877 /*
1878 * find a node to migrate the lockres to
1879 */
1880
1881 mlog(0, "picking a migration node\n");
1882 spin_lock(&dlm->spinlock);
1883 /* pick a new node */
1884 if (!test_bit(target, dlm->domain_map) ||
1885 target >= O2NM_MAX_NODES) {
1886 target = dlm_pick_migration_target(dlm, res);
1887 }
1888 mlog(0, "node %u chosen for migration\n", target);
1889
1890 if (target >= O2NM_MAX_NODES ||
1891 !test_bit(target, dlm->domain_map)) {
1892 /* target chosen is not alive */
1893 ret = -EINVAL;
1894 }
1895
1896 if (ret) {
1897 spin_unlock(&dlm->spinlock);
1898 goto fail;
1899 }
1900
1901 mlog(0, "continuing with target = %u\n", target);
1902
1903 /*
1904 * clear any existing master requests and
1905 * add the migration mle to the list
1906 */
1907 spin_lock(&dlm->master_lock);
1908 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
1909 namelen, target, dlm->node_num);
1910 spin_unlock(&dlm->master_lock);
1911 spin_unlock(&dlm->spinlock);
1912
1913 if (ret == -EEXIST) {
1914 mlog(0, "another process is already migrating it\n");
1915 goto fail;
1916 }
1917 mle_added = 1;
1918
1919 /*
1920 * set the MIGRATING flag and flush asts
1921 * if we fail after this we need to re-dirty the lockres
1922 */
1923 if (dlm_mark_lockres_migrating(dlm, res, target) < 0) {
1924 mlog(ML_ERROR, "tried to migrate %.*s to %u, but "
1925 "the target went down.\n", res->lockname.len,
1926 res->lockname.name, target);
1927 spin_lock(&res->spinlock);
1928 res->state &= ~DLM_LOCK_RES_MIGRATING;
1929 spin_unlock(&res->spinlock);
1930 ret = -EINVAL;
1931 }
1932
1933fail:
1934 if (oldmle) {
1935 /* master is known, detach if not already detached */
1936 dlm_mle_detach_hb_events(dlm, oldmle);
1937 dlm_put_mle(oldmle);
1938 }
1939
1940 if (ret < 0) {
1941 if (mle_added) {
1942 dlm_mle_detach_hb_events(dlm, mle);
1943 dlm_put_mle(mle);
1944 } else if (mle) {
1945 kmem_cache_free(dlm_mle_cache, mle);
1946 }
1947 goto leave;
1948 }
1949
1950 /*
1951 * at this point, we have a migration target, an mle
1952 * in the master list, and the MIGRATING flag set on
1953 * the lockres
1954 */
1955
1956
1957 /* get an extra reference on the mle.
1958 * otherwise the assert_master from the new
1959 * master will destroy this.
1960 * also, make sure that all callers of dlm_get_mle
1961 * take both dlm->spinlock and dlm->master_lock */
1962 spin_lock(&dlm->spinlock);
1963 spin_lock(&dlm->master_lock);
1964 dlm_get_mle(mle);
1965 spin_unlock(&dlm->master_lock);
1966 spin_unlock(&dlm->spinlock);
1967
1968 /* notify new node and send all lock state */
1969 /* call send_one_lockres with migration flag.
1970 * this serves as notice to the target node that a
1971 * migration is starting. */
1972 ret = dlm_send_one_lockres(dlm, res, mres, target,
1973 DLM_MRES_MIGRATION);
1974
1975 if (ret < 0) {
1976 mlog(0, "migration to node %u failed with %d\n",
1977 target, ret);
1978 /* migration failed, detach and clean up mle */
1979 dlm_mle_detach_hb_events(dlm, mle);
1980 dlm_put_mle(mle);
1981 dlm_put_mle(mle);
1982 goto leave;
1983 }
1984
1985 /* at this point, the target sends a message to all nodes,
1986 * (using dlm_do_migrate_request). this node is skipped since
1987 * we had to put an mle in the list to begin the process. this
1988 * node now waits for target to do an assert master. this node
1989 * will be the last one notified, ensuring that the migration
1990 * is complete everywhere. if the target dies while this is
1991 * going on, some nodes could potentially see the target as the
1992 * master, so it is important that my recovery finds the migration
1993 * mle and sets the master to UNKNONWN. */
1994
1995
1996 /* wait for new node to assert master */
1997 while (1) {
1998 ret = wait_event_interruptible_timeout(mle->wq,
1999 (atomic_read(&mle->woken) == 1),
2000 msecs_to_jiffies(5000));
2001
2002 if (ret >= 0) {
2003 if (atomic_read(&mle->woken) == 1 ||
2004 res->owner == target)
2005 break;
2006
2007 mlog(0, "timed out during migration\n");
2008 }
2009 if (ret == -ERESTARTSYS) {
2010 /* migration failed, detach and clean up mle */
2011 dlm_mle_detach_hb_events(dlm, mle);
2012 dlm_put_mle(mle);
2013 dlm_put_mle(mle);
2014 goto leave;
2015 }
2016 /* TODO: if node died: stop, clean up, return error */
2017 }
2018
2019 /* all done, set the owner, clear the flag */
2020 spin_lock(&res->spinlock);
2021 dlm_set_lockres_owner(dlm, res, target);
2022 res->state &= ~DLM_LOCK_RES_MIGRATING;
2023 dlm_remove_nonlocal_locks(dlm, res);
2024 spin_unlock(&res->spinlock);
2025 wake_up(&res->wq);
2026
2027 /* master is known, detach if not already detached */
2028 dlm_mle_detach_hb_events(dlm, mle);
2029 dlm_put_mle(mle);
2030 ret = 0;
2031
2032 dlm_lockres_calc_usage(dlm, res);
2033
2034leave:
2035 /* re-dirty the lockres if we failed */
2036 if (ret < 0)
2037 dlm_kick_thread(dlm, res);
2038
2039 /* TODO: cleanup */
2040 if (mres)
2041 free_page((unsigned long)mres);
2042
2043 dlm_put(dlm);
2044
2045 mlog(0, "returning %d\n", ret);
2046 return ret;
2047}
2048EXPORT_SYMBOL_GPL(dlm_migrate_lockres);
2049
2050int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock)
2051{
2052 int ret;
2053 spin_lock(&dlm->ast_lock);
2054 spin_lock(&lock->spinlock);
2055 ret = (list_empty(&lock->bast_list) && !lock->bast_pending);
2056 spin_unlock(&lock->spinlock);
2057 spin_unlock(&dlm->ast_lock);
2058 return ret;
2059}
2060
2061static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
2062 struct dlm_lock_resource *res,
2063 u8 mig_target)
2064{
2065 int can_proceed;
2066 spin_lock(&res->spinlock);
2067 can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING);
2068 spin_unlock(&res->spinlock);
2069
2070 /* target has died, so make the caller break out of the
2071 * wait_event, but caller must recheck the domain_map */
2072 spin_lock(&dlm->spinlock);
2073 if (!test_bit(mig_target, dlm->domain_map))
2074 can_proceed = 1;
2075 spin_unlock(&dlm->spinlock);
2076 return can_proceed;
2077}
2078
2079int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2080{
2081 int ret;
2082 spin_lock(&res->spinlock);
2083 ret = !!(res->state & DLM_LOCK_RES_DIRTY);
2084 spin_unlock(&res->spinlock);
2085 return ret;
2086}
2087
2088
2089static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
2090 struct dlm_lock_resource *res,
2091 u8 target)
2092{
2093 int ret = 0;
2094
2095 mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n",
2096 res->lockname.len, res->lockname.name, dlm->node_num,
2097 target);
2098 /* need to set MIGRATING flag on lockres. this is done by
2099 * ensuring that all asts have been flushed for this lockres. */
2100 spin_lock(&res->spinlock);
2101 BUG_ON(res->migration_pending);
2102 res->migration_pending = 1;
2103 /* strategy is to reserve an extra ast then release
2104 * it below, letting the release do all of the work */
2105 __dlm_lockres_reserve_ast(res);
2106 spin_unlock(&res->spinlock);
2107
2108 /* now flush all the pending asts.. hang out for a bit */
2109 dlm_kick_thread(dlm, res);
2110 wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
2111 dlm_lockres_release_ast(dlm, res);
2112
2113 mlog(0, "about to wait on migration_wq, dirty=%s\n",
2114 res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
2115 /* if the extra ref we just put was the final one, this
2116 * will pass thru immediately. otherwise, we need to wait
2117 * for the last ast to finish. */
2118again:
2119 ret = wait_event_interruptible_timeout(dlm->migration_wq,
2120 dlm_migration_can_proceed(dlm, res, target),
2121 msecs_to_jiffies(1000));
2122 if (ret < 0) {
2123 mlog(0, "woken again: migrating? %s, dead? %s\n",
2124 res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
2125 test_bit(target, dlm->domain_map) ? "no":"yes");
2126 } else {
2127 mlog(0, "all is well: migrating? %s, dead? %s\n",
2128 res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
2129 test_bit(target, dlm->domain_map) ? "no":"yes");
2130 }
2131 if (!dlm_migration_can_proceed(dlm, res, target)) {
2132 mlog(0, "trying again...\n");
2133 goto again;
2134 }
2135
2136 /* did the target go down or die? */
2137 spin_lock(&dlm->spinlock);
2138 if (!test_bit(target, dlm->domain_map)) {
2139 mlog(ML_ERROR, "aha. migration target %u just went down\n",
2140 target);
2141 ret = -EHOSTDOWN;
2142 }
2143 spin_unlock(&dlm->spinlock);
2144
2145 /*
2146 * at this point:
2147 *
2148 * o the DLM_LOCK_RES_MIGRATING flag is set
2149 * o there are no pending asts on this lockres
2150 * o all processes trying to reserve an ast on this
2151 * lockres must wait for the MIGRATING flag to clear
2152 */
2153 return ret;
2154}
2155
2156/* last step in the migration process.
2157 * original master calls this to free all of the dlm_lock
2158 * structures that used to be for other nodes. */
2159static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
2160 struct dlm_lock_resource *res)
2161{
2162 struct list_head *iter, *iter2;
2163 struct list_head *queue = &res->granted;
2164 int i;
2165 struct dlm_lock *lock;
2166
2167 assert_spin_locked(&res->spinlock);
2168
2169 BUG_ON(res->owner == dlm->node_num);
2170
2171 for (i=0; i<3; i++) {
2172 list_for_each_safe(iter, iter2, queue) {
2173 lock = list_entry (iter, struct dlm_lock, list);
2174 if (lock->ml.node != dlm->node_num) {
2175 mlog(0, "putting lock for node %u\n",
2176 lock->ml.node);
2177 /* be extra careful */
2178 BUG_ON(!list_empty(&lock->ast_list));
2179 BUG_ON(!list_empty(&lock->bast_list));
2180 BUG_ON(lock->ast_pending);
2181 BUG_ON(lock->bast_pending);
2182 list_del_init(&lock->list);
2183 dlm_lock_put(lock);
2184 }
2185 }
2186 queue++;
2187 }
2188}
2189
2190/* for now this is not too intelligent. we will
2191 * need stats to make this do the right thing.
2192 * this just finds the first lock on one of the
2193 * queues and uses that node as the target. */
2194static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
2195 struct dlm_lock_resource *res)
2196{
2197 int i;
2198 struct list_head *queue = &res->granted;
2199 struct list_head *iter;
2200 struct dlm_lock *lock;
2201 int nodenum;
2202
2203 assert_spin_locked(&dlm->spinlock);
2204
2205 spin_lock(&res->spinlock);
2206 for (i=0; i<3; i++) {
2207 list_for_each(iter, queue) {
2208 /* up to the caller to make sure this node
2209 * is alive */
2210 lock = list_entry (iter, struct dlm_lock, list);
2211 if (lock->ml.node != dlm->node_num) {
2212 spin_unlock(&res->spinlock);
2213 return lock->ml.node;
2214 }
2215 }
2216 queue++;
2217 }
2218 spin_unlock(&res->spinlock);
2219 mlog(0, "have not found a suitable target yet! checking domain map\n");
2220
2221 /* ok now we're getting desperate. pick anyone alive. */
2222 nodenum = -1;
2223 while (1) {
2224 nodenum = find_next_bit(dlm->domain_map,
2225 O2NM_MAX_NODES, nodenum+1);
2226 mlog(0, "found %d in domain map\n", nodenum);
2227 if (nodenum >= O2NM_MAX_NODES)
2228 break;
2229 if (nodenum != dlm->node_num) {
2230 mlog(0, "picking %d\n", nodenum);
2231 return nodenum;
2232 }
2233 }
2234
2235 mlog(0, "giving up. no master to migrate to\n");
2236 return DLM_LOCK_RES_OWNER_UNKNOWN;
2237}
2238
2239
2240
2241/* this is called by the new master once all lockres
2242 * data has been received */
2243static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
2244 struct dlm_lock_resource *res,
2245 u8 master, u8 new_master,
2246 struct dlm_node_iter *iter)
2247{
2248 struct dlm_migrate_request migrate;
2249 int ret, status = 0;
2250 int nodenum;
2251
2252 memset(&migrate, 0, sizeof(migrate));
2253 migrate.namelen = res->lockname.len;
2254 memcpy(migrate.name, res->lockname.name, migrate.namelen);
2255 migrate.new_master = new_master;
2256 migrate.master = master;
2257
2258 ret = 0;
2259
2260 /* send message to all nodes, except the master and myself */
2261 while ((nodenum = dlm_node_iter_next(iter)) >= 0) {
2262 if (nodenum == master ||
2263 nodenum == new_master)
2264 continue;
2265
2266 ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
2267 &migrate, sizeof(migrate), nodenum,
2268 &status);
2269 if (ret < 0)
2270 mlog_errno(ret);
2271 else if (status < 0) {
2272 mlog(0, "migrate request (node %u) returned %d!\n",
2273 nodenum, status);
2274 ret = status;
2275 }
2276 }
2277
2278 if (ret < 0)
2279 mlog_errno(ret);
2280
2281 mlog(0, "returning ret=%d\n", ret);
2282 return ret;
2283}
2284
2285
2286/* if there is an existing mle for this lockres, we now know who the master is.
2287 * (the one who sent us *this* message) we can clear it up right away.
2288 * since the process that put the mle on the list still has a reference to it,
2289 * we can unhash it now, set the master and wake the process. as a result,
2290 * we will have no mle in the list to start with. now we can add an mle for
2291 * the migration and this should be the only one found for those scanning the
2292 * list. */
2293int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
2294{
2295 struct dlm_ctxt *dlm = data;
2296 struct dlm_lock_resource *res = NULL;
2297 struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf;
2298 struct dlm_master_list_entry *mle = NULL, *oldmle = NULL;
2299 const char *name;
2300 unsigned int namelen;
2301 int ret = 0;
2302
2303 if (!dlm_grab(dlm))
2304 return -EINVAL;
2305
2306 name = migrate->name;
2307 namelen = migrate->namelen;
2308
2309 /* preallocate.. if this fails, abort */
2310 mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
2311 GFP_KERNEL);
2312
2313 if (!mle) {
2314 ret = -ENOMEM;
2315 goto leave;
2316 }
2317
2318 /* check for pre-existing lock */
2319 spin_lock(&dlm->spinlock);
2320 res = __dlm_lookup_lockres(dlm, name, namelen);
2321 spin_lock(&dlm->master_lock);
2322
2323 if (res) {
2324 spin_lock(&res->spinlock);
2325 if (res->state & DLM_LOCK_RES_RECOVERING) {
2326 /* if all is working ok, this can only mean that we got
2327 * a migrate request from a node that we now see as
2328 * dead. what can we do here? drop it to the floor? */
2329 spin_unlock(&res->spinlock);
2330 mlog(ML_ERROR, "Got a migrate request, but the "
2331 "lockres is marked as recovering!");
2332 kmem_cache_free(dlm_mle_cache, mle);
2333 ret = -EINVAL; /* need a better solution */
2334 goto unlock;
2335 }
2336 res->state |= DLM_LOCK_RES_MIGRATING;
2337 spin_unlock(&res->spinlock);
2338 }
2339
2340 /* ignore status. only nonzero status would BUG. */
2341 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
2342 name, namelen,
2343 migrate->new_master,
2344 migrate->master);
2345
2346unlock:
2347 spin_unlock(&dlm->master_lock);
2348 spin_unlock(&dlm->spinlock);
2349
2350 if (oldmle) {
2351 /* master is known, detach if not already detached */
2352 dlm_mle_detach_hb_events(dlm, oldmle);
2353 dlm_put_mle(oldmle);
2354 }
2355
2356 if (res)
2357 dlm_lockres_put(res);
2358leave:
2359 dlm_put(dlm);
2360 return ret;
2361}
2362
2363/* must be holding dlm->spinlock and dlm->master_lock
2364 * when adding a migration mle, we can clear any other mles
2365 * in the master list because we know with certainty that
2366 * the master is "master". so we remove any old mle from
2367 * the list after setting it's master field, and then add
2368 * the new migration mle. this way we can hold with the rule
2369 * of having only one mle for a given lock name at all times. */
2370static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
2371 struct dlm_lock_resource *res,
2372 struct dlm_master_list_entry *mle,
2373 struct dlm_master_list_entry **oldmle,
2374 const char *name, unsigned int namelen,
2375 u8 new_master, u8 master)
2376{
2377 int found;
2378 int ret = 0;
2379
2380 *oldmle = NULL;
2381
2382 mlog_entry_void();
2383
2384 assert_spin_locked(&dlm->spinlock);
2385 assert_spin_locked(&dlm->master_lock);
2386
2387 /* caller is responsible for any ref taken here on oldmle */
2388 found = dlm_find_mle(dlm, oldmle, (char *)name, namelen);
2389 if (found) {
2390 struct dlm_master_list_entry *tmp = *oldmle;
2391 spin_lock(&tmp->spinlock);
2392 if (tmp->type == DLM_MLE_MIGRATION) {
2393 if (master == dlm->node_num) {
2394 /* ah another process raced me to it */
2395 mlog(0, "tried to migrate %.*s, but some "
2396 "process beat me to it\n",
2397 namelen, name);
2398 ret = -EEXIST;
2399 } else {
2400 /* bad. 2 NODES are trying to migrate! */
2401 mlog(ML_ERROR, "migration error mle: "
2402 "master=%u new_master=%u // request: "
2403 "master=%u new_master=%u // "
2404 "lockres=%.*s\n",
2405 tmp->master, tmp->new_master,
2406 master, new_master,
2407 namelen, name);
2408 BUG();
2409 }
2410 } else {
2411 /* this is essentially what assert_master does */
2412 tmp->master = master;
2413 atomic_set(&tmp->woken, 1);
2414 wake_up(&tmp->wq);
2415 /* remove it from the list so that only one
2416 * mle will be found */
2417 list_del_init(&tmp->list);
2418 }
2419 spin_unlock(&tmp->spinlock);
2420 }
2421
2422 /* now add a migration mle to the tail of the list */
2423 dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen);
2424 mle->new_master = new_master;
2425 mle->master = master;
2426 /* do this for consistency with other mle types */
2427 set_bit(new_master, mle->maybe_map);
2428 list_add(&mle->list, &dlm->master_list);
2429
2430 return ret;
2431}
2432
2433
2434void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
2435{
2436 struct list_head *iter, *iter2;
2437 struct dlm_master_list_entry *mle;
2438 struct dlm_lock_resource *res;
2439
2440 mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
2441top:
2442 assert_spin_locked(&dlm->spinlock);
2443
2444 /* clean the master list */
2445 spin_lock(&dlm->master_lock);
2446 list_for_each_safe(iter, iter2, &dlm->master_list) {
2447 mle = list_entry(iter, struct dlm_master_list_entry, list);
2448
2449 BUG_ON(mle->type != DLM_MLE_BLOCK &&
2450 mle->type != DLM_MLE_MASTER &&
2451 mle->type != DLM_MLE_MIGRATION);
2452
2453 /* MASTER mles are initiated locally. the waiting
2454 * process will notice the node map change
2455 * shortly. let that happen as normal. */
2456 if (mle->type == DLM_MLE_MASTER)
2457 continue;
2458
2459
2460 /* BLOCK mles are initiated by other nodes.
2461 * need to clean up if the dead node would have
2462 * been the master. */
2463 if (mle->type == DLM_MLE_BLOCK) {
2464 int bit;
2465
2466 spin_lock(&mle->spinlock);
2467 bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
2468 if (bit != dead_node) {
2469 mlog(0, "mle found, but dead node %u would "
2470 "not have been master\n", dead_node);
2471 spin_unlock(&mle->spinlock);
2472 } else {
2473 /* must drop the refcount by one since the
2474 * assert_master will never arrive. this
2475 * may result in the mle being unlinked and
2476 * freed, but there may still be a process
2477 * waiting in the dlmlock path which is fine. */
2478 mlog(ML_ERROR, "node %u was expected master\n",
2479 dead_node);
2480 atomic_set(&mle->woken, 1);
2481 spin_unlock(&mle->spinlock);
2482 wake_up(&mle->wq);
2483 /* final put will take care of list removal */
2484 __dlm_put_mle(mle);
2485 }
2486 continue;
2487 }
2488
2489 /* everything else is a MIGRATION mle */
2490
2491 /* the rule for MIGRATION mles is that the master
2492 * becomes UNKNOWN if *either* the original or
2493 * the new master dies. all UNKNOWN lockreses
2494 * are sent to whichever node becomes the recovery
2495 * master. the new master is responsible for
2496 * determining if there is still a master for
2497 * this lockres, or if he needs to take over
2498 * mastery. either way, this node should expect
2499 * another message to resolve this. */
2500 if (mle->master != dead_node &&
2501 mle->new_master != dead_node)
2502 continue;
2503
2504 /* if we have reached this point, this mle needs to
2505 * be removed from the list and freed. */
2506
2507 /* remove from the list early. NOTE: unlinking
2508 * list_head while in list_for_each_safe */
2509 spin_lock(&mle->spinlock);
2510 list_del_init(&mle->list);
2511 atomic_set(&mle->woken, 1);
2512 spin_unlock(&mle->spinlock);
2513 wake_up(&mle->wq);
2514
2515 mlog(0, "node %u died during migration from "
2516 "%u to %u!\n", dead_node,
2517 mle->master, mle->new_master);
2518 /* if there is a lockres associated with this
2519 * mle, find it and set its owner to UNKNOWN */
2520 res = __dlm_lookup_lockres(dlm, mle->u.name.name,
2521 mle->u.name.len);
2522 if (res) {
2523 /* unfortunately if we hit this rare case, our
2524 * lock ordering is messed. we need to drop
2525 * the master lock so that we can take the
2526 * lockres lock, meaning that we will have to
2527 * restart from the head of list. */
2528 spin_unlock(&dlm->master_lock);
2529
2530 /* move lockres onto recovery list */
2531 spin_lock(&res->spinlock);
2532 dlm_set_lockres_owner(dlm, res,
2533 DLM_LOCK_RES_OWNER_UNKNOWN);
2534 dlm_move_lockres_to_recovery_list(dlm, res);
2535 spin_unlock(&res->spinlock);
2536 dlm_lockres_put(res);
2537
2538 /* dump the mle */
2539 spin_lock(&dlm->master_lock);
2540 __dlm_put_mle(mle);
2541 spin_unlock(&dlm->master_lock);
2542
2543 /* restart */
2544 goto top;
2545 }
2546
2547 /* this may be the last reference */
2548 __dlm_put_mle(mle);
2549 }
2550 spin_unlock(&dlm->master_lock);
2551}
2552
2553
2554int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
2555 u8 old_master)
2556{
2557 struct dlm_node_iter iter;
2558 int ret = 0;
2559
2560 spin_lock(&dlm->spinlock);
2561 dlm_node_iter_init(dlm->domain_map, &iter);
2562 clear_bit(old_master, iter.node_map);
2563 clear_bit(dlm->node_num, iter.node_map);
2564 spin_unlock(&dlm->spinlock);
2565
2566 mlog(0, "now time to do a migrate request to other nodes\n");
2567 ret = dlm_do_migrate_request(dlm, res, old_master,
2568 dlm->node_num, &iter);
2569 if (ret < 0) {
2570 mlog_errno(ret);
2571 goto leave;
2572 }
2573
2574 mlog(0, "doing assert master of %.*s to all except the original node\n",
2575 res->lockname.len, res->lockname.name);
2576 /* this call now finishes out the nodemap
2577 * even if one or more nodes die */
2578 ret = dlm_do_assert_master(dlm, res->lockname.name,
2579 res->lockname.len, iter.node_map,
2580 DLM_ASSERT_MASTER_FINISH_MIGRATION);
2581 if (ret < 0) {
2582 /* no longer need to retry. all living nodes contacted. */
2583 mlog_errno(ret);
2584 ret = 0;
2585 }
2586
2587 memset(iter.node_map, 0, sizeof(iter.node_map));
2588 set_bit(old_master, iter.node_map);
2589 mlog(0, "doing assert master of %.*s back to %u\n",
2590 res->lockname.len, res->lockname.name, old_master);
2591 ret = dlm_do_assert_master(dlm, res->lockname.name,
2592 res->lockname.len, iter.node_map,
2593 DLM_ASSERT_MASTER_FINISH_MIGRATION);
2594 if (ret < 0) {
2595 mlog(0, "assert master to original master failed "
2596 "with %d.\n", ret);
2597 /* the only nonzero status here would be because of
2598 * a dead original node. we're done. */
2599 ret = 0;
2600 }
2601
2602 /* all done, set the owner, clear the flag */
2603 spin_lock(&res->spinlock);
2604 dlm_set_lockres_owner(dlm, res, dlm->node_num);
2605 res->state &= ~DLM_LOCK_RES_MIGRATING;
2606 spin_unlock(&res->spinlock);
2607 /* re-dirty it on the new master */
2608 dlm_kick_thread(dlm, res);
2609 wake_up(&res->wq);
2610leave:
2611 return ret;
2612}
2613
2614/*
2615 * LOCKRES AST REFCOUNT
2616 * this is integral to migration
2617 */
2618
2619/* for future intent to call an ast, reserve one ahead of time.
2620 * this should be called only after waiting on the lockres
2621 * with dlm_wait_on_lockres, and while still holding the
2622 * spinlock after the call. */
2623void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res)
2624{
2625 assert_spin_locked(&res->spinlock);
2626 if (res->state & DLM_LOCK_RES_MIGRATING) {
2627 __dlm_print_one_lock_resource(res);
2628 }
2629 BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
2630
2631 atomic_inc(&res->asts_reserved);
2632}
2633
2634/*
2635 * used to drop the reserved ast, either because it went unused,
2636 * or because the ast/bast was actually called.
2637 *
2638 * also, if there is a pending migration on this lockres,
2639 * and this was the last pending ast on the lockres,
2640 * atomically set the MIGRATING flag before we drop the lock.
2641 * this is how we ensure that migration can proceed with no
2642 * asts in progress. note that it is ok if the state of the
2643 * queues is such that a lock should be granted in the future
2644 * or that a bast should be fired, because the new master will
2645 * shuffle the lists on this lockres as soon as it is migrated.
2646 */
2647void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
2648 struct dlm_lock_resource *res)
2649{
2650 if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock))
2651 return;
2652
2653 if (!res->migration_pending) {
2654 spin_unlock(&res->spinlock);
2655 return;
2656 }
2657
2658 BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
2659 res->migration_pending = 0;
2660 res->state |= DLM_LOCK_RES_MIGRATING;
2661 spin_unlock(&res->spinlock);
2662 wake_up(&res->wq);
2663 wake_up(&dlm->migration_wq);
2664}
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
new file mode 100644
index 00000000000..0c8eb1093f0
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -0,0 +1,2132 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmrecovery.c
5 *
6 * recovery stuff
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27
28#include <linux/module.h>
29#include <linux/fs.h>
30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h>
35#include <linux/sysctl.h>
36#include <linux/random.h>
37#include <linux/blkdev.h>
38#include <linux/socket.h>
39#include <linux/inet.h>
40#include <linux/timer.h>
41#include <linux/kthread.h>
42
43
44#include "cluster/heartbeat.h"
45#include "cluster/nodemanager.h"
46#include "cluster/tcp.h"
47
48#include "dlmapi.h"
49#include "dlmcommon.h"
50#include "dlmdomain.h"
51
52#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_RECOVERY)
53#include "cluster/masklog.h"
54
55static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
56
57static int dlm_recovery_thread(void *data);
58void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
59int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
60static void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
61static int dlm_do_recovery(struct dlm_ctxt *dlm);
62
63static int dlm_pick_recovery_master(struct dlm_ctxt *dlm);
64static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node);
65static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
66static int dlm_request_all_locks(struct dlm_ctxt *dlm,
67 u8 request_from, u8 dead_node);
68static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
69
70static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res);
71static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
72 const char *lockname, int namelen,
73 int total_locks, u64 cookie,
74 u8 flags, u8 master);
75static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
76 struct dlm_migratable_lockres *mres,
77 u8 send_to,
78 struct dlm_lock_resource *res,
79 int total_locks);
80static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
81 struct dlm_lock_resource *res,
82 u8 *real_master);
83static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
84 struct dlm_lock_resource *res,
85 struct dlm_migratable_lockres *mres);
86static int dlm_do_master_requery(struct dlm_ctxt *dlm,
87 struct dlm_lock_resource *res,
88 u8 nodenum, u8 *real_master);
89static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm);
90static int dlm_send_all_done_msg(struct dlm_ctxt *dlm,
91 u8 dead_node, u8 send_to);
92static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node);
93static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
94 struct list_head *list, u8 dead_node);
95static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
96 u8 dead_node, u8 new_master);
97static void dlm_reco_ast(void *astdata);
98static void dlm_reco_bast(void *astdata, int blocked_type);
99static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st);
100static void dlm_request_all_locks_worker(struct dlm_work_item *item,
101 void *data);
102static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data);
103
104static u64 dlm_get_next_mig_cookie(void);
105
106static spinlock_t dlm_reco_state_lock = SPIN_LOCK_UNLOCKED;
107static spinlock_t dlm_mig_cookie_lock = SPIN_LOCK_UNLOCKED;
108static u64 dlm_mig_cookie = 1;
109
110static u64 dlm_get_next_mig_cookie(void)
111{
112 u64 c;
113 spin_lock(&dlm_mig_cookie_lock);
114 c = dlm_mig_cookie;
115 if (dlm_mig_cookie == (~0ULL))
116 dlm_mig_cookie = 1;
117 else
118 dlm_mig_cookie++;
119 spin_unlock(&dlm_mig_cookie_lock);
120 return c;
121}
122
123static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
124{
125 spin_lock(&dlm->spinlock);
126 clear_bit(dlm->reco.dead_node, dlm->recovery_map);
127 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
128 dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
129 spin_unlock(&dlm->spinlock);
130}
131
132/* Worker function used during recovery. */
133void dlm_dispatch_work(void *data)
134{
135 struct dlm_ctxt *dlm = (struct dlm_ctxt *)data;
136 LIST_HEAD(tmp_list);
137 struct list_head *iter, *iter2;
138 struct dlm_work_item *item;
139 dlm_workfunc_t *workfunc;
140
141 spin_lock(&dlm->work_lock);
142 list_splice_init(&dlm->work_list, &tmp_list);
143 spin_unlock(&dlm->work_lock);
144
145 list_for_each_safe(iter, iter2, &tmp_list) {
146 item = list_entry(iter, struct dlm_work_item, list);
147 workfunc = item->func;
148 list_del_init(&item->list);
149
150 /* already have ref on dlm to avoid having
151 * it disappear. just double-check. */
152 BUG_ON(item->dlm != dlm);
153
154 /* this is allowed to sleep and
155 * call network stuff */
156 workfunc(item, item->data);
157
158 dlm_put(dlm);
159 kfree(item);
160 }
161}
162
163/*
164 * RECOVERY THREAD
165 */
166
167static void dlm_kick_recovery_thread(struct dlm_ctxt *dlm)
168{
169 /* wake the recovery thread
170 * this will wake the reco thread in one of three places
171 * 1) sleeping with no recovery happening
172 * 2) sleeping with recovery mastered elsewhere
173 * 3) recovery mastered here, waiting on reco data */
174
175 wake_up(&dlm->dlm_reco_thread_wq);
176}
177
178/* Launch the recovery thread */
179int dlm_launch_recovery_thread(struct dlm_ctxt *dlm)
180{
181 mlog(0, "starting dlm recovery thread...\n");
182
183 dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm,
184 "dlm_reco_thread");
185 if (IS_ERR(dlm->dlm_reco_thread_task)) {
186 mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task));
187 dlm->dlm_reco_thread_task = NULL;
188 return -EINVAL;
189 }
190
191 return 0;
192}
193
194void dlm_complete_recovery_thread(struct dlm_ctxt *dlm)
195{
196 if (dlm->dlm_reco_thread_task) {
197 mlog(0, "waiting for dlm recovery thread to exit\n");
198 kthread_stop(dlm->dlm_reco_thread_task);
199 dlm->dlm_reco_thread_task = NULL;
200 }
201}
202
203
204
205/*
206 * this is lame, but here's how recovery works...
207 * 1) all recovery threads cluster wide will work on recovering
208 * ONE node at a time
209 * 2) negotiate who will take over all the locks for the dead node.
210 * thats right... ALL the locks.
211 * 3) once a new master is chosen, everyone scans all locks
212 * and moves aside those mastered by the dead guy
213 * 4) each of these locks should be locked until recovery is done
214 * 5) the new master collects up all of secondary lock queue info
215 * one lock at a time, forcing each node to communicate back
216 * before continuing
217 * 6) each secondary lock queue responds with the full known lock info
218 * 7) once the new master has run all its locks, it sends a ALLDONE!
219 * message to everyone
220 * 8) upon receiving this message, the secondary queue node unlocks
221 * and responds to the ALLDONE
222 * 9) once the new master gets responses from everyone, he unlocks
223 * everything and recovery for this dead node is done
224 *10) go back to 2) while there are still dead nodes
225 *
226 */
227
228
229#define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000)
230
231static int dlm_recovery_thread(void *data)
232{
233 int status;
234 struct dlm_ctxt *dlm = data;
235 unsigned long timeout = msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS);
236
237 mlog(0, "dlm thread running for %s...\n", dlm->name);
238
239 while (!kthread_should_stop()) {
240 if (dlm_joined(dlm)) {
241 status = dlm_do_recovery(dlm);
242 if (status == -EAGAIN) {
243 /* do not sleep, recheck immediately. */
244 continue;
245 }
246 if (status < 0)
247 mlog_errno(status);
248 }
249
250 wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
251 kthread_should_stop(),
252 timeout);
253 }
254
255 mlog(0, "quitting DLM recovery thread\n");
256 return 0;
257}
258
259/* callers of the top-level api calls (dlmlock/dlmunlock) should
260 * block on the dlm->reco.event when recovery is in progress.
261 * the dlm recovery thread will set this state when it begins
262 * recovering a dead node (as the new master or not) and clear
263 * the state and wake as soon as all affected lock resources have
264 * been marked with the RECOVERY flag */
265static int dlm_in_recovery(struct dlm_ctxt *dlm)
266{
267 int in_recovery;
268 spin_lock(&dlm->spinlock);
269 in_recovery = !!(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
270 spin_unlock(&dlm->spinlock);
271 return in_recovery;
272}
273
274
275void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
276{
277 wait_event(dlm->reco.event, !dlm_in_recovery(dlm));
278}
279
280static void dlm_begin_recovery(struct dlm_ctxt *dlm)
281{
282 spin_lock(&dlm->spinlock);
283 BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
284 dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
285 spin_unlock(&dlm->spinlock);
286}
287
288static void dlm_end_recovery(struct dlm_ctxt *dlm)
289{
290 spin_lock(&dlm->spinlock);
291 BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE));
292 dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE;
293 spin_unlock(&dlm->spinlock);
294 wake_up(&dlm->reco.event);
295}
296
297static int dlm_do_recovery(struct dlm_ctxt *dlm)
298{
299 int status = 0;
300
301 spin_lock(&dlm->spinlock);
302
303 /* check to see if the new master has died */
304 if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM &&
305 test_bit(dlm->reco.new_master, dlm->recovery_map)) {
306 mlog(0, "new master %u died while recovering %u!\n",
307 dlm->reco.new_master, dlm->reco.dead_node);
308 /* unset the new_master, leave dead_node */
309 dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
310 }
311
312 /* select a target to recover */
313 if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
314 int bit;
315
316 bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
317 if (bit >= O2NM_MAX_NODES || bit < 0)
318 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
319 else
320 dlm->reco.dead_node = bit;
321 } else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
322 /* BUG? */
323 mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n",
324 dlm->reco.dead_node);
325 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
326 }
327
328 if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
329 // mlog(0, "nothing to recover! sleeping now!\n");
330 spin_unlock(&dlm->spinlock);
331 /* return to main thread loop and sleep. */
332 return 0;
333 }
334 mlog(0, "recovery thread found node %u in the recovery map!\n",
335 dlm->reco.dead_node);
336 spin_unlock(&dlm->spinlock);
337
338 /* take write barrier */
339 /* (stops the list reshuffling thread, proxy ast handling) */
340 dlm_begin_recovery(dlm);
341
342 if (dlm->reco.new_master == dlm->node_num)
343 goto master_here;
344
345 if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
346 /* choose a new master */
347 if (!dlm_pick_recovery_master(dlm)) {
348 /* already notified everyone. go. */
349 dlm->reco.new_master = dlm->node_num;
350 goto master_here;
351 }
352 mlog(0, "another node will master this recovery session.\n");
353 }
354 mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n",
355 dlm->name, dlm->reco.new_master,
356 dlm->node_num, dlm->reco.dead_node);
357
358 /* it is safe to start everything back up here
359 * because all of the dead node's lock resources
360 * have been marked as in-recovery */
361 dlm_end_recovery(dlm);
362
363 /* sleep out in main dlm_recovery_thread loop. */
364 return 0;
365
366master_here:
367 mlog(0, "mastering recovery of %s:%u here(this=%u)!\n",
368 dlm->name, dlm->reco.dead_node, dlm->node_num);
369
370 status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
371 if (status < 0) {
372 mlog(ML_ERROR, "error %d remastering locks for node %u, "
373 "retrying.\n", status, dlm->reco.dead_node);
374 } else {
375 /* success! see if any other nodes need recovery */
376 dlm_reset_recovery(dlm);
377 }
378 dlm_end_recovery(dlm);
379
380 /* continue and look for another dead node */
381 return -EAGAIN;
382}
383
384static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
385{
386 int status = 0;
387 struct dlm_reco_node_data *ndata;
388 struct list_head *iter;
389 int all_nodes_done;
390 int destroy = 0;
391 int pass = 0;
392
393 status = dlm_init_recovery_area(dlm, dead_node);
394 if (status < 0)
395 goto leave;
396
397 /* safe to access the node data list without a lock, since this
398 * process is the only one to change the list */
399 list_for_each(iter, &dlm->reco.node_data) {
400 ndata = list_entry (iter, struct dlm_reco_node_data, list);
401 BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
402 ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
403
404 mlog(0, "requesting lock info from node %u\n",
405 ndata->node_num);
406
407 if (ndata->node_num == dlm->node_num) {
408 ndata->state = DLM_RECO_NODE_DATA_DONE;
409 continue;
410 }
411
412 status = dlm_request_all_locks(dlm, ndata->node_num, dead_node);
413 if (status < 0) {
414 mlog_errno(status);
415 if (dlm_is_host_down(status))
416 ndata->state = DLM_RECO_NODE_DATA_DEAD;
417 else {
418 destroy = 1;
419 goto leave;
420 }
421 }
422
423 switch (ndata->state) {
424 case DLM_RECO_NODE_DATA_INIT:
425 case DLM_RECO_NODE_DATA_FINALIZE_SENT:
426 case DLM_RECO_NODE_DATA_REQUESTED:
427 BUG();
428 break;
429 case DLM_RECO_NODE_DATA_DEAD:
430 mlog(0, "node %u died after requesting "
431 "recovery info for node %u\n",
432 ndata->node_num, dead_node);
433 // start all over
434 destroy = 1;
435 status = -EAGAIN;
436 goto leave;
437 case DLM_RECO_NODE_DATA_REQUESTING:
438 ndata->state = DLM_RECO_NODE_DATA_REQUESTED;
439 mlog(0, "now receiving recovery data from "
440 "node %u for dead node %u\n",
441 ndata->node_num, dead_node);
442 break;
443 case DLM_RECO_NODE_DATA_RECEIVING:
444 mlog(0, "already receiving recovery data from "
445 "node %u for dead node %u\n",
446 ndata->node_num, dead_node);
447 break;
448 case DLM_RECO_NODE_DATA_DONE:
449 mlog(0, "already DONE receiving recovery data "
450 "from node %u for dead node %u\n",
451 ndata->node_num, dead_node);
452 break;
453 }
454 }
455
456 mlog(0, "done requesting all lock info\n");
457
458 /* nodes should be sending reco data now
459 * just need to wait */
460
461 while (1) {
462 /* check all the nodes now to see if we are
463 * done, or if anyone died */
464 all_nodes_done = 1;
465 spin_lock(&dlm_reco_state_lock);
466 list_for_each(iter, &dlm->reco.node_data) {
467 ndata = list_entry (iter, struct dlm_reco_node_data, list);
468
469 mlog(0, "checking recovery state of node %u\n",
470 ndata->node_num);
471 switch (ndata->state) {
472 case DLM_RECO_NODE_DATA_INIT:
473 case DLM_RECO_NODE_DATA_REQUESTING:
474 mlog(ML_ERROR, "bad ndata state for "
475 "node %u: state=%d\n",
476 ndata->node_num, ndata->state);
477 BUG();
478 break;
479 case DLM_RECO_NODE_DATA_DEAD:
480 mlog(0, "node %u died after "
481 "requesting recovery info for "
482 "node %u\n", ndata->node_num,
483 dead_node);
484 spin_unlock(&dlm_reco_state_lock);
485 // start all over
486 destroy = 1;
487 status = -EAGAIN;
488 goto leave;
489 case DLM_RECO_NODE_DATA_RECEIVING:
490 case DLM_RECO_NODE_DATA_REQUESTED:
491 all_nodes_done = 0;
492 break;
493 case DLM_RECO_NODE_DATA_DONE:
494 break;
495 case DLM_RECO_NODE_DATA_FINALIZE_SENT:
496 break;
497 }
498 }
499 spin_unlock(&dlm_reco_state_lock);
500
501 mlog(0, "pass #%d, all_nodes_done?: %s\n", ++pass,
502 all_nodes_done?"yes":"no");
503 if (all_nodes_done) {
504 int ret;
505
506 /* all nodes are now in DLM_RECO_NODE_DATA_DONE state
507 * just send a finalize message to everyone and
508 * clean up */
509 mlog(0, "all nodes are done! send finalize\n");
510 ret = dlm_send_finalize_reco_message(dlm);
511 if (ret < 0)
512 mlog_errno(ret);
513
514 spin_lock(&dlm->spinlock);
515 dlm_finish_local_lockres_recovery(dlm, dead_node,
516 dlm->node_num);
517 spin_unlock(&dlm->spinlock);
518 mlog(0, "should be done with recovery!\n");
519
520 mlog(0, "finishing recovery of %s at %lu, "
521 "dead=%u, this=%u, new=%u\n", dlm->name,
522 jiffies, dlm->reco.dead_node,
523 dlm->node_num, dlm->reco.new_master);
524 destroy = 1;
525 status = ret;
526 /* rescan everything marked dirty along the way */
527 dlm_kick_thread(dlm, NULL);
528 break;
529 }
530 /* wait to be signalled, with periodic timeout
531 * to check for node death */
532 wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
533 kthread_should_stop(),
534 msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS));
535
536 }
537
538leave:
539 if (destroy)
540 dlm_destroy_recovery_area(dlm, dead_node);
541
542 mlog_exit(status);
543 return status;
544}
545
546static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
547{
548 int num=0;
549 struct dlm_reco_node_data *ndata;
550
551 spin_lock(&dlm->spinlock);
552 memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map));
553 /* nodes can only be removed (by dying) after dropping
554 * this lock, and death will be trapped later, so this should do */
555 spin_unlock(&dlm->spinlock);
556
557 while (1) {
558 num = find_next_bit (dlm->reco.node_map, O2NM_MAX_NODES, num);
559 if (num >= O2NM_MAX_NODES) {
560 break;
561 }
562 BUG_ON(num == dead_node);
563
564 ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL);
565 if (!ndata) {
566 dlm_destroy_recovery_area(dlm, dead_node);
567 return -ENOMEM;
568 }
569 ndata->node_num = num;
570 ndata->state = DLM_RECO_NODE_DATA_INIT;
571 spin_lock(&dlm_reco_state_lock);
572 list_add_tail(&ndata->list, &dlm->reco.node_data);
573 spin_unlock(&dlm_reco_state_lock);
574 num++;
575 }
576
577 return 0;
578}
579
580static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
581{
582 struct list_head *iter, *iter2;
583 struct dlm_reco_node_data *ndata;
584 LIST_HEAD(tmplist);
585
586 spin_lock(&dlm_reco_state_lock);
587 list_splice_init(&dlm->reco.node_data, &tmplist);
588 spin_unlock(&dlm_reco_state_lock);
589
590 list_for_each_safe(iter, iter2, &tmplist) {
591 ndata = list_entry (iter, struct dlm_reco_node_data, list);
592 list_del_init(&ndata->list);
593 kfree(ndata);
594 }
595}
596
597static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
598 u8 dead_node)
599{
600 struct dlm_lock_request lr;
601 enum dlm_status ret;
602
603 mlog(0, "\n");
604
605
606 mlog(0, "dlm_request_all_locks: dead node is %u, sending request "
607 "to %u\n", dead_node, request_from);
608
609 memset(&lr, 0, sizeof(lr));
610 lr.node_idx = dlm->node_num;
611 lr.dead_node = dead_node;
612
613 // send message
614 ret = DLM_NOLOCKMGR;
615 ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
616 &lr, sizeof(lr), request_from, NULL);
617
618 /* negative status is handled by caller */
619 if (ret < 0)
620 mlog_errno(ret);
621
622 // return from here, then
623 // sleep until all received or error
624 return ret;
625
626}
627
628int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
629{
630 struct dlm_ctxt *dlm = data;
631 struct dlm_lock_request *lr = (struct dlm_lock_request *)msg->buf;
632 char *buf = NULL;
633 struct dlm_work_item *item = NULL;
634
635 if (!dlm_grab(dlm))
636 return -EINVAL;
637
638 BUG_ON(lr->dead_node != dlm->reco.dead_node);
639
640 item = kcalloc(1, sizeof(*item), GFP_KERNEL);
641 if (!item) {
642 dlm_put(dlm);
643 return -ENOMEM;
644 }
645
646 /* this will get freed by dlm_request_all_locks_worker */
647 buf = (char *) __get_free_page(GFP_KERNEL);
648 if (!buf) {
649 kfree(item);
650 dlm_put(dlm);
651 return -ENOMEM;
652 }
653
654 /* queue up work for dlm_request_all_locks_worker */
655 dlm_grab(dlm); /* get an extra ref for the work item */
656 dlm_init_work_item(dlm, item, dlm_request_all_locks_worker, buf);
657 item->u.ral.reco_master = lr->node_idx;
658 item->u.ral.dead_node = lr->dead_node;
659 spin_lock(&dlm->work_lock);
660 list_add_tail(&item->list, &dlm->work_list);
661 spin_unlock(&dlm->work_lock);
662 schedule_work(&dlm->dispatched_work);
663
664 dlm_put(dlm);
665 return 0;
666}
667
668static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
669{
670 struct dlm_migratable_lockres *mres;
671 struct dlm_lock_resource *res;
672 struct dlm_ctxt *dlm;
673 LIST_HEAD(resources);
674 struct list_head *iter;
675 int ret;
676 u8 dead_node, reco_master;
677
678 dlm = item->dlm;
679 dead_node = item->u.ral.dead_node;
680 reco_master = item->u.ral.reco_master;
681 BUG_ON(dead_node != dlm->reco.dead_node);
682 BUG_ON(reco_master != dlm->reco.new_master);
683
684 mres = (struct dlm_migratable_lockres *)data;
685
686 /* lock resources should have already been moved to the
687 * dlm->reco.resources list. now move items from that list
688 * to a temp list if the dead owner matches. note that the
689 * whole cluster recovers only one node at a time, so we
690 * can safely move UNKNOWN lock resources for each recovery
691 * session. */
692 dlm_move_reco_locks_to_list(dlm, &resources, dead_node);
693
694 /* now we can begin blasting lockreses without the dlm lock */
695 list_for_each(iter, &resources) {
696 res = list_entry (iter, struct dlm_lock_resource, recovering);
697 ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
698 DLM_MRES_RECOVERY);
699 if (ret < 0)
700 mlog_errno(ret);
701 }
702
703 /* move the resources back to the list */
704 spin_lock(&dlm->spinlock);
705 list_splice_init(&resources, &dlm->reco.resources);
706 spin_unlock(&dlm->spinlock);
707
708 ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
709 if (ret < 0)
710 mlog_errno(ret);
711
712 free_page((unsigned long)data);
713}
714
715
716static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
717{
718 int ret, tmpret;
719 struct dlm_reco_data_done done_msg;
720
721 memset(&done_msg, 0, sizeof(done_msg));
722 done_msg.node_idx = dlm->node_num;
723 done_msg.dead_node = dead_node;
724 mlog(0, "sending DATA DONE message to %u, "
725 "my node=%u, dead node=%u\n", send_to, done_msg.node_idx,
726 done_msg.dead_node);
727
728 ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
729 sizeof(done_msg), send_to, &tmpret);
730 /* negative status is ignored by the caller */
731 if (ret >= 0)
732 ret = tmpret;
733 return ret;
734}
735
736
737int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data)
738{
739 struct dlm_ctxt *dlm = data;
740 struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf;
741 struct list_head *iter;
742 struct dlm_reco_node_data *ndata = NULL;
743 int ret = -EINVAL;
744
745 if (!dlm_grab(dlm))
746 return -EINVAL;
747
748 mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, "
749 "node_idx=%u, this node=%u\n", done->dead_node,
750 dlm->reco.dead_node, done->node_idx, dlm->node_num);
751 BUG_ON(done->dead_node != dlm->reco.dead_node);
752
753 spin_lock(&dlm_reco_state_lock);
754 list_for_each(iter, &dlm->reco.node_data) {
755 ndata = list_entry (iter, struct dlm_reco_node_data, list);
756 if (ndata->node_num != done->node_idx)
757 continue;
758
759 switch (ndata->state) {
760 case DLM_RECO_NODE_DATA_INIT:
761 case DLM_RECO_NODE_DATA_DEAD:
762 case DLM_RECO_NODE_DATA_DONE:
763 case DLM_RECO_NODE_DATA_FINALIZE_SENT:
764 mlog(ML_ERROR, "bad ndata state for node %u:"
765 " state=%d\n", ndata->node_num,
766 ndata->state);
767 BUG();
768 break;
769 case DLM_RECO_NODE_DATA_RECEIVING:
770 case DLM_RECO_NODE_DATA_REQUESTED:
771 case DLM_RECO_NODE_DATA_REQUESTING:
772 mlog(0, "node %u is DONE sending "
773 "recovery data!\n",
774 ndata->node_num);
775
776 ndata->state = DLM_RECO_NODE_DATA_DONE;
777 ret = 0;
778 break;
779 }
780 }
781 spin_unlock(&dlm_reco_state_lock);
782
783 /* wake the recovery thread, some node is done */
784 if (!ret)
785 dlm_kick_recovery_thread(dlm);
786
787 if (ret < 0)
788 mlog(ML_ERROR, "failed to find recovery node data for node "
789 "%u\n", done->node_idx);
790 dlm_put(dlm);
791
792 mlog(0, "leaving reco data done handler, ret=%d\n", ret);
793 return ret;
794}
795
796static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
797 struct list_head *list,
798 u8 dead_node)
799{
800 struct dlm_lock_resource *res;
801 struct list_head *iter, *iter2;
802
803 spin_lock(&dlm->spinlock);
804 list_for_each_safe(iter, iter2, &dlm->reco.resources) {
805 res = list_entry (iter, struct dlm_lock_resource, recovering);
806 if (dlm_is_recovery_lock(res->lockname.name,
807 res->lockname.len))
808 continue;
809 if (res->owner == dead_node) {
810 mlog(0, "found lockres owned by dead node while "
811 "doing recovery for node %u. sending it.\n",
812 dead_node);
813 list_del_init(&res->recovering);
814 list_add_tail(&res->recovering, list);
815 } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
816 mlog(0, "found UNKNOWN owner while doing recovery "
817 "for node %u. sending it.\n", dead_node);
818 list_del_init(&res->recovering);
819 list_add_tail(&res->recovering, list);
820 }
821 }
822 spin_unlock(&dlm->spinlock);
823}
824
825static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res)
826{
827 int total_locks = 0;
828 struct list_head *iter, *queue = &res->granted;
829 int i;
830
831 for (i=0; i<3; i++) {
832 list_for_each(iter, queue)
833 total_locks++;
834 queue++;
835 }
836 return total_locks;
837}
838
839
840static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
841 struct dlm_migratable_lockres *mres,
842 u8 send_to,
843 struct dlm_lock_resource *res,
844 int total_locks)
845{
846 u64 mig_cookie = be64_to_cpu(mres->mig_cookie);
847 int mres_total_locks = be32_to_cpu(mres->total_locks);
848 int sz, ret = 0, status = 0;
849 u8 orig_flags = mres->flags,
850 orig_master = mres->master;
851
852 BUG_ON(mres->num_locks > DLM_MAX_MIGRATABLE_LOCKS);
853 if (!mres->num_locks)
854 return 0;
855
856 sz = sizeof(struct dlm_migratable_lockres) +
857 (mres->num_locks * sizeof(struct dlm_migratable_lock));
858
859 /* add an all-done flag if we reached the last lock */
860 orig_flags = mres->flags;
861 BUG_ON(total_locks > mres_total_locks);
862 if (total_locks == mres_total_locks)
863 mres->flags |= DLM_MRES_ALL_DONE;
864
865 /* send it */
866 ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres,
867 sz, send_to, &status);
868 if (ret < 0) {
869 /* XXX: negative status is not handled.
870 * this will end up killing this node. */
871 mlog_errno(ret);
872 } else {
873 /* might get an -ENOMEM back here */
874 ret = status;
875 if (ret < 0) {
876 mlog_errno(ret);
877
878 if (ret == -EFAULT) {
879 mlog(ML_ERROR, "node %u told me to kill "
880 "myself!\n", send_to);
881 BUG();
882 }
883 }
884 }
885
886 /* zero and reinit the message buffer */
887 dlm_init_migratable_lockres(mres, res->lockname.name,
888 res->lockname.len, mres_total_locks,
889 mig_cookie, orig_flags, orig_master);
890 return ret;
891}
892
893static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
894 const char *lockname, int namelen,
895 int total_locks, u64 cookie,
896 u8 flags, u8 master)
897{
898 /* mres here is one full page */
899 memset(mres, 0, PAGE_SIZE);
900 mres->lockname_len = namelen;
901 memcpy(mres->lockname, lockname, namelen);
902 mres->num_locks = 0;
903 mres->total_locks = cpu_to_be32(total_locks);
904 mres->mig_cookie = cpu_to_be64(cookie);
905 mres->flags = flags;
906 mres->master = master;
907}
908
909
910/* returns 1 if this lock fills the network structure,
911 * 0 otherwise */
912static int dlm_add_lock_to_array(struct dlm_lock *lock,
913 struct dlm_migratable_lockres *mres, int queue)
914{
915 struct dlm_migratable_lock *ml;
916 int lock_num = mres->num_locks;
917
918 ml = &(mres->ml[lock_num]);
919 ml->cookie = lock->ml.cookie;
920 ml->type = lock->ml.type;
921 ml->convert_type = lock->ml.convert_type;
922 ml->highest_blocked = lock->ml.highest_blocked;
923 ml->list = queue;
924 if (lock->lksb) {
925 ml->flags = lock->lksb->flags;
926 /* send our current lvb */
927 if (ml->type == LKM_EXMODE ||
928 ml->type == LKM_PRMODE) {
929 /* if it is already set, this had better be a PR
930 * and it has to match */
931 if (mres->lvb[0] && (ml->type == LKM_EXMODE ||
932 memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
933 mlog(ML_ERROR, "mismatched lvbs!\n");
934 __dlm_print_one_lock_resource(lock->lockres);
935 BUG();
936 }
937 memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
938 }
939 }
940 ml->node = lock->ml.node;
941 mres->num_locks++;
942 /* we reached the max, send this network message */
943 if (mres->num_locks == DLM_MAX_MIGRATABLE_LOCKS)
944 return 1;
945 return 0;
946}
947
948
949int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
950 struct dlm_migratable_lockres *mres,
951 u8 send_to, u8 flags)
952{
953 struct list_head *queue, *iter;
954 int total_locks, i;
955 u64 mig_cookie = 0;
956 struct dlm_lock *lock;
957 int ret = 0;
958
959 BUG_ON(!(flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
960
961 mlog(0, "sending to %u\n", send_to);
962
963 total_locks = dlm_num_locks_in_lockres(res);
964 if (total_locks > DLM_MAX_MIGRATABLE_LOCKS) {
965 /* rare, but possible */
966 mlog(0, "argh. lockres has %d locks. this will "
967 "require more than one network packet to "
968 "migrate\n", total_locks);
969 mig_cookie = dlm_get_next_mig_cookie();
970 }
971
972 dlm_init_migratable_lockres(mres, res->lockname.name,
973 res->lockname.len, total_locks,
974 mig_cookie, flags, res->owner);
975
976 total_locks = 0;
977 for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) {
978 queue = dlm_list_idx_to_ptr(res, i);
979 list_for_each(iter, queue) {
980 lock = list_entry (iter, struct dlm_lock, list);
981
982 /* add another lock. */
983 total_locks++;
984 if (!dlm_add_lock_to_array(lock, mres, i))
985 continue;
986
987 /* this filled the lock message,
988 * we must send it immediately. */
989 ret = dlm_send_mig_lockres_msg(dlm, mres, send_to,
990 res, total_locks);
991 if (ret < 0) {
992 // TODO
993 mlog(ML_ERROR, "dlm_send_mig_lockres_msg "
994 "returned %d, TODO\n", ret);
995 BUG();
996 }
997 }
998 }
999 /* flush any remaining locks */
1000 ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks);
1001 if (ret < 0) {
1002 // TODO
1003 mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, "
1004 "TODO\n", ret);
1005 BUG();
1006 }
1007 return ret;
1008}
1009
1010
1011
1012/*
1013 * this message will contain no more than one page worth of
1014 * recovery data, and it will work on only one lockres.
1015 * there may be many locks in this page, and we may need to wait
1016 * for additional packets to complete all the locks (rare, but
1017 * possible).
1018 */
1019/*
1020 * NOTE: the allocation error cases here are scary
1021 * we really cannot afford to fail an alloc in recovery
1022 * do we spin? returning an error only delays the problem really
1023 */
1024
1025int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
1026{
1027 struct dlm_ctxt *dlm = data;
1028 struct dlm_migratable_lockres *mres =
1029 (struct dlm_migratable_lockres *)msg->buf;
1030 int ret = 0;
1031 u8 real_master;
1032 char *buf = NULL;
1033 struct dlm_work_item *item = NULL;
1034 struct dlm_lock_resource *res = NULL;
1035
1036 if (!dlm_grab(dlm))
1037 return -EINVAL;
1038
1039 BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
1040
1041 real_master = mres->master;
1042 if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
1043 /* cannot migrate a lockres with no master */
1044 BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
1045 }
1046
1047 mlog(0, "%s message received from node %u\n",
1048 (mres->flags & DLM_MRES_RECOVERY) ?
1049 "recovery" : "migration", mres->master);
1050 if (mres->flags & DLM_MRES_ALL_DONE)
1051 mlog(0, "all done flag. all lockres data received!\n");
1052
1053 ret = -ENOMEM;
1054 buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL);
1055 item = kcalloc(1, sizeof(*item), GFP_KERNEL);
1056 if (!buf || !item)
1057 goto leave;
1058
1059 /* lookup the lock to see if we have a secondary queue for this
1060 * already... just add the locks in and this will have its owner
1061 * and RECOVERY flag changed when it completes. */
1062 res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len);
1063 if (res) {
1064 /* this will get a ref on res */
1065 /* mark it as recovering/migrating and hash it */
1066 spin_lock(&res->spinlock);
1067 if (mres->flags & DLM_MRES_RECOVERY) {
1068 res->state |= DLM_LOCK_RES_RECOVERING;
1069 } else {
1070 if (res->state & DLM_LOCK_RES_MIGRATING) {
1071 /* this is at least the second
1072 * lockres message */
1073 mlog(0, "lock %.*s is already migrating\n",
1074 mres->lockname_len,
1075 mres->lockname);
1076 } else if (res->state & DLM_LOCK_RES_RECOVERING) {
1077 /* caller should BUG */
1078 mlog(ML_ERROR, "node is attempting to migrate "
1079 "lock %.*s, but marked as recovering!\n",
1080 mres->lockname_len, mres->lockname);
1081 ret = -EFAULT;
1082 spin_unlock(&res->spinlock);
1083 goto leave;
1084 }
1085 res->state |= DLM_LOCK_RES_MIGRATING;
1086 }
1087 spin_unlock(&res->spinlock);
1088 } else {
1089 /* need to allocate, just like if it was
1090 * mastered here normally */
1091 res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len);
1092 if (!res)
1093 goto leave;
1094
1095 /* to match the ref that we would have gotten if
1096 * dlm_lookup_lockres had succeeded */
1097 dlm_lockres_get(res);
1098
1099 /* mark it as recovering/migrating and hash it */
1100 if (mres->flags & DLM_MRES_RECOVERY)
1101 res->state |= DLM_LOCK_RES_RECOVERING;
1102 else
1103 res->state |= DLM_LOCK_RES_MIGRATING;
1104
1105 spin_lock(&dlm->spinlock);
1106 __dlm_insert_lockres(dlm, res);
1107 spin_unlock(&dlm->spinlock);
1108
1109 /* now that the new lockres is inserted,
1110 * make it usable by other processes */
1111 spin_lock(&res->spinlock);
1112 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
1113 spin_unlock(&res->spinlock);
1114
1115 /* add an extra ref for just-allocated lockres
1116 * otherwise the lockres will be purged immediately */
1117 dlm_lockres_get(res);
1118
1119 }
1120
1121 /* at this point we have allocated everything we need,
1122 * and we have a hashed lockres with an extra ref and
1123 * the proper res->state flags. */
1124 ret = 0;
1125 if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) {
1126 /* migration cannot have an unknown master */
1127 BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
1128 mlog(0, "recovery has passed me a lockres with an "
1129 "unknown owner.. will need to requery: "
1130 "%.*s\n", mres->lockname_len, mres->lockname);
1131 } else {
1132 spin_lock(&res->spinlock);
1133 dlm_change_lockres_owner(dlm, res, dlm->node_num);
1134 spin_unlock(&res->spinlock);
1135 }
1136
1137 /* queue up work for dlm_mig_lockres_worker */
1138 dlm_grab(dlm); /* get an extra ref for the work item */
1139 memcpy(buf, msg->buf, be16_to_cpu(msg->data_len)); /* copy the whole message */
1140 dlm_init_work_item(dlm, item, dlm_mig_lockres_worker, buf);
1141 item->u.ml.lockres = res; /* already have a ref */
1142 item->u.ml.real_master = real_master;
1143 spin_lock(&dlm->work_lock);
1144 list_add_tail(&item->list, &dlm->work_list);
1145 spin_unlock(&dlm->work_lock);
1146 schedule_work(&dlm->dispatched_work);
1147
1148leave:
1149 dlm_put(dlm);
1150 if (ret < 0) {
1151 if (buf)
1152 kfree(buf);
1153 if (item)
1154 kfree(item);
1155 }
1156
1157 mlog_exit(ret);
1158 return ret;
1159}
1160
1161
1162static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data)
1163{
1164 struct dlm_ctxt *dlm = data;
1165 struct dlm_migratable_lockres *mres;
1166 int ret = 0;
1167 struct dlm_lock_resource *res;
1168 u8 real_master;
1169
1170 dlm = item->dlm;
1171 mres = (struct dlm_migratable_lockres *)data;
1172
1173 res = item->u.ml.lockres;
1174 real_master = item->u.ml.real_master;
1175
1176 if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
1177 /* this case is super-rare. only occurs if
1178 * node death happens during migration. */
1179again:
1180 ret = dlm_lockres_master_requery(dlm, res, &real_master);
1181 if (ret < 0) {
1182 mlog(0, "dlm_lockres_master_requery failure: %d\n",
1183 ret);
1184 goto again;
1185 }
1186 if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
1187 mlog(0, "lockres %.*s not claimed. "
1188 "this node will take it.\n",
1189 res->lockname.len, res->lockname.name);
1190 } else {
1191 mlog(0, "master needs to respond to sender "
1192 "that node %u still owns %.*s\n",
1193 real_master, res->lockname.len,
1194 res->lockname.name);
1195 /* cannot touch this lockres */
1196 goto leave;
1197 }
1198 }
1199
1200 ret = dlm_process_recovery_data(dlm, res, mres);
1201 if (ret < 0)
1202 mlog(0, "dlm_process_recovery_data returned %d\n", ret);
1203 else
1204 mlog(0, "dlm_process_recovery_data succeeded\n");
1205
1206 if ((mres->flags & (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) ==
1207 (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) {
1208 ret = dlm_finish_migration(dlm, res, mres->master);
1209 if (ret < 0)
1210 mlog_errno(ret);
1211 }
1212
1213leave:
1214 kfree(data);
1215 mlog_exit(ret);
1216}
1217
1218
1219
1220static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
1221 struct dlm_lock_resource *res,
1222 u8 *real_master)
1223{
1224 struct dlm_node_iter iter;
1225 int nodenum;
1226 int ret = 0;
1227
1228 *real_master = DLM_LOCK_RES_OWNER_UNKNOWN;
1229
1230 /* we only reach here if one of the two nodes in a
1231 * migration died while the migration was in progress.
1232 * at this point we need to requery the master. we
1233 * know that the new_master got as far as creating
1234 * an mle on at least one node, but we do not know
1235 * if any nodes had actually cleared the mle and set
1236 * the master to the new_master. the old master
1237 * is supposed to set the owner to UNKNOWN in the
1238 * event of a new_master death, so the only possible
1239 * responses that we can get from nodes here are
1240 * that the master is new_master, or that the master
1241 * is UNKNOWN.
1242 * if all nodes come back with UNKNOWN then we know
1243 * the lock needs remastering here.
1244 * if any node comes back with a valid master, check
1245 * to see if that master is the one that we are
1246 * recovering. if so, then the new_master died and
1247 * we need to remaster this lock. if not, then the
1248 * new_master survived and that node will respond to
1249 * other nodes about the owner.
1250 * if there is an owner, this node needs to dump this
1251 * lockres and alert the sender that this lockres
1252 * was rejected. */
1253 spin_lock(&dlm->spinlock);
1254 dlm_node_iter_init(dlm->domain_map, &iter);
1255 spin_unlock(&dlm->spinlock);
1256
1257 while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
1258 /* do not send to self */
1259 if (nodenum == dlm->node_num)
1260 continue;
1261 ret = dlm_do_master_requery(dlm, res, nodenum, real_master);
1262 if (ret < 0) {
1263 mlog_errno(ret);
1264 BUG();
1265 /* TODO: need to figure a way to restart this */
1266 }
1267 if (*real_master != DLM_LOCK_RES_OWNER_UNKNOWN) {
1268 mlog(0, "lock master is %u\n", *real_master);
1269 break;
1270 }
1271 }
1272 return ret;
1273}
1274
1275
1276static int dlm_do_master_requery(struct dlm_ctxt *dlm,
1277 struct dlm_lock_resource *res,
1278 u8 nodenum, u8 *real_master)
1279{
1280 int ret = -EINVAL;
1281 struct dlm_master_requery req;
1282 int status = DLM_LOCK_RES_OWNER_UNKNOWN;
1283
1284 memset(&req, 0, sizeof(req));
1285 req.node_idx = dlm->node_num;
1286 req.namelen = res->lockname.len;
1287 memcpy(req.name, res->lockname.name, res->lockname.len);
1288
1289 ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key,
1290 &req, sizeof(req), nodenum, &status);
1291 /* XXX: negative status not handled properly here. */
1292 if (ret < 0)
1293 mlog_errno(ret);
1294 else {
1295 BUG_ON(status < 0);
1296 BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
1297 *real_master = (u8) (status & 0xff);
1298 mlog(0, "node %u responded to master requery with %u\n",
1299 nodenum, *real_master);
1300 ret = 0;
1301 }
1302 return ret;
1303}
1304
1305
1306/* this function cannot error, so unless the sending
1307 * or receiving of the message failed, the owner can
1308 * be trusted */
1309int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data)
1310{
1311 struct dlm_ctxt *dlm = data;
1312 struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf;
1313 struct dlm_lock_resource *res = NULL;
1314 int master = DLM_LOCK_RES_OWNER_UNKNOWN;
1315 u32 flags = DLM_ASSERT_MASTER_REQUERY;
1316
1317 if (!dlm_grab(dlm)) {
1318 /* since the domain has gone away on this
1319 * node, the proper response is UNKNOWN */
1320 return master;
1321 }
1322
1323 spin_lock(&dlm->spinlock);
1324 res = __dlm_lookup_lockres(dlm, req->name, req->namelen);
1325 if (res) {
1326 spin_lock(&res->spinlock);
1327 master = res->owner;
1328 if (master == dlm->node_num) {
1329 int ret = dlm_dispatch_assert_master(dlm, res,
1330 0, 0, flags);
1331 if (ret < 0) {
1332 mlog_errno(-ENOMEM);
1333 /* retry!? */
1334 BUG();
1335 }
1336 }
1337 spin_unlock(&res->spinlock);
1338 }
1339 spin_unlock(&dlm->spinlock);
1340
1341 dlm_put(dlm);
1342 return master;
1343}
1344
1345static inline struct list_head *
1346dlm_list_num_to_pointer(struct dlm_lock_resource *res, int list_num)
1347{
1348 struct list_head *ret;
1349 BUG_ON(list_num < 0);
1350 BUG_ON(list_num > 2);
1351 ret = &(res->granted);
1352 ret += list_num;
1353 return ret;
1354}
1355/* TODO: do ast flush business
1356 * TODO: do MIGRATING and RECOVERING spinning
1357 */
1358
1359/*
1360* NOTE about in-flight requests during migration:
1361*
1362* Before attempting the migrate, the master has marked the lockres as
1363* MIGRATING and then flushed all of its pending ASTS. So any in-flight
1364* requests either got queued before the MIGRATING flag got set, in which
1365* case the lock data will reflect the change and a return message is on
1366* the way, or the request failed to get in before MIGRATING got set. In
1367* this case, the caller will be told to spin and wait for the MIGRATING
1368* flag to be dropped, then recheck the master.
1369* This holds true for the convert, cancel and unlock cases, and since lvb
1370* updates are tied to these same messages, it applies to lvb updates as
1371* well. For the lock case, there is no way a lock can be on the master
1372* queue and not be on the secondary queue since the lock is always added
1373* locally first. This means that the new target node will never be sent
1374* a lock that he doesn't already have on the list.
1375* In total, this means that the local lock is correct and should not be
1376* updated to match the one sent by the master. Any messages sent back
1377* from the master before the MIGRATING flag will bring the lock properly
1378* up-to-date, and the change will be ordered properly for the waiter.
1379* We will *not* attempt to modify the lock underneath the waiter.
1380*/
1381
1382static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1383 struct dlm_lock_resource *res,
1384 struct dlm_migratable_lockres *mres)
1385{
1386 struct dlm_migratable_lock *ml;
1387 struct list_head *queue;
1388 struct dlm_lock *newlock = NULL;
1389 struct dlm_lockstatus *lksb = NULL;
1390 int ret = 0;
1391 int i;
1392 struct list_head *iter;
1393 struct dlm_lock *lock = NULL;
1394
1395 mlog(0, "running %d locks for this lockres\n", mres->num_locks);
1396 for (i=0; i<mres->num_locks; i++) {
1397 ml = &(mres->ml[i]);
1398 BUG_ON(ml->highest_blocked != LKM_IVMODE);
1399 newlock = NULL;
1400 lksb = NULL;
1401
1402 queue = dlm_list_num_to_pointer(res, ml->list);
1403
1404 /* if the lock is for the local node it needs to
1405 * be moved to the proper location within the queue.
1406 * do not allocate a new lock structure. */
1407 if (ml->node == dlm->node_num) {
1408 /* MIGRATION ONLY! */
1409 BUG_ON(!(mres->flags & DLM_MRES_MIGRATION));
1410
1411 spin_lock(&res->spinlock);
1412 list_for_each(iter, queue) {
1413 lock = list_entry (iter, struct dlm_lock, list);
1414 if (lock->ml.cookie != ml->cookie)
1415 lock = NULL;
1416 else
1417 break;
1418 }
1419
1420 /* lock is always created locally first, and
1421 * destroyed locally last. it must be on the list */
1422 if (!lock) {
1423 mlog(ML_ERROR, "could not find local lock "
1424 "with cookie %"MLFu64"!\n",
1425 ml->cookie);
1426 BUG();
1427 }
1428 BUG_ON(lock->ml.node != ml->node);
1429
1430 /* see NOTE above about why we do not update
1431 * to match the master here */
1432
1433 /* move the lock to its proper place */
1434 /* do not alter lock refcount. switching lists. */
1435 list_del_init(&lock->list);
1436 list_add_tail(&lock->list, queue);
1437 spin_unlock(&res->spinlock);
1438
1439 mlog(0, "just reordered a local lock!\n");
1440 continue;
1441 }
1442
1443 /* lock is for another node. */
1444 newlock = dlm_new_lock(ml->type, ml->node,
1445 be64_to_cpu(ml->cookie), NULL);
1446 if (!newlock) {
1447 ret = -ENOMEM;
1448 goto leave;
1449 }
1450 lksb = newlock->lksb;
1451 dlm_lock_attach_lockres(newlock, res);
1452
1453 if (ml->convert_type != LKM_IVMODE) {
1454 BUG_ON(queue != &res->converting);
1455 newlock->ml.convert_type = ml->convert_type;
1456 }
1457 lksb->flags |= (ml->flags &
1458 (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
1459
1460 if (mres->lvb[0]) {
1461 if (lksb->flags & DLM_LKSB_PUT_LVB) {
1462 /* other node was trying to update
1463 * lvb when node died. recreate the
1464 * lksb with the updated lvb. */
1465 memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN);
1466 } else {
1467 /* otherwise, the node is sending its
1468 * most recent valid lvb info */
1469 BUG_ON(ml->type != LKM_EXMODE &&
1470 ml->type != LKM_PRMODE);
1471 if (res->lvb[0] && (ml->type == LKM_EXMODE ||
1472 memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
1473 mlog(ML_ERROR, "received bad lvb!\n");
1474 __dlm_print_one_lock_resource(res);
1475 BUG();
1476 }
1477 memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
1478 }
1479 }
1480
1481
1482 /* NOTE:
1483 * wrt lock queue ordering and recovery:
1484 * 1. order of locks on granted queue is
1485 * meaningless.
1486 * 2. order of locks on converting queue is
1487 * LOST with the node death. sorry charlie.
1488 * 3. order of locks on the blocked queue is
1489 * also LOST.
1490 * order of locks does not affect integrity, it
1491 * just means that a lock request may get pushed
1492 * back in line as a result of the node death.
1493 * also note that for a given node the lock order
1494 * for its secondary queue locks is preserved
1495 * relative to each other, but clearly *not*
1496 * preserved relative to locks from other nodes.
1497 */
1498 spin_lock(&res->spinlock);
1499 dlm_lock_get(newlock);
1500 list_add_tail(&newlock->list, queue);
1501 spin_unlock(&res->spinlock);
1502 }
1503 mlog(0, "done running all the locks\n");
1504
1505leave:
1506 if (ret < 0) {
1507 mlog_errno(ret);
1508 if (newlock)
1509 dlm_lock_put(newlock);
1510 }
1511
1512 mlog_exit(ret);
1513 return ret;
1514}
1515
1516void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
1517 struct dlm_lock_resource *res)
1518{
1519 int i;
1520 struct list_head *queue, *iter, *iter2;
1521 struct dlm_lock *lock;
1522
1523 res->state |= DLM_LOCK_RES_RECOVERING;
1524 if (!list_empty(&res->recovering))
1525 list_del_init(&res->recovering);
1526 list_add_tail(&res->recovering, &dlm->reco.resources);
1527
1528 /* find any pending locks and put them back on proper list */
1529 for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) {
1530 queue = dlm_list_idx_to_ptr(res, i);
1531 list_for_each_safe(iter, iter2, queue) {
1532 lock = list_entry (iter, struct dlm_lock, list);
1533 dlm_lock_get(lock);
1534 if (lock->convert_pending) {
1535 /* move converting lock back to granted */
1536 BUG_ON(i != DLM_CONVERTING_LIST);
1537 mlog(0, "node died with convert pending "
1538 "on %.*s. move back to granted list.\n",
1539 res->lockname.len, res->lockname.name);
1540 dlm_revert_pending_convert(res, lock);
1541 lock->convert_pending = 0;
1542 } else if (lock->lock_pending) {
1543 /* remove pending lock requests completely */
1544 BUG_ON(i != DLM_BLOCKED_LIST);
1545 mlog(0, "node died with lock pending "
1546 "on %.*s. remove from blocked list and skip.\n",
1547 res->lockname.len, res->lockname.name);
1548 /* lock will be floating until ref in
1549 * dlmlock_remote is freed after the network
1550 * call returns. ok for it to not be on any
1551 * list since no ast can be called
1552 * (the master is dead). */
1553 dlm_revert_pending_lock(res, lock);
1554 lock->lock_pending = 0;
1555 } else if (lock->unlock_pending) {
1556 /* if an unlock was in progress, treat as
1557 * if this had completed successfully
1558 * before sending this lock state to the
1559 * new master. note that the dlm_unlock
1560 * call is still responsible for calling
1561 * the unlockast. that will happen after
1562 * the network call times out. for now,
1563 * just move lists to prepare the new
1564 * recovery master. */
1565 BUG_ON(i != DLM_GRANTED_LIST);
1566 mlog(0, "node died with unlock pending "
1567 "on %.*s. remove from blocked list and skip.\n",
1568 res->lockname.len, res->lockname.name);
1569 dlm_commit_pending_unlock(res, lock);
1570 lock->unlock_pending = 0;
1571 } else if (lock->cancel_pending) {
1572 /* if a cancel was in progress, treat as
1573 * if this had completed successfully
1574 * before sending this lock state to the
1575 * new master */
1576 BUG_ON(i != DLM_CONVERTING_LIST);
1577 mlog(0, "node died with cancel pending "
1578 "on %.*s. move back to granted list.\n",
1579 res->lockname.len, res->lockname.name);
1580 dlm_commit_pending_cancel(res, lock);
1581 lock->cancel_pending = 0;
1582 }
1583 dlm_lock_put(lock);
1584 }
1585 }
1586}
1587
1588
1589
1590/* removes all recovered locks from the recovery list.
1591 * sets the res->owner to the new master.
1592 * unsets the RECOVERY flag and wakes waiters. */
1593static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
1594 u8 dead_node, u8 new_master)
1595{
1596 int i;
1597 struct list_head *iter, *iter2, *bucket;
1598 struct dlm_lock_resource *res;
1599
1600 mlog_entry_void();
1601
1602 assert_spin_locked(&dlm->spinlock);
1603
1604 list_for_each_safe(iter, iter2, &dlm->reco.resources) {
1605 res = list_entry (iter, struct dlm_lock_resource, recovering);
1606 if (res->owner == dead_node) {
1607 list_del_init(&res->recovering);
1608 spin_lock(&res->spinlock);
1609 dlm_change_lockres_owner(dlm, res, new_master);
1610 res->state &= ~DLM_LOCK_RES_RECOVERING;
1611 __dlm_dirty_lockres(dlm, res);
1612 spin_unlock(&res->spinlock);
1613 wake_up(&res->wq);
1614 }
1615 }
1616
1617 /* this will become unnecessary eventually, but
1618 * for now we need to run the whole hash, clear
1619 * the RECOVERING state and set the owner
1620 * if necessary */
1621 for (i=0; i<DLM_HASH_SIZE; i++) {
1622 bucket = &(dlm->resources[i]);
1623 list_for_each(iter, bucket) {
1624 res = list_entry (iter, struct dlm_lock_resource, list);
1625 if (res->state & DLM_LOCK_RES_RECOVERING) {
1626 if (res->owner == dead_node) {
1627 mlog(0, "(this=%u) res %.*s owner=%u "
1628 "was not on recovering list, but "
1629 "clearing state anyway\n",
1630 dlm->node_num, res->lockname.len,
1631 res->lockname.name, new_master);
1632 } else if (res->owner == dlm->node_num) {
1633 mlog(0, "(this=%u) res %.*s owner=%u "
1634 "was not on recovering list, "
1635 "owner is THIS node, clearing\n",
1636 dlm->node_num, res->lockname.len,
1637 res->lockname.name, new_master);
1638 } else
1639 continue;
1640
1641 spin_lock(&res->spinlock);
1642 dlm_change_lockres_owner(dlm, res, new_master);
1643 res->state &= ~DLM_LOCK_RES_RECOVERING;
1644 __dlm_dirty_lockres(dlm, res);
1645 spin_unlock(&res->spinlock);
1646 wake_up(&res->wq);
1647 }
1648 }
1649 }
1650}
1651
1652static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local)
1653{
1654 if (local) {
1655 if (lock->ml.type != LKM_EXMODE &&
1656 lock->ml.type != LKM_PRMODE)
1657 return 1;
1658 } else if (lock->ml.type == LKM_EXMODE)
1659 return 1;
1660 return 0;
1661}
1662
1663static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
1664 struct dlm_lock_resource *res, u8 dead_node)
1665{
1666 struct list_head *iter, *queue;
1667 struct dlm_lock *lock;
1668 int blank_lvb = 0, local = 0;
1669 int i;
1670 u8 search_node;
1671
1672 assert_spin_locked(&dlm->spinlock);
1673 assert_spin_locked(&res->spinlock);
1674
1675 if (res->owner == dlm->node_num)
1676 /* if this node owned the lockres, and if the dead node
1677 * had an EX when he died, blank out the lvb */
1678 search_node = dead_node;
1679 else {
1680 /* if this is a secondary lockres, and we had no EX or PR
1681 * locks granted, we can no longer trust the lvb */
1682 search_node = dlm->node_num;
1683 local = 1; /* check local state for valid lvb */
1684 }
1685
1686 for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) {
1687 queue = dlm_list_idx_to_ptr(res, i);
1688 list_for_each(iter, queue) {
1689 lock = list_entry (iter, struct dlm_lock, list);
1690 if (lock->ml.node == search_node) {
1691 if (dlm_lvb_needs_invalidation(lock, local)) {
1692 /* zero the lksb lvb and lockres lvb */
1693 blank_lvb = 1;
1694 memset(lock->lksb->lvb, 0, DLM_LVB_LEN);
1695 }
1696 }
1697 }
1698 }
1699
1700 if (blank_lvb) {
1701 mlog(0, "clearing %.*s lvb, dead node %u had EX\n",
1702 res->lockname.len, res->lockname.name, dead_node);
1703 memset(res->lvb, 0, DLM_LVB_LEN);
1704 }
1705}
1706
1707static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
1708 struct dlm_lock_resource *res, u8 dead_node)
1709{
1710 struct list_head *iter, *tmpiter;
1711 struct dlm_lock *lock;
1712
1713 /* this node is the lockres master:
1714 * 1) remove any stale locks for the dead node
1715 * 2) if the dead node had an EX when he died, blank out the lvb
1716 */
1717 assert_spin_locked(&dlm->spinlock);
1718 assert_spin_locked(&res->spinlock);
1719
1720 /* TODO: check pending_asts, pending_basts here */
1721 list_for_each_safe(iter, tmpiter, &res->granted) {
1722 lock = list_entry (iter, struct dlm_lock, list);
1723 if (lock->ml.node == dead_node) {
1724 list_del_init(&lock->list);
1725 dlm_lock_put(lock);
1726 }
1727 }
1728 list_for_each_safe(iter, tmpiter, &res->converting) {
1729 lock = list_entry (iter, struct dlm_lock, list);
1730 if (lock->ml.node == dead_node) {
1731 list_del_init(&lock->list);
1732 dlm_lock_put(lock);
1733 }
1734 }
1735 list_for_each_safe(iter, tmpiter, &res->blocked) {
1736 lock = list_entry (iter, struct dlm_lock, list);
1737 if (lock->ml.node == dead_node) {
1738 list_del_init(&lock->list);
1739 dlm_lock_put(lock);
1740 }
1741 }
1742
1743 /* do not kick thread yet */
1744 __dlm_dirty_lockres(dlm, res);
1745}
1746
1747/* if this node is the recovery master, and there are no
1748 * locks for a given lockres owned by this node that are in
1749 * either PR or EX mode, zero out the lvb before requesting.
1750 *
1751 */
1752
1753
1754static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
1755{
1756 struct list_head *iter;
1757 struct dlm_lock_resource *res;
1758 int i;
1759 struct list_head *bucket;
1760
1761
1762 /* purge any stale mles */
1763 dlm_clean_master_list(dlm, dead_node);
1764
1765 /*
1766 * now clean up all lock resources. there are two rules:
1767 *
1768 * 1) if the dead node was the master, move the lockres
1769 * to the recovering list. set the RECOVERING flag.
1770 * this lockres needs to be cleaned up before it can
1771 * be used further.
1772 *
1773 * 2) if this node was the master, remove all locks from
1774 * each of the lockres queues that were owned by the
1775 * dead node. once recovery finishes, the dlm thread
1776 * can be kicked again to see if any ASTs or BASTs
1777 * need to be fired as a result.
1778 */
1779 for (i=0; i<DLM_HASH_SIZE; i++) {
1780 bucket = &(dlm->resources[i]);
1781 list_for_each(iter, bucket) {
1782 res = list_entry (iter, struct dlm_lock_resource, list);
1783 if (dlm_is_recovery_lock(res->lockname.name,
1784 res->lockname.len))
1785 continue;
1786
1787 spin_lock(&res->spinlock);
1788 /* zero the lvb if necessary */
1789 dlm_revalidate_lvb(dlm, res, dead_node);
1790 if (res->owner == dead_node)
1791 dlm_move_lockres_to_recovery_list(dlm, res);
1792 else if (res->owner == dlm->node_num) {
1793 dlm_free_dead_locks(dlm, res, dead_node);
1794 __dlm_lockres_calc_usage(dlm, res);
1795 }
1796 spin_unlock(&res->spinlock);
1797 }
1798 }
1799
1800}
1801
1802static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
1803{
1804 assert_spin_locked(&dlm->spinlock);
1805
1806 /* check to see if the node is already considered dead */
1807 if (!test_bit(idx, dlm->live_nodes_map)) {
1808 mlog(0, "for domain %s, node %d is already dead. "
1809 "another node likely did recovery already.\n",
1810 dlm->name, idx);
1811 return;
1812 }
1813
1814 /* check to see if we do not care about this node */
1815 if (!test_bit(idx, dlm->domain_map)) {
1816 /* This also catches the case that we get a node down
1817 * but haven't joined the domain yet. */
1818 mlog(0, "node %u already removed from domain!\n", idx);
1819 return;
1820 }
1821
1822 clear_bit(idx, dlm->live_nodes_map);
1823
1824 /* Clean up join state on node death. */
1825 if (dlm->joining_node == idx) {
1826 mlog(0, "Clearing join state for node %u\n", idx);
1827 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
1828 }
1829
1830 /* make sure local cleanup occurs before the heartbeat events */
1831 if (!test_bit(idx, dlm->recovery_map))
1832 dlm_do_local_recovery_cleanup(dlm, idx);
1833
1834 /* notify anything attached to the heartbeat events */
1835 dlm_hb_event_notify_attached(dlm, idx, 0);
1836
1837 mlog(0, "node %u being removed from domain map!\n", idx);
1838 clear_bit(idx, dlm->domain_map);
1839 /* wake up migration waiters if a node goes down.
1840 * perhaps later we can genericize this for other waiters. */
1841 wake_up(&dlm->migration_wq);
1842
1843 if (test_bit(idx, dlm->recovery_map))
1844 mlog(0, "domain %s, node %u already added "
1845 "to recovery map!\n", dlm->name, idx);
1846 else
1847 set_bit(idx, dlm->recovery_map);
1848}
1849
1850void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
1851{
1852 struct dlm_ctxt *dlm = data;
1853
1854 if (!dlm_grab(dlm))
1855 return;
1856
1857 spin_lock(&dlm->spinlock);
1858 __dlm_hb_node_down(dlm, idx);
1859 spin_unlock(&dlm->spinlock);
1860
1861 dlm_put(dlm);
1862}
1863
1864void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data)
1865{
1866 struct dlm_ctxt *dlm = data;
1867
1868 if (!dlm_grab(dlm))
1869 return;
1870
1871 spin_lock(&dlm->spinlock);
1872
1873 set_bit(idx, dlm->live_nodes_map);
1874
1875 /* notify any mles attached to the heartbeat events */
1876 dlm_hb_event_notify_attached(dlm, idx, 1);
1877
1878 spin_unlock(&dlm->spinlock);
1879
1880 dlm_put(dlm);
1881}
1882
1883static void dlm_reco_ast(void *astdata)
1884{
1885 struct dlm_ctxt *dlm = astdata;
1886 mlog(0, "ast for recovery lock fired!, this=%u, dlm=%s\n",
1887 dlm->node_num, dlm->name);
1888}
1889static void dlm_reco_bast(void *astdata, int blocked_type)
1890{
1891 struct dlm_ctxt *dlm = astdata;
1892 mlog(0, "bast for recovery lock fired!, this=%u, dlm=%s\n",
1893 dlm->node_num, dlm->name);
1894}
1895static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st)
1896{
1897 mlog(0, "unlockast for recovery lock fired!\n");
1898}
1899
1900
1901static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
1902{
1903 enum dlm_status ret;
1904 struct dlm_lockstatus lksb;
1905 int status = -EINVAL;
1906
1907 mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n",
1908 dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num);
1909retry:
1910 memset(&lksb, 0, sizeof(lksb));
1911
1912 ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
1913 DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast);
1914
1915 if (ret == DLM_NORMAL) {
1916 mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n",
1917 dlm->name, dlm->node_num);
1918 /* I am master, send message to all nodes saying
1919 * that I am beginning a recovery session */
1920 status = dlm_send_begin_reco_message(dlm,
1921 dlm->reco.dead_node);
1922
1923 /* recovery lock is a special case. ast will not get fired,
1924 * so just go ahead and unlock it. */
1925 ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm);
1926 if (ret != DLM_NORMAL) {
1927 /* this would really suck. this could only happen
1928 * if there was a network error during the unlock
1929 * because of node death. this means the unlock
1930 * is actually "done" and the lock structure is
1931 * even freed. we can continue, but only
1932 * because this specific lock name is special. */
1933 mlog(0, "dlmunlock returned %d\n", ret);
1934 }
1935
1936 if (status < 0) {
1937 mlog(0, "failed to send recovery message. "
1938 "must retry with new node map.\n");
1939 goto retry;
1940 }
1941 } else if (ret == DLM_NOTQUEUED) {
1942 mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n",
1943 dlm->name, dlm->node_num);
1944 /* another node is master. wait on
1945 * reco.new_master != O2NM_INVALID_NODE_NUM */
1946 status = -EEXIST;
1947 }
1948
1949 return status;
1950}
1951
1952static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
1953{
1954 struct dlm_begin_reco br;
1955 int ret = 0;
1956 struct dlm_node_iter iter;
1957 int nodenum;
1958 int status;
1959
1960 mlog_entry("%u\n", dead_node);
1961
1962 mlog(0, "dead node is %u\n", dead_node);
1963
1964 spin_lock(&dlm->spinlock);
1965 dlm_node_iter_init(dlm->domain_map, &iter);
1966 spin_unlock(&dlm->spinlock);
1967
1968 clear_bit(dead_node, iter.node_map);
1969
1970 memset(&br, 0, sizeof(br));
1971 br.node_idx = dlm->node_num;
1972 br.dead_node = dead_node;
1973
1974 while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
1975 ret = 0;
1976 if (nodenum == dead_node) {
1977 mlog(0, "not sending begin reco to dead node "
1978 "%u\n", dead_node);
1979 continue;
1980 }
1981 if (nodenum == dlm->node_num) {
1982 mlog(0, "not sending begin reco to self\n");
1983 continue;
1984 }
1985
1986 ret = -EINVAL;
1987 mlog(0, "attempting to send begin reco msg to %d\n",
1988 nodenum);
1989 ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key,
1990 &br, sizeof(br), nodenum, &status);
1991 /* negative status is handled ok by caller here */
1992 if (ret >= 0)
1993 ret = status;
1994 if (ret < 0) {
1995 struct dlm_lock_resource *res;
1996 mlog_errno(ret);
1997 mlog(ML_ERROR, "begin reco of dlm %s to node %u "
1998 " returned %d\n", dlm->name, nodenum, ret);
1999 res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
2000 DLM_RECOVERY_LOCK_NAME_LEN);
2001 if (res) {
2002 dlm_print_one_lock_resource(res);
2003 dlm_lockres_put(res);
2004 } else {
2005 mlog(ML_ERROR, "recovery lock not found\n");
2006 }
2007 break;
2008 }
2009 }
2010
2011 return ret;
2012}
2013
2014int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
2015{
2016 struct dlm_ctxt *dlm = data;
2017 struct dlm_begin_reco *br = (struct dlm_begin_reco *)msg->buf;
2018
2019 /* ok to return 0, domain has gone away */
2020 if (!dlm_grab(dlm))
2021 return 0;
2022
2023 mlog(0, "node %u wants to recover node %u\n",
2024 br->node_idx, br->dead_node);
2025
2026 dlm_fire_domain_eviction_callbacks(dlm, br->dead_node);
2027
2028 spin_lock(&dlm->spinlock);
2029 if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
2030 mlog(0, "new_master already set to %u!\n",
2031 dlm->reco.new_master);
2032 }
2033 if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) {
2034 mlog(0, "dead_node already set to %u!\n",
2035 dlm->reco.dead_node);
2036 }
2037 dlm->reco.new_master = br->node_idx;
2038 dlm->reco.dead_node = br->dead_node;
2039 if (!test_bit(br->dead_node, dlm->recovery_map)) {
2040 mlog(ML_ERROR, "recovery master %u sees %u as dead, but this "
2041 "node has not yet. marking %u as dead\n",
2042 br->node_idx, br->dead_node, br->dead_node);
2043 __dlm_hb_node_down(dlm, br->dead_node);
2044 }
2045 spin_unlock(&dlm->spinlock);
2046
2047 dlm_kick_recovery_thread(dlm);
2048 dlm_put(dlm);
2049 return 0;
2050}
2051
2052static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
2053{
2054 int ret = 0;
2055 struct dlm_finalize_reco fr;
2056 struct dlm_node_iter iter;
2057 int nodenum;
2058 int status;
2059
2060 mlog(0, "finishing recovery for node %s:%u\n",
2061 dlm->name, dlm->reco.dead_node);
2062
2063 spin_lock(&dlm->spinlock);
2064 dlm_node_iter_init(dlm->domain_map, &iter);
2065 spin_unlock(&dlm->spinlock);
2066
2067 memset(&fr, 0, sizeof(fr));
2068 fr.node_idx = dlm->node_num;
2069 fr.dead_node = dlm->reco.dead_node;
2070
2071 while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
2072 if (nodenum == dlm->node_num)
2073 continue;
2074 ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key,
2075 &fr, sizeof(fr), nodenum, &status);
2076 if (ret >= 0) {
2077 ret = status;
2078 if (dlm_is_host_down(ret)) {
2079 /* this has no effect on this recovery
2080 * session, so set the status to zero to
2081 * finish out the last recovery */
2082 mlog(ML_ERROR, "node %u went down after this "
2083 "node finished recovery.\n", nodenum);
2084 ret = 0;
2085 }
2086 }
2087 if (ret < 0) {
2088 mlog_errno(ret);
2089 break;
2090 }
2091 }
2092
2093 return ret;
2094}
2095
2096int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
2097{
2098 struct dlm_ctxt *dlm = data;
2099 struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf;
2100
2101 /* ok to return 0, domain has gone away */
2102 if (!dlm_grab(dlm))
2103 return 0;
2104
2105 mlog(0, "node %u finalizing recovery of node %u\n",
2106 fr->node_idx, fr->dead_node);
2107
2108 spin_lock(&dlm->spinlock);
2109
2110 if (dlm->reco.new_master != fr->node_idx) {
2111 mlog(ML_ERROR, "node %u sent recovery finalize msg, but node "
2112 "%u is supposed to be the new master, dead=%u\n",
2113 fr->node_idx, dlm->reco.new_master, fr->dead_node);
2114 BUG();
2115 }
2116 if (dlm->reco.dead_node != fr->dead_node) {
2117 mlog(ML_ERROR, "node %u sent recovery finalize msg for dead "
2118 "node %u, but node %u is supposed to be dead\n",
2119 fr->node_idx, fr->dead_node, dlm->reco.dead_node);
2120 BUG();
2121 }
2122
2123 dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
2124
2125 spin_unlock(&dlm->spinlock);
2126
2127 dlm_reset_recovery(dlm);
2128
2129 dlm_kick_recovery_thread(dlm);
2130 dlm_put(dlm);
2131 return 0;
2132}
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
new file mode 100644
index 00000000000..5be9d14f12c
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -0,0 +1,692 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmthread.c
5 *
6 * standalone DLM module
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27
28#include <linux/module.h>
29#include <linux/fs.h>
30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h>
35#include <linux/sysctl.h>
36#include <linux/random.h>
37#include <linux/blkdev.h>
38#include <linux/socket.h>
39#include <linux/inet.h>
40#include <linux/timer.h>
41#include <linux/kthread.h>
42
43
44#include "cluster/heartbeat.h"
45#include "cluster/nodemanager.h"
46#include "cluster/tcp.h"
47
48#include "dlmapi.h"
49#include "dlmcommon.h"
50#include "dlmdomain.h"
51
52#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_THREAD)
53#include "cluster/masklog.h"
54
55static int dlm_thread(void *data);
56
57static void dlm_flush_asts(struct dlm_ctxt *dlm);
58
59#define dlm_lock_is_remote(dlm, lock) ((lock)->ml.node != (dlm)->node_num)
60
61/* will exit holding res->spinlock, but may drop in function */
62/* waits until flags are cleared on res->state */
63void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags)
64{
65 DECLARE_WAITQUEUE(wait, current);
66
67 assert_spin_locked(&res->spinlock);
68
69 add_wait_queue(&res->wq, &wait);
70repeat:
71 set_current_state(TASK_UNINTERRUPTIBLE);
72 if (res->state & flags) {
73 spin_unlock(&res->spinlock);
74 schedule();
75 spin_lock(&res->spinlock);
76 goto repeat;
77 }
78 remove_wait_queue(&res->wq, &wait);
79 current->state = TASK_RUNNING;
80}
81
82
83static int __dlm_lockres_unused(struct dlm_lock_resource *res)
84{
85 if (list_empty(&res->granted) &&
86 list_empty(&res->converting) &&
87 list_empty(&res->blocked) &&
88 list_empty(&res->dirty))
89 return 1;
90 return 0;
91}
92
93
94/* Call whenever you may have added or deleted something from one of
95 * the lockres queue's. This will figure out whether it belongs on the
96 * unused list or not and does the appropriate thing. */
97void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
98 struct dlm_lock_resource *res)
99{
100 mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
101
102 assert_spin_locked(&dlm->spinlock);
103 assert_spin_locked(&res->spinlock);
104
105 if (__dlm_lockres_unused(res)){
106 if (list_empty(&res->purge)) {
107 mlog(0, "putting lockres %.*s from purge list\n",
108 res->lockname.len, res->lockname.name);
109
110 res->last_used = jiffies;
111 list_add_tail(&res->purge, &dlm->purge_list);
112 dlm->purge_count++;
113 }
114 } else if (!list_empty(&res->purge)) {
115 mlog(0, "removing lockres %.*s from purge list\n",
116 res->lockname.len, res->lockname.name);
117
118 list_del_init(&res->purge);
119 dlm->purge_count--;
120 }
121}
122
123void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
124 struct dlm_lock_resource *res)
125{
126 mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
127 spin_lock(&dlm->spinlock);
128 spin_lock(&res->spinlock);
129
130 __dlm_lockres_calc_usage(dlm, res);
131
132 spin_unlock(&res->spinlock);
133 spin_unlock(&dlm->spinlock);
134}
135
136/* TODO: Eventual API: Called with the dlm spinlock held, may drop it
137 * to do migration, but will re-acquire before exit. */
138void dlm_purge_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *lockres)
139{
140 int master;
141 int ret;
142
143 spin_lock(&lockres->spinlock);
144 master = lockres->owner == dlm->node_num;
145 spin_unlock(&lockres->spinlock);
146
147 mlog(0, "purging lockres %.*s, master = %d\n", lockres->lockname.len,
148 lockres->lockname.name, master);
149
150 /* Non master is the easy case -- no migration required, just
151 * quit. */
152 if (!master)
153 goto finish;
154
155 /* Wheee! Migrate lockres here! */
156 spin_unlock(&dlm->spinlock);
157again:
158
159 ret = dlm_migrate_lockres(dlm, lockres, O2NM_MAX_NODES);
160 if (ret == -ENOTEMPTY) {
161 mlog(ML_ERROR, "lockres %.*s still has local locks!\n",
162 lockres->lockname.len, lockres->lockname.name);
163
164 BUG();
165 } else if (ret < 0) {
166 mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n",
167 lockres->lockname.len, lockres->lockname.name);
168 goto again;
169 }
170
171 spin_lock(&dlm->spinlock);
172
173finish:
174 if (!list_empty(&lockres->purge)) {
175 list_del_init(&lockres->purge);
176 dlm->purge_count--;
177 }
178 __dlm_unhash_lockres(lockres);
179}
180
181static void dlm_run_purge_list(struct dlm_ctxt *dlm,
182 int purge_now)
183{
184 unsigned int run_max, unused;
185 unsigned long purge_jiffies;
186 struct dlm_lock_resource *lockres;
187
188 spin_lock(&dlm->spinlock);
189 run_max = dlm->purge_count;
190
191 while(run_max && !list_empty(&dlm->purge_list)) {
192 run_max--;
193
194 lockres = list_entry(dlm->purge_list.next,
195 struct dlm_lock_resource, purge);
196
197 /* Status of the lockres *might* change so double
198 * check. If the lockres is unused, holding the dlm
199 * spinlock will prevent people from getting and more
200 * refs on it -- there's no need to keep the lockres
201 * spinlock. */
202 spin_lock(&lockres->spinlock);
203 unused = __dlm_lockres_unused(lockres);
204 spin_unlock(&lockres->spinlock);
205
206 if (!unused)
207 continue;
208
209 purge_jiffies = lockres->last_used +
210 msecs_to_jiffies(DLM_PURGE_INTERVAL_MS);
211
212 /* Make sure that we want to be processing this guy at
213 * this time. */
214 if (!purge_now && time_after(purge_jiffies, jiffies)) {
215 /* Since resources are added to the purge list
216 * in tail order, we can stop at the first
217 * unpurgable resource -- anyone added after
218 * him will have a greater last_used value */
219 break;
220 }
221
222 list_del_init(&lockres->purge);
223 dlm->purge_count--;
224
225 /* This may drop and reacquire the dlm spinlock if it
226 * has to do migration. */
227 mlog(0, "calling dlm_purge_lockres!\n");
228 dlm_purge_lockres(dlm, lockres);
229 mlog(0, "DONE calling dlm_purge_lockres!\n");
230
231 /* Avoid adding any scheduling latencies */
232 cond_resched_lock(&dlm->spinlock);
233 }
234
235 spin_unlock(&dlm->spinlock);
236}
237
238static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
239 struct dlm_lock_resource *res)
240{
241 struct dlm_lock *lock, *target;
242 struct list_head *iter;
243 struct list_head *head;
244 int can_grant = 1;
245
246 //mlog(0, "res->lockname.len=%d\n", res->lockname.len);
247 //mlog(0, "res->lockname.name=%p\n", res->lockname.name);
248 //mlog(0, "shuffle res %.*s\n", res->lockname.len,
249 // res->lockname.name);
250
251 /* because this function is called with the lockres
252 * spinlock, and because we know that it is not migrating/
253 * recovering/in-progress, it is fine to reserve asts and
254 * basts right before queueing them all throughout */
255 assert_spin_locked(&res->spinlock);
256 BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
257 DLM_LOCK_RES_RECOVERING|
258 DLM_LOCK_RES_IN_PROGRESS)));
259
260converting:
261 if (list_empty(&res->converting))
262 goto blocked;
263 mlog(0, "res %.*s has locks on a convert queue\n", res->lockname.len,
264 res->lockname.name);
265
266 target = list_entry(res->converting.next, struct dlm_lock, list);
267 if (target->ml.convert_type == LKM_IVMODE) {
268 mlog(ML_ERROR, "%.*s: converting a lock with no "
269 "convert_type!\n", res->lockname.len, res->lockname.name);
270 BUG();
271 }
272 head = &res->granted;
273 list_for_each(iter, head) {
274 lock = list_entry(iter, struct dlm_lock, list);
275 if (lock==target)
276 continue;
277 if (!dlm_lock_compatible(lock->ml.type,
278 target->ml.convert_type)) {
279 can_grant = 0;
280 /* queue the BAST if not already */
281 if (lock->ml.highest_blocked == LKM_IVMODE) {
282 __dlm_lockres_reserve_ast(res);
283 dlm_queue_bast(dlm, lock);
284 }
285 /* update the highest_blocked if needed */
286 if (lock->ml.highest_blocked < target->ml.convert_type)
287 lock->ml.highest_blocked =
288 target->ml.convert_type;
289 }
290 }
291 head = &res->converting;
292 list_for_each(iter, head) {
293 lock = list_entry(iter, struct dlm_lock, list);
294 if (lock==target)
295 continue;
296 if (!dlm_lock_compatible(lock->ml.type,
297 target->ml.convert_type)) {
298 can_grant = 0;
299 if (lock->ml.highest_blocked == LKM_IVMODE) {
300 __dlm_lockres_reserve_ast(res);
301 dlm_queue_bast(dlm, lock);
302 }
303 if (lock->ml.highest_blocked < target->ml.convert_type)
304 lock->ml.highest_blocked =
305 target->ml.convert_type;
306 }
307 }
308
309 /* we can convert the lock */
310 if (can_grant) {
311 spin_lock(&target->spinlock);
312 BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
313
314 mlog(0, "calling ast for converting lock: %.*s, have: %d, "
315 "granting: %d, node: %u\n", res->lockname.len,
316 res->lockname.name, target->ml.type,
317 target->ml.convert_type, target->ml.node);
318
319 target->ml.type = target->ml.convert_type;
320 target->ml.convert_type = LKM_IVMODE;
321 list_del_init(&target->list);
322 list_add_tail(&target->list, &res->granted);
323
324 BUG_ON(!target->lksb);
325 target->lksb->status = DLM_NORMAL;
326
327 spin_unlock(&target->spinlock);
328
329 __dlm_lockres_reserve_ast(res);
330 dlm_queue_ast(dlm, target);
331 /* go back and check for more */
332 goto converting;
333 }
334
335blocked:
336 if (list_empty(&res->blocked))
337 goto leave;
338 target = list_entry(res->blocked.next, struct dlm_lock, list);
339
340 head = &res->granted;
341 list_for_each(iter, head) {
342 lock = list_entry(iter, struct dlm_lock, list);
343 if (lock==target)
344 continue;
345 if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
346 can_grant = 0;
347 if (lock->ml.highest_blocked == LKM_IVMODE) {
348 __dlm_lockres_reserve_ast(res);
349 dlm_queue_bast(dlm, lock);
350 }
351 if (lock->ml.highest_blocked < target->ml.type)
352 lock->ml.highest_blocked = target->ml.type;
353 }
354 }
355
356 head = &res->converting;
357 list_for_each(iter, head) {
358 lock = list_entry(iter, struct dlm_lock, list);
359 if (lock==target)
360 continue;
361 if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
362 can_grant = 0;
363 if (lock->ml.highest_blocked == LKM_IVMODE) {
364 __dlm_lockres_reserve_ast(res);
365 dlm_queue_bast(dlm, lock);
366 }
367 if (lock->ml.highest_blocked < target->ml.type)
368 lock->ml.highest_blocked = target->ml.type;
369 }
370 }
371
372 /* we can grant the blocked lock (only
373 * possible if converting list empty) */
374 if (can_grant) {
375 spin_lock(&target->spinlock);
376 BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
377
378 mlog(0, "calling ast for blocked lock: %.*s, granting: %d, "
379 "node: %u\n", res->lockname.len, res->lockname.name,
380 target->ml.type, target->ml.node);
381
382 // target->ml.type is already correct
383 list_del_init(&target->list);
384 list_add_tail(&target->list, &res->granted);
385
386 BUG_ON(!target->lksb);
387 target->lksb->status = DLM_NORMAL;
388
389 spin_unlock(&target->spinlock);
390
391 __dlm_lockres_reserve_ast(res);
392 dlm_queue_ast(dlm, target);
393 /* go back and check for more */
394 goto converting;
395 }
396
397leave:
398 return;
399}
400
401/* must have NO locks when calling this with res !=NULL * */
402void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
403{
404 mlog_entry("dlm=%p, res=%p\n", dlm, res);
405 if (res) {
406 spin_lock(&dlm->spinlock);
407 spin_lock(&res->spinlock);
408 __dlm_dirty_lockres(dlm, res);
409 spin_unlock(&res->spinlock);
410 spin_unlock(&dlm->spinlock);
411 }
412 wake_up(&dlm->dlm_thread_wq);
413}
414
415void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
416{
417 mlog_entry("dlm=%p, res=%p\n", dlm, res);
418
419 assert_spin_locked(&dlm->spinlock);
420 assert_spin_locked(&res->spinlock);
421
422 /* don't shuffle secondary queues */
423 if ((res->owner == dlm->node_num) &&
424 !(res->state & DLM_LOCK_RES_DIRTY)) {
425 list_add_tail(&res->dirty, &dlm->dirty_list);
426 res->state |= DLM_LOCK_RES_DIRTY;
427 }
428}
429
430
431/* Launch the NM thread for the mounted volume */
432int dlm_launch_thread(struct dlm_ctxt *dlm)
433{
434 mlog(0, "starting dlm thread...\n");
435
436 dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
437 if (IS_ERR(dlm->dlm_thread_task)) {
438 mlog_errno(PTR_ERR(dlm->dlm_thread_task));
439 dlm->dlm_thread_task = NULL;
440 return -EINVAL;
441 }
442
443 return 0;
444}
445
446void dlm_complete_thread(struct dlm_ctxt *dlm)
447{
448 if (dlm->dlm_thread_task) {
449 mlog(ML_KTHREAD, "waiting for dlm thread to exit\n");
450 kthread_stop(dlm->dlm_thread_task);
451 dlm->dlm_thread_task = NULL;
452 }
453}
454
455static int dlm_dirty_list_empty(struct dlm_ctxt *dlm)
456{
457 int empty;
458
459 spin_lock(&dlm->spinlock);
460 empty = list_empty(&dlm->dirty_list);
461 spin_unlock(&dlm->spinlock);
462
463 return empty;
464}
465
466static void dlm_flush_asts(struct dlm_ctxt *dlm)
467{
468 int ret;
469 struct dlm_lock *lock;
470 struct dlm_lock_resource *res;
471 u8 hi;
472
473 spin_lock(&dlm->ast_lock);
474 while (!list_empty(&dlm->pending_asts)) {
475 lock = list_entry(dlm->pending_asts.next,
476 struct dlm_lock, ast_list);
477 /* get an extra ref on lock */
478 dlm_lock_get(lock);
479 res = lock->lockres;
480 mlog(0, "delivering an ast for this lockres\n");
481
482 BUG_ON(!lock->ast_pending);
483
484 /* remove from list (including ref) */
485 list_del_init(&lock->ast_list);
486 dlm_lock_put(lock);
487 spin_unlock(&dlm->ast_lock);
488
489 if (lock->ml.node != dlm->node_num) {
490 ret = dlm_do_remote_ast(dlm, res, lock);
491 if (ret < 0)
492 mlog_errno(ret);
493 } else
494 dlm_do_local_ast(dlm, res, lock);
495
496 spin_lock(&dlm->ast_lock);
497
498 /* possible that another ast was queued while
499 * we were delivering the last one */
500 if (!list_empty(&lock->ast_list)) {
501 mlog(0, "aha another ast got queued while "
502 "we were finishing the last one. will "
503 "keep the ast_pending flag set.\n");
504 } else
505 lock->ast_pending = 0;
506
507 /* drop the extra ref.
508 * this may drop it completely. */
509 dlm_lock_put(lock);
510 dlm_lockres_release_ast(dlm, res);
511 }
512
513 while (!list_empty(&dlm->pending_basts)) {
514 lock = list_entry(dlm->pending_basts.next,
515 struct dlm_lock, bast_list);
516 /* get an extra ref on lock */
517 dlm_lock_get(lock);
518 res = lock->lockres;
519
520 BUG_ON(!lock->bast_pending);
521
522 /* get the highest blocked lock, and reset */
523 spin_lock(&lock->spinlock);
524 BUG_ON(lock->ml.highest_blocked <= LKM_IVMODE);
525 hi = lock->ml.highest_blocked;
526 lock->ml.highest_blocked = LKM_IVMODE;
527 spin_unlock(&lock->spinlock);
528
529 /* remove from list (including ref) */
530 list_del_init(&lock->bast_list);
531 dlm_lock_put(lock);
532 spin_unlock(&dlm->ast_lock);
533
534 mlog(0, "delivering a bast for this lockres "
535 "(blocked = %d\n", hi);
536
537 if (lock->ml.node != dlm->node_num) {
538 ret = dlm_send_proxy_bast(dlm, res, lock, hi);
539 if (ret < 0)
540 mlog_errno(ret);
541 } else
542 dlm_do_local_bast(dlm, res, lock, hi);
543
544 spin_lock(&dlm->ast_lock);
545
546 /* possible that another bast was queued while
547 * we were delivering the last one */
548 if (!list_empty(&lock->bast_list)) {
549 mlog(0, "aha another bast got queued while "
550 "we were finishing the last one. will "
551 "keep the bast_pending flag set.\n");
552 } else
553 lock->bast_pending = 0;
554
555 /* drop the extra ref.
556 * this may drop it completely. */
557 dlm_lock_put(lock);
558 dlm_lockres_release_ast(dlm, res);
559 }
560 wake_up(&dlm->ast_wq);
561 spin_unlock(&dlm->ast_lock);
562}
563
564
565#define DLM_THREAD_TIMEOUT_MS (4 * 1000)
566#define DLM_THREAD_MAX_DIRTY 100
567#define DLM_THREAD_MAX_ASTS 10
568
569static int dlm_thread(void *data)
570{
571 struct dlm_lock_resource *res;
572 struct dlm_ctxt *dlm = data;
573 unsigned long timeout = msecs_to_jiffies(DLM_THREAD_TIMEOUT_MS);
574
575 mlog(0, "dlm thread running for %s...\n", dlm->name);
576
577 while (!kthread_should_stop()) {
578 int n = DLM_THREAD_MAX_DIRTY;
579
580 /* dlm_shutting_down is very point-in-time, but that
581 * doesn't matter as we'll just loop back around if we
582 * get false on the leading edge of a state
583 * transition. */
584 dlm_run_purge_list(dlm, dlm_shutting_down(dlm));
585
586 /* We really don't want to hold dlm->spinlock while
587 * calling dlm_shuffle_lists on each lockres that
588 * needs to have its queues adjusted and AST/BASTs
589 * run. So let's pull each entry off the dirty_list
590 * and drop dlm->spinlock ASAP. Once off the list,
591 * res->spinlock needs to be taken again to protect
592 * the queues while calling dlm_shuffle_lists. */
593 spin_lock(&dlm->spinlock);
594 while (!list_empty(&dlm->dirty_list)) {
595 int delay = 0;
596 res = list_entry(dlm->dirty_list.next,
597 struct dlm_lock_resource, dirty);
598
599 /* peel a lockres off, remove it from the list,
600 * unset the dirty flag and drop the dlm lock */
601 BUG_ON(!res);
602 dlm_lockres_get(res);
603
604 spin_lock(&res->spinlock);
605 res->state &= ~DLM_LOCK_RES_DIRTY;
606 list_del_init(&res->dirty);
607 spin_unlock(&res->spinlock);
608 spin_unlock(&dlm->spinlock);
609
610 /* lockres can be re-dirtied/re-added to the
611 * dirty_list in this gap, but that is ok */
612
613 spin_lock(&res->spinlock);
614 if (res->owner != dlm->node_num) {
615 __dlm_print_one_lock_resource(res);
616 mlog(ML_ERROR, "inprog:%s, mig:%s, reco:%s, dirty:%s\n",
617 res->state & DLM_LOCK_RES_IN_PROGRESS ? "yes" : "no",
618 res->state & DLM_LOCK_RES_MIGRATING ? "yes" : "no",
619 res->state & DLM_LOCK_RES_RECOVERING ? "yes" : "no",
620 res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
621 }
622 BUG_ON(res->owner != dlm->node_num);
623
624 /* it is now ok to move lockreses in these states
625 * to the dirty list, assuming that they will only be
626 * dirty for a short while. */
627 if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
628 DLM_LOCK_RES_MIGRATING |
629 DLM_LOCK_RES_RECOVERING)) {
630 /* move it to the tail and keep going */
631 spin_unlock(&res->spinlock);
632 mlog(0, "delaying list shuffling for in-"
633 "progress lockres %.*s, state=%d\n",
634 res->lockname.len, res->lockname.name,
635 res->state);
636 delay = 1;
637 goto in_progress;
638 }
639
640 /* at this point the lockres is not migrating/
641 * recovering/in-progress. we have the lockres
642 * spinlock and do NOT have the dlm lock.
643 * safe to reserve/queue asts and run the lists. */
644
645 mlog(0, "calling dlm_shuffle_lists with dlm=%p, "
646 "res=%p\n", dlm, res);
647
648 /* called while holding lockres lock */
649 dlm_shuffle_lists(dlm, res);
650 spin_unlock(&res->spinlock);
651
652 dlm_lockres_calc_usage(dlm, res);
653
654in_progress:
655
656 spin_lock(&dlm->spinlock);
657 /* if the lock was in-progress, stick
658 * it on the back of the list */
659 if (delay) {
660 spin_lock(&res->spinlock);
661 list_add_tail(&res->dirty, &dlm->dirty_list);
662 res->state |= DLM_LOCK_RES_DIRTY;
663 spin_unlock(&res->spinlock);
664 }
665 dlm_lockres_put(res);
666
667 /* unlikely, but we may need to give time to
668 * other tasks */
669 if (!--n) {
670 mlog(0, "throttling dlm_thread\n");
671 break;
672 }
673 }
674
675 spin_unlock(&dlm->spinlock);
676 dlm_flush_asts(dlm);
677
678 /* yield and continue right away if there is more work to do */
679 if (!n) {
680 yield();
681 continue;
682 }
683
684 wait_event_interruptible_timeout(dlm->dlm_thread_wq,
685 !dlm_dirty_list_empty(dlm) ||
686 kthread_should_stop(),
687 timeout);
688 }
689
690 mlog(0, "quitting DLM thread\n");
691 return 0;
692}
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
new file mode 100644
index 00000000000..cec2ce1cd31
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -0,0 +1,672 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmunlock.c
5 *
6 * underlying calls for unlocking locks
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27
28#include <linux/module.h>
29#include <linux/fs.h>
30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h>
35#include <linux/sysctl.h>
36#include <linux/random.h>
37#include <linux/blkdev.h>
38#include <linux/socket.h>
39#include <linux/inet.h>
40#include <linux/spinlock.h>
41#include <linux/delay.h>
42
43#include "cluster/heartbeat.h"
44#include "cluster/nodemanager.h"
45#include "cluster/tcp.h"
46
47#include "dlmapi.h"
48#include "dlmcommon.h"
49
50#define MLOG_MASK_PREFIX ML_DLM
51#include "cluster/masklog.h"
52
53#define DLM_UNLOCK_FREE_LOCK 0x00000001
54#define DLM_UNLOCK_CALL_AST 0x00000002
55#define DLM_UNLOCK_REMOVE_LOCK 0x00000004
56#define DLM_UNLOCK_REGRANT_LOCK 0x00000008
57#define DLM_UNLOCK_CLEAR_CONVERT_TYPE 0x00000010
58
59
60static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm,
61 struct dlm_lock_resource *res,
62 struct dlm_lock *lock,
63 struct dlm_lockstatus *lksb,
64 int *actions);
65static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm,
66 struct dlm_lock_resource *res,
67 struct dlm_lock *lock,
68 struct dlm_lockstatus *lksb,
69 int *actions);
70
71static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
72 struct dlm_lock_resource *res,
73 struct dlm_lock *lock,
74 struct dlm_lockstatus *lksb,
75 int flags,
76 u8 owner);
77
78
79/*
80 * according to the spec:
81 * http://opendlm.sourceforge.net/cvsmirror/opendlm/docs/dlmbook_final.pdf
82 *
83 * flags & LKM_CANCEL != 0: must be converting or blocked
84 * flags & LKM_CANCEL == 0: must be granted
85 *
86 * So to unlock a converting lock, you must first cancel the
87 * convert (passing LKM_CANCEL in flags), then call the unlock
88 * again (with no LKM_CANCEL in flags).
89 */
90
91
92/*
93 * locking:
94 * caller needs: none
95 * taken: res->spinlock and lock->spinlock taken and dropped
96 * held on exit: none
97 * returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network
98 * all callers should have taken an extra ref on lock coming in
99 */
100static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
101 struct dlm_lock_resource *res,
102 struct dlm_lock *lock,
103 struct dlm_lockstatus *lksb,
104 int flags, int *call_ast,
105 int master_node)
106{
107 enum dlm_status status;
108 int actions = 0;
109 int in_use;
110 u8 owner;
111
112 mlog(0, "master_node = %d, valblk = %d\n", master_node,
113 flags & LKM_VALBLK);
114
115 if (master_node)
116 BUG_ON(res->owner != dlm->node_num);
117 else
118 BUG_ON(res->owner == dlm->node_num);
119
120 spin_lock(&dlm->spinlock);
121 /* We want to be sure that we're not freeing a lock
122 * that still has AST's pending... */
123 in_use = !list_empty(&lock->ast_list);
124 spin_unlock(&dlm->spinlock);
125 if (in_use) {
126 mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
127 "while waiting for an ast!", res->lockname.len,
128 res->lockname.name);
129 return DLM_BADPARAM;
130 }
131
132 spin_lock(&res->spinlock);
133 if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
134 if (master_node) {
135 mlog(ML_ERROR, "lockres in progress!\n");
136 spin_unlock(&res->spinlock);
137 return DLM_FORWARD;
138 }
139 /* ok for this to sleep if not in a network handler */
140 __dlm_wait_on_lockres(res);
141 res->state |= DLM_LOCK_RES_IN_PROGRESS;
142 }
143 spin_lock(&lock->spinlock);
144
145 if (res->state & DLM_LOCK_RES_RECOVERING) {
146 status = DLM_RECOVERING;
147 goto leave;
148 }
149
150
151 /* see above for what the spec says about
152 * LKM_CANCEL and the lock queue state */
153 if (flags & LKM_CANCEL)
154 status = dlm_get_cancel_actions(dlm, res, lock, lksb, &actions);
155 else
156 status = dlm_get_unlock_actions(dlm, res, lock, lksb, &actions);
157
158 if (status != DLM_NORMAL)
159 goto leave;
160
161 /* By now this has been masked out of cancel requests. */
162 if (flags & LKM_VALBLK) {
163 /* make the final update to the lvb */
164 if (master_node)
165 memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
166 else
167 flags |= LKM_PUT_LVB; /* let the send function
168 * handle it. */
169 }
170
171 if (!master_node) {
172 owner = res->owner;
173 /* drop locks and send message */
174 if (flags & LKM_CANCEL)
175 lock->cancel_pending = 1;
176 else
177 lock->unlock_pending = 1;
178 spin_unlock(&lock->spinlock);
179 spin_unlock(&res->spinlock);
180 status = dlm_send_remote_unlock_request(dlm, res, lock, lksb,
181 flags, owner);
182 spin_lock(&res->spinlock);
183 spin_lock(&lock->spinlock);
184 /* if the master told us the lock was already granted,
185 * let the ast handle all of these actions */
186 if (status == DLM_NORMAL &&
187 lksb->status == DLM_CANCELGRANT) {
188 actions &= ~(DLM_UNLOCK_REMOVE_LOCK|
189 DLM_UNLOCK_REGRANT_LOCK|
190 DLM_UNLOCK_CLEAR_CONVERT_TYPE);
191 }
192 if (flags & LKM_CANCEL)
193 lock->cancel_pending = 0;
194 else
195 lock->unlock_pending = 0;
196
197 }
198
199 /* get an extra ref on lock. if we are just switching
200 * lists here, we dont want the lock to go away. */
201 dlm_lock_get(lock);
202
203 if (actions & DLM_UNLOCK_REMOVE_LOCK) {
204 list_del_init(&lock->list);
205 dlm_lock_put(lock);
206 }
207 if (actions & DLM_UNLOCK_REGRANT_LOCK) {
208 dlm_lock_get(lock);
209 list_add_tail(&lock->list, &res->granted);
210 }
211 if (actions & DLM_UNLOCK_CLEAR_CONVERT_TYPE) {
212 mlog(0, "clearing convert_type at %smaster node\n",
213 master_node ? "" : "non-");
214 lock->ml.convert_type = LKM_IVMODE;
215 }
216
217 /* remove the extra ref on lock */
218 dlm_lock_put(lock);
219
220leave:
221 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
222 if (!dlm_lock_on_list(&res->converting, lock))
223 BUG_ON(lock->ml.convert_type != LKM_IVMODE);
224 else
225 BUG_ON(lock->ml.convert_type == LKM_IVMODE);
226 spin_unlock(&lock->spinlock);
227 spin_unlock(&res->spinlock);
228 wake_up(&res->wq);
229
230 /* let the caller's final dlm_lock_put handle the actual kfree */
231 if (actions & DLM_UNLOCK_FREE_LOCK) {
232 /* this should always be coupled with list removal */
233 BUG_ON(!(actions & DLM_UNLOCK_REMOVE_LOCK));
234 mlog(0, "lock %"MLFu64" should be gone now! refs=%d\n",
235 lock->ml.cookie, atomic_read(&lock->lock_refs.refcount)-1);
236 dlm_lock_put(lock);
237 }
238 if (actions & DLM_UNLOCK_CALL_AST)
239 *call_ast = 1;
240
241 /* if cancel or unlock succeeded, lvb work is done */
242 if (status == DLM_NORMAL)
243 lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
244
245 return status;
246}
247
248void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
249 struct dlm_lock *lock)
250{
251 /* leave DLM_LKSB_PUT_LVB on the lksb so any final
252 * update of the lvb will be sent to the new master */
253 list_del_init(&lock->list);
254}
255
256void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
257 struct dlm_lock *lock)
258{
259 list_del_init(&lock->list);
260 list_add_tail(&lock->list, &res->granted);
261 lock->ml.convert_type = LKM_IVMODE;
262}
263
264
265static inline enum dlm_status dlmunlock_master(struct dlm_ctxt *dlm,
266 struct dlm_lock_resource *res,
267 struct dlm_lock *lock,
268 struct dlm_lockstatus *lksb,
269 int flags,
270 int *call_ast)
271{
272 return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 1);
273}
274
275static inline enum dlm_status dlmunlock_remote(struct dlm_ctxt *dlm,
276 struct dlm_lock_resource *res,
277 struct dlm_lock *lock,
278 struct dlm_lockstatus *lksb,
279 int flags, int *call_ast)
280{
281 return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 0);
282}
283
284/*
285 * locking:
286 * caller needs: none
287 * taken: none
288 * held on exit: none
289 * returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network
290 */
291static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
292 struct dlm_lock_resource *res,
293 struct dlm_lock *lock,
294 struct dlm_lockstatus *lksb,
295 int flags,
296 u8 owner)
297{
298 struct dlm_unlock_lock unlock;
299 int tmpret;
300 enum dlm_status ret;
301 int status = 0;
302 struct kvec vec[2];
303 size_t veclen = 1;
304
305 mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
306
307 memset(&unlock, 0, sizeof(unlock));
308 unlock.node_idx = dlm->node_num;
309 unlock.flags = cpu_to_be32(flags);
310 unlock.cookie = lock->ml.cookie;
311 unlock.namelen = res->lockname.len;
312 memcpy(unlock.name, res->lockname.name, unlock.namelen);
313
314 vec[0].iov_len = sizeof(struct dlm_unlock_lock);
315 vec[0].iov_base = &unlock;
316
317 if (flags & LKM_PUT_LVB) {
318 /* extra data to send if we are updating lvb */
319 vec[1].iov_len = DLM_LVB_LEN;
320 vec[1].iov_base = lock->lksb->lvb;
321 veclen++;
322 }
323
324 tmpret = o2net_send_message_vec(DLM_UNLOCK_LOCK_MSG, dlm->key,
325 vec, veclen, owner, &status);
326 if (tmpret >= 0) {
327 // successfully sent and received
328 if (status == DLM_CANCELGRANT)
329 ret = DLM_NORMAL;
330 else if (status == DLM_FORWARD) {
331 mlog(0, "master was in-progress. retry\n");
332 ret = DLM_FORWARD;
333 } else
334 ret = status;
335 lksb->status = status;
336 } else {
337 mlog_errno(tmpret);
338 if (dlm_is_host_down(tmpret)) {
339 /* NOTE: this seems strange, but it is what we want.
340 * when the master goes down during a cancel or
341 * unlock, the recovery code completes the operation
342 * as if the master had not died, then passes the
343 * updated state to the recovery master. this thread
344 * just needs to finish out the operation and call
345 * the unlockast. */
346 ret = DLM_NORMAL;
347 } else {
348 /* something bad. this will BUG in ocfs2 */
349 ret = dlm_err_to_dlm_status(tmpret);
350 }
351 lksb->status = ret;
352 }
353
354 return ret;
355}
356
357/*
358 * locking:
359 * caller needs: none
360 * taken: takes and drops res->spinlock
361 * held on exit: none
362 * returns: DLM_NORMAL, DLM_BADARGS, DLM_IVLOCKID,
363 * return value from dlmunlock_master
364 */
365int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data)
366{
367 struct dlm_ctxt *dlm = data;
368 struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf;
369 struct dlm_lock_resource *res = NULL;
370 struct list_head *iter;
371 struct dlm_lock *lock = NULL;
372 enum dlm_status status = DLM_NORMAL;
373 int found = 0, i;
374 struct dlm_lockstatus *lksb = NULL;
375 int ignore;
376 u32 flags;
377 struct list_head *queue;
378
379 flags = be32_to_cpu(unlock->flags);
380
381 if (flags & LKM_GET_LVB) {
382 mlog(ML_ERROR, "bad args! GET_LVB specified on unlock!\n");
383 return DLM_BADARGS;
384 }
385
386 if ((flags & (LKM_PUT_LVB|LKM_CANCEL)) == (LKM_PUT_LVB|LKM_CANCEL)) {
387 mlog(ML_ERROR, "bad args! cannot modify lvb on a CANCEL "
388 "request!\n");
389 return DLM_BADARGS;
390 }
391
392 if (unlock->namelen > DLM_LOCKID_NAME_MAX) {
393 mlog(ML_ERROR, "Invalid name length in unlock handler!\n");
394 return DLM_IVBUFLEN;
395 }
396
397 if (!dlm_grab(dlm))
398 return DLM_REJECTED;
399
400 mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
401 "Domain %s not fully joined!\n", dlm->name);
402
403 mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : "none");
404
405 res = dlm_lookup_lockres(dlm, unlock->name, unlock->namelen);
406 if (!res) {
407 /* We assume here that a no lock resource simply means
408 * it was migrated away and destroyed before the other
409 * node could detect it. */
410 mlog(0, "returning DLM_FORWARD -- res no longer exists\n");
411 status = DLM_FORWARD;
412 goto not_found;
413 }
414
415 queue=&res->granted;
416 found = 0;
417 spin_lock(&res->spinlock);
418 if (res->state & DLM_LOCK_RES_RECOVERING) {
419 spin_unlock(&res->spinlock);
420 mlog(0, "returning DLM_RECOVERING\n");
421 status = DLM_RECOVERING;
422 goto leave;
423 }
424
425 if (res->state & DLM_LOCK_RES_MIGRATING) {
426 spin_unlock(&res->spinlock);
427 mlog(0, "returning DLM_MIGRATING\n");
428 status = DLM_MIGRATING;
429 goto leave;
430 }
431
432 if (res->owner != dlm->node_num) {
433 spin_unlock(&res->spinlock);
434 mlog(0, "returning DLM_FORWARD -- not master\n");
435 status = DLM_FORWARD;
436 goto leave;
437 }
438
439 for (i=0; i<3; i++) {
440 list_for_each(iter, queue) {
441 lock = list_entry(iter, struct dlm_lock, list);
442 if (lock->ml.cookie == unlock->cookie &&
443 lock->ml.node == unlock->node_idx) {
444 dlm_lock_get(lock);
445 found = 1;
446 break;
447 }
448 }
449 if (found)
450 break;
451 /* scan granted -> converting -> blocked queues */
452 queue++;
453 }
454 spin_unlock(&res->spinlock);
455 if (!found) {
456 status = DLM_IVLOCKID;
457 goto not_found;
458 }
459
460 /* lock was found on queue */
461 lksb = lock->lksb;
462 /* unlockast only called on originating node */
463 if (flags & LKM_PUT_LVB) {
464 lksb->flags |= DLM_LKSB_PUT_LVB;
465 memcpy(&lksb->lvb[0], &unlock->lvb[0], DLM_LVB_LEN);
466 }
467
468 /* if this is in-progress, propagate the DLM_FORWARD
469 * all the way back out */
470 status = dlmunlock_master(dlm, res, lock, lksb, flags, &ignore);
471 if (status == DLM_FORWARD)
472 mlog(0, "lockres is in progress\n");
473
474 if (flags & LKM_PUT_LVB)
475 lksb->flags &= ~DLM_LKSB_PUT_LVB;
476
477 dlm_lockres_calc_usage(dlm, res);
478 dlm_kick_thread(dlm, res);
479
480not_found:
481 if (!found)
482 mlog(ML_ERROR, "failed to find lock to unlock! "
483 "cookie=%"MLFu64"\n",
484 unlock->cookie);
485 else {
486 /* send the lksb->status back to the other node */
487 status = lksb->status;
488 dlm_lock_put(lock);
489 }
490
491leave:
492 if (res)
493 dlm_lockres_put(res);
494
495 dlm_put(dlm);
496
497 return status;
498}
499
500
501static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm,
502 struct dlm_lock_resource *res,
503 struct dlm_lock *lock,
504 struct dlm_lockstatus *lksb,
505 int *actions)
506{
507 enum dlm_status status;
508
509 if (dlm_lock_on_list(&res->blocked, lock)) {
510 /* cancel this outright */
511 lksb->status = DLM_NORMAL;
512 status = DLM_NORMAL;
513 *actions = (DLM_UNLOCK_CALL_AST |
514 DLM_UNLOCK_REMOVE_LOCK);
515 } else if (dlm_lock_on_list(&res->converting, lock)) {
516 /* cancel the request, put back on granted */
517 lksb->status = DLM_NORMAL;
518 status = DLM_NORMAL;
519 *actions = (DLM_UNLOCK_CALL_AST |
520 DLM_UNLOCK_REMOVE_LOCK |
521 DLM_UNLOCK_REGRANT_LOCK |
522 DLM_UNLOCK_CLEAR_CONVERT_TYPE);
523 } else if (dlm_lock_on_list(&res->granted, lock)) {
524 /* too late, already granted. DLM_CANCELGRANT */
525 lksb->status = DLM_CANCELGRANT;
526 status = DLM_NORMAL;
527 *actions = DLM_UNLOCK_CALL_AST;
528 } else {
529 mlog(ML_ERROR, "lock to cancel is not on any list!\n");
530 lksb->status = DLM_IVLOCKID;
531 status = DLM_IVLOCKID;
532 *actions = 0;
533 }
534 return status;
535}
536
537static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm,
538 struct dlm_lock_resource *res,
539 struct dlm_lock *lock,
540 struct dlm_lockstatus *lksb,
541 int *actions)
542{
543 enum dlm_status status;
544
545 /* unlock request */
546 if (!dlm_lock_on_list(&res->granted, lock)) {
547 lksb->status = DLM_DENIED;
548 status = DLM_DENIED;
549 dlm_error(status);
550 *actions = 0;
551 } else {
552 /* unlock granted lock */
553 lksb->status = DLM_NORMAL;
554 status = DLM_NORMAL;
555 *actions = (DLM_UNLOCK_FREE_LOCK |
556 DLM_UNLOCK_CALL_AST |
557 DLM_UNLOCK_REMOVE_LOCK);
558 }
559 return status;
560}
561
562/* there seems to be no point in doing this async
563 * since (even for the remote case) there is really
564 * no work to queue up... so just do it and fire the
565 * unlockast by hand when done... */
566enum dlm_status dlmunlock(struct dlm_ctxt *dlm, struct dlm_lockstatus *lksb,
567 int flags, dlm_astunlockfunc_t *unlockast, void *data)
568{
569 enum dlm_status status;
570 struct dlm_lock_resource *res;
571 struct dlm_lock *lock = NULL;
572 int call_ast, is_master;
573
574 mlog_entry_void();
575
576 if (!lksb) {
577 dlm_error(DLM_BADARGS);
578 return DLM_BADARGS;
579 }
580
581 if (flags & ~(LKM_CANCEL | LKM_VALBLK | LKM_INVVALBLK)) {
582 dlm_error(DLM_BADPARAM);
583 return DLM_BADPARAM;
584 }
585
586 if ((flags & (LKM_VALBLK | LKM_CANCEL)) == (LKM_VALBLK | LKM_CANCEL)) {
587 mlog(0, "VALBLK given with CANCEL: ignoring VALBLK\n");
588 flags &= ~LKM_VALBLK;
589 }
590
591 if (!lksb->lockid || !lksb->lockid->lockres) {
592 dlm_error(DLM_BADPARAM);
593 return DLM_BADPARAM;
594 }
595
596 lock = lksb->lockid;
597 BUG_ON(!lock);
598 dlm_lock_get(lock);
599
600 res = lock->lockres;
601 BUG_ON(!res);
602 dlm_lockres_get(res);
603retry:
604 call_ast = 0;
605 /* need to retry up here because owner may have changed */
606 mlog(0, "lock=%p res=%p\n", lock, res);
607
608 spin_lock(&res->spinlock);
609 is_master = (res->owner == dlm->node_num);
610 spin_unlock(&res->spinlock);
611
612 if (is_master) {
613 status = dlmunlock_master(dlm, res, lock, lksb, flags,
614 &call_ast);
615 mlog(0, "done calling dlmunlock_master: returned %d, "
616 "call_ast is %d\n", status, call_ast);
617 } else {
618 status = dlmunlock_remote(dlm, res, lock, lksb, flags,
619 &call_ast);
620 mlog(0, "done calling dlmunlock_remote: returned %d, "
621 "call_ast is %d\n", status, call_ast);
622 }
623
624 if (status == DLM_RECOVERING ||
625 status == DLM_MIGRATING ||
626 status == DLM_FORWARD) {
627 /* We want to go away for a tiny bit to allow recovery
628 * / migration to complete on this resource. I don't
629 * know of any wait queue we could sleep on as this
630 * may be happening on another node. Perhaps the
631 * proper solution is to queue up requests on the
632 * other end? */
633
634 /* do we want to yield(); ?? */
635 msleep(50);
636
637 mlog(0, "retrying unlock due to pending recovery/"
638 "migration/in-progress\n");
639 goto retry;
640 }
641
642 if (call_ast) {
643 mlog(0, "calling unlockast(%p, %d)\n", data, lksb->status);
644 if (is_master) {
645 /* it is possible that there is one last bast
646 * pending. make sure it is flushed, then
647 * call the unlockast.
648 * not an issue if this is a mastered remotely,
649 * since this lock has been removed from the
650 * lockres queues and cannot be found. */
651 dlm_kick_thread(dlm, NULL);
652 wait_event(dlm->ast_wq,
653 dlm_lock_basts_flushed(dlm, lock));
654 }
655 (*unlockast)(data, lksb->status);
656 }
657
658 if (status == DLM_NORMAL) {
659 mlog(0, "kicking the thread\n");
660 dlm_kick_thread(dlm, res);
661 } else
662 dlm_error(status);
663
664 dlm_lockres_calc_usage(dlm, res);
665 dlm_lockres_put(res);
666 dlm_lock_put(lock);
667
668 mlog(0, "returning status=%d!\n", status);
669 return status;
670}
671EXPORT_SYMBOL_GPL(dlmunlock);
672
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c
new file mode 100644
index 00000000000..7ef2653f8f4
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmver.c
@@ -0,0 +1,42 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmver.c
5 *
6 * version string
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/kernel.h>
28
29#include "dlmver.h"
30
31#define DLM_BUILD_VERSION "1.3.3"
32
33#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
34
35void dlm_print_version(void)
36{
37 printk(KERN_INFO "%s\n", VERSION_STR);
38}
39
40MODULE_DESCRIPTION(VERSION_STR);
41
42MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h
new file mode 100644
index 00000000000..f674aee77a1
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmver.h
@@ -0,0 +1,31 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmfsver.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef DLM_VER_H
27#define DLM_VER_H
28
29void dlm_print_version(void);
30
31#endif /* DLM_VER_H */
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c
new file mode 100644
index 00000000000..e1fdd288796
--- /dev/null
+++ b/fs/ocfs2/dlm/userdlm.c
@@ -0,0 +1,658 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * userdlm.c
5 *
6 * Code which implements the kernel side of a minimal userspace
7 * interface to our DLM.
8 *
9 * Many of the functions here are pared down versions of dlmglue.c
10 * functions.
11 *
12 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public
16 * License as published by the Free Software Foundation; either
17 * version 2 of the License, or (at your option) any later version.
18 *
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 * General Public License for more details.
23 *
24 * You should have received a copy of the GNU General Public
25 * License along with this program; if not, write to the
26 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
27 * Boston, MA 021110-1307, USA.
28 */
29
30#include <asm/signal.h>
31
32#include <linux/module.h>
33#include <linux/fs.h>
34#include <linux/types.h>
35#include <linux/crc32.h>
36
37
38#include "cluster/nodemanager.h"
39#include "cluster/heartbeat.h"
40#include "cluster/tcp.h"
41
42#include "dlmapi.h"
43
44#include "userdlm.h"
45
46#define MLOG_MASK_PREFIX ML_DLMFS
47#include "cluster/masklog.h"
48
49static inline int user_check_wait_flag(struct user_lock_res *lockres,
50 int flag)
51{
52 int ret;
53
54 spin_lock(&lockres->l_lock);
55 ret = lockres->l_flags & flag;
56 spin_unlock(&lockres->l_lock);
57
58 return ret;
59}
60
61static inline void user_wait_on_busy_lock(struct user_lock_res *lockres)
62
63{
64 wait_event(lockres->l_event,
65 !user_check_wait_flag(lockres, USER_LOCK_BUSY));
66}
67
68static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres)
69
70{
71 wait_event(lockres->l_event,
72 !user_check_wait_flag(lockres, USER_LOCK_BLOCKED));
73}
74
75/* I heart container_of... */
76static inline struct dlm_ctxt *
77dlm_ctxt_from_user_lockres(struct user_lock_res *lockres)
78{
79 struct dlmfs_inode_private *ip;
80
81 ip = container_of(lockres,
82 struct dlmfs_inode_private,
83 ip_lockres);
84 return ip->ip_dlm;
85}
86
87static struct inode *
88user_dlm_inode_from_user_lockres(struct user_lock_res *lockres)
89{
90 struct dlmfs_inode_private *ip;
91
92 ip = container_of(lockres,
93 struct dlmfs_inode_private,
94 ip_lockres);
95 return &ip->ip_vfs_inode;
96}
97
98static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
99{
100 spin_lock(&lockres->l_lock);
101 lockres->l_flags &= ~USER_LOCK_BUSY;
102 spin_unlock(&lockres->l_lock);
103}
104
105#define user_log_dlm_error(_func, _stat, _lockres) do { \
106 mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \
107 "resource %s: %s\n", dlm_errname(_stat), _func, \
108 _lockres->l_name, dlm_errmsg(_stat)); \
109} while (0)
110
111/* WARNING: This function lives in a world where the only three lock
112 * levels are EX, PR, and NL. It *will* have to be adjusted when more
113 * lock types are added. */
114static inline int user_highest_compat_lock_level(int level)
115{
116 int new_level = LKM_EXMODE;
117
118 if (level == LKM_EXMODE)
119 new_level = LKM_NLMODE;
120 else if (level == LKM_PRMODE)
121 new_level = LKM_PRMODE;
122 return new_level;
123}
124
125static void user_ast(void *opaque)
126{
127 struct user_lock_res *lockres = opaque;
128 struct dlm_lockstatus *lksb;
129
130 mlog(0, "AST fired for lockres %s\n", lockres->l_name);
131
132 spin_lock(&lockres->l_lock);
133
134 lksb = &(lockres->l_lksb);
135 if (lksb->status != DLM_NORMAL) {
136 mlog(ML_ERROR, "lksb status value of %u on lockres %s\n",
137 lksb->status, lockres->l_name);
138 spin_unlock(&lockres->l_lock);
139 return;
140 }
141
142 /* we're downconverting. */
143 if (lockres->l_requested < lockres->l_level) {
144 if (lockres->l_requested <=
145 user_highest_compat_lock_level(lockres->l_blocking)) {
146 lockres->l_blocking = LKM_NLMODE;
147 lockres->l_flags &= ~USER_LOCK_BLOCKED;
148 }
149 }
150
151 lockres->l_level = lockres->l_requested;
152 lockres->l_requested = LKM_IVMODE;
153 lockres->l_flags |= USER_LOCK_ATTACHED;
154 lockres->l_flags &= ~USER_LOCK_BUSY;
155
156 spin_unlock(&lockres->l_lock);
157
158 wake_up(&lockres->l_event);
159}
160
161static inline void user_dlm_grab_inode_ref(struct user_lock_res *lockres)
162{
163 struct inode *inode;
164 inode = user_dlm_inode_from_user_lockres(lockres);
165 if (!igrab(inode))
166 BUG();
167}
168
169static void user_dlm_unblock_lock(void *opaque);
170
171static void __user_dlm_queue_lockres(struct user_lock_res *lockres)
172{
173 if (!(lockres->l_flags & USER_LOCK_QUEUED)) {
174 user_dlm_grab_inode_ref(lockres);
175
176 INIT_WORK(&lockres->l_work, user_dlm_unblock_lock,
177 lockres);
178
179 queue_work(user_dlm_worker, &lockres->l_work);
180 lockres->l_flags |= USER_LOCK_QUEUED;
181 }
182}
183
184static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
185{
186 int queue = 0;
187
188 if (!(lockres->l_flags & USER_LOCK_BLOCKED))
189 return;
190
191 switch (lockres->l_blocking) {
192 case LKM_EXMODE:
193 if (!lockres->l_ex_holders && !lockres->l_ro_holders)
194 queue = 1;
195 break;
196 case LKM_PRMODE:
197 if (!lockres->l_ex_holders)
198 queue = 1;
199 break;
200 default:
201 BUG();
202 }
203
204 if (queue)
205 __user_dlm_queue_lockres(lockres);
206}
207
208static void user_bast(void *opaque, int level)
209{
210 struct user_lock_res *lockres = opaque;
211
212 mlog(0, "Blocking AST fired for lockres %s. Blocking level %d\n",
213 lockres->l_name, level);
214
215 spin_lock(&lockres->l_lock);
216 lockres->l_flags |= USER_LOCK_BLOCKED;
217 if (level > lockres->l_blocking)
218 lockres->l_blocking = level;
219
220 __user_dlm_queue_lockres(lockres);
221 spin_unlock(&lockres->l_lock);
222
223 wake_up(&lockres->l_event);
224}
225
226static void user_unlock_ast(void *opaque, enum dlm_status status)
227{
228 struct user_lock_res *lockres = opaque;
229
230 mlog(0, "UNLOCK AST called on lock %s\n", lockres->l_name);
231
232 if (status != DLM_NORMAL)
233 mlog(ML_ERROR, "Dlm returns status %d\n", status);
234
235 spin_lock(&lockres->l_lock);
236 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN)
237 lockres->l_level = LKM_IVMODE;
238 else {
239 lockres->l_requested = LKM_IVMODE; /* cancel an
240 * upconvert
241 * request. */
242 lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
243 /* we want the unblock thread to look at it again
244 * now. */
245 __user_dlm_queue_lockres(lockres);
246 }
247
248 lockres->l_flags &= ~USER_LOCK_BUSY;
249 spin_unlock(&lockres->l_lock);
250
251 wake_up(&lockres->l_event);
252}
253
254static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
255{
256 struct inode *inode;
257 inode = user_dlm_inode_from_user_lockres(lockres);
258 iput(inode);
259}
260
261static void user_dlm_unblock_lock(void *opaque)
262{
263 int new_level, status;
264 struct user_lock_res *lockres = (struct user_lock_res *) opaque;
265 struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
266
267 mlog(0, "processing lockres %s\n", lockres->l_name);
268
269 spin_lock(&lockres->l_lock);
270
271 BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
272 BUG_ON(!(lockres->l_flags & USER_LOCK_QUEUED));
273
274 /* notice that we don't clear USER_LOCK_BLOCKED here. That's
275 * for user_ast to do. */
276 lockres->l_flags &= ~USER_LOCK_QUEUED;
277
278 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
279 mlog(0, "lock is in teardown so we do nothing\n");
280 spin_unlock(&lockres->l_lock);
281 goto drop_ref;
282 }
283
284 if (lockres->l_flags & USER_LOCK_BUSY) {
285 mlog(0, "BUSY flag detected...\n");
286 if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
287 spin_unlock(&lockres->l_lock);
288 goto drop_ref;
289 }
290
291 lockres->l_flags |= USER_LOCK_IN_CANCEL;
292 spin_unlock(&lockres->l_lock);
293
294 status = dlmunlock(dlm,
295 &lockres->l_lksb,
296 LKM_CANCEL,
297 user_unlock_ast,
298 lockres);
299 if (status == DLM_CANCELGRANT) {
300 /* If we got this, then the ast was fired
301 * before we could cancel. We cleanup our
302 * state, and restart the function. */
303 spin_lock(&lockres->l_lock);
304 lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
305 spin_unlock(&lockres->l_lock);
306 } else if (status != DLM_NORMAL)
307 user_log_dlm_error("dlmunlock", status, lockres);
308 goto drop_ref;
309 }
310
311 /* If there are still incompat holders, we can exit safely
312 * without worrying about re-queueing this lock as that will
313 * happen on the last call to user_cluster_unlock. */
314 if ((lockres->l_blocking == LKM_EXMODE)
315 && (lockres->l_ex_holders || lockres->l_ro_holders)) {
316 spin_unlock(&lockres->l_lock);
317 mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n",
318 lockres->l_ro_holders, lockres->l_ex_holders);
319 goto drop_ref;
320 }
321
322 if ((lockres->l_blocking == LKM_PRMODE)
323 && lockres->l_ex_holders) {
324 spin_unlock(&lockres->l_lock);
325 mlog(0, "can't downconvert for pr: ex = %u\n",
326 lockres->l_ex_holders);
327 goto drop_ref;
328 }
329
330 /* yay, we can downconvert now. */
331 new_level = user_highest_compat_lock_level(lockres->l_blocking);
332 lockres->l_requested = new_level;
333 lockres->l_flags |= USER_LOCK_BUSY;
334 mlog(0, "Downconvert lock from %d to %d\n",
335 lockres->l_level, new_level);
336 spin_unlock(&lockres->l_lock);
337
338 /* need lock downconvert request now... */
339 status = dlmlock(dlm,
340 new_level,
341 &lockres->l_lksb,
342 LKM_CONVERT|LKM_VALBLK,
343 lockres->l_name,
344 user_ast,
345 lockres,
346 user_bast);
347 if (status != DLM_NORMAL) {
348 user_log_dlm_error("dlmlock", status, lockres);
349 user_recover_from_dlm_error(lockres);
350 }
351
352drop_ref:
353 user_dlm_drop_inode_ref(lockres);
354}
355
356static inline void user_dlm_inc_holders(struct user_lock_res *lockres,
357 int level)
358{
359 switch(level) {
360 case LKM_EXMODE:
361 lockres->l_ex_holders++;
362 break;
363 case LKM_PRMODE:
364 lockres->l_ro_holders++;
365 break;
366 default:
367 BUG();
368 }
369}
370
371/* predict what lock level we'll be dropping down to on behalf
372 * of another node, and return true if the currently wanted
373 * level will be compatible with it. */
374static inline int
375user_may_continue_on_blocked_lock(struct user_lock_res *lockres,
376 int wanted)
377{
378 BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
379
380 return wanted <= user_highest_compat_lock_level(lockres->l_blocking);
381}
382
383int user_dlm_cluster_lock(struct user_lock_res *lockres,
384 int level,
385 int lkm_flags)
386{
387 int status, local_flags;
388 struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
389
390 if (level != LKM_EXMODE &&
391 level != LKM_PRMODE) {
392 mlog(ML_ERROR, "lockres %s: invalid request!\n",
393 lockres->l_name);
394 status = -EINVAL;
395 goto bail;
396 }
397
398 mlog(0, "lockres %s: asking for %s lock, passed flags = 0x%x\n",
399 lockres->l_name,
400 (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE",
401 lkm_flags);
402
403again:
404 if (signal_pending(current)) {
405 status = -ERESTARTSYS;
406 goto bail;
407 }
408
409 spin_lock(&lockres->l_lock);
410
411 /* We only compare against the currently granted level
412 * here. If the lock is blocked waiting on a downconvert,
413 * we'll get caught below. */
414 if ((lockres->l_flags & USER_LOCK_BUSY) &&
415 (level > lockres->l_level)) {
416 /* is someone sitting in dlm_lock? If so, wait on
417 * them. */
418 spin_unlock(&lockres->l_lock);
419
420 user_wait_on_busy_lock(lockres);
421 goto again;
422 }
423
424 if ((lockres->l_flags & USER_LOCK_BLOCKED) &&
425 (!user_may_continue_on_blocked_lock(lockres, level))) {
426 /* is the lock is currently blocked on behalf of
427 * another node */
428 spin_unlock(&lockres->l_lock);
429
430 user_wait_on_blocked_lock(lockres);
431 goto again;
432 }
433
434 if (level > lockres->l_level) {
435 local_flags = lkm_flags | LKM_VALBLK;
436 if (lockres->l_level != LKM_IVMODE)
437 local_flags |= LKM_CONVERT;
438
439 lockres->l_requested = level;
440 lockres->l_flags |= USER_LOCK_BUSY;
441 spin_unlock(&lockres->l_lock);
442
443 BUG_ON(level == LKM_IVMODE);
444 BUG_ON(level == LKM_NLMODE);
445
446 mlog(0, "lock %s, get lock from %d to level = %d\n",
447 lockres->l_name, lockres->l_level, level);
448
449 /* call dlm_lock to upgrade lock now */
450 status = dlmlock(dlm,
451 level,
452 &lockres->l_lksb,
453 local_flags,
454 lockres->l_name,
455 user_ast,
456 lockres,
457 user_bast);
458 if (status != DLM_NORMAL) {
459 if ((lkm_flags & LKM_NOQUEUE) &&
460 (status == DLM_NOTQUEUED))
461 status = -EAGAIN;
462 else {
463 user_log_dlm_error("dlmlock", status, lockres);
464 status = -EINVAL;
465 }
466 user_recover_from_dlm_error(lockres);
467 goto bail;
468 }
469
470 mlog(0, "lock %s, successfull return from dlmlock\n",
471 lockres->l_name);
472
473 user_wait_on_busy_lock(lockres);
474 goto again;
475 }
476
477 user_dlm_inc_holders(lockres, level);
478 spin_unlock(&lockres->l_lock);
479
480 mlog(0, "lockres %s: Got %s lock!\n", lockres->l_name,
481 (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE");
482
483 status = 0;
484bail:
485 return status;
486}
487
488static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
489 int level)
490{
491 switch(level) {
492 case LKM_EXMODE:
493 BUG_ON(!lockres->l_ex_holders);
494 lockres->l_ex_holders--;
495 break;
496 case LKM_PRMODE:
497 BUG_ON(!lockres->l_ro_holders);
498 lockres->l_ro_holders--;
499 break;
500 default:
501 BUG();
502 }
503}
504
505void user_dlm_cluster_unlock(struct user_lock_res *lockres,
506 int level)
507{
508 if (level != LKM_EXMODE &&
509 level != LKM_PRMODE) {
510 mlog(ML_ERROR, "lockres %s: invalid request!\n", lockres->l_name);
511 return;
512 }
513
514 mlog(0, "lockres %s: dropping %s lock\n", lockres->l_name,
515 (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE");
516
517 spin_lock(&lockres->l_lock);
518 user_dlm_dec_holders(lockres, level);
519 __user_dlm_cond_queue_lockres(lockres);
520 spin_unlock(&lockres->l_lock);
521}
522
523void user_dlm_write_lvb(struct inode *inode,
524 const char *val,
525 unsigned int len)
526{
527 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
528 char *lvb = lockres->l_lksb.lvb;
529
530 BUG_ON(len > DLM_LVB_LEN);
531
532 spin_lock(&lockres->l_lock);
533
534 BUG_ON(lockres->l_level < LKM_EXMODE);
535 memcpy(lvb, val, len);
536
537 spin_unlock(&lockres->l_lock);
538}
539
540void user_dlm_read_lvb(struct inode *inode,
541 char *val,
542 unsigned int len)
543{
544 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
545 char *lvb = lockres->l_lksb.lvb;
546
547 BUG_ON(len > DLM_LVB_LEN);
548
549 spin_lock(&lockres->l_lock);
550
551 BUG_ON(lockres->l_level < LKM_PRMODE);
552 memcpy(val, lvb, len);
553
554 spin_unlock(&lockres->l_lock);
555}
556
557void user_dlm_lock_res_init(struct user_lock_res *lockres,
558 struct dentry *dentry)
559{
560 memset(lockres, 0, sizeof(*lockres));
561
562 spin_lock_init(&lockres->l_lock);
563 init_waitqueue_head(&lockres->l_event);
564 lockres->l_level = LKM_IVMODE;
565 lockres->l_requested = LKM_IVMODE;
566 lockres->l_blocking = LKM_IVMODE;
567
568 /* should have been checked before getting here. */
569 BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN);
570
571 memcpy(lockres->l_name,
572 dentry->d_name.name,
573 dentry->d_name.len);
574}
575
576int user_dlm_destroy_lock(struct user_lock_res *lockres)
577{
578 int status = -EBUSY;
579 struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
580
581 mlog(0, "asked to destroy %s\n", lockres->l_name);
582
583 spin_lock(&lockres->l_lock);
584 while (lockres->l_flags & USER_LOCK_BUSY) {
585 spin_unlock(&lockres->l_lock);
586
587 mlog(0, "lock %s is busy\n", lockres->l_name);
588
589 user_wait_on_busy_lock(lockres);
590
591 spin_lock(&lockres->l_lock);
592 }
593
594 if (lockres->l_ro_holders || lockres->l_ex_holders) {
595 spin_unlock(&lockres->l_lock);
596 mlog(0, "lock %s has holders\n", lockres->l_name);
597 goto bail;
598 }
599
600 status = 0;
601 if (!(lockres->l_flags & USER_LOCK_ATTACHED)) {
602 spin_unlock(&lockres->l_lock);
603 mlog(0, "lock %s is not attached\n", lockres->l_name);
604 goto bail;
605 }
606
607 lockres->l_flags &= ~USER_LOCK_ATTACHED;
608 lockres->l_flags |= USER_LOCK_BUSY;
609 lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
610 spin_unlock(&lockres->l_lock);
611
612 mlog(0, "unlocking lockres %s\n", lockres->l_name);
613 status = dlmunlock(dlm,
614 &lockres->l_lksb,
615 LKM_VALBLK,
616 user_unlock_ast,
617 lockres);
618 if (status != DLM_NORMAL) {
619 user_log_dlm_error("dlmunlock", status, lockres);
620 status = -EINVAL;
621 goto bail;
622 }
623
624 user_wait_on_busy_lock(lockres);
625
626 status = 0;
627bail:
628 return status;
629}
630
631struct dlm_ctxt *user_dlm_register_context(struct qstr *name)
632{
633 struct dlm_ctxt *dlm;
634 u32 dlm_key;
635 char *domain;
636
637 domain = kmalloc(name->len + 1, GFP_KERNEL);
638 if (!domain) {
639 mlog_errno(-ENOMEM);
640 return ERR_PTR(-ENOMEM);
641 }
642
643 dlm_key = crc32_le(0, name->name, name->len);
644
645 snprintf(domain, name->len + 1, "%.*s", name->len, name->name);
646
647 dlm = dlm_register_domain(domain, dlm_key);
648 if (IS_ERR(dlm))
649 mlog_errno(PTR_ERR(dlm));
650
651 kfree(domain);
652 return dlm;
653}
654
655void user_dlm_unregister_context(struct dlm_ctxt *dlm)
656{
657 dlm_unregister_domain(dlm);
658}
diff --git a/fs/ocfs2/dlm/userdlm.h b/fs/ocfs2/dlm/userdlm.h
new file mode 100644
index 00000000000..04178bc40b7
--- /dev/null
+++ b/fs/ocfs2/dlm/userdlm.h
@@ -0,0 +1,111 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * userdlm.h
5 *
6 * Userspace dlm defines
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26
27#ifndef USERDLM_H
28#define USERDLM_H
29
30#include <linux/module.h>
31#include <linux/fs.h>
32#include <linux/types.h>
33#include <linux/workqueue.h>
34
35/* user_lock_res->l_flags flags. */
36#define USER_LOCK_ATTACHED (0x00000001) /* have we initialized
37 * the lvb */
38#define USER_LOCK_BUSY (0x00000002) /* we are currently in
39 * dlm_lock */
40#define USER_LOCK_BLOCKED (0x00000004) /* blocked waiting to
41 * downconvert*/
42#define USER_LOCK_IN_TEARDOWN (0x00000008) /* we're currently
43 * destroying this
44 * lock. */
45#define USER_LOCK_QUEUED (0x00000010) /* lock is on the
46 * workqueue */
47#define USER_LOCK_IN_CANCEL (0x00000020)
48
49struct user_lock_res {
50 spinlock_t l_lock;
51
52 int l_flags;
53
54#define USER_DLM_LOCK_ID_MAX_LEN 32
55 char l_name[USER_DLM_LOCK_ID_MAX_LEN];
56 int l_level;
57 unsigned int l_ro_holders;
58 unsigned int l_ex_holders;
59 struct dlm_lockstatus l_lksb;
60
61 int l_requested;
62 int l_blocking;
63
64 wait_queue_head_t l_event;
65
66 struct work_struct l_work;
67};
68
69extern struct workqueue_struct *user_dlm_worker;
70
71void user_dlm_lock_res_init(struct user_lock_res *lockres,
72 struct dentry *dentry);
73int user_dlm_destroy_lock(struct user_lock_res *lockres);
74int user_dlm_cluster_lock(struct user_lock_res *lockres,
75 int level,
76 int lkm_flags);
77void user_dlm_cluster_unlock(struct user_lock_res *lockres,
78 int level);
79void user_dlm_write_lvb(struct inode *inode,
80 const char *val,
81 unsigned int len);
82void user_dlm_read_lvb(struct inode *inode,
83 char *val,
84 unsigned int len);
85struct dlm_ctxt *user_dlm_register_context(struct qstr *name);
86void user_dlm_unregister_context(struct dlm_ctxt *dlm);
87
88struct dlmfs_inode_private {
89 struct dlm_ctxt *ip_dlm;
90
91 struct user_lock_res ip_lockres; /* unused for directories. */
92 struct inode *ip_parent;
93
94 struct inode ip_vfs_inode;
95};
96
97static inline struct dlmfs_inode_private *
98DLMFS_I(struct inode *inode)
99{
100 return container_of(inode,
101 struct dlmfs_inode_private,
102 ip_vfs_inode);
103}
104
105struct dlmfs_filp_private {
106 int fp_lock_level;
107};
108
109#define DLMFS_MAGIC 0x76a9f425
110
111#endif /* USERDLM_H */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
new file mode 100644
index 00000000000..e971ec2f840
--- /dev/null
+++ b/fs/ocfs2/dlmglue.c
@@ -0,0 +1,2904 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmglue.c
5 *
6 * Code which implements an OCFS2 specific interface to our DLM.
7 *
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/types.h>
27#include <linux/slab.h>
28#include <linux/highmem.h>
29#include <linux/mm.h>
30#include <linux/smp_lock.h>
31#include <linux/crc32.h>
32#include <linux/kthread.h>
33#include <linux/pagemap.h>
34#include <linux/debugfs.h>
35#include <linux/seq_file.h>
36
37#include <cluster/heartbeat.h>
38#include <cluster/nodemanager.h>
39#include <cluster/tcp.h>
40
41#include <dlm/dlmapi.h>
42
43#define MLOG_MASK_PREFIX ML_DLM_GLUE
44#include <cluster/masklog.h>
45
46#include "ocfs2.h"
47
48#include "alloc.h"
49#include "dlmglue.h"
50#include "extent_map.h"
51#include "heartbeat.h"
52#include "inode.h"
53#include "journal.h"
54#include "slot_map.h"
55#include "super.h"
56#include "uptodate.h"
57#include "vote.h"
58
59#include "buffer_head_io.h"
60
61struct ocfs2_mask_waiter {
62 struct list_head mw_item;
63 int mw_status;
64 struct completion mw_complete;
65 unsigned long mw_mask;
66 unsigned long mw_goal;
67};
68
69static void ocfs2_inode_ast_func(void *opaque);
70static void ocfs2_inode_bast_func(void *opaque,
71 int level);
72static void ocfs2_super_ast_func(void *opaque);
73static void ocfs2_super_bast_func(void *opaque,
74 int level);
75static void ocfs2_rename_ast_func(void *opaque);
76static void ocfs2_rename_bast_func(void *opaque,
77 int level);
78
79/* so far, all locks have gotten along with the same unlock ast */
80static void ocfs2_unlock_ast_func(void *opaque,
81 enum dlm_status status);
82static int ocfs2_do_unblock_meta(struct inode *inode,
83 int *requeue);
84static int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
85 int *requeue);
86static int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
87 int *requeue);
88static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
89 int *requeue);
90static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
91 int *requeue);
92typedef void (ocfs2_convert_worker_t)(struct ocfs2_lock_res *, int);
93static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
94 struct ocfs2_lock_res *lockres,
95 int *requeue,
96 ocfs2_convert_worker_t *worker);
97
98struct ocfs2_lock_res_ops {
99 void (*ast)(void *);
100 void (*bast)(void *, int);
101 void (*unlock_ast)(void *, enum dlm_status);
102 int (*unblock)(struct ocfs2_lock_res *, int *);
103};
104
105static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
106 .ast = ocfs2_inode_ast_func,
107 .bast = ocfs2_inode_bast_func,
108 .unlock_ast = ocfs2_unlock_ast_func,
109 .unblock = ocfs2_unblock_inode_lock,
110};
111
112static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = {
113 .ast = ocfs2_inode_ast_func,
114 .bast = ocfs2_inode_bast_func,
115 .unlock_ast = ocfs2_unlock_ast_func,
116 .unblock = ocfs2_unblock_meta,
117};
118
119static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
120 int blocking);
121
122static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
123 .ast = ocfs2_inode_ast_func,
124 .bast = ocfs2_inode_bast_func,
125 .unlock_ast = ocfs2_unlock_ast_func,
126 .unblock = ocfs2_unblock_data,
127};
128
129static struct ocfs2_lock_res_ops ocfs2_super_lops = {
130 .ast = ocfs2_super_ast_func,
131 .bast = ocfs2_super_bast_func,
132 .unlock_ast = ocfs2_unlock_ast_func,
133 .unblock = ocfs2_unblock_osb_lock,
134};
135
136static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
137 .ast = ocfs2_rename_ast_func,
138 .bast = ocfs2_rename_bast_func,
139 .unlock_ast = ocfs2_unlock_ast_func,
140 .unblock = ocfs2_unblock_osb_lock,
141};
142
143static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
144{
145 return lockres->l_type == OCFS2_LOCK_TYPE_META ||
146 lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
147 lockres->l_type == OCFS2_LOCK_TYPE_RW;
148}
149
150static inline int ocfs2_is_super_lock(struct ocfs2_lock_res *lockres)
151{
152 return lockres->l_type == OCFS2_LOCK_TYPE_SUPER;
153}
154
155static inline int ocfs2_is_rename_lock(struct ocfs2_lock_res *lockres)
156{
157 return lockres->l_type == OCFS2_LOCK_TYPE_RENAME;
158}
159
160static inline struct ocfs2_super *ocfs2_lock_res_super(struct ocfs2_lock_res *lockres)
161{
162 BUG_ON(!ocfs2_is_super_lock(lockres)
163 && !ocfs2_is_rename_lock(lockres));
164
165 return (struct ocfs2_super *) lockres->l_priv;
166}
167
168static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
169{
170 BUG_ON(!ocfs2_is_inode_lock(lockres));
171
172 return (struct inode *) lockres->l_priv;
173}
174
175static int ocfs2_lock_create(struct ocfs2_super *osb,
176 struct ocfs2_lock_res *lockres,
177 int level,
178 int dlm_flags);
179static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
180 int wanted);
181static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
182 struct ocfs2_lock_res *lockres,
183 int level);
184static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres);
185static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres);
186static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres);
187static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level);
188static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
189 struct ocfs2_lock_res *lockres);
190static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
191 int convert);
192#define ocfs2_log_dlm_error(_func, _stat, _lockres) do { \
193 mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \
194 "resource %s: %s\n", dlm_errname(_stat), _func, \
195 _lockres->l_name, dlm_errmsg(_stat)); \
196} while (0)
197static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
198 struct ocfs2_lock_res *lockres);
199static int ocfs2_meta_lock_update(struct inode *inode,
200 struct buffer_head **bh);
201static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
202static inline int ocfs2_highest_compat_lock_level(int level);
203static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
204 struct ocfs2_lock_res *lockres,
205 int new_level);
206
207static char *ocfs2_lock_type_strings[] = {
208 [OCFS2_LOCK_TYPE_META] = "Meta",
209 [OCFS2_LOCK_TYPE_DATA] = "Data",
210 [OCFS2_LOCK_TYPE_SUPER] = "Super",
211 [OCFS2_LOCK_TYPE_RENAME] = "Rename",
212 /* Need to differntiate from [R]ename.. serializing writes is the
213 * important job it does, anyway. */
214 [OCFS2_LOCK_TYPE_RW] = "Write/Read",
215};
216
217static char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
218{
219 mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type);
220 return ocfs2_lock_type_strings[type];
221}
222
223static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
224 u64 blkno,
225 u32 generation,
226 char *name)
227{
228 int len;
229
230 mlog_entry_void();
231
232 BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
233
234 len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016"MLFx64"%08x",
235 ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD, blkno,
236 generation);
237
238 BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1));
239
240 mlog(0, "built lock resource with name: %s\n", name);
241
242 mlog_exit_void();
243}
244
245static spinlock_t ocfs2_dlm_tracking_lock = SPIN_LOCK_UNLOCKED;
246
247static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res,
248 struct ocfs2_dlm_debug *dlm_debug)
249{
250 mlog(0, "Add tracking for lockres %s\n", res->l_name);
251
252 spin_lock(&ocfs2_dlm_tracking_lock);
253 list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking);
254 spin_unlock(&ocfs2_dlm_tracking_lock);
255}
256
257static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
258{
259 spin_lock(&ocfs2_dlm_tracking_lock);
260 if (!list_empty(&res->l_debug_list))
261 list_del_init(&res->l_debug_list);
262 spin_unlock(&ocfs2_dlm_tracking_lock);
263}
264
265static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
266 struct ocfs2_lock_res *res,
267 enum ocfs2_lock_type type,
268 u64 blkno,
269 u32 generation,
270 struct ocfs2_lock_res_ops *ops,
271 void *priv)
272{
273 ocfs2_build_lock_name(type, blkno, generation, res->l_name);
274
275 res->l_type = type;
276 res->l_ops = ops;
277 res->l_priv = priv;
278
279 res->l_level = LKM_IVMODE;
280 res->l_requested = LKM_IVMODE;
281 res->l_blocking = LKM_IVMODE;
282 res->l_action = OCFS2_AST_INVALID;
283 res->l_unlock_action = OCFS2_UNLOCK_INVALID;
284
285 res->l_flags = OCFS2_LOCK_INITIALIZED;
286
287 ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
288}
289
290void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
291{
292 /* This also clears out the lock status block */
293 memset(res, 0, sizeof(struct ocfs2_lock_res));
294 spin_lock_init(&res->l_lock);
295 init_waitqueue_head(&res->l_event);
296 INIT_LIST_HEAD(&res->l_blocked_list);
297 INIT_LIST_HEAD(&res->l_mask_waiters);
298}
299
300void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
301 enum ocfs2_lock_type type,
302 struct inode *inode)
303{
304 struct ocfs2_lock_res_ops *ops;
305
306 switch(type) {
307 case OCFS2_LOCK_TYPE_RW:
308 ops = &ocfs2_inode_rw_lops;
309 break;
310 case OCFS2_LOCK_TYPE_META:
311 ops = &ocfs2_inode_meta_lops;
312 break;
313 case OCFS2_LOCK_TYPE_DATA:
314 ops = &ocfs2_inode_data_lops;
315 break;
316 default:
317 mlog_bug_on_msg(1, "type: %d\n", type);
318 ops = NULL; /* thanks, gcc */
319 break;
320 };
321
322 ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type,
323 OCFS2_I(inode)->ip_blkno,
324 inode->i_generation, ops, inode);
325}
326
327static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res,
328 struct ocfs2_super *osb)
329{
330 /* Superblock lockres doesn't come from a slab so we call init
331 * once on it manually. */
332 ocfs2_lock_res_init_once(res);
333 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER,
334 OCFS2_SUPER_BLOCK_BLKNO, 0,
335 &ocfs2_super_lops, osb);
336}
337
338static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
339 struct ocfs2_super *osb)
340{
341 /* Rename lockres doesn't come from a slab so we call init
342 * once on it manually. */
343 ocfs2_lock_res_init_once(res);
344 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME, 0, 0,
345 &ocfs2_rename_lops, osb);
346}
347
348void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
349{
350 mlog_entry_void();
351
352 if (!(res->l_flags & OCFS2_LOCK_INITIALIZED))
353 return;
354
355 ocfs2_remove_lockres_tracking(res);
356
357 mlog_bug_on_msg(!list_empty(&res->l_blocked_list),
358 "Lockres %s is on the blocked list\n",
359 res->l_name);
360 mlog_bug_on_msg(!list_empty(&res->l_mask_waiters),
361 "Lockres %s has mask waiters pending\n",
362 res->l_name);
363 mlog_bug_on_msg(spin_is_locked(&res->l_lock),
364 "Lockres %s is locked\n",
365 res->l_name);
366 mlog_bug_on_msg(res->l_ro_holders,
367 "Lockres %s has %u ro holders\n",
368 res->l_name, res->l_ro_holders);
369 mlog_bug_on_msg(res->l_ex_holders,
370 "Lockres %s has %u ex holders\n",
371 res->l_name, res->l_ex_holders);
372
373 /* Need to clear out the lock status block for the dlm */
374 memset(&res->l_lksb, 0, sizeof(res->l_lksb));
375
376 res->l_flags = 0UL;
377 mlog_exit_void();
378}
379
380static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
381 int level)
382{
383 mlog_entry_void();
384
385 BUG_ON(!lockres);
386
387 switch(level) {
388 case LKM_EXMODE:
389 lockres->l_ex_holders++;
390 break;
391 case LKM_PRMODE:
392 lockres->l_ro_holders++;
393 break;
394 default:
395 BUG();
396 }
397
398 mlog_exit_void();
399}
400
401static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
402 int level)
403{
404 mlog_entry_void();
405
406 BUG_ON(!lockres);
407
408 switch(level) {
409 case LKM_EXMODE:
410 BUG_ON(!lockres->l_ex_holders);
411 lockres->l_ex_holders--;
412 break;
413 case LKM_PRMODE:
414 BUG_ON(!lockres->l_ro_holders);
415 lockres->l_ro_holders--;
416 break;
417 default:
418 BUG();
419 }
420 mlog_exit_void();
421}
422
423/* WARNING: This function lives in a world where the only three lock
424 * levels are EX, PR, and NL. It *will* have to be adjusted when more
425 * lock types are added. */
426static inline int ocfs2_highest_compat_lock_level(int level)
427{
428 int new_level = LKM_EXMODE;
429
430 if (level == LKM_EXMODE)
431 new_level = LKM_NLMODE;
432 else if (level == LKM_PRMODE)
433 new_level = LKM_PRMODE;
434 return new_level;
435}
436
437static void lockres_set_flags(struct ocfs2_lock_res *lockres,
438 unsigned long newflags)
439{
440 struct list_head *pos, *tmp;
441 struct ocfs2_mask_waiter *mw;
442
443 assert_spin_locked(&lockres->l_lock);
444
445 lockres->l_flags = newflags;
446
447 list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) {
448 mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item);
449 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
450 continue;
451
452 list_del_init(&mw->mw_item);
453 mw->mw_status = 0;
454 complete(&mw->mw_complete);
455 }
456}
457static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or)
458{
459 lockres_set_flags(lockres, lockres->l_flags | or);
460}
461static void lockres_clear_flags(struct ocfs2_lock_res *lockres,
462 unsigned long clear)
463{
464 lockres_set_flags(lockres, lockres->l_flags & ~clear);
465}
466
467static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres)
468{
469 mlog_entry_void();
470
471 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
472 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
473 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
474 BUG_ON(lockres->l_blocking <= LKM_NLMODE);
475
476 lockres->l_level = lockres->l_requested;
477 if (lockres->l_level <=
478 ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
479 lockres->l_blocking = LKM_NLMODE;
480 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
481 }
482 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
483
484 mlog_exit_void();
485}
486
487static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres)
488{
489 mlog_entry_void();
490
491 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
492 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
493
494 /* Convert from RO to EX doesn't really need anything as our
495 * information is already up to data. Convert from NL to
496 * *anything* however should mark ourselves as needing an
497 * update */
498 if (lockres->l_level == LKM_NLMODE)
499 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
500
501 lockres->l_level = lockres->l_requested;
502 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
503
504 mlog_exit_void();
505}
506
507static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres)
508{
509 mlog_entry_void();
510
511 BUG_ON((!lockres->l_flags & OCFS2_LOCK_BUSY));
512 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
513
514 if (lockres->l_requested > LKM_NLMODE &&
515 !(lockres->l_flags & OCFS2_LOCK_LOCAL))
516 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
517
518 lockres->l_level = lockres->l_requested;
519 lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED);
520 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
521
522 mlog_exit_void();
523}
524
525static void ocfs2_inode_ast_func(void *opaque)
526{
527 struct ocfs2_lock_res *lockres = opaque;
528 struct inode *inode;
529 struct dlm_lockstatus *lksb;
530 unsigned long flags;
531
532 mlog_entry_void();
533
534 inode = ocfs2_lock_res_inode(lockres);
535
536 mlog(0, "AST fired for inode %"MLFu64", l_action = %u, type = %s\n",
537 OCFS2_I(inode)->ip_blkno, lockres->l_action,
538 ocfs2_lock_type_string(lockres->l_type));
539
540 BUG_ON(!ocfs2_is_inode_lock(lockres));
541
542 spin_lock_irqsave(&lockres->l_lock, flags);
543
544 lksb = &(lockres->l_lksb);
545 if (lksb->status != DLM_NORMAL) {
546 mlog(ML_ERROR, "ocfs2_inode_ast_func: lksb status value of %u "
547 "on inode %"MLFu64"\n", lksb->status,
548 OCFS2_I(inode)->ip_blkno);
549 spin_unlock_irqrestore(&lockres->l_lock, flags);
550 mlog_exit_void();
551 return;
552 }
553
554 switch(lockres->l_action) {
555 case OCFS2_AST_ATTACH:
556 ocfs2_generic_handle_attach_action(lockres);
557 lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL);
558 break;
559 case OCFS2_AST_CONVERT:
560 ocfs2_generic_handle_convert_action(lockres);
561 break;
562 case OCFS2_AST_DOWNCONVERT:
563 ocfs2_generic_handle_downconvert_action(lockres);
564 break;
565 default:
566 mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u "
567 "lockres flags = 0x%lx, unlock action: %u\n",
568 lockres->l_name, lockres->l_action, lockres->l_flags,
569 lockres->l_unlock_action);
570
571 BUG();
572 }
573
574 /* data and rw locking ignores refresh flag for now. */
575 if (lockres->l_type != OCFS2_LOCK_TYPE_META)
576 lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
577
578 /* set it to something invalid so if we get called again we
579 * can catch it. */
580 lockres->l_action = OCFS2_AST_INVALID;
581 spin_unlock_irqrestore(&lockres->l_lock, flags);
582 wake_up(&lockres->l_event);
583
584 mlog_exit_void();
585}
586
587static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
588 int level)
589{
590 int needs_downconvert = 0;
591 mlog_entry_void();
592
593 assert_spin_locked(&lockres->l_lock);
594
595 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
596
597 if (level > lockres->l_blocking) {
598 /* only schedule a downconvert if we haven't already scheduled
599 * one that goes low enough to satisfy the level we're
600 * blocking. this also catches the case where we get
601 * duplicate BASTs */
602 if (ocfs2_highest_compat_lock_level(level) <
603 ocfs2_highest_compat_lock_level(lockres->l_blocking))
604 needs_downconvert = 1;
605
606 lockres->l_blocking = level;
607 }
608
609 mlog_exit(needs_downconvert);
610 return needs_downconvert;
611}
612
613static void ocfs2_generic_bast_func(struct ocfs2_super *osb,
614 struct ocfs2_lock_res *lockres,
615 int level)
616{
617 int needs_downconvert;
618 unsigned long flags;
619
620 mlog_entry_void();
621
622 BUG_ON(level <= LKM_NLMODE);
623
624 spin_lock_irqsave(&lockres->l_lock, flags);
625 needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
626 if (needs_downconvert)
627 ocfs2_schedule_blocked_lock(osb, lockres);
628 spin_unlock_irqrestore(&lockres->l_lock, flags);
629
630 ocfs2_kick_vote_thread(osb);
631
632 wake_up(&lockres->l_event);
633 mlog_exit_void();
634}
635
636static void ocfs2_inode_bast_func(void *opaque, int level)
637{
638 struct ocfs2_lock_res *lockres = opaque;
639 struct inode *inode;
640 struct ocfs2_super *osb;
641
642 mlog_entry_void();
643
644 BUG_ON(!ocfs2_is_inode_lock(lockres));
645
646 inode = ocfs2_lock_res_inode(lockres);
647 osb = OCFS2_SB(inode->i_sb);
648
649 mlog(0, "BAST fired for inode %"MLFu64", blocking = %d, level = %d "
650 "type = %s\n", OCFS2_I(inode)->ip_blkno, level,
651 lockres->l_level,
652 ocfs2_lock_type_string(lockres->l_type));
653
654 ocfs2_generic_bast_func(osb, lockres, level);
655
656 mlog_exit_void();
657}
658
659static void ocfs2_generic_ast_func(struct ocfs2_lock_res *lockres,
660 int ignore_refresh)
661{
662 struct dlm_lockstatus *lksb = &lockres->l_lksb;
663 unsigned long flags;
664
665 spin_lock_irqsave(&lockres->l_lock, flags);
666
667 if (lksb->status != DLM_NORMAL) {
668 mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n",
669 lockres->l_name, lksb->status);
670 spin_unlock_irqrestore(&lockres->l_lock, flags);
671 return;
672 }
673
674 switch(lockres->l_action) {
675 case OCFS2_AST_ATTACH:
676 ocfs2_generic_handle_attach_action(lockres);
677 break;
678 case OCFS2_AST_CONVERT:
679 ocfs2_generic_handle_convert_action(lockres);
680 break;
681 case OCFS2_AST_DOWNCONVERT:
682 ocfs2_generic_handle_downconvert_action(lockres);
683 break;
684 default:
685 BUG();
686 }
687
688 if (ignore_refresh)
689 lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
690
691 /* set it to something invalid so if we get called again we
692 * can catch it. */
693 lockres->l_action = OCFS2_AST_INVALID;
694 spin_unlock_irqrestore(&lockres->l_lock, flags);
695
696 wake_up(&lockres->l_event);
697}
698
699static void ocfs2_super_ast_func(void *opaque)
700{
701 struct ocfs2_lock_res *lockres = opaque;
702
703 mlog_entry_void();
704 mlog(0, "Superblock AST fired\n");
705
706 BUG_ON(!ocfs2_is_super_lock(lockres));
707 ocfs2_generic_ast_func(lockres, 0);
708
709 mlog_exit_void();
710}
711
712static void ocfs2_super_bast_func(void *opaque,
713 int level)
714{
715 struct ocfs2_lock_res *lockres = opaque;
716 struct ocfs2_super *osb;
717
718 mlog_entry_void();
719 mlog(0, "Superblock BAST fired\n");
720
721 BUG_ON(!ocfs2_is_super_lock(lockres));
722 osb = ocfs2_lock_res_super(lockres);
723 ocfs2_generic_bast_func(osb, lockres, level);
724
725 mlog_exit_void();
726}
727
728static void ocfs2_rename_ast_func(void *opaque)
729{
730 struct ocfs2_lock_res *lockres = opaque;
731
732 mlog_entry_void();
733
734 mlog(0, "Rename AST fired\n");
735
736 BUG_ON(!ocfs2_is_rename_lock(lockres));
737
738 ocfs2_generic_ast_func(lockres, 1);
739
740 mlog_exit_void();
741}
742
743static void ocfs2_rename_bast_func(void *opaque,
744 int level)
745{
746 struct ocfs2_lock_res *lockres = opaque;
747 struct ocfs2_super *osb;
748
749 mlog_entry_void();
750
751 mlog(0, "Rename BAST fired\n");
752
753 BUG_ON(!ocfs2_is_rename_lock(lockres));
754
755 osb = ocfs2_lock_res_super(lockres);
756 ocfs2_generic_bast_func(osb, lockres, level);
757
758 mlog_exit_void();
759}
760
761static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
762 int convert)
763{
764 unsigned long flags;
765
766 mlog_entry_void();
767 spin_lock_irqsave(&lockres->l_lock, flags);
768 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
769 if (convert)
770 lockres->l_action = OCFS2_AST_INVALID;
771 else
772 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
773 spin_unlock_irqrestore(&lockres->l_lock, flags);
774
775 wake_up(&lockres->l_event);
776 mlog_exit_void();
777}
778
779/* Note: If we detect another process working on the lock (i.e.,
780 * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller
781 * to do the right thing in that case.
782 */
783static int ocfs2_lock_create(struct ocfs2_super *osb,
784 struct ocfs2_lock_res *lockres,
785 int level,
786 int dlm_flags)
787{
788 int ret = 0;
789 enum dlm_status status;
790 unsigned long flags;
791
792 mlog_entry_void();
793
794 mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level,
795 dlm_flags);
796
797 spin_lock_irqsave(&lockres->l_lock, flags);
798 if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) ||
799 (lockres->l_flags & OCFS2_LOCK_BUSY)) {
800 spin_unlock_irqrestore(&lockres->l_lock, flags);
801 goto bail;
802 }
803
804 lockres->l_action = OCFS2_AST_ATTACH;
805 lockres->l_requested = level;
806 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
807 spin_unlock_irqrestore(&lockres->l_lock, flags);
808
809 status = dlmlock(osb->dlm,
810 level,
811 &lockres->l_lksb,
812 dlm_flags,
813 lockres->l_name,
814 lockres->l_ops->ast,
815 lockres,
816 lockres->l_ops->bast);
817 if (status != DLM_NORMAL) {
818 ocfs2_log_dlm_error("dlmlock", status, lockres);
819 ret = -EINVAL;
820 ocfs2_recover_from_dlm_error(lockres, 1);
821 }
822
823 mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name);
824
825bail:
826 mlog_exit(ret);
827 return ret;
828}
829
830static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres,
831 int flag)
832{
833 unsigned long flags;
834 int ret;
835
836 spin_lock_irqsave(&lockres->l_lock, flags);
837 ret = lockres->l_flags & flag;
838 spin_unlock_irqrestore(&lockres->l_lock, flags);
839
840 return ret;
841}
842
843static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres)
844
845{
846 wait_event(lockres->l_event,
847 !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY));
848}
849
850static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres)
851
852{
853 wait_event(lockres->l_event,
854 !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING));
855}
856
857/* predict what lock level we'll be dropping down to on behalf
858 * of another node, and return true if the currently wanted
859 * level will be compatible with it. */
860static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
861 int wanted)
862{
863 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
864
865 return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking);
866}
867
868static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw)
869{
870 INIT_LIST_HEAD(&mw->mw_item);
871 init_completion(&mw->mw_complete);
872}
873
874static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
875{
876 wait_for_completion(&mw->mw_complete);
877 /* Re-arm the completion in case we want to wait on it again */
878 INIT_COMPLETION(mw->mw_complete);
879 return mw->mw_status;
880}
881
882static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres,
883 struct ocfs2_mask_waiter *mw,
884 unsigned long mask,
885 unsigned long goal)
886{
887 BUG_ON(!list_empty(&mw->mw_item));
888
889 assert_spin_locked(&lockres->l_lock);
890
891 list_add_tail(&mw->mw_item, &lockres->l_mask_waiters);
892 mw->mw_mask = mask;
893 mw->mw_goal = goal;
894}
895
896/* returns 0 if the mw that was removed was already satisfied, -EBUSY
897 * if the mask still hadn't reached its goal */
898static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
899 struct ocfs2_mask_waiter *mw)
900{
901 unsigned long flags;
902 int ret = 0;
903
904 spin_lock_irqsave(&lockres->l_lock, flags);
905 if (!list_empty(&mw->mw_item)) {
906 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
907 ret = -EBUSY;
908
909 list_del_init(&mw->mw_item);
910 init_completion(&mw->mw_complete);
911 }
912 spin_unlock_irqrestore(&lockres->l_lock, flags);
913
914 return ret;
915
916}
917
918static int ocfs2_cluster_lock(struct ocfs2_super *osb,
919 struct ocfs2_lock_res *lockres,
920 int level,
921 int lkm_flags,
922 int arg_flags)
923{
924 struct ocfs2_mask_waiter mw;
925 enum dlm_status status;
926 int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
927 int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
928 unsigned long flags;
929
930 mlog_entry_void();
931
932 ocfs2_init_mask_waiter(&mw);
933
934again:
935 wait = 0;
936
937 if (catch_signals && signal_pending(current)) {
938 ret = -ERESTARTSYS;
939 goto out;
940 }
941
942 spin_lock_irqsave(&lockres->l_lock, flags);
943
944 mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
945 "Cluster lock called on freeing lockres %s! flags "
946 "0x%lx\n", lockres->l_name, lockres->l_flags);
947
948 /* We only compare against the currently granted level
949 * here. If the lock is blocked waiting on a downconvert,
950 * we'll get caught below. */
951 if (lockres->l_flags & OCFS2_LOCK_BUSY &&
952 level > lockres->l_level) {
953 /* is someone sitting in dlm_lock? If so, wait on
954 * them. */
955 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
956 wait = 1;
957 goto unlock;
958 }
959
960 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
961 /* lock has not been created yet. */
962 spin_unlock_irqrestore(&lockres->l_lock, flags);
963
964 ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
965 if (ret < 0) {
966 mlog_errno(ret);
967 goto out;
968 }
969 goto again;
970 }
971
972 if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
973 !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
974 /* is the lock is currently blocked on behalf of
975 * another node */
976 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0);
977 wait = 1;
978 goto unlock;
979 }
980
981 if (level > lockres->l_level) {
982 if (lockres->l_action != OCFS2_AST_INVALID)
983 mlog(ML_ERROR, "lockres %s has action %u pending\n",
984 lockres->l_name, lockres->l_action);
985
986 lockres->l_action = OCFS2_AST_CONVERT;
987 lockres->l_requested = level;
988 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
989 spin_unlock_irqrestore(&lockres->l_lock, flags);
990
991 BUG_ON(level == LKM_IVMODE);
992 BUG_ON(level == LKM_NLMODE);
993
994 mlog(0, "lock %s, convert from %d to level = %d\n",
995 lockres->l_name, lockres->l_level, level);
996
997 /* call dlm_lock to upgrade lock now */
998 status = dlmlock(osb->dlm,
999 level,
1000 &lockres->l_lksb,
1001 lkm_flags|LKM_CONVERT|LKM_VALBLK,
1002 lockres->l_name,
1003 lockres->l_ops->ast,
1004 lockres,
1005 lockres->l_ops->bast);
1006 if (status != DLM_NORMAL) {
1007 if ((lkm_flags & LKM_NOQUEUE) &&
1008 (status == DLM_NOTQUEUED))
1009 ret = -EAGAIN;
1010 else {
1011 ocfs2_log_dlm_error("dlmlock", status,
1012 lockres);
1013 ret = -EINVAL;
1014 }
1015 ocfs2_recover_from_dlm_error(lockres, 1);
1016 goto out;
1017 }
1018
1019 mlog(0, "lock %s, successfull return from dlmlock\n",
1020 lockres->l_name);
1021
1022 /* At this point we've gone inside the dlm and need to
1023 * complete our work regardless. */
1024 catch_signals = 0;
1025
1026 /* wait for busy to clear and carry on */
1027 goto again;
1028 }
1029
1030 /* Ok, if we get here then we're good to go. */
1031 ocfs2_inc_holders(lockres, level);
1032
1033 ret = 0;
1034unlock:
1035 spin_unlock_irqrestore(&lockres->l_lock, flags);
1036out:
1037 /*
1038 * This is helping work around a lock inversion between the page lock
1039 * and dlm locks. One path holds the page lock while calling aops
1040 * which block acquiring dlm locks. The voting thread holds dlm
1041 * locks while acquiring page locks while down converting data locks.
1042 * This block is helping an aop path notice the inversion and back
1043 * off to unlock its page lock before trying the dlm lock again.
1044 */
1045 if (wait && arg_flags & OCFS2_LOCK_NONBLOCK &&
1046 mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) {
1047 wait = 0;
1048 if (lockres_remove_mask_waiter(lockres, &mw))
1049 ret = -EAGAIN;
1050 else
1051 goto again;
1052 }
1053 if (wait) {
1054 ret = ocfs2_wait_for_mask(&mw);
1055 if (ret == 0)
1056 goto again;
1057 mlog_errno(ret);
1058 }
1059
1060 mlog_exit(ret);
1061 return ret;
1062}
1063
1064static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
1065 struct ocfs2_lock_res *lockres,
1066 int level)
1067{
1068 unsigned long flags;
1069
1070 mlog_entry_void();
1071 spin_lock_irqsave(&lockres->l_lock, flags);
1072 ocfs2_dec_holders(lockres, level);
1073 ocfs2_vote_on_unlock(osb, lockres);
1074 spin_unlock_irqrestore(&lockres->l_lock, flags);
1075 mlog_exit_void();
1076}
1077
1078static int ocfs2_create_new_inode_lock(struct inode *inode,
1079 struct ocfs2_lock_res *lockres)
1080{
1081 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1082 unsigned long flags;
1083
1084 spin_lock_irqsave(&lockres->l_lock, flags);
1085 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
1086 lockres_or_flags(lockres, OCFS2_LOCK_LOCAL);
1087 spin_unlock_irqrestore(&lockres->l_lock, flags);
1088
1089 return ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL);
1090}
1091
1092/* Grants us an EX lock on the data and metadata resources, skipping
1093 * the normal cluster directory lookup. Use this ONLY on newly created
1094 * inodes which other nodes can't possibly see, and which haven't been
1095 * hashed in the inode hash yet. This can give us a good performance
1096 * increase as it'll skip the network broadcast normally associated
1097 * with creating a new lock resource. */
1098int ocfs2_create_new_inode_locks(struct inode *inode)
1099{
1100 int ret;
1101
1102 BUG_ON(!inode);
1103 BUG_ON(!ocfs2_inode_is_new(inode));
1104
1105 mlog_entry_void();
1106
1107 mlog(0, "Inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
1108
1109 /* NOTE: That we don't increment any of the holder counts, nor
1110 * do we add anything to a journal handle. Since this is
1111 * supposed to be a new inode which the cluster doesn't know
1112 * about yet, there is no need to. As far as the LVB handling
1113 * is concerned, this is basically like acquiring an EX lock
1114 * on a resource which has an invalid one -- we'll set it
1115 * valid when we release the EX. */
1116
1117 ret = ocfs2_create_new_inode_lock(inode,
1118 &OCFS2_I(inode)->ip_rw_lockres);
1119 if (ret) {
1120 mlog_errno(ret);
1121 goto bail;
1122 }
1123
1124 ret = ocfs2_create_new_inode_lock(inode,
1125 &OCFS2_I(inode)->ip_meta_lockres);
1126 if (ret) {
1127 mlog_errno(ret);
1128 goto bail;
1129 }
1130
1131 ret = ocfs2_create_new_inode_lock(inode,
1132 &OCFS2_I(inode)->ip_data_lockres);
1133 if (ret) {
1134 mlog_errno(ret);
1135 goto bail;
1136 }
1137
1138bail:
1139 mlog_exit(ret);
1140 return ret;
1141}
1142
1143int ocfs2_rw_lock(struct inode *inode, int write)
1144{
1145 int status, level;
1146 struct ocfs2_lock_res *lockres;
1147
1148 BUG_ON(!inode);
1149
1150 mlog_entry_void();
1151
1152 mlog(0, "inode %"MLFu64" take %s RW lock\n",
1153 OCFS2_I(inode)->ip_blkno,
1154 write ? "EXMODE" : "PRMODE");
1155
1156 lockres = &OCFS2_I(inode)->ip_rw_lockres;
1157
1158 level = write ? LKM_EXMODE : LKM_PRMODE;
1159
1160 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
1161 0);
1162 if (status < 0)
1163 mlog_errno(status);
1164
1165 mlog_exit(status);
1166 return status;
1167}
1168
1169void ocfs2_rw_unlock(struct inode *inode, int write)
1170{
1171 int level = write ? LKM_EXMODE : LKM_PRMODE;
1172 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
1173
1174 mlog_entry_void();
1175
1176 mlog(0, "inode %"MLFu64" drop %s RW lock\n",
1177 OCFS2_I(inode)->ip_blkno,
1178 write ? "EXMODE" : "PRMODE");
1179
1180 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1181
1182 mlog_exit_void();
1183}
1184
1185int ocfs2_data_lock_full(struct inode *inode,
1186 int write,
1187 int arg_flags)
1188{
1189 int status = 0, level;
1190 struct ocfs2_lock_res *lockres;
1191
1192 BUG_ON(!inode);
1193
1194 mlog_entry_void();
1195
1196 mlog(0, "inode %"MLFu64" take %s DATA lock\n",
1197 OCFS2_I(inode)->ip_blkno,
1198 write ? "EXMODE" : "PRMODE");
1199
1200 /* We'll allow faking a readonly data lock for
1201 * rodevices. */
1202 if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) {
1203 if (write) {
1204 status = -EROFS;
1205 mlog_errno(status);
1206 }
1207 goto out;
1208 }
1209
1210 lockres = &OCFS2_I(inode)->ip_data_lockres;
1211
1212 level = write ? LKM_EXMODE : LKM_PRMODE;
1213
1214 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level,
1215 0, arg_flags);
1216 if (status < 0 && status != -EAGAIN)
1217 mlog_errno(status);
1218
1219out:
1220 mlog_exit(status);
1221 return status;
1222}
1223
1224/* see ocfs2_meta_lock_with_page() */
1225int ocfs2_data_lock_with_page(struct inode *inode,
1226 int write,
1227 struct page *page)
1228{
1229 int ret;
1230
1231 ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK);
1232 if (ret == -EAGAIN) {
1233 unlock_page(page);
1234 if (ocfs2_data_lock(inode, write) == 0)
1235 ocfs2_data_unlock(inode, write);
1236 ret = AOP_TRUNCATED_PAGE;
1237 }
1238
1239 return ret;
1240}
1241
1242static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
1243 struct ocfs2_lock_res *lockres)
1244{
1245 int kick = 0;
1246
1247 mlog_entry_void();
1248
1249 /* If we know that another node is waiting on our lock, kick
1250 * the vote thread * pre-emptively when we reach a release
1251 * condition. */
1252 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
1253 switch(lockres->l_blocking) {
1254 case LKM_EXMODE:
1255 if (!lockres->l_ex_holders && !lockres->l_ro_holders)
1256 kick = 1;
1257 break;
1258 case LKM_PRMODE:
1259 if (!lockres->l_ex_holders)
1260 kick = 1;
1261 break;
1262 default:
1263 BUG();
1264 }
1265 }
1266
1267 if (kick)
1268 ocfs2_kick_vote_thread(osb);
1269
1270 mlog_exit_void();
1271}
1272
1273void ocfs2_data_unlock(struct inode *inode,
1274 int write)
1275{
1276 int level = write ? LKM_EXMODE : LKM_PRMODE;
1277 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres;
1278
1279 mlog_entry_void();
1280
1281 mlog(0, "inode %"MLFu64" drop %s DATA lock\n",
1282 OCFS2_I(inode)->ip_blkno,
1283 write ? "EXMODE" : "PRMODE");
1284
1285 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
1286 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1287
1288 mlog_exit_void();
1289}
1290
1291#define OCFS2_SEC_BITS 34
1292#define OCFS2_SEC_SHIFT (64 - 34)
1293#define OCFS2_NSEC_MASK ((1ULL << OCFS2_SEC_SHIFT) - 1)
1294
1295/* LVB only has room for 64 bits of time here so we pack it for
1296 * now. */
1297static u64 ocfs2_pack_timespec(struct timespec *spec)
1298{
1299 u64 res;
1300 u64 sec = spec->tv_sec;
1301 u32 nsec = spec->tv_nsec;
1302
1303 res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK);
1304
1305 return res;
1306}
1307
1308/* Call this with the lockres locked. I am reasonably sure we don't
1309 * need ip_lock in this function as anyone who would be changing those
1310 * values is supposed to be blocked in ocfs2_meta_lock right now. */
1311static void __ocfs2_stuff_meta_lvb(struct inode *inode)
1312{
1313 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1314 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
1315 struct ocfs2_meta_lvb *lvb;
1316
1317 mlog_entry_void();
1318
1319 lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
1320
1321 lvb->lvb_version = cpu_to_be32(OCFS2_LVB_VERSION);
1322 lvb->lvb_isize = cpu_to_be64(i_size_read(inode));
1323 lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters);
1324 lvb->lvb_iuid = cpu_to_be32(inode->i_uid);
1325 lvb->lvb_igid = cpu_to_be32(inode->i_gid);
1326 lvb->lvb_imode = cpu_to_be16(inode->i_mode);
1327 lvb->lvb_inlink = cpu_to_be16(inode->i_nlink);
1328 lvb->lvb_iatime_packed =
1329 cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
1330 lvb->lvb_ictime_packed =
1331 cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
1332 lvb->lvb_imtime_packed =
1333 cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
1334
1335 mlog_meta_lvb(0, lockres);
1336
1337 mlog_exit_void();
1338}
1339
1340static void ocfs2_unpack_timespec(struct timespec *spec,
1341 u64 packed_time)
1342{
1343 spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT;
1344 spec->tv_nsec = packed_time & OCFS2_NSEC_MASK;
1345}
1346
1347static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1348{
1349 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1350 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
1351 struct ocfs2_meta_lvb *lvb;
1352
1353 mlog_entry_void();
1354
1355 mlog_meta_lvb(0, lockres);
1356
1357 lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
1358
1359 /* We're safe here without the lockres lock... */
1360 spin_lock(&oi->ip_lock);
1361 oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters);
1362 i_size_write(inode, be64_to_cpu(lvb->lvb_isize));
1363
1364 /* fast-symlinks are a special case */
1365 if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
1366 inode->i_blocks = 0;
1367 else
1368 inode->i_blocks =
1369 ocfs2_align_bytes_to_sectors(i_size_read(inode));
1370
1371 inode->i_uid = be32_to_cpu(lvb->lvb_iuid);
1372 inode->i_gid = be32_to_cpu(lvb->lvb_igid);
1373 inode->i_mode = be16_to_cpu(lvb->lvb_imode);
1374 inode->i_nlink = be16_to_cpu(lvb->lvb_inlink);
1375 ocfs2_unpack_timespec(&inode->i_atime,
1376 be64_to_cpu(lvb->lvb_iatime_packed));
1377 ocfs2_unpack_timespec(&inode->i_mtime,
1378 be64_to_cpu(lvb->lvb_imtime_packed));
1379 ocfs2_unpack_timespec(&inode->i_ctime,
1380 be64_to_cpu(lvb->lvb_ictime_packed));
1381 spin_unlock(&oi->ip_lock);
1382
1383 mlog_exit_void();
1384}
1385
1386static inline int ocfs2_meta_lvb_is_trustable(struct ocfs2_lock_res *lockres)
1387{
1388 struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
1389
1390 if (be32_to_cpu(lvb->lvb_version) == OCFS2_LVB_VERSION)
1391 return 1;
1392 return 0;
1393}
1394
1395/* Determine whether a lock resource needs to be refreshed, and
1396 * arbitrate who gets to refresh it.
1397 *
1398 * 0 means no refresh needed.
1399 *
1400 * > 0 means you need to refresh this and you MUST call
1401 * ocfs2_complete_lock_res_refresh afterwards. */
1402static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres)
1403{
1404 unsigned long flags;
1405 int status = 0;
1406
1407 mlog_entry_void();
1408
1409refresh_check:
1410 spin_lock_irqsave(&lockres->l_lock, flags);
1411 if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
1412 spin_unlock_irqrestore(&lockres->l_lock, flags);
1413 goto bail;
1414 }
1415
1416 if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
1417 spin_unlock_irqrestore(&lockres->l_lock, flags);
1418
1419 ocfs2_wait_on_refreshing_lock(lockres);
1420 goto refresh_check;
1421 }
1422
1423 /* Ok, I'll be the one to refresh this lock. */
1424 lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING);
1425 spin_unlock_irqrestore(&lockres->l_lock, flags);
1426
1427 status = 1;
1428bail:
1429 mlog_exit(status);
1430 return status;
1431}
1432
1433/* If status is non zero, I'll mark it as not being in refresh
1434 * anymroe, but i won't clear the needs refresh flag. */
1435static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres,
1436 int status)
1437{
1438 unsigned long flags;
1439 mlog_entry_void();
1440
1441 spin_lock_irqsave(&lockres->l_lock, flags);
1442 lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING);
1443 if (!status)
1444 lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
1445 spin_unlock_irqrestore(&lockres->l_lock, flags);
1446
1447 wake_up(&lockres->l_event);
1448
1449 mlog_exit_void();
1450}
1451
1452/* may or may not return a bh if it went to disk. */
1453static int ocfs2_meta_lock_update(struct inode *inode,
1454 struct buffer_head **bh)
1455{
1456 int status = 0;
1457 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1458 struct ocfs2_lock_res *lockres;
1459 struct ocfs2_dinode *fe;
1460
1461 mlog_entry_void();
1462
1463 spin_lock(&oi->ip_lock);
1464 if (oi->ip_flags & OCFS2_INODE_DELETED) {
1465 mlog(0, "Orphaned inode %"MLFu64" was deleted while we "
1466 "were waiting on a lock. ip_flags = 0x%x\n",
1467 oi->ip_blkno, oi->ip_flags);
1468 spin_unlock(&oi->ip_lock);
1469 status = -ENOENT;
1470 goto bail;
1471 }
1472 spin_unlock(&oi->ip_lock);
1473
1474 lockres = &oi->ip_meta_lockres;
1475
1476 if (!ocfs2_should_refresh_lock_res(lockres))
1477 goto bail;
1478
1479 /* This will discard any caching information we might have had
1480 * for the inode metadata. */
1481 ocfs2_metadata_cache_purge(inode);
1482
1483 /* will do nothing for inode types that don't use the extent
1484 * map (directories, bitmap files, etc) */
1485 ocfs2_extent_map_trunc(inode, 0);
1486
1487 if (ocfs2_meta_lvb_is_trustable(lockres)) {
1488 mlog(0, "Trusting LVB on inode %"MLFu64"\n",
1489 oi->ip_blkno);
1490 ocfs2_refresh_inode_from_lvb(inode);
1491 } else {
1492 /* Boo, we have to go to disk. */
1493 /* read bh, cast, ocfs2_refresh_inode */
1494 status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno,
1495 bh, OCFS2_BH_CACHED, inode);
1496 if (status < 0) {
1497 mlog_errno(status);
1498 goto bail_refresh;
1499 }
1500 fe = (struct ocfs2_dinode *) (*bh)->b_data;
1501
1502 /* This is a good chance to make sure we're not
1503 * locking an invalid object.
1504 *
1505 * We bug on a stale inode here because we checked
1506 * above whether it was wiped from disk. The wiping
1507 * node provides a guarantee that we receive that
1508 * message and can mark the inode before dropping any
1509 * locks associated with it. */
1510 if (!OCFS2_IS_VALID_DINODE(fe)) {
1511 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
1512 status = -EIO;
1513 goto bail_refresh;
1514 }
1515 mlog_bug_on_msg(inode->i_generation !=
1516 le32_to_cpu(fe->i_generation),
1517 "Invalid dinode %"MLFu64" disk generation: %u "
1518 "inode->i_generation: %u\n",
1519 oi->ip_blkno, le32_to_cpu(fe->i_generation),
1520 inode->i_generation);
1521 mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) ||
1522 !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)),
1523 "Stale dinode %"MLFu64" dtime: %"MLFu64" "
1524 "flags: 0x%x\n", oi->ip_blkno,
1525 le64_to_cpu(fe->i_dtime),
1526 le32_to_cpu(fe->i_flags));
1527
1528 ocfs2_refresh_inode(inode, fe);
1529 }
1530
1531 status = 0;
1532bail_refresh:
1533 ocfs2_complete_lock_res_refresh(lockres, status);
1534bail:
1535 mlog_exit(status);
1536 return status;
1537}
1538
1539static int ocfs2_assign_bh(struct inode *inode,
1540 struct buffer_head **ret_bh,
1541 struct buffer_head *passed_bh)
1542{
1543 int status;
1544
1545 if (passed_bh) {
1546 /* Ok, the update went to disk for us, use the
1547 * returned bh. */
1548 *ret_bh = passed_bh;
1549 get_bh(*ret_bh);
1550
1551 return 0;
1552 }
1553
1554 status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
1555 OCFS2_I(inode)->ip_blkno,
1556 ret_bh,
1557 OCFS2_BH_CACHED,
1558 inode);
1559 if (status < 0)
1560 mlog_errno(status);
1561
1562 return status;
1563}
1564
1565/*
1566 * returns < 0 error if the callback will never be called, otherwise
1567 * the result of the lock will be communicated via the callback.
1568 */
1569int ocfs2_meta_lock_full(struct inode *inode,
1570 struct ocfs2_journal_handle *handle,
1571 struct buffer_head **ret_bh,
1572 int ex,
1573 int arg_flags)
1574{
1575 int status, level, dlm_flags, acquired;
1576 struct ocfs2_lock_res *lockres;
1577 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1578 struct buffer_head *local_bh = NULL;
1579
1580 BUG_ON(!inode);
1581
1582 mlog_entry_void();
1583
1584 mlog(0, "inode %"MLFu64", take %s META lock\n",
1585 OCFS2_I(inode)->ip_blkno,
1586 ex ? "EXMODE" : "PRMODE");
1587
1588 status = 0;
1589 acquired = 0;
1590 /* We'll allow faking a readonly metadata lock for
1591 * rodevices. */
1592 if (ocfs2_is_hard_readonly(osb)) {
1593 if (ex)
1594 status = -EROFS;
1595 goto bail;
1596 }
1597
1598 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
1599 wait_event(osb->recovery_event,
1600 ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1601
1602 acquired = 0;
1603 lockres = &OCFS2_I(inode)->ip_meta_lockres;
1604 level = ex ? LKM_EXMODE : LKM_PRMODE;
1605 dlm_flags = 0;
1606 if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
1607 dlm_flags |= LKM_NOQUEUE;
1608
1609 status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags);
1610 if (status < 0) {
1611 if (status != -EAGAIN && status != -EIOCBRETRY)
1612 mlog_errno(status);
1613 goto bail;
1614 }
1615
1616 /* Notify the error cleanup path to drop the cluster lock. */
1617 acquired = 1;
1618
1619 /* We wait twice because a node may have died while we were in
1620 * the lower dlm layers. The second time though, we've
1621 * committed to owning this lock so we don't allow signals to
1622 * abort the operation. */
1623 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
1624 wait_event(osb->recovery_event,
1625 ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1626
1627 /* This is fun. The caller may want a bh back, or it may
1628 * not. ocfs2_meta_lock_update definitely wants one in, but
1629 * may or may not read one, depending on what's in the
1630 * LVB. The result of all of this is that we've *only* gone to
1631 * disk if we have to, so the complexity is worthwhile. */
1632 status = ocfs2_meta_lock_update(inode, &local_bh);
1633 if (status < 0) {
1634 if (status != -ENOENT)
1635 mlog_errno(status);
1636 goto bail;
1637 }
1638
1639 if (ret_bh) {
1640 status = ocfs2_assign_bh(inode, ret_bh, local_bh);
1641 if (status < 0) {
1642 mlog_errno(status);
1643 goto bail;
1644 }
1645 }
1646
1647 if (handle) {
1648 status = ocfs2_handle_add_lock(handle, inode);
1649 if (status < 0)
1650 mlog_errno(status);
1651 }
1652
1653bail:
1654 if (status < 0) {
1655 if (ret_bh && (*ret_bh)) {
1656 brelse(*ret_bh);
1657 *ret_bh = NULL;
1658 }
1659 if (acquired)
1660 ocfs2_meta_unlock(inode, ex);
1661 }
1662
1663 if (local_bh)
1664 brelse(local_bh);
1665
1666 mlog_exit(status);
1667 return status;
1668}
1669
1670/*
1671 * This is working around a lock inversion between tasks acquiring DLM locks
1672 * while holding a page lock and the vote thread which blocks dlm lock acquiry
1673 * while acquiring page locks.
1674 *
1675 * ** These _with_page variantes are only intended to be called from aop
1676 * methods that hold page locks and return a very specific *positive* error
1677 * code that aop methods pass up to the VFS -- test for errors with != 0. **
1678 *
1679 * The DLM is called such that it returns -EAGAIN if it would have blocked
1680 * waiting for the vote thread. In that case we unlock our page so the vote
1681 * thread can make progress. Once we've done this we have to return
1682 * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up
1683 * into the VFS who will then immediately retry the aop call.
1684 *
1685 * We do a blocking lock and immediate unlock before returning, though, so that
1686 * the lock has a great chance of being cached on this node by the time the VFS
1687 * calls back to retry the aop. This has a potential to livelock as nodes
1688 * ping locks back and forth, but that's a risk we're willing to take to avoid
1689 * the lock inversion simply.
1690 */
1691int ocfs2_meta_lock_with_page(struct inode *inode,
1692 struct ocfs2_journal_handle *handle,
1693 struct buffer_head **ret_bh,
1694 int ex,
1695 struct page *page)
1696{
1697 int ret;
1698
1699 ret = ocfs2_meta_lock_full(inode, handle, ret_bh, ex,
1700 OCFS2_LOCK_NONBLOCK);
1701 if (ret == -EAGAIN) {
1702 unlock_page(page);
1703 if (ocfs2_meta_lock(inode, handle, ret_bh, ex) == 0)
1704 ocfs2_meta_unlock(inode, ex);
1705 ret = AOP_TRUNCATED_PAGE;
1706 }
1707
1708 return ret;
1709}
1710
1711void ocfs2_meta_unlock(struct inode *inode,
1712 int ex)
1713{
1714 int level = ex ? LKM_EXMODE : LKM_PRMODE;
1715 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
1716
1717 mlog_entry_void();
1718
1719 mlog(0, "inode %"MLFu64" drop %s META lock\n",
1720 OCFS2_I(inode)->ip_blkno,
1721 ex ? "EXMODE" : "PRMODE");
1722
1723 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
1724 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1725
1726 mlog_exit_void();
1727}
1728
1729int ocfs2_super_lock(struct ocfs2_super *osb,
1730 int ex)
1731{
1732 int status;
1733 int level = ex ? LKM_EXMODE : LKM_PRMODE;
1734 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
1735 struct buffer_head *bh;
1736 struct ocfs2_slot_info *si = osb->slot_info;
1737
1738 mlog_entry_void();
1739
1740 if (ocfs2_is_hard_readonly(osb))
1741 return -EROFS;
1742
1743 status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
1744 if (status < 0) {
1745 mlog_errno(status);
1746 goto bail;
1747 }
1748
1749 /* The super block lock path is really in the best position to
1750 * know when resources covered by the lock need to be
1751 * refreshed, so we do it here. Of course, making sense of
1752 * everything is up to the caller :) */
1753 status = ocfs2_should_refresh_lock_res(lockres);
1754 if (status < 0) {
1755 mlog_errno(status);
1756 goto bail;
1757 }
1758 if (status) {
1759 bh = si->si_bh;
1760 status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0,
1761 si->si_inode);
1762 if (status == 0)
1763 ocfs2_update_slot_info(si);
1764
1765 ocfs2_complete_lock_res_refresh(lockres, status);
1766
1767 if (status < 0)
1768 mlog_errno(status);
1769 }
1770bail:
1771 mlog_exit(status);
1772 return status;
1773}
1774
1775void ocfs2_super_unlock(struct ocfs2_super *osb,
1776 int ex)
1777{
1778 int level = ex ? LKM_EXMODE : LKM_PRMODE;
1779 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
1780
1781 ocfs2_cluster_unlock(osb, lockres, level);
1782}
1783
1784int ocfs2_rename_lock(struct ocfs2_super *osb)
1785{
1786 int status;
1787 struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
1788
1789 if (ocfs2_is_hard_readonly(osb))
1790 return -EROFS;
1791
1792 status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0);
1793 if (status < 0)
1794 mlog_errno(status);
1795
1796 return status;
1797}
1798
1799void ocfs2_rename_unlock(struct ocfs2_super *osb)
1800{
1801 struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
1802
1803 ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE);
1804}
1805
1806/* Reference counting of the dlm debug structure. We want this because
1807 * open references on the debug inodes can live on after a mount, so
1808 * we can't rely on the ocfs2_super to always exist. */
1809static void ocfs2_dlm_debug_free(struct kref *kref)
1810{
1811 struct ocfs2_dlm_debug *dlm_debug;
1812
1813 dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt);
1814
1815 kfree(dlm_debug);
1816}
1817
1818void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug)
1819{
1820 if (dlm_debug)
1821 kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free);
1822}
1823
1824static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug)
1825{
1826 kref_get(&debug->d_refcnt);
1827}
1828
1829struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void)
1830{
1831 struct ocfs2_dlm_debug *dlm_debug;
1832
1833 dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL);
1834 if (!dlm_debug) {
1835 mlog_errno(-ENOMEM);
1836 goto out;
1837 }
1838
1839 kref_init(&dlm_debug->d_refcnt);
1840 INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking);
1841 dlm_debug->d_locking_state = NULL;
1842out:
1843 return dlm_debug;
1844}
1845
1846/* Access to this is arbitrated for us via seq_file->sem. */
1847struct ocfs2_dlm_seq_priv {
1848 struct ocfs2_dlm_debug *p_dlm_debug;
1849 struct ocfs2_lock_res p_iter_res;
1850 struct ocfs2_lock_res p_tmp_res;
1851};
1852
1853static struct ocfs2_lock_res *ocfs2_dlm_next_res(struct ocfs2_lock_res *start,
1854 struct ocfs2_dlm_seq_priv *priv)
1855{
1856 struct ocfs2_lock_res *iter, *ret = NULL;
1857 struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug;
1858
1859 assert_spin_locked(&ocfs2_dlm_tracking_lock);
1860
1861 list_for_each_entry(iter, &start->l_debug_list, l_debug_list) {
1862 /* discover the head of the list */
1863 if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) {
1864 mlog(0, "End of list found, %p\n", ret);
1865 break;
1866 }
1867
1868 /* We track our "dummy" iteration lockres' by a NULL
1869 * l_ops field. */
1870 if (iter->l_ops != NULL) {
1871 ret = iter;
1872 break;
1873 }
1874 }
1875
1876 return ret;
1877}
1878
1879static void *ocfs2_dlm_seq_start(struct seq_file *m, loff_t *pos)
1880{
1881 struct ocfs2_dlm_seq_priv *priv = m->private;
1882 struct ocfs2_lock_res *iter;
1883
1884 spin_lock(&ocfs2_dlm_tracking_lock);
1885 iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv);
1886 if (iter) {
1887 /* Since lockres' have the lifetime of their container
1888 * (which can be inodes, ocfs2_supers, etc) we want to
1889 * copy this out to a temporary lockres while still
1890 * under the spinlock. Obviously after this we can't
1891 * trust any pointers on the copy returned, but that's
1892 * ok as the information we want isn't typically held
1893 * in them. */
1894 priv->p_tmp_res = *iter;
1895 iter = &priv->p_tmp_res;
1896 }
1897 spin_unlock(&ocfs2_dlm_tracking_lock);
1898
1899 return iter;
1900}
1901
1902static void ocfs2_dlm_seq_stop(struct seq_file *m, void *v)
1903{
1904}
1905
1906static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
1907{
1908 struct ocfs2_dlm_seq_priv *priv = m->private;
1909 struct ocfs2_lock_res *iter = v;
1910 struct ocfs2_lock_res *dummy = &priv->p_iter_res;
1911
1912 spin_lock(&ocfs2_dlm_tracking_lock);
1913 iter = ocfs2_dlm_next_res(iter, priv);
1914 list_del_init(&dummy->l_debug_list);
1915 if (iter) {
1916 list_add(&dummy->l_debug_list, &iter->l_debug_list);
1917 priv->p_tmp_res = *iter;
1918 iter = &priv->p_tmp_res;
1919 }
1920 spin_unlock(&ocfs2_dlm_tracking_lock);
1921
1922 return iter;
1923}
1924
1925/* So that debugfs.ocfs2 can determine which format is being used */
1926#define OCFS2_DLM_DEBUG_STR_VERSION 1
1927static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
1928{
1929 int i;
1930 char *lvb;
1931 struct ocfs2_lock_res *lockres = v;
1932
1933 if (!lockres)
1934 return -EINVAL;
1935
1936 seq_printf(m, "0x%x\t"
1937 "%.*s\t"
1938 "%d\t"
1939 "0x%lx\t"
1940 "0x%x\t"
1941 "0x%x\t"
1942 "%u\t"
1943 "%u\t"
1944 "%d\t"
1945 "%d\t",
1946 OCFS2_DLM_DEBUG_STR_VERSION,
1947 OCFS2_LOCK_ID_MAX_LEN, lockres->l_name,
1948 lockres->l_level,
1949 lockres->l_flags,
1950 lockres->l_action,
1951 lockres->l_unlock_action,
1952 lockres->l_ro_holders,
1953 lockres->l_ex_holders,
1954 lockres->l_requested,
1955 lockres->l_blocking);
1956
1957 /* Dump the raw LVB */
1958 lvb = lockres->l_lksb.lvb;
1959 for(i = 0; i < DLM_LVB_LEN; i++)
1960 seq_printf(m, "0x%x\t", lvb[i]);
1961
1962 /* End the line */
1963 seq_printf(m, "\n");
1964 return 0;
1965}
1966
1967static struct seq_operations ocfs2_dlm_seq_ops = {
1968 .start = ocfs2_dlm_seq_start,
1969 .stop = ocfs2_dlm_seq_stop,
1970 .next = ocfs2_dlm_seq_next,
1971 .show = ocfs2_dlm_seq_show,
1972};
1973
1974static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
1975{
1976 struct seq_file *seq = (struct seq_file *) file->private_data;
1977 struct ocfs2_dlm_seq_priv *priv = seq->private;
1978 struct ocfs2_lock_res *res = &priv->p_iter_res;
1979
1980 ocfs2_remove_lockres_tracking(res);
1981 ocfs2_put_dlm_debug(priv->p_dlm_debug);
1982 return seq_release_private(inode, file);
1983}
1984
1985static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
1986{
1987 int ret;
1988 struct ocfs2_dlm_seq_priv *priv;
1989 struct seq_file *seq;
1990 struct ocfs2_super *osb;
1991
1992 priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL);
1993 if (!priv) {
1994 ret = -ENOMEM;
1995 mlog_errno(ret);
1996 goto out;
1997 }
1998 osb = (struct ocfs2_super *) inode->u.generic_ip;
1999 ocfs2_get_dlm_debug(osb->osb_dlm_debug);
2000 priv->p_dlm_debug = osb->osb_dlm_debug;
2001 INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
2002
2003 ret = seq_open(file, &ocfs2_dlm_seq_ops);
2004 if (ret) {
2005 kfree(priv);
2006 mlog_errno(ret);
2007 goto out;
2008 }
2009
2010 seq = (struct seq_file *) file->private_data;
2011 seq->private = priv;
2012
2013 ocfs2_add_lockres_tracking(&priv->p_iter_res,
2014 priv->p_dlm_debug);
2015
2016out:
2017 return ret;
2018}
2019
2020static struct file_operations ocfs2_dlm_debug_fops = {
2021 .open = ocfs2_dlm_debug_open,
2022 .release = ocfs2_dlm_debug_release,
2023 .read = seq_read,
2024 .llseek = seq_lseek,
2025};
2026
2027static int ocfs2_dlm_init_debug(struct ocfs2_super *osb)
2028{
2029 int ret = 0;
2030 struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
2031
2032 dlm_debug->d_locking_state = debugfs_create_file("locking_state",
2033 S_IFREG|S_IRUSR,
2034 osb->osb_debug_root,
2035 osb,
2036 &ocfs2_dlm_debug_fops);
2037 if (!dlm_debug->d_locking_state) {
2038 ret = -EINVAL;
2039 mlog(ML_ERROR,
2040 "Unable to create locking state debugfs file.\n");
2041 goto out;
2042 }
2043
2044 ocfs2_get_dlm_debug(dlm_debug);
2045out:
2046 return ret;
2047}
2048
2049static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
2050{
2051 struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
2052
2053 if (dlm_debug) {
2054 debugfs_remove(dlm_debug->d_locking_state);
2055 ocfs2_put_dlm_debug(dlm_debug);
2056 }
2057}
2058
2059int ocfs2_dlm_init(struct ocfs2_super *osb)
2060{
2061 int status;
2062 u32 dlm_key;
2063 struct dlm_ctxt *dlm;
2064
2065 mlog_entry_void();
2066
2067 status = ocfs2_dlm_init_debug(osb);
2068 if (status < 0) {
2069 mlog_errno(status);
2070 goto bail;
2071 }
2072
2073 /* launch vote thread */
2074 osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote-%d",
2075 osb->osb_id);
2076 if (IS_ERR(osb->vote_task)) {
2077 status = PTR_ERR(osb->vote_task);
2078 osb->vote_task = NULL;
2079 mlog_errno(status);
2080 goto bail;
2081 }
2082
2083 /* used by the dlm code to make message headers unique, each
2084 * node in this domain must agree on this. */
2085 dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str));
2086
2087 /* for now, uuid == domain */
2088 dlm = dlm_register_domain(osb->uuid_str, dlm_key);
2089 if (IS_ERR(dlm)) {
2090 status = PTR_ERR(dlm);
2091 mlog_errno(status);
2092 goto bail;
2093 }
2094
2095 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
2096 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
2097
2098 dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb);
2099
2100 osb->dlm = dlm;
2101
2102 status = 0;
2103bail:
2104 if (status < 0) {
2105 ocfs2_dlm_shutdown_debug(osb);
2106 if (osb->vote_task)
2107 kthread_stop(osb->vote_task);
2108 }
2109
2110 mlog_exit(status);
2111 return status;
2112}
2113
2114void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
2115{
2116 mlog_entry_void();
2117
2118 dlm_unregister_eviction_cb(&osb->osb_eviction_cb);
2119
2120 ocfs2_drop_osb_locks(osb);
2121
2122 if (osb->vote_task) {
2123 kthread_stop(osb->vote_task);
2124 osb->vote_task = NULL;
2125 }
2126
2127 ocfs2_lock_res_free(&osb->osb_super_lockres);
2128 ocfs2_lock_res_free(&osb->osb_rename_lockres);
2129
2130 dlm_unregister_domain(osb->dlm);
2131 osb->dlm = NULL;
2132
2133 ocfs2_dlm_shutdown_debug(osb);
2134
2135 mlog_exit_void();
2136}
2137
2138static void ocfs2_unlock_ast_func(void *opaque, enum dlm_status status)
2139{
2140 struct ocfs2_lock_res *lockres = opaque;
2141 unsigned long flags;
2142
2143 mlog_entry_void();
2144
2145 mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
2146 lockres->l_unlock_action);
2147
2148 spin_lock_irqsave(&lockres->l_lock, flags);
2149 /* We tried to cancel a convert request, but it was already
2150 * granted. All we want to do here is clear our unlock
2151 * state. The wake_up call done at the bottom is redundant
2152 * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't
2153 * hurt anything anyway */
2154 if (status == DLM_CANCELGRANT &&
2155 lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
2156 mlog(0, "Got cancelgrant for %s\n", lockres->l_name);
2157
2158 /* We don't clear the busy flag in this case as it
2159 * should have been cleared by the ast which the dlm
2160 * has called. */
2161 goto complete_unlock;
2162 }
2163
2164 if (status != DLM_NORMAL) {
2165 mlog(ML_ERROR, "Dlm passes status %d for lock %s, "
2166 "unlock_action %d\n", status, lockres->l_name,
2167 lockres->l_unlock_action);
2168 spin_unlock_irqrestore(&lockres->l_lock, flags);
2169 return;
2170 }
2171
2172 switch(lockres->l_unlock_action) {
2173 case OCFS2_UNLOCK_CANCEL_CONVERT:
2174 mlog(0, "Cancel convert success for %s\n", lockres->l_name);
2175 lockres->l_action = OCFS2_AST_INVALID;
2176 break;
2177 case OCFS2_UNLOCK_DROP_LOCK:
2178 lockres->l_level = LKM_IVMODE;
2179 break;
2180 default:
2181 BUG();
2182 }
2183
2184 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
2185complete_unlock:
2186 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
2187 spin_unlock_irqrestore(&lockres->l_lock, flags);
2188
2189 wake_up(&lockres->l_event);
2190
2191 mlog_exit_void();
2192}
2193
2194typedef void (ocfs2_pre_drop_cb_t)(struct ocfs2_lock_res *, void *);
2195
2196struct drop_lock_cb {
2197 ocfs2_pre_drop_cb_t *drop_func;
2198 void *drop_data;
2199};
2200
2201static int ocfs2_drop_lock(struct ocfs2_super *osb,
2202 struct ocfs2_lock_res *lockres,
2203 struct drop_lock_cb *dcb)
2204{
2205 enum dlm_status status;
2206 unsigned long flags;
2207
2208 /* We didn't get anywhere near actually using this lockres. */
2209 if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
2210 goto out;
2211
2212 spin_lock_irqsave(&lockres->l_lock, flags);
2213
2214 mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING),
2215 "lockres %s, flags 0x%lx\n",
2216 lockres->l_name, lockres->l_flags);
2217
2218 while (lockres->l_flags & OCFS2_LOCK_BUSY) {
2219 mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = "
2220 "%u, unlock_action = %u\n",
2221 lockres->l_name, lockres->l_flags, lockres->l_action,
2222 lockres->l_unlock_action);
2223
2224 spin_unlock_irqrestore(&lockres->l_lock, flags);
2225
2226 /* XXX: Today we just wait on any busy
2227 * locks... Perhaps we need to cancel converts in the
2228 * future? */
2229 ocfs2_wait_on_busy_lock(lockres);
2230
2231 spin_lock_irqsave(&lockres->l_lock, flags);
2232 }
2233
2234 if (dcb)
2235 dcb->drop_func(lockres, dcb->drop_data);
2236
2237 if (lockres->l_flags & OCFS2_LOCK_BUSY)
2238 mlog(ML_ERROR, "destroying busy lock: \"%s\"\n",
2239 lockres->l_name);
2240 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
2241 mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name);
2242
2243 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
2244 spin_unlock_irqrestore(&lockres->l_lock, flags);
2245 goto out;
2246 }
2247
2248 lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED);
2249
2250 /* make sure we never get here while waiting for an ast to
2251 * fire. */
2252 BUG_ON(lockres->l_action != OCFS2_AST_INVALID);
2253
2254 /* is this necessary? */
2255 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
2256 lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK;
2257 spin_unlock_irqrestore(&lockres->l_lock, flags);
2258
2259 mlog(0, "lock %s\n", lockres->l_name);
2260
2261 status = dlmunlock(osb->dlm, &lockres->l_lksb, LKM_VALBLK,
2262 lockres->l_ops->unlock_ast, lockres);
2263 if (status != DLM_NORMAL) {
2264 ocfs2_log_dlm_error("dlmunlock", status, lockres);
2265 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
2266 dlm_print_one_lock(lockres->l_lksb.lockid);
2267 BUG();
2268 }
2269 mlog(0, "lock %s, successfull return from dlmunlock\n",
2270 lockres->l_name);
2271
2272 ocfs2_wait_on_busy_lock(lockres);
2273out:
2274 mlog_exit(0);
2275 return 0;
2276}
2277
2278/* Mark the lockres as being dropped. It will no longer be
2279 * queued if blocking, but we still may have to wait on it
2280 * being dequeued from the vote thread before we can consider
2281 * it safe to drop.
2282 *
2283 * You can *not* attempt to call cluster_lock on this lockres anymore. */
2284void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
2285{
2286 int status;
2287 struct ocfs2_mask_waiter mw;
2288 unsigned long flags;
2289
2290 ocfs2_init_mask_waiter(&mw);
2291
2292 spin_lock_irqsave(&lockres->l_lock, flags);
2293 lockres->l_flags |= OCFS2_LOCK_FREEING;
2294 while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
2295 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
2296 spin_unlock_irqrestore(&lockres->l_lock, flags);
2297
2298 mlog(0, "Waiting on lockres %s\n", lockres->l_name);
2299
2300 status = ocfs2_wait_for_mask(&mw);
2301 if (status)
2302 mlog_errno(status);
2303
2304 spin_lock_irqsave(&lockres->l_lock, flags);
2305 }
2306 spin_unlock_irqrestore(&lockres->l_lock, flags);
2307}
2308
2309static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
2310{
2311 int status;
2312
2313 mlog_entry_void();
2314
2315 ocfs2_mark_lockres_freeing(&osb->osb_super_lockres);
2316
2317 status = ocfs2_drop_lock(osb, &osb->osb_super_lockres, NULL);
2318 if (status < 0)
2319 mlog_errno(status);
2320
2321 ocfs2_mark_lockres_freeing(&osb->osb_rename_lockres);
2322
2323 status = ocfs2_drop_lock(osb, &osb->osb_rename_lockres, NULL);
2324 if (status < 0)
2325 mlog_errno(status);
2326
2327 mlog_exit(status);
2328}
2329
2330static void ocfs2_meta_pre_drop(struct ocfs2_lock_res *lockres, void *data)
2331{
2332 struct inode *inode = data;
2333
2334 /* the metadata lock requires a bit more work as we have an
2335 * LVB to worry about. */
2336 if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
2337 lockres->l_level == LKM_EXMODE &&
2338 !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
2339 __ocfs2_stuff_meta_lvb(inode);
2340}
2341
2342int ocfs2_drop_inode_locks(struct inode *inode)
2343{
2344 int status, err;
2345 struct drop_lock_cb meta_dcb = { ocfs2_meta_pre_drop, inode, };
2346
2347 mlog_entry_void();
2348
2349 /* No need to call ocfs2_mark_lockres_freeing here -
2350 * ocfs2_clear_inode has done it for us. */
2351
2352 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2353 &OCFS2_I(inode)->ip_data_lockres,
2354 NULL);
2355 if (err < 0)
2356 mlog_errno(err);
2357
2358 status = err;
2359
2360 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2361 &OCFS2_I(inode)->ip_meta_lockres,
2362 &meta_dcb);
2363 if (err < 0)
2364 mlog_errno(err);
2365 if (err < 0 && !status)
2366 status = err;
2367
2368 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2369 &OCFS2_I(inode)->ip_rw_lockres,
2370 NULL);
2371 if (err < 0)
2372 mlog_errno(err);
2373 if (err < 0 && !status)
2374 status = err;
2375
2376 mlog_exit(status);
2377 return status;
2378}
2379
2380static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
2381 int new_level)
2382{
2383 assert_spin_locked(&lockres->l_lock);
2384
2385 BUG_ON(lockres->l_blocking <= LKM_NLMODE);
2386
2387 if (lockres->l_level <= new_level) {
2388 mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n",
2389 lockres->l_level, new_level);
2390 BUG();
2391 }
2392
2393 mlog(0, "lock %s, new_level = %d, l_blocking = %d\n",
2394 lockres->l_name, new_level, lockres->l_blocking);
2395
2396 lockres->l_action = OCFS2_AST_DOWNCONVERT;
2397 lockres->l_requested = new_level;
2398 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
2399}
2400
2401static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
2402 struct ocfs2_lock_res *lockres,
2403 int new_level,
2404 int lvb)
2405{
2406 int ret, dlm_flags = LKM_CONVERT;
2407 enum dlm_status status;
2408
2409 mlog_entry_void();
2410
2411 if (lvb)
2412 dlm_flags |= LKM_VALBLK;
2413
2414 status = dlmlock(osb->dlm,
2415 new_level,
2416 &lockres->l_lksb,
2417 dlm_flags,
2418 lockres->l_name,
2419 lockres->l_ops->ast,
2420 lockres,
2421 lockres->l_ops->bast);
2422 if (status != DLM_NORMAL) {
2423 ocfs2_log_dlm_error("dlmlock", status, lockres);
2424 ret = -EINVAL;
2425 ocfs2_recover_from_dlm_error(lockres, 1);
2426 goto bail;
2427 }
2428
2429 ret = 0;
2430bail:
2431 mlog_exit(ret);
2432 return ret;
2433}
2434
2435/* returns 1 when the caller should unlock and call dlmunlock */
2436static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
2437 struct ocfs2_lock_res *lockres)
2438{
2439 assert_spin_locked(&lockres->l_lock);
2440
2441 mlog_entry_void();
2442 mlog(0, "lock %s\n", lockres->l_name);
2443
2444 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
2445 /* If we're already trying to cancel a lock conversion
2446 * then just drop the spinlock and allow the caller to
2447 * requeue this lock. */
2448
2449 mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
2450 return 0;
2451 }
2452
2453 /* were we in a convert when we got the bast fire? */
2454 BUG_ON(lockres->l_action != OCFS2_AST_CONVERT &&
2455 lockres->l_action != OCFS2_AST_DOWNCONVERT);
2456 /* set things up for the unlockast to know to just
2457 * clear out the ast_action and unset busy, etc. */
2458 lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT;
2459
2460 mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY),
2461 "lock %s, invalid flags: 0x%lx\n",
2462 lockres->l_name, lockres->l_flags);
2463
2464 return 1;
2465}
2466
2467static int ocfs2_cancel_convert(struct ocfs2_super *osb,
2468 struct ocfs2_lock_res *lockres)
2469{
2470 int ret;
2471 enum dlm_status status;
2472
2473 mlog_entry_void();
2474 mlog(0, "lock %s\n", lockres->l_name);
2475
2476 ret = 0;
2477 status = dlmunlock(osb->dlm,
2478 &lockres->l_lksb,
2479 LKM_CANCEL,
2480 lockres->l_ops->unlock_ast,
2481 lockres);
2482 if (status != DLM_NORMAL) {
2483 ocfs2_log_dlm_error("dlmunlock", status, lockres);
2484 ret = -EINVAL;
2485 ocfs2_recover_from_dlm_error(lockres, 0);
2486 }
2487
2488 mlog(0, "lock %s return from dlmunlock\n", lockres->l_name);
2489
2490 mlog_exit(ret);
2491 return ret;
2492}
2493
2494static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
2495 struct ocfs2_lock_res *lockres,
2496 int new_level)
2497{
2498 int ret;
2499
2500 mlog_entry_void();
2501
2502 BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE);
2503
2504 if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
2505 ret = 0;
2506 mlog(0, "lockres %s currently being refreshed -- backing "
2507 "off!\n", lockres->l_name);
2508 } else if (new_level == LKM_PRMODE)
2509 ret = !lockres->l_ex_holders &&
2510 ocfs2_inode_fully_checkpointed(inode);
2511 else /* Must be NLMODE we're converting to. */
2512 ret = !lockres->l_ro_holders && !lockres->l_ex_holders &&
2513 ocfs2_inode_fully_checkpointed(inode);
2514
2515 mlog_exit(ret);
2516 return ret;
2517}
2518
2519static int ocfs2_do_unblock_meta(struct inode *inode,
2520 int *requeue)
2521{
2522 int new_level;
2523 int set_lvb = 0;
2524 int ret = 0;
2525 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
2526 unsigned long flags;
2527
2528 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2529
2530 mlog_entry_void();
2531
2532 spin_lock_irqsave(&lockres->l_lock, flags);
2533
2534 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
2535
2536 mlog(0, "l_level=%d, l_blocking=%d\n", lockres->l_level,
2537 lockres->l_blocking);
2538
2539 BUG_ON(lockres->l_level != LKM_EXMODE &&
2540 lockres->l_level != LKM_PRMODE);
2541
2542 if (lockres->l_flags & OCFS2_LOCK_BUSY) {
2543 *requeue = 1;
2544 ret = ocfs2_prepare_cancel_convert(osb, lockres);
2545 spin_unlock_irqrestore(&lockres->l_lock, flags);
2546 if (ret) {
2547 ret = ocfs2_cancel_convert(osb, lockres);
2548 if (ret < 0)
2549 mlog_errno(ret);
2550 }
2551 goto leave;
2552 }
2553
2554 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
2555
2556 mlog(0, "l_level=%d, l_blocking=%d, new_level=%d\n",
2557 lockres->l_level, lockres->l_blocking, new_level);
2558
2559 if (ocfs2_can_downconvert_meta_lock(inode, lockres, new_level)) {
2560 if (lockres->l_level == LKM_EXMODE)
2561 set_lvb = 1;
2562
2563 /* If the lock hasn't been refreshed yet (rare), then
2564 * our memory inode values are old and we skip
2565 * stuffing the lvb. There's no need to actually clear
2566 * out the lvb here as it's value is still valid. */
2567 if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
2568 if (set_lvb)
2569 __ocfs2_stuff_meta_lvb(inode);
2570 } else
2571 mlog(0, "lockres %s: downconverting stale lock!\n",
2572 lockres->l_name);
2573
2574 mlog(0, "calling ocfs2_downconvert_lock with l_level=%d, "
2575 "l_blocking=%d, new_level=%d\n",
2576 lockres->l_level, lockres->l_blocking, new_level);
2577
2578 ocfs2_prepare_downconvert(lockres, new_level);
2579 spin_unlock_irqrestore(&lockres->l_lock, flags);
2580 ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb);
2581 goto leave;
2582 }
2583 if (!ocfs2_inode_fully_checkpointed(inode))
2584 ocfs2_start_checkpoint(osb);
2585
2586 *requeue = 1;
2587 spin_unlock_irqrestore(&lockres->l_lock, flags);
2588 ret = 0;
2589leave:
2590 mlog_exit(ret);
2591 return ret;
2592}
2593
2594static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
2595 struct ocfs2_lock_res *lockres,
2596 int *requeue,
2597 ocfs2_convert_worker_t *worker)
2598{
2599 unsigned long flags;
2600 int blocking;
2601 int new_level;
2602 int ret = 0;
2603
2604 mlog_entry_void();
2605
2606 spin_lock_irqsave(&lockres->l_lock, flags);
2607
2608 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
2609
2610recheck:
2611 if (lockres->l_flags & OCFS2_LOCK_BUSY) {
2612 *requeue = 1;
2613 ret = ocfs2_prepare_cancel_convert(osb, lockres);
2614 spin_unlock_irqrestore(&lockres->l_lock, flags);
2615 if (ret) {
2616 ret = ocfs2_cancel_convert(osb, lockres);
2617 if (ret < 0)
2618 mlog_errno(ret);
2619 }
2620 goto leave;
2621 }
2622
2623 /* if we're blocking an exclusive and we have *any* holders,
2624 * then requeue. */
2625 if ((lockres->l_blocking == LKM_EXMODE)
2626 && (lockres->l_ex_holders || lockres->l_ro_holders)) {
2627 spin_unlock_irqrestore(&lockres->l_lock, flags);
2628 *requeue = 1;
2629 ret = 0;
2630 goto leave;
2631 }
2632
2633 /* If it's a PR we're blocking, then only
2634 * requeue if we've got any EX holders */
2635 if (lockres->l_blocking == LKM_PRMODE &&
2636 lockres->l_ex_holders) {
2637 spin_unlock_irqrestore(&lockres->l_lock, flags);
2638 *requeue = 1;
2639 ret = 0;
2640 goto leave;
2641 }
2642
2643 /* If we get here, then we know that there are no more
2644 * incompatible holders (and anyone asking for an incompatible
2645 * lock is blocked). We can now downconvert the lock */
2646 if (!worker)
2647 goto downconvert;
2648
2649 /* Some lockres types want to do a bit of work before
2650 * downconverting a lock. Allow that here. The worker function
2651 * may sleep, so we save off a copy of what we're blocking as
2652 * it may change while we're not holding the spin lock. */
2653 blocking = lockres->l_blocking;
2654 spin_unlock_irqrestore(&lockres->l_lock, flags);
2655
2656 worker(lockres, blocking);
2657
2658 spin_lock_irqsave(&lockres->l_lock, flags);
2659 if (blocking != lockres->l_blocking) {
2660 /* If this changed underneath us, then we can't drop
2661 * it just yet. */
2662 goto recheck;
2663 }
2664
2665downconvert:
2666 *requeue = 0;
2667 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
2668
2669 ocfs2_prepare_downconvert(lockres, new_level);
2670 spin_unlock_irqrestore(&lockres->l_lock, flags);
2671 ret = ocfs2_downconvert_lock(osb, lockres, new_level, 0);
2672leave:
2673 mlog_exit(ret);
2674 return ret;
2675}
2676
2677static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
2678 int blocking)
2679{
2680 struct inode *inode;
2681 struct address_space *mapping;
2682
2683 mlog_entry_void();
2684
2685 inode = ocfs2_lock_res_inode(lockres);
2686 mapping = inode->i_mapping;
2687
2688 if (filemap_fdatawrite(mapping)) {
2689 mlog(ML_ERROR, "Could not sync inode %"MLFu64" for downconvert!",
2690 OCFS2_I(inode)->ip_blkno);
2691 }
2692 sync_mapping_buffers(mapping);
2693 if (blocking == LKM_EXMODE) {
2694 truncate_inode_pages(mapping, 0);
2695 unmap_mapping_range(mapping, 0, 0, 0);
2696 } else {
2697 /* We only need to wait on the I/O if we're not also
2698 * truncating pages because truncate_inode_pages waits
2699 * for us above. We don't truncate pages if we're
2700 * blocking anything < EXMODE because we want to keep
2701 * them around in that case. */
2702 filemap_fdatawait(mapping);
2703 }
2704
2705 mlog_exit_void();
2706}
2707
2708int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
2709 int *requeue)
2710{
2711 int status;
2712 struct inode *inode;
2713 struct ocfs2_super *osb;
2714
2715 mlog_entry_void();
2716
2717 inode = ocfs2_lock_res_inode(lockres);
2718 osb = OCFS2_SB(inode->i_sb);
2719
2720 mlog(0, "unblock inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
2721
2722 status = ocfs2_generic_unblock_lock(osb,
2723 lockres,
2724 requeue,
2725 ocfs2_data_convert_worker);
2726 if (status < 0)
2727 mlog_errno(status);
2728
2729 mlog(0, "inode %"MLFu64", requeue = %d\n",
2730 OCFS2_I(inode)->ip_blkno, *requeue);
2731
2732 mlog_exit(status);
2733 return status;
2734}
2735
2736static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
2737 int *requeue)
2738{
2739 int status;
2740 struct inode *inode;
2741
2742 mlog_entry_void();
2743
2744 mlog(0, "Unblock lockres %s\n", lockres->l_name);
2745
2746 inode = ocfs2_lock_res_inode(lockres);
2747
2748 status = ocfs2_generic_unblock_lock(OCFS2_SB(inode->i_sb),
2749 lockres,
2750 requeue,
2751 NULL);
2752 if (status < 0)
2753 mlog_errno(status);
2754
2755 mlog_exit(status);
2756 return status;
2757}
2758
2759
2760int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
2761 int *requeue)
2762{
2763 int status;
2764 struct inode *inode;
2765
2766 mlog_entry_void();
2767
2768 inode = ocfs2_lock_res_inode(lockres);
2769
2770 mlog(0, "unblock inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
2771
2772 status = ocfs2_do_unblock_meta(inode, requeue);
2773 if (status < 0)
2774 mlog_errno(status);
2775
2776 mlog(0, "inode %"MLFu64", requeue = %d\n",
2777 OCFS2_I(inode)->ip_blkno, *requeue);
2778
2779 mlog_exit(status);
2780 return status;
2781}
2782
2783/* Generic unblock function for any lockres whose private data is an
2784 * ocfs2_super pointer. */
2785static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
2786 int *requeue)
2787{
2788 int status;
2789 struct ocfs2_super *osb;
2790
2791 mlog_entry_void();
2792
2793 mlog(0, "Unblock lockres %s\n", lockres->l_name);
2794
2795 osb = ocfs2_lock_res_super(lockres);
2796
2797 status = ocfs2_generic_unblock_lock(osb,
2798 lockres,
2799 requeue,
2800 NULL);
2801 if (status < 0)
2802 mlog_errno(status);
2803
2804 mlog_exit(status);
2805 return status;
2806}
2807
2808void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
2809 struct ocfs2_lock_res *lockres)
2810{
2811 int status;
2812 int requeue = 0;
2813 unsigned long flags;
2814
2815 /* Our reference to the lockres in this function can be
2816 * considered valid until we remove the OCFS2_LOCK_QUEUED
2817 * flag. */
2818
2819 mlog_entry_void();
2820
2821 BUG_ON(!lockres);
2822 BUG_ON(!lockres->l_ops);
2823 BUG_ON(!lockres->l_ops->unblock);
2824
2825 mlog(0, "lockres %s blocked.\n", lockres->l_name);
2826
2827 /* Detect whether a lock has been marked as going away while
2828 * the vote thread was processing other things. A lock can
2829 * still be marked with OCFS2_LOCK_FREEING after this check,
2830 * but short circuiting here will still save us some
2831 * performance. */
2832 spin_lock_irqsave(&lockres->l_lock, flags);
2833 if (lockres->l_flags & OCFS2_LOCK_FREEING)
2834 goto unqueue;
2835 spin_unlock_irqrestore(&lockres->l_lock, flags);
2836
2837 status = lockres->l_ops->unblock(lockres, &requeue);
2838 if (status < 0)
2839 mlog_errno(status);
2840
2841 spin_lock_irqsave(&lockres->l_lock, flags);
2842unqueue:
2843 if (lockres->l_flags & OCFS2_LOCK_FREEING || !requeue) {
2844 lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED);
2845 } else
2846 ocfs2_schedule_blocked_lock(osb, lockres);
2847
2848 mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name,
2849 requeue ? "yes" : "no");
2850 spin_unlock_irqrestore(&lockres->l_lock, flags);
2851
2852 mlog_exit_void();
2853}
2854
2855static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
2856 struct ocfs2_lock_res *lockres)
2857{
2858 mlog_entry_void();
2859
2860 assert_spin_locked(&lockres->l_lock);
2861
2862 if (lockres->l_flags & OCFS2_LOCK_FREEING) {
2863 /* Do not schedule a lock for downconvert when it's on
2864 * the way to destruction - any nodes wanting access
2865 * to the resource will get it soon. */
2866 mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n",
2867 lockres->l_name, lockres->l_flags);
2868 return;
2869 }
2870
2871 lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
2872
2873 spin_lock(&osb->vote_task_lock);
2874 if (list_empty(&lockres->l_blocked_list)) {
2875 list_add_tail(&lockres->l_blocked_list,
2876 &osb->blocked_lock_list);
2877 osb->blocked_lock_count++;
2878 }
2879 spin_unlock(&osb->vote_task_lock);
2880
2881 mlog_exit_void();
2882}
2883
2884/* This aids in debugging situations where a bad LVB might be involved. */
2885void ocfs2_dump_meta_lvb_info(u64 level,
2886 const char *function,
2887 unsigned int line,
2888 struct ocfs2_lock_res *lockres)
2889{
2890 struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
2891
2892 mlog(level, "LVB information for %s (called from %s:%u):\n",
2893 lockres->l_name, function, line);
2894 mlog(level, "version: %u, clusters: %u\n",
2895 be32_to_cpu(lvb->lvb_version), be32_to_cpu(lvb->lvb_iclusters));
2896 mlog(level, "size: %"MLFu64", uid %u, gid %u, mode 0x%x\n",
2897 be64_to_cpu(lvb->lvb_isize), be32_to_cpu(lvb->lvb_iuid),
2898 be32_to_cpu(lvb->lvb_igid), be16_to_cpu(lvb->lvb_imode));
2899 mlog(level, "nlink %u, atime_packed 0x%"MLFx64", "
2900 "ctime_packed 0x%"MLFx64", mtime_packed 0x%"MLFx64"\n",
2901 be16_to_cpu(lvb->lvb_inlink), be64_to_cpu(lvb->lvb_iatime_packed),
2902 be64_to_cpu(lvb->lvb_ictime_packed),
2903 be64_to_cpu(lvb->lvb_imtime_packed));
2904}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
new file mode 100644
index 00000000000..8f2d1db2d9e
--- /dev/null
+++ b/fs/ocfs2/dlmglue.h
@@ -0,0 +1,111 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmglue.h
5 *
6 * description here
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26
27#ifndef DLMGLUE_H
28#define DLMGLUE_H
29
30#define OCFS2_LVB_VERSION 2
31
32struct ocfs2_meta_lvb {
33 __be32 lvb_version;
34 __be32 lvb_iclusters;
35 __be32 lvb_iuid;
36 __be32 lvb_igid;
37 __be64 lvb_iatime_packed;
38 __be64 lvb_ictime_packed;
39 __be64 lvb_imtime_packed;
40 __be64 lvb_isize;
41 __be16 lvb_imode;
42 __be16 lvb_inlink;
43 __be32 lvb_reserved[3];
44};
45
46/* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */
47/* don't wait on recovery. */
48#define OCFS2_META_LOCK_RECOVERY (0x01)
49/* Instruct the dlm not to queue ourselves on the other node. */
50#define OCFS2_META_LOCK_NOQUEUE (0x02)
51/* don't block waiting for the vote thread, instead return -EAGAIN */
52#define OCFS2_LOCK_NONBLOCK (0x04)
53
54int ocfs2_dlm_init(struct ocfs2_super *osb);
55void ocfs2_dlm_shutdown(struct ocfs2_super *osb);
56void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
57void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
58 enum ocfs2_lock_type type,
59 struct inode *inode);
60void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
61int ocfs2_create_new_inode_locks(struct inode *inode);
62int ocfs2_drop_inode_locks(struct inode *inode);
63int ocfs2_data_lock_full(struct inode *inode,
64 int write,
65 int arg_flags);
66#define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0)
67int ocfs2_data_lock_with_page(struct inode *inode,
68 int write,
69 struct page *page);
70void ocfs2_data_unlock(struct inode *inode,
71 int write);
72int ocfs2_rw_lock(struct inode *inode, int write);
73void ocfs2_rw_unlock(struct inode *inode, int write);
74int ocfs2_meta_lock_full(struct inode *inode,
75 struct ocfs2_journal_handle *handle,
76 struct buffer_head **ret_bh,
77 int ex,
78 int arg_flags);
79int ocfs2_meta_lock_with_page(struct inode *inode,
80 struct ocfs2_journal_handle *handle,
81 struct buffer_head **ret_bh,
82 int ex,
83 struct page *page);
84/* 99% of the time we don't want to supply any additional flags --
85 * those are for very specific cases only. */
86#define ocfs2_meta_lock(i, h, b, e) ocfs2_meta_lock_full(i, h, b, e, 0)
87void ocfs2_meta_unlock(struct inode *inode,
88 int ex);
89int ocfs2_super_lock(struct ocfs2_super *osb,
90 int ex);
91void ocfs2_super_unlock(struct ocfs2_super *osb,
92 int ex);
93int ocfs2_rename_lock(struct ocfs2_super *osb);
94void ocfs2_rename_unlock(struct ocfs2_super *osb);
95void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
96
97/* for the vote thread */
98void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
99 struct ocfs2_lock_res *lockres);
100
101struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
102void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
103
104/* aids in debugging and tracking lvbs */
105void ocfs2_dump_meta_lvb_info(u64 level,
106 const char *function,
107 unsigned int line,
108 struct ocfs2_lock_res *lockres);
109#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
110
111#endif /* DLMGLUE_H */
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h
new file mode 100644
index 00000000000..f226b220762
--- /dev/null
+++ b/fs/ocfs2/endian.h
@@ -0,0 +1,45 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#ifndef OCFS2_ENDIAN_H
23#define OCFS2_ENDIAN_H
24
25static inline void le16_add_cpu(__le16 *var, u16 val)
26{
27 *var = cpu_to_le16(le16_to_cpu(*var) + val);
28}
29
30static inline void le32_add_cpu(__le32 *var, u32 val)
31{
32 *var = cpu_to_le32(le32_to_cpu(*var) + val);
33}
34
35static inline void le32_and_cpu(__le32 *var, u32 val)
36{
37 *var = cpu_to_le32(le32_to_cpu(*var) & val);
38}
39
40static inline void be32_add_cpu(__be32 *var, u32 val)
41{
42 *var = cpu_to_be32(be32_to_cpu(*var) + val);
43}
44
45#endif /* OCFS2_ENDIAN_H */
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
new file mode 100644
index 00000000000..5810160d92a
--- /dev/null
+++ b/fs/ocfs2/export.c
@@ -0,0 +1,248 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * export.c
5 *
6 * Functions to facilitate NFS exporting
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28
29#define MLOG_MASK_PREFIX ML_EXPORT
30#include <cluster/masklog.h>
31
32#include "ocfs2.h"
33
34#include "dir.h"
35#include "dlmglue.h"
36#include "export.h"
37#include "inode.h"
38
39#include "buffer_head_io.h"
40
41struct ocfs2_inode_handle
42{
43 u64 ih_blkno;
44 u32 ih_generation;
45};
46
47static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp)
48{
49 struct ocfs2_inode_handle *handle = vobjp;
50 struct inode *inode;
51 struct dentry *result;
52
53 mlog_entry("(0x%p, 0x%p)\n", sb, handle);
54
55 if (handle->ih_blkno == 0) {
56 mlog_errno(-ESTALE);
57 return ERR_PTR(-ESTALE);
58 }
59
60 inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno);
61
62 if (IS_ERR(inode)) {
63 mlog_errno(PTR_ERR(inode));
64 return (void *)inode;
65 }
66
67 if (handle->ih_generation != inode->i_generation) {
68 iput(inode);
69 mlog_errno(-ESTALE);
70 return ERR_PTR(-ESTALE);
71 }
72
73 result = d_alloc_anon(inode);
74
75 if (!result) {
76 iput(inode);
77 mlog_errno(-ENOMEM);
78 return ERR_PTR(-ENOMEM);
79 }
80
81 mlog_exit_ptr(result);
82 return result;
83}
84
85static struct dentry *ocfs2_get_parent(struct dentry *child)
86{
87 int status;
88 u64 blkno;
89 struct dentry *parent;
90 struct inode *inode;
91 struct inode *dir = child->d_inode;
92 struct buffer_head *dirent_bh = NULL;
93 struct ocfs2_dir_entry *dirent;
94
95 mlog_entry("(0x%p, '%.*s')\n", child,
96 child->d_name.len, child->d_name.name);
97
98 mlog(0, "find parent of directory %"MLFu64"\n",
99 OCFS2_I(dir)->ip_blkno);
100
101 status = ocfs2_meta_lock(dir, NULL, NULL, 0);
102 if (status < 0) {
103 if (status != -ENOENT)
104 mlog_errno(status);
105 parent = ERR_PTR(status);
106 goto bail;
107 }
108
109 status = ocfs2_find_files_on_disk("..", 2, &blkno, dir, &dirent_bh,
110 &dirent);
111 if (status < 0) {
112 parent = ERR_PTR(-ENOENT);
113 goto bail_unlock;
114 }
115
116 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno);
117 if (IS_ERR(inode)) {
118 mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno);
119 parent = ERR_PTR(-EACCES);
120 goto bail_unlock;
121 }
122
123 parent = d_alloc_anon(inode);
124 if (!parent) {
125 iput(inode);
126 parent = ERR_PTR(-ENOMEM);
127 }
128
129bail_unlock:
130 ocfs2_meta_unlock(dir, 0);
131
132 if (dirent_bh)
133 brelse(dirent_bh);
134
135bail:
136 mlog_exit_ptr(parent);
137
138 return parent;
139}
140
141static int ocfs2_encode_fh(struct dentry *dentry, __be32 *fh, int *max_len,
142 int connectable)
143{
144 struct inode *inode = dentry->d_inode;
145 int len = *max_len;
146 int type = 1;
147 u64 blkno;
148 u32 generation;
149
150 mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry,
151 dentry->d_name.len, dentry->d_name.name,
152 fh, len, connectable);
153
154 if (len < 3 || (connectable && len < 6)) {
155 mlog(ML_ERROR, "fh buffer is too small for encoding\n");
156 type = 255;
157 goto bail;
158 }
159
160 blkno = OCFS2_I(inode)->ip_blkno;
161 generation = inode->i_generation;
162
163 mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n",
164 blkno, generation);
165
166 len = 3;
167 fh[0] = cpu_to_le32((u32)(blkno >> 32));
168 fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff));
169 fh[2] = cpu_to_le32(generation);
170
171 if (connectable && !S_ISDIR(inode->i_mode)) {
172 struct inode *parent;
173
174 spin_lock(&dentry->d_lock);
175
176 parent = dentry->d_parent->d_inode;
177 blkno = OCFS2_I(parent)->ip_blkno;
178 generation = parent->i_generation;
179
180 fh[3] = cpu_to_le32((u32)(blkno >> 32));
181 fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff));
182 fh[5] = cpu_to_le32(generation);
183
184 spin_unlock(&dentry->d_lock);
185
186 len = 6;
187 type = 2;
188
189 mlog(0, "Encoding parent: blkno: %"MLFu64", generation: %u\n",
190 blkno, generation);
191 }
192
193 *max_len = len;
194
195bail:
196 mlog_exit(type);
197 return type;
198}
199
200static struct dentry *ocfs2_decode_fh(struct super_block *sb, __be32 *fh,
201 int fh_len, int fileid_type,
202 int (*acceptable)(void *context,
203 struct dentry *de),
204 void *context)
205{
206 struct ocfs2_inode_handle handle, parent;
207 struct dentry *ret = NULL;
208
209 mlog_entry("(0x%p, 0x%p, %d, %d, 0x%p, 0x%p)\n",
210 sb, fh, fh_len, fileid_type, acceptable, context);
211
212 if (fh_len < 3 || fileid_type > 2)
213 goto bail;
214
215 if (fileid_type == 2) {
216 if (fh_len < 6)
217 goto bail;
218
219 parent.ih_blkno = (u64)le32_to_cpu(fh[3]) << 32;
220 parent.ih_blkno |= (u64)le32_to_cpu(fh[4]);
221 parent.ih_generation = le32_to_cpu(fh[5]);
222
223 mlog(0, "Decoding parent: blkno: %"MLFu64", generation: %u\n",
224 parent.ih_blkno, parent.ih_generation);
225 }
226
227 handle.ih_blkno = (u64)le32_to_cpu(fh[0]) << 32;
228 handle.ih_blkno |= (u64)le32_to_cpu(fh[1]);
229 handle.ih_generation = le32_to_cpu(fh[2]);
230
231 mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n",
232 handle.ih_blkno, handle.ih_generation);
233
234 ret = ocfs2_export_ops.find_exported_dentry(sb, &handle, &parent,
235 acceptable, context);
236
237bail:
238 mlog_exit_ptr(ret);
239 return ret;
240}
241
242struct export_operations ocfs2_export_ops = {
243 .decode_fh = ocfs2_decode_fh,
244 .encode_fh = ocfs2_encode_fh,
245
246 .get_parent = ocfs2_get_parent,
247 .get_dentry = ocfs2_get_dentry,
248};
diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h
new file mode 100644
index 00000000000..5b77ee7866e
--- /dev/null
+++ b/fs/ocfs2/export.h
@@ -0,0 +1,31 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * export.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_EXPORT_H
27#define OCFS2_EXPORT_H
28
29extern struct export_operations ocfs2_export_ops;
30
31#endif /* OCFS2_EXPORT_H */
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
new file mode 100644
index 00000000000..f2fb40cd296
--- /dev/null
+++ b/fs/ocfs2/extent_map.c
@@ -0,0 +1,994 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * extent_map.c
5 *
6 * In-memory extent map for OCFS2. Man, this code was prettier in
7 * the library.
8 *
9 * Copyright (C) 2004 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License, version 2, as published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/init.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/rbtree.h>
31
32#define MLOG_MASK_PREFIX ML_EXTENT_MAP
33#include <cluster/masklog.h>
34
35#include "ocfs2.h"
36
37#include "extent_map.h"
38#include "inode.h"
39#include "super.h"
40
41#include "buffer_head_io.h"
42
43
44/*
45 * SUCK SUCK SUCK
46 * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
47 */
48
49struct ocfs2_extent_map_entry {
50 struct rb_node e_node;
51 int e_tree_depth;
52 struct ocfs2_extent_rec e_rec;
53};
54
55struct ocfs2_em_insert_context {
56 int need_left;
57 int need_right;
58 struct ocfs2_extent_map_entry *new_ent;
59 struct ocfs2_extent_map_entry *old_ent;
60 struct ocfs2_extent_map_entry *left_ent;
61 struct ocfs2_extent_map_entry *right_ent;
62};
63
64static kmem_cache_t *ocfs2_em_ent_cachep = NULL;
65
66
67static struct ocfs2_extent_map_entry *
68ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
69 u32 cpos, u32 clusters,
70 struct rb_node ***ret_p,
71 struct rb_node **ret_parent);
72static int ocfs2_extent_map_insert(struct inode *inode,
73 struct ocfs2_extent_rec *rec,
74 int tree_depth);
75static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
76 struct ocfs2_extent_map_entry *ent);
77static int ocfs2_extent_map_find_leaf(struct inode *inode,
78 u32 cpos, u32 clusters,
79 struct ocfs2_extent_list *el);
80static int ocfs2_extent_map_lookup_read(struct inode *inode,
81 u32 cpos, u32 clusters,
82 struct ocfs2_extent_map_entry **ret_ent);
83static int ocfs2_extent_map_try_insert(struct inode *inode,
84 struct ocfs2_extent_rec *rec,
85 int tree_depth,
86 struct ocfs2_em_insert_context *ctxt);
87
88/* returns 1 only if the rec contains all the given clusters -- that is that
89 * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
90 * clusters) is >= the argument's endpoint */
91static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
92 u32 cpos, u32 clusters)
93{
94 if (le32_to_cpu(rec->e_cpos) > cpos)
95 return 0;
96 if (cpos + clusters > le32_to_cpu(rec->e_cpos) +
97 le32_to_cpu(rec->e_clusters))
98 return 0;
99 return 1;
100}
101
102
103/*
104 * Find an entry in the tree that intersects the region passed in.
105 * Note that this will find straddled intervals, it is up to the
106 * callers to enforce any boundary conditions.
107 *
108 * Callers must hold ip_lock. This lookup is not guaranteed to return
109 * a tree_depth 0 match, and as such can race inserts if the lock
110 * were not held.
111 *
112 * The rb_node garbage lets insertion share the search. Trivial
113 * callers pass NULL.
114 */
115static struct ocfs2_extent_map_entry *
116ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
117 u32 cpos, u32 clusters,
118 struct rb_node ***ret_p,
119 struct rb_node **ret_parent)
120{
121 struct rb_node **p = &em->em_extents.rb_node;
122 struct rb_node *parent = NULL;
123 struct ocfs2_extent_map_entry *ent = NULL;
124
125 while (*p)
126 {
127 parent = *p;
128 ent = rb_entry(parent, struct ocfs2_extent_map_entry,
129 e_node);
130 if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
131 p = &(*p)->rb_left;
132 ent = NULL;
133 } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
134 le32_to_cpu(ent->e_rec.e_clusters))) {
135 p = &(*p)->rb_right;
136 ent = NULL;
137 } else
138 break;
139 }
140
141 if (ret_p != NULL)
142 *ret_p = p;
143 if (ret_parent != NULL)
144 *ret_parent = parent;
145 return ent;
146}
147
148/*
149 * Find the leaf containing the interval we want. While we're on our
150 * way down the tree, fill in every record we see at any depth, because
151 * we might want it later.
152 *
153 * Note that this code is run without ip_lock. That's because it
154 * sleeps while reading. If someone is also filling the extent list at
155 * the same time we are, we might have to restart.
156 */
157static int ocfs2_extent_map_find_leaf(struct inode *inode,
158 u32 cpos, u32 clusters,
159 struct ocfs2_extent_list *el)
160{
161 int i, ret;
162 struct buffer_head *eb_bh = NULL;
163 u64 blkno;
164 u32 rec_end;
165 struct ocfs2_extent_block *eb;
166 struct ocfs2_extent_rec *rec;
167
168 /*
169 * The bh data containing the el cannot change here, because
170 * we hold alloc_sem. So we can do this without other
171 * locks.
172 */
173 while (el->l_tree_depth)
174 {
175 blkno = 0;
176 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
177 rec = &el->l_recs[i];
178 rec_end = (le32_to_cpu(rec->e_cpos) +
179 le32_to_cpu(rec->e_clusters));
180
181 ret = -EBADR;
182 if (rec_end > OCFS2_I(inode)->ip_clusters) {
183 mlog_errno(ret);
184 goto out_free;
185 }
186
187 if (rec_end <= cpos) {
188 ret = ocfs2_extent_map_insert(inode, rec,
189 le16_to_cpu(el->l_tree_depth));
190 if (ret && (ret != -EEXIST)) {
191 mlog_errno(ret);
192 goto out_free;
193 }
194 continue;
195 }
196 if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
197 ret = ocfs2_extent_map_insert(inode, rec,
198 le16_to_cpu(el->l_tree_depth));
199 if (ret && (ret != -EEXIST)) {
200 mlog_errno(ret);
201 goto out_free;
202 }
203 continue;
204 }
205
206 /*
207 * We've found a record that matches our
208 * interval. We don't insert it because we're
209 * about to traverse it.
210 */
211
212 /* Check to see if we're stradling */
213 ret = -ESRCH;
214 if (!ocfs2_extent_rec_contains_clusters(rec,
215 cpos,
216 clusters)) {
217 mlog_errno(ret);
218 goto out_free;
219 }
220
221 /*
222 * If we've already found a record, the el has
223 * two records covering the same interval.
224 * EEEK!
225 */
226 ret = -EBADR;
227 if (blkno) {
228 mlog_errno(ret);
229 goto out_free;
230 }
231
232 blkno = le64_to_cpu(rec->e_blkno);
233 }
234
235 /*
236 * We don't support holes, and we're still up
237 * in the branches, so we'd better have found someone
238 */
239 ret = -EBADR;
240 if (!blkno) {
241 mlog_errno(ret);
242 goto out_free;
243 }
244
245 if (eb_bh) {
246 brelse(eb_bh);
247 eb_bh = NULL;
248 }
249 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
250 blkno, &eb_bh, OCFS2_BH_CACHED,
251 inode);
252 if (ret) {
253 mlog_errno(ret);
254 goto out_free;
255 }
256 eb = (struct ocfs2_extent_block *)eb_bh->b_data;
257 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
258 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
259 ret = -EIO;
260 goto out_free;
261 }
262 el = &eb->h_list;
263 }
264
265 if (el->l_tree_depth)
266 BUG();
267
268 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
269 rec = &el->l_recs[i];
270 ret = ocfs2_extent_map_insert(inode, rec,
271 le16_to_cpu(el->l_tree_depth));
272 if (ret) {
273 mlog_errno(ret);
274 goto out_free;
275 }
276 }
277
278 ret = 0;
279
280out_free:
281 if (eb_bh)
282 brelse(eb_bh);
283
284 return ret;
285}
286
287/*
288 * This lookup actually will read from disk. It has one invariant:
289 * It will never re-traverse blocks. This means that all inserts should
290 * be new regions or more granular regions (both allowed by insert).
291 */
292static int ocfs2_extent_map_lookup_read(struct inode *inode,
293 u32 cpos,
294 u32 clusters,
295 struct ocfs2_extent_map_entry **ret_ent)
296{
297 int ret;
298 u64 blkno;
299 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
300 struct ocfs2_extent_map_entry *ent;
301 struct buffer_head *bh = NULL;
302 struct ocfs2_extent_block *eb;
303 struct ocfs2_dinode *di;
304 struct ocfs2_extent_list *el;
305
306 spin_lock(&OCFS2_I(inode)->ip_lock);
307 ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
308 if (ent) {
309 if (!ent->e_tree_depth) {
310 spin_unlock(&OCFS2_I(inode)->ip_lock);
311 *ret_ent = ent;
312 return 0;
313 }
314 blkno = le64_to_cpu(ent->e_rec.e_blkno);
315 spin_unlock(&OCFS2_I(inode)->ip_lock);
316
317 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
318 OCFS2_BH_CACHED, inode);
319 if (ret) {
320 mlog_errno(ret);
321 if (bh)
322 brelse(bh);
323 return ret;
324 }
325 eb = (struct ocfs2_extent_block *)bh->b_data;
326 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
327 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
328 brelse(bh);
329 return -EIO;
330 }
331 el = &eb->h_list;
332 } else {
333 spin_unlock(&OCFS2_I(inode)->ip_lock);
334
335 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
336 OCFS2_I(inode)->ip_blkno, &bh,
337 OCFS2_BH_CACHED, inode);
338 if (ret) {
339 mlog_errno(ret);
340 if (bh)
341 brelse(bh);
342 return ret;
343 }
344 di = (struct ocfs2_dinode *)bh->b_data;
345 if (!OCFS2_IS_VALID_DINODE(di)) {
346 brelse(bh);
347 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
348 return -EIO;
349 }
350 el = &di->id2.i_list;
351 }
352
353 ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
354 brelse(bh);
355 if (ret) {
356 mlog_errno(ret);
357 return ret;
358 }
359
360 ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
361 if (!ent) {
362 ret = -ESRCH;
363 mlog_errno(ret);
364 return ret;
365 }
366
367 if (ent->e_tree_depth)
368 BUG(); /* FIXME: Make sure this isn't a corruption */
369
370 *ret_ent = ent;
371
372 return 0;
373}
374
375/*
376 * Callers must hold ip_lock. This can insert pieces of the tree,
377 * thus racing lookup if the lock weren't held.
378 */
379static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
380 struct ocfs2_extent_map_entry *ent)
381{
382 struct rb_node **p, *parent;
383 struct ocfs2_extent_map_entry *old_ent;
384
385 old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos),
386 le32_to_cpu(ent->e_rec.e_clusters),
387 &p, &parent);
388 if (old_ent)
389 return -EEXIST;
390
391 rb_link_node(&ent->e_node, parent, p);
392 rb_insert_color(&ent->e_node, &em->em_extents);
393
394 return 0;
395}
396
397
398/*
399 * Simple rule: on any return code other than -EAGAIN, anything left
400 * in the insert_context will be freed.
401 */
402static int ocfs2_extent_map_try_insert(struct inode *inode,
403 struct ocfs2_extent_rec *rec,
404 int tree_depth,
405 struct ocfs2_em_insert_context *ctxt)
406{
407 int ret;
408 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
409 struct ocfs2_extent_map_entry *old_ent;
410
411 ctxt->need_left = 0;
412 ctxt->need_right = 0;
413 ctxt->old_ent = NULL;
414
415 spin_lock(&OCFS2_I(inode)->ip_lock);
416 ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
417 if (!ret) {
418 ctxt->new_ent = NULL;
419 goto out_unlock;
420 }
421
422 old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
423 le32_to_cpu(rec->e_clusters), NULL,
424 NULL);
425
426 if (!old_ent)
427 BUG();
428
429 ret = -EEXIST;
430 if (old_ent->e_tree_depth < tree_depth)
431 goto out_unlock;
432
433 if (old_ent->e_tree_depth == tree_depth) {
434 if (!memcmp(rec, &old_ent->e_rec,
435 sizeof(struct ocfs2_extent_rec)))
436 ret = 0;
437
438 /* FIXME: Should this be ESRCH/EBADR??? */
439 goto out_unlock;
440 }
441
442 /*
443 * We do it in this order specifically so that no actual tree
444 * changes occur until we have all the pieces we need. We
445 * don't want malloc failures to leave an inconsistent tree.
446 * Whenever we drop the lock, another process could be
447 * inserting. Also note that, if another process just beat us
448 * to an insert, we might not need the same pieces we needed
449 * the first go round. In the end, the pieces we need will
450 * be used, and the pieces we don't will be freed.
451 */
452 ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) >
453 le32_to_cpu(old_ent->e_rec.e_cpos));
454 ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) +
455 le32_to_cpu(old_ent->e_rec.e_clusters)) >
456 (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)));
457 ret = -EAGAIN;
458 if (ctxt->need_left) {
459 if (!ctxt->left_ent)
460 goto out_unlock;
461 *(ctxt->left_ent) = *old_ent;
462 ctxt->left_ent->e_rec.e_clusters =
463 cpu_to_le32(le32_to_cpu(rec->e_cpos) -
464 le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
465 }
466 if (ctxt->need_right) {
467 if (!ctxt->right_ent)
468 goto out_unlock;
469 *(ctxt->right_ent) = *old_ent;
470 ctxt->right_ent->e_rec.e_cpos =
471 cpu_to_le32(le32_to_cpu(rec->e_cpos) +
472 le32_to_cpu(rec->e_clusters));
473 ctxt->right_ent->e_rec.e_clusters =
474 cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
475 le32_to_cpu(old_ent->e_rec.e_clusters)) -
476 le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
477 }
478
479 rb_erase(&old_ent->e_node, &em->em_extents);
480 /* Now that he's erased, set him up for deletion */
481 ctxt->old_ent = old_ent;
482
483 if (ctxt->need_left) {
484 ret = ocfs2_extent_map_insert_entry(em,
485 ctxt->left_ent);
486 if (ret)
487 goto out_unlock;
488 ctxt->left_ent = NULL;
489 }
490
491 if (ctxt->need_right) {
492 ret = ocfs2_extent_map_insert_entry(em,
493 ctxt->right_ent);
494 if (ret)
495 goto out_unlock;
496 ctxt->right_ent = NULL;
497 }
498
499 ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
500
501 if (!ret)
502 ctxt->new_ent = NULL;
503
504out_unlock:
505 spin_unlock(&OCFS2_I(inode)->ip_lock);
506
507 return ret;
508}
509
510
511static int ocfs2_extent_map_insert(struct inode *inode,
512 struct ocfs2_extent_rec *rec,
513 int tree_depth)
514{
515 int ret;
516 struct ocfs2_em_insert_context ctxt = {0, };
517
518 if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
519 OCFS2_I(inode)->ip_map.em_clusters) {
520 ret = -EBADR;
521 mlog_errno(ret);
522 return ret;
523 }
524
525 /* Zero e_clusters means a truncated tail record. It better be EOF */
526 if (!rec->e_clusters) {
527 if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) !=
528 OCFS2_I(inode)->ip_map.em_clusters) {
529 ret = -EBADR;
530 mlog_errno(ret);
531 return ret;
532 }
533
534 /* Ignore the truncated tail */
535 return 0;
536 }
537
538 ret = -ENOMEM;
539 ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
540 GFP_KERNEL);
541 if (!ctxt.new_ent) {
542 mlog_errno(ret);
543 return ret;
544 }
545
546 ctxt.new_ent->e_rec = *rec;
547 ctxt.new_ent->e_tree_depth = tree_depth;
548
549 do {
550 ret = -ENOMEM;
551 if (ctxt.need_left && !ctxt.left_ent) {
552 ctxt.left_ent =
553 kmem_cache_alloc(ocfs2_em_ent_cachep,
554 GFP_KERNEL);
555 if (!ctxt.left_ent)
556 break;
557 }
558 if (ctxt.need_right && !ctxt.right_ent) {
559 ctxt.right_ent =
560 kmem_cache_alloc(ocfs2_em_ent_cachep,
561 GFP_KERNEL);
562 if (!ctxt.right_ent)
563 break;
564 }
565
566 ret = ocfs2_extent_map_try_insert(inode, rec,
567 tree_depth, &ctxt);
568 } while (ret == -EAGAIN);
569
570 if (ret < 0)
571 mlog_errno(ret);
572
573 if (ctxt.left_ent)
574 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
575 if (ctxt.right_ent)
576 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
577 if (ctxt.old_ent)
578 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
579 if (ctxt.new_ent)
580 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
581
582 return ret;
583}
584
585/*
586 * Append this record to the tail of the extent map. It must be
587 * tree_depth 0. The record might be an extension of an existing
588 * record, and as such that needs to be handled. eg:
589 *
590 * Existing record in the extent map:
591 *
592 * cpos = 10, len = 10
593 * |---------|
594 *
595 * New Record:
596 *
597 * cpos = 10, len = 20
598 * |------------------|
599 *
600 * The passed record is the new on-disk record. The new_clusters value
601 * is how many clusters were added to the file. If the append is a
602 * contiguous append, the new_clusters has been added to
603 * rec->e_clusters. If the append is an entirely new extent, then
604 * rec->e_clusters is == new_clusters.
605 */
606int ocfs2_extent_map_append(struct inode *inode,
607 struct ocfs2_extent_rec *rec,
608 u32 new_clusters)
609{
610 int ret;
611 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
612 struct ocfs2_extent_map_entry *ent;
613 struct ocfs2_extent_rec *old;
614
615 BUG_ON(!new_clusters);
616 BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
617
618 if (em->em_clusters < OCFS2_I(inode)->ip_clusters) {
619 /*
620 * Size changed underneath us on disk. Drop any
621 * straddling records and update our idea of
622 * i_clusters
623 */
624 ocfs2_extent_map_drop(inode, em->em_clusters - 1);
625 em->em_clusters = OCFS2_I(inode)->ip_clusters;
626 }
627
628 mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) +
629 le32_to_cpu(rec->e_clusters)) !=
630 (em->em_clusters + new_clusters),
631 "Inode %"MLFu64":\n"
632 "rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
633 "em->em_clusters = %u + new_clusters = %u = %u\n",
634 OCFS2_I(inode)->ip_blkno,
635 le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
636 le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
637 em->em_clusters, new_clusters,
638 em->em_clusters + new_clusters);
639
640 em->em_clusters += new_clusters;
641
642 ret = -ENOENT;
643 if (le32_to_cpu(rec->e_clusters) > new_clusters) {
644 /* This is a contiguous append */
645 ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
646 NULL, NULL);
647 if (ent) {
648 old = &ent->e_rec;
649 BUG_ON((le32_to_cpu(rec->e_cpos) +
650 le32_to_cpu(rec->e_clusters)) !=
651 (le32_to_cpu(old->e_cpos) +
652 le32_to_cpu(old->e_clusters) +
653 new_clusters));
654 if (ent->e_tree_depth == 0) {
655 BUG_ON(le32_to_cpu(old->e_cpos) !=
656 le32_to_cpu(rec->e_cpos));
657 BUG_ON(le64_to_cpu(old->e_blkno) !=
658 le64_to_cpu(rec->e_blkno));
659 ret = 0;
660 }
661 /*
662 * Let non-leafs fall through as -ENOENT to
663 * force insertion of the new leaf.
664 */
665 le32_add_cpu(&old->e_clusters, new_clusters);
666 }
667 }
668
669 if (ret == -ENOENT)
670 ret = ocfs2_extent_map_insert(inode, rec, 0);
671 if (ret < 0)
672 mlog_errno(ret);
673 return ret;
674}
675
676#if 0
677/* Code here is included but defined out as it completes the extent
678 * map api and may be used in the future. */
679
680/*
681 * Look up the record containing this cluster offset. This record is
682 * part of the extent map. Do not free it. Any changes you make to
683 * it will reflect in the extent map. So, if your last extent
684 * is (cpos = 10, clusters = 10) and you truncate the file by 5
685 * clusters, you can do:
686 *
687 * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
688 * rec->e_clusters -= 5;
689 *
690 * The lookup does not read from disk. If the map isn't filled in for
691 * an entry, you won't find it.
692 *
693 * Also note that the returned record is valid until alloc_sem is
694 * dropped. After that, truncate and extend can happen. Caveat Emptor.
695 */
696int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
697 struct ocfs2_extent_rec **rec,
698 int *tree_depth)
699{
700 int ret = -ENOENT;
701 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
702 struct ocfs2_extent_map_entry *ent;
703
704 *rec = NULL;
705
706 if (cpos >= OCFS2_I(inode)->ip_clusters)
707 return -EINVAL;
708
709 if (cpos >= em->em_clusters) {
710 /*
711 * Size changed underneath us on disk. Drop any
712 * straddling records and update our idea of
713 * i_clusters
714 */
715 ocfs2_extent_map_drop(inode, em->em_clusters - 1);
716 em->em_clusters = OCFS2_I(inode)->ip_clusters ;
717 }
718
719 ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
720 NULL, NULL);
721
722 if (ent) {
723 *rec = &ent->e_rec;
724 if (tree_depth)
725 *tree_depth = ent->e_tree_depth;
726 ret = 0;
727 }
728
729 return ret;
730}
731
732int ocfs2_extent_map_get_clusters(struct inode *inode,
733 u32 v_cpos, int count,
734 u32 *p_cpos, int *ret_count)
735{
736 int ret;
737 u32 coff, ccount;
738 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
739 struct ocfs2_extent_map_entry *ent = NULL;
740
741 *p_cpos = ccount = 0;
742
743 if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters)
744 return -EINVAL;
745
746 if ((v_cpos + count) > em->em_clusters) {
747 /*
748 * Size changed underneath us on disk. Drop any
749 * straddling records and update our idea of
750 * i_clusters
751 */
752 ocfs2_extent_map_drop(inode, em->em_clusters - 1);
753 em->em_clusters = OCFS2_I(inode)->ip_clusters;
754 }
755
756
757 ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent);
758 if (ret)
759 return ret;
760
761 if (ent) {
762 /* We should never find ourselves straddling an interval */
763 if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
764 v_cpos,
765 count))
766 return -ESRCH;
767
768 coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos);
769 *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
770 le64_to_cpu(ent->e_rec.e_blkno)) +
771 coff;
772
773 if (ret_count)
774 *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
775
776 return 0;
777 }
778
779
780 return -ENOENT;
781}
782
783#endif /* 0 */
784
785int ocfs2_extent_map_get_blocks(struct inode *inode,
786 u64 v_blkno, int count,
787 u64 *p_blkno, int *ret_count)
788{
789 int ret;
790 u64 boff;
791 u32 cpos, clusters;
792 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
793 struct ocfs2_extent_map_entry *ent = NULL;
794 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
795 struct ocfs2_extent_rec *rec;
796
797 *p_blkno = 0;
798
799 cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
800 clusters = ocfs2_blocks_to_clusters(inode->i_sb,
801 (u64)count + bpc - 1);
802 if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
803 ret = -EINVAL;
804 mlog_errno(ret);
805 return ret;
806 }
807
808 if ((cpos + clusters) > em->em_clusters) {
809 /*
810 * Size changed underneath us on disk. Drop any
811 * straddling records and update our idea of
812 * i_clusters
813 */
814 ocfs2_extent_map_drop(inode, em->em_clusters - 1);
815 em->em_clusters = OCFS2_I(inode)->ip_clusters;
816 }
817
818 ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent);
819 if (ret) {
820 mlog_errno(ret);
821 return ret;
822 }
823
824 if (ent)
825 {
826 rec = &ent->e_rec;
827
828 /* We should never find ourselves straddling an interval */
829 if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) {
830 ret = -ESRCH;
831 mlog_errno(ret);
832 return ret;
833 }
834
835 boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos -
836 le32_to_cpu(rec->e_cpos));
837 boff += (v_blkno & (u64)(bpc - 1));
838 *p_blkno = le64_to_cpu(rec->e_blkno) + boff;
839
840 if (ret_count) {
841 *ret_count = ocfs2_clusters_to_blocks(inode->i_sb,
842 le32_to_cpu(rec->e_clusters)) - boff;
843 }
844
845 return 0;
846 }
847
848 return -ENOENT;
849}
850
851int ocfs2_extent_map_init(struct inode *inode)
852{
853 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
854
855 em->em_extents = RB_ROOT;
856 em->em_clusters = 0;
857
858 return 0;
859}
860
861/* Needs the lock */
862static void __ocfs2_extent_map_drop(struct inode *inode,
863 u32 new_clusters,
864 struct rb_node **free_head,
865 struct ocfs2_extent_map_entry **tail_ent)
866{
867 struct rb_node *node, *next;
868 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
869 struct ocfs2_extent_map_entry *ent;
870
871 *free_head = NULL;
872
873 ent = NULL;
874 node = rb_last(&em->em_extents);
875 while (node)
876 {
877 next = rb_prev(node);
878
879 ent = rb_entry(node, struct ocfs2_extent_map_entry,
880 e_node);
881 if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters)
882 break;
883
884 rb_erase(&ent->e_node, &em->em_extents);
885
886 node->rb_right = *free_head;
887 *free_head = node;
888
889 ent = NULL;
890 node = next;
891 }
892
893 /* Do we have an entry straddling new_clusters? */
894 if (tail_ent) {
895 if (ent &&
896 ((le32_to_cpu(ent->e_rec.e_cpos) +
897 le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
898 *tail_ent = ent;
899 else
900 *tail_ent = NULL;
901 }
902}
903
904static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
905{
906 struct rb_node *node;
907 struct ocfs2_extent_map_entry *ent;
908
909 while (free_head) {
910 node = free_head;
911 free_head = node->rb_right;
912
913 ent = rb_entry(node, struct ocfs2_extent_map_entry,
914 e_node);
915 kmem_cache_free(ocfs2_em_ent_cachep, ent);
916 }
917}
918
919/*
920 * Remove all entries past new_clusters, inclusive of an entry that
921 * contains new_clusters. This is effectively a cache forget.
922 *
923 * If you want to also clip the last extent by some number of clusters,
924 * you need to call ocfs2_extent_map_trunc().
925 * This code does not check or modify ip_clusters.
926 */
927int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters)
928{
929 struct rb_node *free_head = NULL;
930 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
931 struct ocfs2_extent_map_entry *ent;
932
933 spin_lock(&OCFS2_I(inode)->ip_lock);
934
935 __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
936
937 if (ent) {
938 rb_erase(&ent->e_node, &em->em_extents);
939 ent->e_node.rb_right = free_head;
940 free_head = &ent->e_node;
941 }
942
943 spin_unlock(&OCFS2_I(inode)->ip_lock);
944
945 if (free_head)
946 __ocfs2_extent_map_drop_cleanup(free_head);
947
948 return 0;
949}
950
951/*
952 * Remove all entries past new_clusters and also clip any extent
953 * straddling new_clusters, if there is one. This does not check
954 * or modify ip_clusters
955 */
956int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
957{
958 struct rb_node *free_head = NULL;
959 struct ocfs2_extent_map_entry *ent = NULL;
960
961 spin_lock(&OCFS2_I(inode)->ip_lock);
962
963 __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
964
965 if (ent)
966 ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
967 le32_to_cpu(ent->e_rec.e_cpos));
968
969 OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
970
971 spin_unlock(&OCFS2_I(inode)->ip_lock);
972
973 if (free_head)
974 __ocfs2_extent_map_drop_cleanup(free_head);
975
976 return 0;
977}
978
979int __init init_ocfs2_extent_maps(void)
980{
981 ocfs2_em_ent_cachep =
982 kmem_cache_create("ocfs2_em_ent",
983 sizeof(struct ocfs2_extent_map_entry),
984 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
985 if (!ocfs2_em_ent_cachep)
986 return -ENOMEM;
987
988 return 0;
989}
990
991void __exit exit_ocfs2_extent_maps(void)
992{
993 kmem_cache_destroy(ocfs2_em_ent_cachep);
994}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
new file mode 100644
index 00000000000..fa3745efa88
--- /dev/null
+++ b/fs/ocfs2/extent_map.h
@@ -0,0 +1,46 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * extent_map.h
5 *
6 * In-memory file extent mappings for OCFS2.
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License, version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public
20 * License along with this program; if not, write to the
21 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
22 * Boston, MA 021110-1307, USA.
23 */
24
25#ifndef _EXTENT_MAP_H
26#define _EXTENT_MAP_H
27
28int init_ocfs2_extent_maps(void);
29void exit_ocfs2_extent_maps(void);
30
31/*
32 * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem
33 * to be held. The allocation cannot change at all while the map is
34 * in the process of being updated.
35 */
36int ocfs2_extent_map_init(struct inode *inode);
37int ocfs2_extent_map_append(struct inode *inode,
38 struct ocfs2_extent_rec *rec,
39 u32 new_clusters);
40int ocfs2_extent_map_get_blocks(struct inode *inode,
41 u64 v_blkno, int count,
42 u64 *p_blkno, int *ret_count);
43int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters);
44int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters);
45
46#endif /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
new file mode 100644
index 00000000000..eaf33caa0a1
--- /dev/null
+++ b/fs/ocfs2/file.c
@@ -0,0 +1,1238 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * file.c
5 *
6 * File open, close, extend, truncate
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/capability.h>
27#include <linux/fs.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h>
31#include <linux/pagemap.h>
32#include <linux/uio.h>
33
34#define MLOG_MASK_PREFIX ML_INODE
35#include <cluster/masklog.h>
36
37#include "ocfs2.h"
38
39#include "alloc.h"
40#include "aops.h"
41#include "dir.h"
42#include "dlmglue.h"
43#include "extent_map.h"
44#include "file.h"
45#include "sysfile.h"
46#include "inode.h"
47#include "journal.h"
48#include "mmap.h"
49#include "suballoc.h"
50#include "super.h"
51
52#include "buffer_head_io.h"
53
54static int ocfs2_sync_inode(struct inode *inode)
55{
56 filemap_fdatawrite(inode->i_mapping);
57 return sync_mapping_buffers(inode->i_mapping);
58}
59
60static int ocfs2_file_open(struct inode *inode, struct file *file)
61{
62 int status;
63 int mode = file->f_flags;
64 struct ocfs2_inode_info *oi = OCFS2_I(inode);
65
66 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
67 file->f_dentry->d_name.len, file->f_dentry->d_name.name);
68
69 spin_lock(&oi->ip_lock);
70
71 /* Check that the inode hasn't been wiped from disk by another
72 * node. If it hasn't then we're safe as long as we hold the
73 * spin lock until our increment of open count. */
74 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
75 spin_unlock(&oi->ip_lock);
76
77 status = -ENOENT;
78 goto leave;
79 }
80
81 if (mode & O_DIRECT)
82 oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
83
84 oi->ip_open_count++;
85 spin_unlock(&oi->ip_lock);
86 status = 0;
87leave:
88 mlog_exit(status);
89 return status;
90}
91
92static int ocfs2_file_release(struct inode *inode, struct file *file)
93{
94 struct ocfs2_inode_info *oi = OCFS2_I(inode);
95
96 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
97 file->f_dentry->d_name.len,
98 file->f_dentry->d_name.name);
99
100 spin_lock(&oi->ip_lock);
101 if (!--oi->ip_open_count)
102 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
103 spin_unlock(&oi->ip_lock);
104
105 mlog_exit(0);
106
107 return 0;
108}
109
110static int ocfs2_sync_file(struct file *file,
111 struct dentry *dentry,
112 int datasync)
113{
114 int err = 0;
115 journal_t *journal;
116 struct inode *inode = dentry->d_inode;
117 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
118
119 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
120 dentry->d_name.len, dentry->d_name.name);
121
122 err = ocfs2_sync_inode(dentry->d_inode);
123 if (err)
124 goto bail;
125
126 journal = osb->journal->j_journal;
127 err = journal_force_commit(journal);
128
129bail:
130 mlog_exit(err);
131
132 return (err < 0) ? -EIO : 0;
133}
134
135int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,
136 struct inode *inode,
137 struct buffer_head *fe_bh,
138 u64 new_i_size)
139{
140 int status;
141
142 mlog_entry_void();
143 i_size_write(inode, new_i_size);
144 inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
145 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
146
147 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
148 if (status < 0) {
149 mlog_errno(status);
150 goto bail;
151 }
152
153bail:
154 mlog_exit(status);
155 return status;
156}
157
158static int ocfs2_simple_size_update(struct inode *inode,
159 struct buffer_head *di_bh,
160 u64 new_i_size)
161{
162 int ret;
163 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
164 struct ocfs2_journal_handle *handle = NULL;
165
166 handle = ocfs2_start_trans(osb, NULL,
167 OCFS2_INODE_UPDATE_CREDITS);
168 if (handle == NULL) {
169 ret = -ENOMEM;
170 mlog_errno(ret);
171 goto out;
172 }
173
174 ret = ocfs2_set_inode_size(handle, inode, di_bh,
175 new_i_size);
176 if (ret < 0)
177 mlog_errno(ret);
178
179 ocfs2_commit_trans(handle);
180out:
181 return ret;
182}
183
184static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
185 struct inode *inode,
186 struct buffer_head *fe_bh,
187 u64 new_i_size)
188{
189 int status;
190 struct ocfs2_journal_handle *handle;
191
192 mlog_entry_void();
193
194 /* TODO: This needs to actually orphan the inode in this
195 * transaction. */
196
197 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
198 if (IS_ERR(handle)) {
199 status = PTR_ERR(handle);
200 mlog_errno(status);
201 goto out;
202 }
203
204 status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
205 if (status < 0)
206 mlog_errno(status);
207
208 ocfs2_commit_trans(handle);
209out:
210 mlog_exit(status);
211 return status;
212}
213
214static int ocfs2_truncate_file(struct inode *inode,
215 struct buffer_head *di_bh,
216 u64 new_i_size)
217{
218 int status = 0;
219 struct ocfs2_dinode *fe = NULL;
220 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
221 struct ocfs2_truncate_context *tc = NULL;
222
223 mlog_entry("(inode = %"MLFu64", new_i_size = %"MLFu64"\n",
224 OCFS2_I(inode)->ip_blkno, new_i_size);
225
226 truncate_inode_pages(inode->i_mapping, new_i_size);
227
228 fe = (struct ocfs2_dinode *) di_bh->b_data;
229 if (!OCFS2_IS_VALID_DINODE(fe)) {
230 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
231 status = -EIO;
232 goto bail;
233 }
234
235 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
236 "Inode %"MLFu64", inode i_size = %lld != di "
237 "i_size = %"MLFu64", i_flags = 0x%x\n",
238 OCFS2_I(inode)->ip_blkno,
239 i_size_read(inode),
240 le64_to_cpu(fe->i_size), le32_to_cpu(fe->i_flags));
241
242 if (new_i_size > le64_to_cpu(fe->i_size)) {
243 mlog(0, "asked to truncate file with size (%"MLFu64") "
244 "to size (%"MLFu64")!\n",
245 le64_to_cpu(fe->i_size), new_i_size);
246 status = -EINVAL;
247 mlog_errno(status);
248 goto bail;
249 }
250
251 mlog(0, "inode %"MLFu64", i_size = %"MLFu64", new_i_size = %"MLFu64"\n",
252 le64_to_cpu(fe->i_blkno), le64_to_cpu(fe->i_size), new_i_size);
253
254 /* lets handle the simple truncate cases before doing any more
255 * cluster locking. */
256 if (new_i_size == le64_to_cpu(fe->i_size))
257 goto bail;
258
259 if (le32_to_cpu(fe->i_clusters) ==
260 ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
261 mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
262 fe->i_clusters);
263 /* No allocation change is required, so lets fast path
264 * this truncate. */
265 status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
266 if (status < 0)
267 mlog_errno(status);
268 goto bail;
269 }
270
271 /* This forces other nodes to sync and drop their pages */
272 status = ocfs2_data_lock(inode, 1);
273 if (status < 0) {
274 mlog_errno(status);
275 goto bail;
276 }
277 ocfs2_data_unlock(inode, 1);
278
279 /* alright, we're going to need to do a full blown alloc size
280 * change. Orphan the inode so that recovery can complete the
281 * truncate if necessary. This does the task of marking
282 * i_size. */
283 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
284 if (status < 0) {
285 mlog_errno(status);
286 goto bail;
287 }
288
289 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
290 if (status < 0) {
291 mlog_errno(status);
292 goto bail;
293 }
294
295 status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
296 if (status < 0) {
297 mlog_errno(status);
298 goto bail;
299 }
300
301 /* TODO: orphan dir cleanup here. */
302bail:
303
304 mlog_exit(status);
305 return status;
306}
307
308/*
309 * extend allocation only here.
310 * we'll update all the disk stuff, and oip->alloc_size
311 *
312 * expect stuff to be locked, a transaction started and enough data /
313 * metadata reservations in the contexts.
314 *
315 * Will return -EAGAIN, and a reason if a restart is needed.
316 * If passed in, *reason will always be set, even in error.
317 */
318int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
319 struct inode *inode,
320 u32 clusters_to_add,
321 struct buffer_head *fe_bh,
322 struct ocfs2_journal_handle *handle,
323 struct ocfs2_alloc_context *data_ac,
324 struct ocfs2_alloc_context *meta_ac,
325 enum ocfs2_alloc_restarted *reason_ret)
326{
327 int status = 0;
328 int free_extents;
329 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
330 enum ocfs2_alloc_restarted reason = RESTART_NONE;
331 u32 bit_off, num_bits;
332 u64 block;
333
334 BUG_ON(!clusters_to_add);
335
336 free_extents = ocfs2_num_free_extents(osb, inode, fe);
337 if (free_extents < 0) {
338 status = free_extents;
339 mlog_errno(status);
340 goto leave;
341 }
342
343 /* there are two cases which could cause us to EAGAIN in the
344 * we-need-more-metadata case:
345 * 1) we haven't reserved *any*
346 * 2) we are so fragmented, we've needed to add metadata too
347 * many times. */
348 if (!free_extents && !meta_ac) {
349 mlog(0, "we haven't reserved any metadata!\n");
350 status = -EAGAIN;
351 reason = RESTART_META;
352 goto leave;
353 } else if ((!free_extents)
354 && (ocfs2_alloc_context_bits_left(meta_ac)
355 < ocfs2_extend_meta_needed(fe))) {
356 mlog(0, "filesystem is really fragmented...\n");
357 status = -EAGAIN;
358 reason = RESTART_META;
359 goto leave;
360 }
361
362 status = ocfs2_claim_clusters(osb, handle, data_ac, 1,
363 &bit_off, &num_bits);
364 if (status < 0) {
365 if (status != -ENOSPC)
366 mlog_errno(status);
367 goto leave;
368 }
369
370 BUG_ON(num_bits > clusters_to_add);
371
372 /* reserve our write early -- insert_extent may update the inode */
373 status = ocfs2_journal_access(handle, inode, fe_bh,
374 OCFS2_JOURNAL_ACCESS_WRITE);
375 if (status < 0) {
376 mlog_errno(status);
377 goto leave;
378 }
379
380 block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
381 mlog(0, "Allocating %u clusters at block %u for inode %"MLFu64"\n",
382 num_bits, bit_off, OCFS2_I(inode)->ip_blkno);
383 status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block,
384 num_bits, meta_ac);
385 if (status < 0) {
386 mlog_errno(status);
387 goto leave;
388 }
389
390 le32_add_cpu(&fe->i_clusters, num_bits);
391 spin_lock(&OCFS2_I(inode)->ip_lock);
392 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
393 spin_unlock(&OCFS2_I(inode)->ip_lock);
394
395 status = ocfs2_journal_dirty(handle, fe_bh);
396 if (status < 0) {
397 mlog_errno(status);
398 goto leave;
399 }
400
401 clusters_to_add -= num_bits;
402
403 if (clusters_to_add) {
404 mlog(0, "need to alloc once more, clusters = %u, wanted = "
405 "%u\n", fe->i_clusters, clusters_to_add);
406 status = -EAGAIN;
407 reason = RESTART_TRANS;
408 }
409
410leave:
411 mlog_exit(status);
412 if (reason_ret)
413 *reason_ret = reason;
414 return status;
415}
416
417static int ocfs2_extend_allocation(struct inode *inode,
418 u32 clusters_to_add)
419{
420 int status = 0;
421 int restart_func = 0;
422 int drop_alloc_sem = 0;
423 int credits, num_free_extents;
424 u32 prev_clusters;
425 struct buffer_head *bh = NULL;
426 struct ocfs2_dinode *fe = NULL;
427 struct ocfs2_journal_handle *handle = NULL;
428 struct ocfs2_alloc_context *data_ac = NULL;
429 struct ocfs2_alloc_context *meta_ac = NULL;
430 enum ocfs2_alloc_restarted why;
431 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
432
433 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
434
435 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
436 OCFS2_BH_CACHED, inode);
437 if (status < 0) {
438 mlog_errno(status);
439 goto leave;
440 }
441
442 fe = (struct ocfs2_dinode *) bh->b_data;
443 if (!OCFS2_IS_VALID_DINODE(fe)) {
444 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
445 status = -EIO;
446 goto leave;
447 }
448
449restart_all:
450 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
451
452 mlog(0, "extend inode %"MLFu64", i_size = %lld, fe->i_clusters = %u, "
453 "clusters_to_add = %u\n",
454 OCFS2_I(inode)->ip_blkno, i_size_read(inode),
455 fe->i_clusters, clusters_to_add);
456
457 handle = ocfs2_alloc_handle(osb);
458 if (handle == NULL) {
459 status = -ENOMEM;
460 mlog_errno(status);
461 goto leave;
462 }
463
464 num_free_extents = ocfs2_num_free_extents(osb,
465 inode,
466 fe);
467 if (num_free_extents < 0) {
468 status = num_free_extents;
469 mlog_errno(status);
470 goto leave;
471 }
472
473 if (!num_free_extents) {
474 status = ocfs2_reserve_new_metadata(osb,
475 handle,
476 fe,
477 &meta_ac);
478 if (status < 0) {
479 if (status != -ENOSPC)
480 mlog_errno(status);
481 goto leave;
482 }
483 }
484
485 status = ocfs2_reserve_clusters(osb,
486 handle,
487 clusters_to_add,
488 &data_ac);
489 if (status < 0) {
490 if (status != -ENOSPC)
491 mlog_errno(status);
492 goto leave;
493 }
494
495 /* blocks peope in read/write from reading our allocation
496 * until we're done changing it. We depend on i_mutex to block
497 * other extend/truncate calls while we're here. Ordering wrt
498 * start_trans is important here -- always do it before! */
499 down_write(&OCFS2_I(inode)->ip_alloc_sem);
500 drop_alloc_sem = 1;
501
502 credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
503 handle = ocfs2_start_trans(osb, handle, credits);
504 if (IS_ERR(handle)) {
505 status = PTR_ERR(handle);
506 handle = NULL;
507 mlog_errno(status);
508 goto leave;
509 }
510
511restarted_transaction:
512 /* reserve a write to the file entry early on - that we if we
513 * run out of credits in the allocation path, we can still
514 * update i_size. */
515 status = ocfs2_journal_access(handle, inode, bh,
516 OCFS2_JOURNAL_ACCESS_WRITE);
517 if (status < 0) {
518 mlog_errno(status);
519 goto leave;
520 }
521
522 prev_clusters = OCFS2_I(inode)->ip_clusters;
523
524 status = ocfs2_do_extend_allocation(osb,
525 inode,
526 clusters_to_add,
527 bh,
528 handle,
529 data_ac,
530 meta_ac,
531 &why);
532 if ((status < 0) && (status != -EAGAIN)) {
533 if (status != -ENOSPC)
534 mlog_errno(status);
535 goto leave;
536 }
537
538 status = ocfs2_journal_dirty(handle, bh);
539 if (status < 0) {
540 mlog_errno(status);
541 goto leave;
542 }
543
544 spin_lock(&OCFS2_I(inode)->ip_lock);
545 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
546 spin_unlock(&OCFS2_I(inode)->ip_lock);
547
548 if (why != RESTART_NONE && clusters_to_add) {
549 if (why == RESTART_META) {
550 mlog(0, "restarting function.\n");
551 restart_func = 1;
552 } else {
553 BUG_ON(why != RESTART_TRANS);
554
555 mlog(0, "restarting transaction.\n");
556 /* TODO: This can be more intelligent. */
557 credits = ocfs2_calc_extend_credits(osb->sb,
558 fe,
559 clusters_to_add);
560 status = ocfs2_extend_trans(handle, credits);
561 if (status < 0) {
562 /* handle still has to be committed at
563 * this point. */
564 status = -ENOMEM;
565 mlog_errno(status);
566 goto leave;
567 }
568 goto restarted_transaction;
569 }
570 }
571
572 mlog(0, "fe: i_clusters = %u, i_size=%"MLFu64"\n",
573 fe->i_clusters, fe->i_size);
574 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
575 OCFS2_I(inode)->ip_clusters, i_size_read(inode));
576
577leave:
578 if (drop_alloc_sem) {
579 up_write(&OCFS2_I(inode)->ip_alloc_sem);
580 drop_alloc_sem = 0;
581 }
582 if (handle) {
583 ocfs2_commit_trans(handle);
584 handle = NULL;
585 }
586 if (data_ac) {
587 ocfs2_free_alloc_context(data_ac);
588 data_ac = NULL;
589 }
590 if (meta_ac) {
591 ocfs2_free_alloc_context(meta_ac);
592 meta_ac = NULL;
593 }
594 if ((!status) && restart_func) {
595 restart_func = 0;
596 goto restart_all;
597 }
598 if (bh) {
599 brelse(bh);
600 bh = NULL;
601 }
602
603 mlog_exit(status);
604 return status;
605}
606
607/* Some parts of this taken from generic_cont_expand, which turned out
608 * to be too fragile to do exactly what we need without us having to
609 * worry about recursive locking in ->commit_write(). */
610static int ocfs2_write_zero_page(struct inode *inode,
611 u64 size)
612{
613 struct address_space *mapping = inode->i_mapping;
614 struct page *page;
615 unsigned long index;
616 unsigned int offset;
617 struct ocfs2_journal_handle *handle = NULL;
618 int ret;
619
620 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
621 /* ugh. in prepare/commit_write, if from==to==start of block, we
622 ** skip the prepare. make sure we never send an offset for the start
623 ** of a block
624 */
625 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
626 offset++;
627 }
628 index = size >> PAGE_CACHE_SHIFT;
629
630 page = grab_cache_page(mapping, index);
631 if (!page) {
632 ret = -ENOMEM;
633 mlog_errno(ret);
634 goto out;
635 }
636
637 ret = ocfs2_prepare_write(NULL, page, offset, offset);
638 if (ret < 0) {
639 mlog_errno(ret);
640 goto out_unlock;
641 }
642
643 if (ocfs2_should_order_data(inode)) {
644 handle = ocfs2_start_walk_page_trans(inode, page, offset,
645 offset);
646 if (IS_ERR(handle)) {
647 ret = PTR_ERR(handle);
648 handle = NULL;
649 goto out_unlock;
650 }
651 }
652
653 /* must not update i_size! */
654 ret = block_commit_write(page, offset, offset);
655 if (ret < 0)
656 mlog_errno(ret);
657 else
658 ret = 0;
659
660 if (handle)
661 ocfs2_commit_trans(handle);
662out_unlock:
663 unlock_page(page);
664 page_cache_release(page);
665out:
666 return ret;
667}
668
669static int ocfs2_zero_extend(struct inode *inode,
670 u64 zero_to_size)
671{
672 int ret = 0;
673 u64 start_off;
674 struct super_block *sb = inode->i_sb;
675
676 start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
677 while (start_off < zero_to_size) {
678 ret = ocfs2_write_zero_page(inode, start_off);
679 if (ret < 0) {
680 mlog_errno(ret);
681 goto out;
682 }
683
684 start_off += sb->s_blocksize;
685 }
686
687out:
688 return ret;
689}
690
691static int ocfs2_extend_file(struct inode *inode,
692 struct buffer_head *di_bh,
693 u64 new_i_size)
694{
695 int ret = 0;
696 u32 clusters_to_add;
697
698 /* setattr sometimes calls us like this. */
699 if (new_i_size == 0)
700 goto out;
701
702 if (i_size_read(inode) == new_i_size)
703 goto out;
704 BUG_ON(new_i_size < i_size_read(inode));
705
706 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -
707 OCFS2_I(inode)->ip_clusters;
708
709 if (clusters_to_add) {
710 ret = ocfs2_extend_allocation(inode, clusters_to_add);
711 if (ret < 0) {
712 mlog_errno(ret);
713 goto out;
714 }
715
716 ret = ocfs2_zero_extend(inode, new_i_size);
717 if (ret < 0) {
718 mlog_errno(ret);
719 goto out;
720 }
721 }
722
723 /* No allocation required, we just use this helper to
724 * do a trivial update of i_size. */
725 ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
726 if (ret < 0) {
727 mlog_errno(ret);
728 goto out;
729 }
730
731out:
732 return ret;
733}
734
735int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
736{
737 int status = 0, size_change;
738 struct inode *inode = dentry->d_inode;
739 struct super_block *sb = inode->i_sb;
740 struct ocfs2_super *osb = OCFS2_SB(sb);
741 struct buffer_head *bh = NULL;
742 struct ocfs2_journal_handle *handle = NULL;
743
744 mlog_entry("(0x%p, '%.*s')\n", dentry,
745 dentry->d_name.len, dentry->d_name.name);
746
747 if (attr->ia_valid & ATTR_MODE)
748 mlog(0, "mode change: %d\n", attr->ia_mode);
749 if (attr->ia_valid & ATTR_UID)
750 mlog(0, "uid change: %d\n", attr->ia_uid);
751 if (attr->ia_valid & ATTR_GID)
752 mlog(0, "gid change: %d\n", attr->ia_gid);
753 if (attr->ia_valid & ATTR_SIZE)
754 mlog(0, "size change...\n");
755 if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
756 mlog(0, "time change...\n");
757
758#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
759 | ATTR_GID | ATTR_UID | ATTR_MODE)
760 if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
761 mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
762 return 0;
763 }
764
765 status = inode_change_ok(inode, attr);
766 if (status)
767 return status;
768
769 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
770 if (size_change) {
771 status = ocfs2_rw_lock(inode, 1);
772 if (status < 0) {
773 mlog_errno(status);
774 goto bail;
775 }
776 }
777
778 status = ocfs2_meta_lock(inode, NULL, &bh, 1);
779 if (status < 0) {
780 if (status != -ENOENT)
781 mlog_errno(status);
782 goto bail_unlock_rw;
783 }
784
785 if (size_change && attr->ia_size != i_size_read(inode)) {
786 if (i_size_read(inode) > attr->ia_size)
787 status = ocfs2_truncate_file(inode, bh, attr->ia_size);
788 else
789 status = ocfs2_extend_file(inode, bh, attr->ia_size);
790 if (status < 0) {
791 if (status != -ENOSPC)
792 mlog_errno(status);
793 status = -ENOSPC;
794 goto bail_unlock;
795 }
796 }
797
798 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
799 if (IS_ERR(handle)) {
800 status = PTR_ERR(handle);
801 mlog_errno(status);
802 goto bail_unlock;
803 }
804
805 status = inode_setattr(inode, attr);
806 if (status < 0) {
807 mlog_errno(status);
808 goto bail_commit;
809 }
810
811 status = ocfs2_mark_inode_dirty(handle, inode, bh);
812 if (status < 0)
813 mlog_errno(status);
814
815bail_commit:
816 ocfs2_commit_trans(handle);
817bail_unlock:
818 ocfs2_meta_unlock(inode, 1);
819bail_unlock_rw:
820 if (size_change)
821 ocfs2_rw_unlock(inode, 1);
822bail:
823 if (bh)
824 brelse(bh);
825
826 mlog_exit(status);
827 return status;
828}
829
830int ocfs2_getattr(struct vfsmount *mnt,
831 struct dentry *dentry,
832 struct kstat *stat)
833{
834 struct inode *inode = dentry->d_inode;
835 struct super_block *sb = dentry->d_inode->i_sb;
836 struct ocfs2_super *osb = sb->s_fs_info;
837 int err;
838
839 mlog_entry_void();
840
841 err = ocfs2_inode_revalidate(dentry);
842 if (err) {
843 if (err != -ENOENT)
844 mlog_errno(err);
845 goto bail;
846 }
847
848 generic_fillattr(inode, stat);
849
850 /* We set the blksize from the cluster size for performance */
851 stat->blksize = osb->s_clustersize;
852
853bail:
854 mlog_exit(err);
855
856 return err;
857}
858
859static int ocfs2_write_remove_suid(struct inode *inode)
860{
861 int ret;
862 struct buffer_head *bh = NULL;
863 struct ocfs2_inode_info *oi = OCFS2_I(inode);
864 struct ocfs2_journal_handle *handle;
865 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
866 struct ocfs2_dinode *di;
867
868 mlog_entry("(Inode %"MLFu64", mode 0%o)\n", oi->ip_blkno,
869 inode->i_mode);
870
871 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
872 if (handle == NULL) {
873 ret = -ENOMEM;
874 mlog_errno(ret);
875 goto out;
876 }
877
878 ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
879 if (ret < 0) {
880 mlog_errno(ret);
881 goto out_trans;
882 }
883
884 ret = ocfs2_journal_access(handle, inode, bh,
885 OCFS2_JOURNAL_ACCESS_WRITE);
886 if (ret < 0) {
887 mlog_errno(ret);
888 goto out_bh;
889 }
890
891 inode->i_mode &= ~S_ISUID;
892 if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
893 inode->i_mode &= ~S_ISGID;
894
895 di = (struct ocfs2_dinode *) bh->b_data;
896 di->i_mode = cpu_to_le16(inode->i_mode);
897
898 ret = ocfs2_journal_dirty(handle, bh);
899 if (ret < 0)
900 mlog_errno(ret);
901out_bh:
902 brelse(bh);
903out_trans:
904 ocfs2_commit_trans(handle);
905out:
906 mlog_exit(ret);
907 return ret;
908}
909
910static inline int ocfs2_write_should_remove_suid(struct inode *inode)
911{
912 mode_t mode = inode->i_mode;
913
914 if (!capable(CAP_FSETID)) {
915 if (unlikely(mode & S_ISUID))
916 return 1;
917
918 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
919 return 1;
920 }
921 return 0;
922}
923
924static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
925 const char __user *buf,
926 size_t count,
927 loff_t pos)
928{
929 struct iovec local_iov = { .iov_base = (void __user *)buf,
930 .iov_len = count };
931 int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0;
932 u32 clusters;
933 struct file *filp = iocb->ki_filp;
934 struct inode *inode = filp->f_dentry->d_inode;
935 loff_t newsize, saved_pos;
936#ifdef OCFS2_ORACORE_WORKAROUNDS
937 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
938#endif
939
940 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
941 (unsigned int)count,
942 filp->f_dentry->d_name.len,
943 filp->f_dentry->d_name.name);
944
945 /* happy write of zero bytes */
946 if (count == 0)
947 return 0;
948
949 if (!inode) {
950 mlog(0, "bad inode\n");
951 return -EIO;
952 }
953
954#ifdef OCFS2_ORACORE_WORKAROUNDS
955 /* ugh, work around some applications which open everything O_DIRECT +
956 * O_APPEND and really don't mean to use O_DIRECT. */
957 if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
958 (filp->f_flags & O_APPEND) && (filp->f_flags & O_DIRECT))
959 filp->f_flags &= ~O_DIRECT;
960#endif
961
962 mutex_lock(&inode->i_mutex);
963 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
964 if (filp->f_flags & O_DIRECT) {
965 have_alloc_sem = 1;
966 down_read(&inode->i_alloc_sem);
967 }
968
969 /* concurrent O_DIRECT writes are allowed */
970 rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;
971 ret = ocfs2_rw_lock(inode, rw_level);
972 if (ret < 0) {
973 rw_level = -1;
974 mlog_errno(ret);
975 goto out;
976 }
977
978 /*
979 * We sample i_size under a read level meta lock to see if our write
980 * is extending the file, if it is we back off and get a write level
981 * meta lock.
982 */
983 meta_level = (filp->f_flags & O_APPEND) ? 1 : 0;
984 for(;;) {
985 ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level);
986 if (ret < 0) {
987 meta_level = -1;
988 mlog_errno(ret);
989 goto out;
990 }
991
992 /* Clear suid / sgid if necessary. We do this here
993 * instead of later in the write path because
994 * remove_suid() calls ->setattr without any hint that
995 * we may have already done our cluster locking. Since
996 * ocfs2_setattr() *must* take cluster locks to
997 * proceeed, this will lead us to recursively lock the
998 * inode. There's also the dinode i_size state which
999 * can be lost via setattr during extending writes (we
1000 * set inode->i_size at the end of a write. */
1001 if (ocfs2_write_should_remove_suid(inode)) {
1002 if (meta_level == 0) {
1003 ocfs2_meta_unlock(inode, meta_level);
1004 meta_level = 1;
1005 continue;
1006 }
1007
1008 ret = ocfs2_write_remove_suid(inode);
1009 if (ret < 0) {
1010 mlog_errno(ret);
1011 goto out;
1012 }
1013 }
1014
1015 /* work on a copy of ppos until we're sure that we won't have
1016 * to recalculate it due to relocking. */
1017 if (filp->f_flags & O_APPEND) {
1018 saved_pos = i_size_read(inode);
1019 mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
1020 } else {
1021 saved_pos = iocb->ki_pos;
1022 }
1023 newsize = count + saved_pos;
1024
1025 mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n",
1026 saved_pos, newsize, i_size_read(inode));
1027
1028 /* No need for a higher level metadata lock if we're
1029 * never going past i_size. */
1030 if (newsize <= i_size_read(inode))
1031 break;
1032
1033 if (meta_level == 0) {
1034 ocfs2_meta_unlock(inode, meta_level);
1035 meta_level = 1;
1036 continue;
1037 }
1038
1039 spin_lock(&OCFS2_I(inode)->ip_lock);
1040 clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -
1041 OCFS2_I(inode)->ip_clusters;
1042 spin_unlock(&OCFS2_I(inode)->ip_lock);
1043
1044 mlog(0, "Writing at EOF, may need more allocation: "
1045 "i_size = %lld, newsize = %"MLFu64", need %u clusters\n",
1046 i_size_read(inode), newsize, clusters);
1047
1048 /* We only want to continue the rest of this loop if
1049 * our extend will actually require more
1050 * allocation. */
1051 if (!clusters)
1052 break;
1053
1054 ret = ocfs2_extend_allocation(inode, clusters);
1055 if (ret < 0) {
1056 if (ret != -ENOSPC)
1057 mlog_errno(ret);
1058 goto out;
1059 }
1060
1061 /* Fill any holes which would've been created by this
1062 * write. If we're O_APPEND, this will wind up
1063 * (correctly) being a noop. */
1064 ret = ocfs2_zero_extend(inode, (u64) newsize - count);
1065 if (ret < 0) {
1066 mlog_errno(ret);
1067 goto out;
1068 }
1069 break;
1070 }
1071
1072 /* ok, we're done with i_size and alloc work */
1073 iocb->ki_pos = saved_pos;
1074 ocfs2_meta_unlock(inode, meta_level);
1075 meta_level = -1;
1076
1077 /* communicate with ocfs2_dio_end_io */
1078 ocfs2_iocb_set_rw_locked(iocb);
1079
1080#ifdef OCFS2_ORACORE_WORKAROUNDS
1081 if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
1082 filp->f_flags & O_DIRECT) {
1083 unsigned int saved_flags = filp->f_flags;
1084 int sector_size = 1 << osb->s_sectsize_bits;
1085
1086 if ((saved_pos & (sector_size - 1)) ||
1087 (count & (sector_size - 1)) ||
1088 ((unsigned long)buf & (sector_size - 1))) {
1089 filp->f_flags |= O_SYNC;
1090 filp->f_flags &= ~O_DIRECT;
1091 }
1092
1093 ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
1094 &iocb->ki_pos);
1095
1096 filp->f_flags = saved_flags;
1097 } else
1098#endif
1099 ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
1100 &iocb->ki_pos);
1101
1102 /* buffered aio wouldn't have proper lock coverage today */
1103 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
1104
1105 /*
1106 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
1107 * function pointer which is called when o_direct io completes so that
1108 * it can unlock our rw lock. (it's the clustered equivalent of
1109 * i_alloc_sem; protects truncate from racing with pending ios).
1110 * Unfortunately there are error cases which call end_io and others
1111 * that don't. so we don't have to unlock the rw_lock if either an
1112 * async dio is going to do it in the future or an end_io after an
1113 * error has already done it.
1114 */
1115 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
1116 rw_level = -1;
1117 have_alloc_sem = 0;
1118 }
1119
1120out:
1121 if (meta_level != -1)
1122 ocfs2_meta_unlock(inode, meta_level);
1123 if (have_alloc_sem)
1124 up_read(&inode->i_alloc_sem);
1125 if (rw_level != -1)
1126 ocfs2_rw_unlock(inode, rw_level);
1127 mutex_unlock(&inode->i_mutex);
1128
1129 mlog_exit(ret);
1130 return ret;
1131}
1132
1133static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
1134 char __user *buf,
1135 size_t count,
1136 loff_t pos)
1137{
1138 int ret = 0, rw_level = -1, have_alloc_sem = 0;
1139 struct file *filp = iocb->ki_filp;
1140 struct inode *inode = filp->f_dentry->d_inode;
1141#ifdef OCFS2_ORACORE_WORKAROUNDS
1142 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1143#endif
1144
1145 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
1146 (unsigned int)count,
1147 filp->f_dentry->d_name.len,
1148 filp->f_dentry->d_name.name);
1149
1150 if (!inode) {
1151 ret = -EINVAL;
1152 mlog_errno(ret);
1153 goto bail;
1154 }
1155
1156#ifdef OCFS2_ORACORE_WORKAROUNDS
1157 if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
1158 if (filp->f_flags & O_DIRECT) {
1159 int sector_size = 1 << osb->s_sectsize_bits;
1160
1161 if ((pos & (sector_size - 1)) ||
1162 (count & (sector_size - 1)) ||
1163 ((unsigned long)buf & (sector_size - 1)) ||
1164 (i_size_read(inode) & (sector_size -1))) {
1165 filp->f_flags &= ~O_DIRECT;
1166 }
1167 }
1168 }
1169#endif
1170
1171 /*
1172 * buffered reads protect themselves in ->readpage(). O_DIRECT reads
1173 * need locks to protect pending reads from racing with truncate.
1174 */
1175 if (filp->f_flags & O_DIRECT) {
1176 down_read(&inode->i_alloc_sem);
1177 have_alloc_sem = 1;
1178
1179 ret = ocfs2_rw_lock(inode, 0);
1180 if (ret < 0) {
1181 mlog_errno(ret);
1182 goto bail;
1183 }
1184 rw_level = 0;
1185 /* communicate with ocfs2_dio_end_io */
1186 ocfs2_iocb_set_rw_locked(iocb);
1187 }
1188
1189 ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos);
1190 if (ret == -EINVAL)
1191 mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
1192
1193 /* buffered aio wouldn't have proper lock coverage today */
1194 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
1195
1196 /* see ocfs2_file_aio_write */
1197 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
1198 rw_level = -1;
1199 have_alloc_sem = 0;
1200 }
1201
1202bail:
1203 if (have_alloc_sem)
1204 up_read(&inode->i_alloc_sem);
1205 if (rw_level != -1)
1206 ocfs2_rw_unlock(inode, rw_level);
1207 mlog_exit(ret);
1208
1209 return ret;
1210}
1211
1212struct inode_operations ocfs2_file_iops = {
1213 .setattr = ocfs2_setattr,
1214 .getattr = ocfs2_getattr,
1215};
1216
1217struct inode_operations ocfs2_special_file_iops = {
1218 .setattr = ocfs2_setattr,
1219 .getattr = ocfs2_getattr,
1220};
1221
1222struct file_operations ocfs2_fops = {
1223 .read = do_sync_read,
1224 .write = do_sync_write,
1225 .sendfile = generic_file_sendfile,
1226 .mmap = ocfs2_mmap,
1227 .fsync = ocfs2_sync_file,
1228 .release = ocfs2_file_release,
1229 .open = ocfs2_file_open,
1230 .aio_read = ocfs2_file_aio_read,
1231 .aio_write = ocfs2_file_aio_write,
1232};
1233
1234struct file_operations ocfs2_dops = {
1235 .read = generic_read_dir,
1236 .readdir = ocfs2_readdir,
1237 .fsync = ocfs2_sync_file,
1238};
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
new file mode 100644
index 00000000000..a5ea33b2406
--- /dev/null
+++ b/fs/ocfs2/file.h
@@ -0,0 +1,57 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * file.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_FILE_H
27#define OCFS2_FILE_H
28
29extern struct file_operations ocfs2_fops;
30extern struct file_operations ocfs2_dops;
31extern struct inode_operations ocfs2_file_iops;
32extern struct inode_operations ocfs2_special_file_iops;
33struct ocfs2_alloc_context;
34
35enum ocfs2_alloc_restarted {
36 RESTART_NONE = 0,
37 RESTART_TRANS,
38 RESTART_META
39};
40int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
41 struct inode *inode,
42 u32 clusters_to_add,
43 struct buffer_head *fe_bh,
44 struct ocfs2_journal_handle *handle,
45 struct ocfs2_alloc_context *data_ac,
46 struct ocfs2_alloc_context *meta_ac,
47 enum ocfs2_alloc_restarted *reason);
48int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
49int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
50 struct kstat *stat);
51
52int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,
53 struct inode *inode,
54 struct buffer_head *fe_bh,
55 u64 new_i_size);
56
57#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
new file mode 100644
index 00000000000..0bbd22f46c8
--- /dev/null
+++ b/fs/ocfs2/heartbeat.c
@@ -0,0 +1,378 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * heartbeat.c
5 *
6 * Register ourselves with the heartbaet service, keep our node maps
7 * up to date, and fire off recovery when needed.
8 *
9 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 021110-1307, USA.
25 */
26
27#include <linux/fs.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h>
31#include <linux/kmod.h>
32
33#include <cluster/heartbeat.h>
34#include <cluster/nodemanager.h>
35
36#include <dlm/dlmapi.h>
37
38#define MLOG_MASK_PREFIX ML_SUPER
39#include <cluster/masklog.h>
40
41#include "ocfs2.h"
42
43#include "alloc.h"
44#include "heartbeat.h"
45#include "inode.h"
46#include "journal.h"
47#include "vote.h"
48
49#include "buffer_head_io.h"
50
51#define OCFS2_HB_NODE_DOWN_PRI (0x0000002)
52#define OCFS2_HB_NODE_UP_PRI OCFS2_HB_NODE_DOWN_PRI
53
54static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
55 int bit);
56static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
57 int bit);
58static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map);
59static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
60 struct ocfs2_node_map *from);
61static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
62 struct ocfs2_node_map *from);
63
64void ocfs2_init_node_maps(struct ocfs2_super *osb)
65{
66 spin_lock_init(&osb->node_map_lock);
67 ocfs2_node_map_init(&osb->mounted_map);
68 ocfs2_node_map_init(&osb->recovery_map);
69 ocfs2_node_map_init(&osb->umount_map);
70}
71
72static void ocfs2_do_node_down(int node_num,
73 struct ocfs2_super *osb)
74{
75 BUG_ON(osb->node_num == node_num);
76
77 mlog(0, "ocfs2: node down event for %d\n", node_num);
78
79 if (!osb->dlm) {
80 /*
81 * No DLM means we're not even ready to participate yet.
82 * We check the slots after the DLM comes up, so we will
83 * notice the node death then. We can safely ignore it
84 * here.
85 */
86 return;
87 }
88
89 if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) {
90 /* If a node is in the umount map, then we've been
91 * expecting him to go down and we know ahead of time
92 * that recovery is not necessary. */
93 ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
94 return;
95 }
96
97 ocfs2_recovery_thread(osb, node_num);
98
99 ocfs2_remove_node_from_vote_queues(osb, node_num);
100}
101
102static void ocfs2_hb_node_down_cb(struct o2nm_node *node,
103 int node_num,
104 void *data)
105{
106 ocfs2_do_node_down(node_num, (struct ocfs2_super *) data);
107}
108
109/* Called from the dlm when it's about to evict a node. We may also
110 * get a heartbeat callback later. */
111static void ocfs2_dlm_eviction_cb(int node_num,
112 void *data)
113{
114 struct ocfs2_super *osb = (struct ocfs2_super *) data;
115 struct super_block *sb = osb->sb;
116
117 mlog(ML_NOTICE, "device (%u,%u): dlm has evicted node %d\n",
118 MAJOR(sb->s_dev), MINOR(sb->s_dev), node_num);
119
120 ocfs2_do_node_down(node_num, osb);
121}
122
123static void ocfs2_hb_node_up_cb(struct o2nm_node *node,
124 int node_num,
125 void *data)
126{
127 struct ocfs2_super *osb = data;
128
129 BUG_ON(osb->node_num == node_num);
130
131 mlog(0, "node up event for %d\n", node_num);
132 ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
133}
134
135void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
136{
137 o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB,
138 ocfs2_hb_node_down_cb, osb,
139 OCFS2_HB_NODE_DOWN_PRI);
140
141 o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB,
142 ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI);
143
144 /* Not exactly a heartbeat callback, but leads to essentially
145 * the same path so we set it up here. */
146 dlm_setup_eviction_cb(&osb->osb_eviction_cb,
147 ocfs2_dlm_eviction_cb,
148 osb);
149}
150
151/* Most functions here are just stubs for now... */
152int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
153{
154 int status;
155
156 status = o2hb_register_callback(&osb->osb_hb_down);
157 if (status < 0) {
158 mlog_errno(status);
159 goto bail;
160 }
161
162 status = o2hb_register_callback(&osb->osb_hb_up);
163 if (status < 0)
164 mlog_errno(status);
165
166bail:
167 return status;
168}
169
170void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
171{
172 int status;
173
174 status = o2hb_unregister_callback(&osb->osb_hb_down);
175 if (status < 0)
176 mlog_errno(status);
177
178 status = o2hb_unregister_callback(&osb->osb_hb_up);
179 if (status < 0)
180 mlog_errno(status);
181}
182
183void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
184{
185 int ret;
186 char *argv[5], *envp[3];
187
188 if (!osb->uuid_str) {
189 /* This can happen if we don't get far enough in mount... */
190 mlog(0, "No UUID with which to stop heartbeat!\n\n");
191 return;
192 }
193
194 argv[0] = (char *)o2nm_get_hb_ctl_path();
195 argv[1] = "-K";
196 argv[2] = "-u";
197 argv[3] = osb->uuid_str;
198 argv[4] = NULL;
199
200 mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
201
202 /* minimal command environment taken from cpu_run_sbin_hotplug */
203 envp[0] = "HOME=/";
204 envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
205 envp[2] = NULL;
206
207 ret = call_usermodehelper(argv[0], argv, envp, 1);
208 if (ret < 0)
209 mlog_errno(ret);
210}
211
212/* special case -1 for now
213 * TODO: should *really* make sure the calling func never passes -1!! */
214void ocfs2_node_map_init(struct ocfs2_node_map *map)
215{
216 map->num_nodes = OCFS2_NODE_MAP_MAX_NODES;
217 memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) *
218 sizeof(unsigned long));
219}
220
221static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
222 int bit)
223{
224 set_bit(bit, map->map);
225}
226
227void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
228 struct ocfs2_node_map *map,
229 int bit)
230{
231 if (bit==-1)
232 return;
233 BUG_ON(bit >= map->num_nodes);
234 spin_lock(&osb->node_map_lock);
235 __ocfs2_node_map_set_bit(map, bit);
236 spin_unlock(&osb->node_map_lock);
237}
238
239static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
240 int bit)
241{
242 clear_bit(bit, map->map);
243}
244
245void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
246 struct ocfs2_node_map *map,
247 int bit)
248{
249 if (bit==-1)
250 return;
251 BUG_ON(bit >= map->num_nodes);
252 spin_lock(&osb->node_map_lock);
253 __ocfs2_node_map_clear_bit(map, bit);
254 spin_unlock(&osb->node_map_lock);
255}
256
257int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
258 struct ocfs2_node_map *map,
259 int bit)
260{
261 int ret;
262 if (bit >= map->num_nodes) {
263 mlog(ML_ERROR, "bit=%d map->num_nodes=%d\n", bit, map->num_nodes);
264 BUG();
265 }
266 spin_lock(&osb->node_map_lock);
267 ret = test_bit(bit, map->map);
268 spin_unlock(&osb->node_map_lock);
269 return ret;
270}
271
272static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map)
273{
274 int bit;
275 bit = find_next_bit(map->map, map->num_nodes, 0);
276 if (bit < map->num_nodes)
277 return 0;
278 return 1;
279}
280
281int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
282 struct ocfs2_node_map *map)
283{
284 int ret;
285 BUG_ON(map->num_nodes == 0);
286 spin_lock(&osb->node_map_lock);
287 ret = __ocfs2_node_map_is_empty(map);
288 spin_unlock(&osb->node_map_lock);
289 return ret;
290}
291
292static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
293 struct ocfs2_node_map *from)
294{
295 BUG_ON(from->num_nodes == 0);
296 ocfs2_node_map_init(target);
297 __ocfs2_node_map_set(target, from);
298}
299
300/* returns 1 if bit is the only bit set in target, 0 otherwise */
301int ocfs2_node_map_is_only(struct ocfs2_super *osb,
302 struct ocfs2_node_map *target,
303 int bit)
304{
305 struct ocfs2_node_map temp;
306 int ret;
307
308 spin_lock(&osb->node_map_lock);
309 __ocfs2_node_map_dup(&temp, target);
310 __ocfs2_node_map_clear_bit(&temp, bit);
311 ret = __ocfs2_node_map_is_empty(&temp);
312 spin_unlock(&osb->node_map_lock);
313
314 return ret;
315}
316
317static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
318 struct ocfs2_node_map *from)
319{
320 int num_longs, i;
321
322 BUG_ON(target->num_nodes != from->num_nodes);
323 BUG_ON(target->num_nodes == 0);
324
325 num_longs = BITS_TO_LONGS(target->num_nodes);
326 for (i = 0; i < num_longs; i++)
327 target->map[i] = from->map[i];
328}
329
330/* Returns whether the recovery bit was actually set - it may not be
331 * if a node is still marked as needing recovery */
332int ocfs2_recovery_map_set(struct ocfs2_super *osb,
333 int num)
334{
335 int set = 0;
336
337 spin_lock(&osb->node_map_lock);
338
339 __ocfs2_node_map_clear_bit(&osb->mounted_map, num);
340
341 if (!test_bit(num, osb->recovery_map.map)) {
342 __ocfs2_node_map_set_bit(&osb->recovery_map, num);
343 set = 1;
344 }
345
346 spin_unlock(&osb->node_map_lock);
347
348 return set;
349}
350
351void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
352 int num)
353{
354 ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num);
355}
356
357int ocfs2_node_map_iterate(struct ocfs2_super *osb,
358 struct ocfs2_node_map *map,
359 int idx)
360{
361 int i = idx;
362
363 idx = O2NM_INVALID_NODE_NUM;
364 spin_lock(&osb->node_map_lock);
365 if ((i != O2NM_INVALID_NODE_NUM) &&
366 (i >= 0) &&
367 (i < map->num_nodes)) {
368 while(i < map->num_nodes) {
369 if (test_bit(i, map->map)) {
370 idx = i;
371 break;
372 }
373 i++;
374 }
375 }
376 spin_unlock(&osb->node_map_lock);
377 return idx;
378}
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
new file mode 100644
index 00000000000..e8fb079122e
--- /dev/null
+++ b/fs/ocfs2/heartbeat.h
@@ -0,0 +1,67 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * heartbeat.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_HEARTBEAT_H
27#define OCFS2_HEARTBEAT_H
28
29void ocfs2_init_node_maps(struct ocfs2_super *osb);
30
31void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
32int ocfs2_register_hb_callbacks(struct ocfs2_super *osb);
33void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb);
34void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
35
36/* node map functions - used to keep track of mounted and in-recovery
37 * nodes. */
38void ocfs2_node_map_init(struct ocfs2_node_map *map);
39int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
40 struct ocfs2_node_map *map);
41void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
42 struct ocfs2_node_map *map,
43 int bit);
44void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
45 struct ocfs2_node_map *map,
46 int bit);
47int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
48 struct ocfs2_node_map *map,
49 int bit);
50int ocfs2_node_map_iterate(struct ocfs2_super *osb,
51 struct ocfs2_node_map *map,
52 int idx);
53static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb,
54 struct ocfs2_node_map *map)
55{
56 return ocfs2_node_map_iterate(osb, map, 0);
57}
58int ocfs2_recovery_map_set(struct ocfs2_super *osb,
59 int num);
60void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
61 int num);
62/* returns 1 if bit is the only bit set in target, 0 otherwise */
63int ocfs2_node_map_is_only(struct ocfs2_super *osb,
64 struct ocfs2_node_map *target,
65 int bit);
66
67#endif /* OCFS2_HEARTBEAT_H */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
new file mode 100644
index 00000000000..d4ecc062771
--- /dev/null
+++ b/fs/ocfs2/inode.c
@@ -0,0 +1,1140 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * inode.c
5 *
6 * vfs' aops, fops, dops and iops
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30#include <linux/pagemap.h>
31#include <linux/smp_lock.h>
32
33#include <asm/byteorder.h>
34
35#define MLOG_MASK_PREFIX ML_INODE
36#include <cluster/masklog.h>
37
38#include "ocfs2.h"
39
40#include "alloc.h"
41#include "dlmglue.h"
42#include "extent_map.h"
43#include "file.h"
44#include "inode.h"
45#include "journal.h"
46#include "namei.h"
47#include "suballoc.h"
48#include "super.h"
49#include "symlink.h"
50#include "sysfile.h"
51#include "uptodate.h"
52#include "vote.h"
53
54#include "buffer_head_io.h"
55
56#define OCFS2_FI_FLAG_NOWAIT 0x1
57#define OCFS2_FI_FLAG_DELETE 0x2
58struct ocfs2_find_inode_args
59{
60 u64 fi_blkno;
61 unsigned long fi_ino;
62 unsigned int fi_flags;
63};
64
65static int ocfs2_read_locked_inode(struct inode *inode,
66 struct ocfs2_find_inode_args *args);
67static int ocfs2_init_locked_inode(struct inode *inode, void *opaque);
68static int ocfs2_find_actor(struct inode *inode, void *opaque);
69static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
70 struct inode *inode,
71 struct buffer_head *fe_bh);
72
73struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
74 u64 blkno,
75 int delete_vote)
76{
77 struct ocfs2_find_inode_args args;
78
79 /* ocfs2_ilookup_for_vote should *only* be called from the
80 * vote thread */
81 BUG_ON(current != osb->vote_task);
82
83 args.fi_blkno = blkno;
84 args.fi_flags = OCFS2_FI_FLAG_NOWAIT;
85 if (delete_vote)
86 args.fi_flags |= OCFS2_FI_FLAG_DELETE;
87 args.fi_ino = ino_from_blkno(osb->sb, blkno);
88 return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args);
89}
90
91struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno)
92{
93 struct inode *inode = NULL;
94 struct super_block *sb = osb->sb;
95 struct ocfs2_find_inode_args args;
96
97 mlog_entry("(blkno = %"MLFu64")\n", blkno);
98
99 /* Ok. By now we've either got the offsets passed to us by the
100 * caller, or we just pulled them off the bh. Lets do some
101 * sanity checks to make sure they're OK. */
102 if (blkno == 0) {
103 inode = ERR_PTR(-EINVAL);
104 mlog_errno(PTR_ERR(inode));
105 goto bail;
106 }
107
108 args.fi_blkno = blkno;
109 args.fi_flags = 0;
110 args.fi_ino = ino_from_blkno(sb, blkno);
111
112 inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor,
113 ocfs2_init_locked_inode, &args);
114 /* inode was *not* in the inode cache. 2.6.x requires
115 * us to do our own read_inode call and unlock it
116 * afterwards. */
117 if (inode && inode->i_state & I_NEW) {
118 mlog(0, "Inode was not in inode cache, reading it.\n");
119 ocfs2_read_locked_inode(inode, &args);
120 unlock_new_inode(inode);
121 }
122 if (inode == NULL) {
123 inode = ERR_PTR(-ENOMEM);
124 mlog_errno(PTR_ERR(inode));
125 goto bail;
126 }
127 if (is_bad_inode(inode)) {
128 iput(inode);
129 inode = ERR_PTR(-ESTALE);
130 mlog_errno(PTR_ERR(inode));
131 goto bail;
132 }
133
134bail:
135 if (!IS_ERR(inode)) {
136 mlog(0, "returning inode with number %"MLFu64"\n",
137 OCFS2_I(inode)->ip_blkno);
138 mlog_exit_ptr(inode);
139 } else
140 mlog_errno(PTR_ERR(inode));
141
142 return inode;
143}
144
145
146/*
147 * here's how inodes get read from disk:
148 * iget5_locked -> find_actor -> OCFS2_FIND_ACTOR
149 * found? : return the in-memory inode
150 * not found? : get_new_inode -> OCFS2_INIT_LOCKED_INODE
151 */
152
153static int ocfs2_find_actor(struct inode *inode, void *opaque)
154{
155 struct ocfs2_find_inode_args *args = NULL;
156 struct ocfs2_inode_info *oi = OCFS2_I(inode);
157 int ret = 0;
158
159 mlog_entry("(0x%p, %lu, 0x%p)\n", inode, inode->i_ino, opaque);
160
161 args = opaque;
162
163 mlog_bug_on_msg(!inode, "No inode in find actor!\n");
164
165 if (oi->ip_blkno != args->fi_blkno)
166 goto bail;
167
168 /* OCFS2_FI_FLAG_NOWAIT is *only* set from
169 * ocfs2_ilookup_for_vote which won't create an inode for one
170 * that isn't found. The vote thread which doesn't want to get
171 * an inode which is in the process of going away - otherwise
172 * the call to __wait_on_freeing_inode in find_inode_fast will
173 * cause it to deadlock on an inode which may be waiting on a
174 * vote (or lock release) in delete_inode */
175 if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) &&
176 (inode->i_state & (I_FREEING|I_CLEAR))) {
177 /* As stated above, we're not going to return an
178 * inode. In the case of a delete vote, the voting
179 * code is going to signal the other node to go
180 * ahead. Mark that state here, so this freeing inode
181 * has the state when it gets to delete_inode. */
182 if (args->fi_flags & OCFS2_FI_FLAG_DELETE) {
183 spin_lock(&oi->ip_lock);
184 ocfs2_mark_inode_remotely_deleted(inode);
185 spin_unlock(&oi->ip_lock);
186 }
187 goto bail;
188 }
189
190 ret = 1;
191bail:
192 mlog_exit(ret);
193 return ret;
194}
195
196/*
197 * initialize the new inode, but don't do anything that would cause
198 * us to sleep.
199 * return 0 on success, 1 on failure
200 */
201static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
202{
203 struct ocfs2_find_inode_args *args = opaque;
204
205 mlog_entry("inode = %p, opaque = %p\n", inode, opaque);
206
207 inode->i_ino = args->fi_ino;
208 OCFS2_I(inode)->ip_blkno = args->fi_blkno;
209
210 mlog_exit(0);
211 return 0;
212}
213
214int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
215 int create_ino)
216{
217 struct super_block *sb;
218 struct ocfs2_super *osb;
219 int status = -EINVAL;
220
221 mlog_entry("(0x%p, size:%"MLFu64")\n", inode, fe->i_size);
222
223 sb = inode->i_sb;
224 osb = OCFS2_SB(sb);
225
226 /* this means that read_inode cannot create a superblock inode
227 * today. change if needed. */
228 if (!OCFS2_IS_VALID_DINODE(fe) ||
229 !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
230 mlog(ML_ERROR, "Invalid dinode: i_ino=%lu, i_blkno=%"MLFu64", "
231 "signature = %.*s, flags = 0x%x\n",
232 inode->i_ino, le64_to_cpu(fe->i_blkno), 7,
233 fe->i_signature, le32_to_cpu(fe->i_flags));
234 goto bail;
235 }
236
237 if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) {
238 mlog(ML_ERROR, "file entry generation does not match "
239 "superblock! osb->fs_generation=%x, "
240 "fe->i_fs_generation=%x\n",
241 osb->fs_generation, le32_to_cpu(fe->i_fs_generation));
242 goto bail;
243 }
244
245 inode->i_version = 1;
246 inode->i_generation = le32_to_cpu(fe->i_generation);
247 inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
248 inode->i_mode = le16_to_cpu(fe->i_mode);
249 inode->i_uid = le32_to_cpu(fe->i_uid);
250 inode->i_gid = le32_to_cpu(fe->i_gid);
251 inode->i_blksize = (u32)osb->s_clustersize;
252
253 /* Fast symlinks will have i_size but no allocated clusters. */
254 if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
255 inode->i_blocks = 0;
256 else
257 inode->i_blocks =
258 ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size));
259 inode->i_mapping->a_ops = &ocfs2_aops;
260 inode->i_flags |= S_NOATIME;
261 inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
262 inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
263 inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
264 inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
265 inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
266 inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
267
268 if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno))
269 mlog(ML_ERROR,
270 "ip_blkno %"MLFu64" != i_blkno %"MLFu64"!\n",
271 OCFS2_I(inode)->ip_blkno, fe->i_blkno);
272
273 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
274 OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
275
276 if (create_ino)
277 inode->i_ino = ino_from_blkno(inode->i_sb,
278 le64_to_cpu(fe->i_blkno));
279
280 mlog(0, "blkno = %"MLFu64", ino = %lu, create_ino = %s\n",
281 fe->i_blkno, inode->i_ino, create_ino ? "true" : "false");
282
283 inode->i_nlink = le16_to_cpu(fe->i_links_count);
284
285 if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
286 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
287 mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
288 } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
289 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
290 } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
291 mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
292 /* we can't actually hit this as read_inode can't
293 * handle superblocks today ;-) */
294 BUG();
295 }
296
297 switch (inode->i_mode & S_IFMT) {
298 case S_IFREG:
299 inode->i_fop = &ocfs2_fops;
300 inode->i_op = &ocfs2_file_iops;
301 i_size_write(inode, le64_to_cpu(fe->i_size));
302 break;
303 case S_IFDIR:
304 inode->i_op = &ocfs2_dir_iops;
305 inode->i_fop = &ocfs2_dops;
306 i_size_write(inode, le64_to_cpu(fe->i_size));
307 break;
308 case S_IFLNK:
309 if (ocfs2_inode_is_fast_symlink(inode))
310 inode->i_op = &ocfs2_fast_symlink_inode_operations;
311 else
312 inode->i_op = &ocfs2_symlink_inode_operations;
313 i_size_write(inode, le64_to_cpu(fe->i_size));
314 break;
315 default:
316 inode->i_op = &ocfs2_special_file_iops;
317 init_special_inode(inode, inode->i_mode,
318 inode->i_rdev);
319 break;
320 }
321
322 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
323 OCFS2_LOCK_TYPE_RW, inode);
324 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
325 OCFS2_LOCK_TYPE_META, inode);
326 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres,
327 OCFS2_LOCK_TYPE_DATA, inode);
328
329 status = 0;
330bail:
331 mlog_exit(status);
332 return status;
333}
334
335static int ocfs2_read_locked_inode(struct inode *inode,
336 struct ocfs2_find_inode_args *args)
337{
338 struct super_block *sb;
339 struct ocfs2_super *osb;
340 struct ocfs2_dinode *fe;
341 struct buffer_head *bh = NULL;
342 int status;
343 int sysfile = 0;
344
345 mlog_entry("(0x%p, 0x%p)\n", inode, args);
346
347 status = -EINVAL;
348 if (inode == NULL || inode->i_sb == NULL) {
349 mlog(ML_ERROR, "bad inode\n");
350 goto bail;
351 }
352 sb = inode->i_sb;
353 osb = OCFS2_SB(sb);
354
355 if (!args) {
356 mlog(ML_ERROR, "bad inode args\n");
357 make_bad_inode(inode);
358 goto bail;
359 }
360
361 /* Read the FE off disk. This is safe because the kernel only
362 * does one read_inode2 for a new inode, and if it doesn't
363 * exist yet then nobody can be working on it! */
364 status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, NULL);
365 if (status < 0) {
366 mlog_errno(status);
367 make_bad_inode(inode);
368 goto bail;
369 }
370
371 fe = (struct ocfs2_dinode *) bh->b_data;
372 if (!OCFS2_IS_VALID_DINODE(fe)) {
373 mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n",
374 fe->i_blkno, 7, fe->i_signature);
375 make_bad_inode(inode);
376 goto bail;
377 }
378
379 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
380 sysfile = 1;
381
382 if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
383 S_ISBLK(le16_to_cpu(fe->i_mode)))
384 inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
385
386 status = -EINVAL;
387 if (ocfs2_populate_inode(inode, fe, 0) < 0) {
388 mlog(ML_ERROR, "populate inode failed! i_blkno=%"MLFu64", "
389 "i_ino=%lu\n", fe->i_blkno, inode->i_ino);
390 make_bad_inode(inode);
391 goto bail;
392 }
393
394 BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
395
396 if (sysfile)
397 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
398
399 status = 0;
400
401bail:
402 if (args && bh)
403 brelse(bh);
404
405 mlog_exit(status);
406 return status;
407}
408
409void ocfs2_sync_blockdev(struct super_block *sb)
410{
411 sync_blockdev(sb->s_bdev);
412}
413
414static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
415 struct inode *inode,
416 struct buffer_head *fe_bh)
417{
418 int status = 0;
419 struct ocfs2_journal_handle *handle = NULL;
420 struct ocfs2_truncate_context *tc = NULL;
421 struct ocfs2_dinode *fe;
422
423 mlog_entry_void();
424
425 fe = (struct ocfs2_dinode *) fe_bh->b_data;
426
427 /* zero allocation, zero truncate :) */
428 if (!fe->i_clusters)
429 goto bail;
430
431 handle = ocfs2_start_trans(osb, handle, OCFS2_INODE_UPDATE_CREDITS);
432 if (IS_ERR(handle)) {
433 status = PTR_ERR(handle);
434 handle = NULL;
435 mlog_errno(status);
436 goto bail;
437 }
438
439 status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL);
440 if (status < 0) {
441 mlog_errno(status);
442 goto bail;
443 }
444
445 ocfs2_commit_trans(handle);
446 handle = NULL;
447
448 status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
449 if (status < 0) {
450 mlog_errno(status);
451 goto bail;
452 }
453
454 status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
455 if (status < 0) {
456 mlog_errno(status);
457 goto bail;
458 }
459bail:
460 if (handle)
461 ocfs2_commit_trans(handle);
462
463 mlog_exit(status);
464 return status;
465}
466
467static int ocfs2_remove_inode(struct inode *inode,
468 struct buffer_head *di_bh,
469 struct inode *orphan_dir_inode,
470 struct buffer_head *orphan_dir_bh)
471{
472 int status;
473 struct inode *inode_alloc_inode = NULL;
474 struct buffer_head *inode_alloc_bh = NULL;
475 struct ocfs2_journal_handle *handle;
476 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
477 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
478
479 inode_alloc_inode =
480 ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
481 le16_to_cpu(di->i_suballoc_slot));
482 if (!inode_alloc_inode) {
483 status = -EEXIST;
484 mlog_errno(status);
485 goto bail;
486 }
487
488 mutex_lock(&inode_alloc_inode->i_mutex);
489 status = ocfs2_meta_lock(inode_alloc_inode, NULL, &inode_alloc_bh, 1);
490 if (status < 0) {
491 mutex_unlock(&inode_alloc_inode->i_mutex);
492
493 mlog_errno(status);
494 goto bail;
495 }
496
497 handle = ocfs2_start_trans(osb, NULL, OCFS2_DELETE_INODE_CREDITS);
498 if (IS_ERR(handle)) {
499 status = PTR_ERR(handle);
500 mlog_errno(status);
501 goto bail_unlock;
502 }
503
504 status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
505 orphan_dir_bh);
506 if (status < 0) {
507 mlog_errno(status);
508 goto bail_commit;
509 }
510
511 /* set the inodes dtime */
512 status = ocfs2_journal_access(handle, inode, di_bh,
513 OCFS2_JOURNAL_ACCESS_WRITE);
514 if (status < 0) {
515 mlog_errno(status);
516 goto bail_commit;
517 }
518
519 di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
520 le32_and_cpu(&di->i_flags, ~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
521
522 status = ocfs2_journal_dirty(handle, di_bh);
523 if (status < 0) {
524 mlog_errno(status);
525 goto bail_commit;
526 }
527
528 ocfs2_remove_from_cache(inode, di_bh);
529
530 status = ocfs2_free_dinode(handle, inode_alloc_inode,
531 inode_alloc_bh, di);
532 if (status < 0)
533 mlog_errno(status);
534
535bail_commit:
536 ocfs2_commit_trans(handle);
537bail_unlock:
538 ocfs2_meta_unlock(inode_alloc_inode, 1);
539 mutex_unlock(&inode_alloc_inode->i_mutex);
540 brelse(inode_alloc_bh);
541bail:
542 iput(inode_alloc_inode);
543
544 return status;
545}
546
547static int ocfs2_wipe_inode(struct inode *inode,
548 struct buffer_head *di_bh)
549{
550 int status, orphaned_slot;
551 struct inode *orphan_dir_inode = NULL;
552 struct buffer_head *orphan_dir_bh = NULL;
553 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
554
555 /* We've already voted on this so it should be readonly - no
556 * spinlock needed. */
557 orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
558 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
559 ORPHAN_DIR_SYSTEM_INODE,
560 orphaned_slot);
561 if (!orphan_dir_inode) {
562 status = -EEXIST;
563 mlog_errno(status);
564 goto bail;
565 }
566
567 /* Lock the orphan dir. The lock will be held for the entire
568 * delete_inode operation. We do this now to avoid races with
569 * recovery completion on other nodes. */
570 mutex_lock(&orphan_dir_inode->i_mutex);
571 status = ocfs2_meta_lock(orphan_dir_inode, NULL, &orphan_dir_bh, 1);
572 if (status < 0) {
573 mutex_unlock(&orphan_dir_inode->i_mutex);
574
575 mlog_errno(status);
576 goto bail;
577 }
578
579 /* we do this while holding the orphan dir lock because we
580 * don't want recovery being run from another node to vote for
581 * an inode delete on us -- this will result in two nodes
582 * truncating the same file! */
583 status = ocfs2_truncate_for_delete(osb, inode, di_bh);
584 if (status < 0) {
585 mlog_errno(status);
586 goto bail_unlock_dir;
587 }
588
589 status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
590 orphan_dir_bh);
591 if (status < 0)
592 mlog_errno(status);
593
594bail_unlock_dir:
595 ocfs2_meta_unlock(orphan_dir_inode, 1);
596 mutex_unlock(&orphan_dir_inode->i_mutex);
597 brelse(orphan_dir_bh);
598bail:
599 iput(orphan_dir_inode);
600
601 return status;
602}
603
604/* There is a series of simple checks that should be done before a
605 * vote is even considered. Encapsulate those in this function. */
606static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
607{
608 int ret = 0;
609 struct ocfs2_inode_info *oi = OCFS2_I(inode);
610 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
611
612 /* We shouldn't be getting here for the root directory
613 * inode.. */
614 if (inode == osb->root_inode) {
615 mlog(ML_ERROR, "Skipping delete of root inode.\n");
616 goto bail;
617 }
618
619 /* If we're coming from process_vote we can't go into our own
620 * voting [hello, deadlock city!], so unforuntately we just
621 * have to skip deleting this guy. That's OK though because
622 * the node who's doing the actual deleting should handle it
623 * anyway. */
624 if (current == osb->vote_task) {
625 mlog(0, "Skipping delete of %lu because we're currently "
626 "in process_vote\n", inode->i_ino);
627 goto bail;
628 }
629
630 spin_lock(&oi->ip_lock);
631 /* OCFS2 *never* deletes system files. This should technically
632 * never get here as system file inodes should always have a
633 * positive link count. */
634 if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
635 mlog(ML_ERROR, "Skipping delete of system file %"MLFu64".\n",
636 oi->ip_blkno);
637 goto bail_unlock;
638 }
639
640 /* If we have voted "yes" on the wipe of this inode for
641 * another node, it will be marked here so we can safely skip
642 * it. Recovery will cleanup any inodes we might inadvertantly
643 * skip here. */
644 if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) {
645 mlog(0, "Skipping delete of %lu because another node "
646 "has done this for us.\n", inode->i_ino);
647 goto bail_unlock;
648 }
649
650 ret = 1;
651bail_unlock:
652 spin_unlock(&oi->ip_lock);
653bail:
654 return ret;
655}
656
657/* Query the cluster to determine whether we should wipe an inode from
658 * disk or not.
659 *
660 * Requires the inode to have the cluster lock. */
661static int ocfs2_query_inode_wipe(struct inode *inode,
662 struct buffer_head *di_bh,
663 int *wipe)
664{
665 int status = 0;
666 struct ocfs2_inode_info *oi = OCFS2_I(inode);
667 struct ocfs2_dinode *di;
668
669 *wipe = 0;
670
671 /* While we were waiting for the cluster lock in
672 * ocfs2_delete_inode, another node might have asked to delete
673 * the inode. Recheck our flags to catch this. */
674 if (!ocfs2_inode_is_valid_to_delete(inode)) {
675 mlog(0, "Skipping delete of %"MLFu64" because flags changed\n",
676 oi->ip_blkno);
677 goto bail;
678 }
679
680 /* Now that we have an up to date inode, we can double check
681 * the link count. */
682 if (inode->i_nlink) {
683 mlog(0, "Skipping delete of %"MLFu64" because nlink = %u\n",
684 oi->ip_blkno, inode->i_nlink);
685 goto bail;
686 }
687
688 /* Do some basic inode verification... */
689 di = (struct ocfs2_dinode *) di_bh->b_data;
690 if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) {
691 /* for lack of a better error? */
692 status = -EEXIST;
693 mlog(ML_ERROR,
694 "Inode %"MLFu64" (on-disk %"MLFu64") not orphaned! "
695 "Disk flags 0x%x, inode flags 0x%x\n",
696 oi->ip_blkno, di->i_blkno, di->i_flags, oi->ip_flags);
697 goto bail;
698 }
699
700 /* has someone already deleted us?! baaad... */
701 if (di->i_dtime) {
702 status = -EEXIST;
703 mlog_errno(status);
704 goto bail;
705 }
706
707 status = ocfs2_request_delete_vote(inode);
708 /* -EBUSY means that other nodes are still using the
709 * inode. We're done here though, so avoid doing anything on
710 * disk and let them worry about deleting it. */
711 if (status == -EBUSY) {
712 status = 0;
713 mlog(0, "Skipping delete of %"MLFu64" because it is in use on"
714 "other nodes\n", oi->ip_blkno);
715 goto bail;
716 }
717 if (status < 0) {
718 mlog_errno(status);
719 goto bail;
720 }
721
722 spin_lock(&oi->ip_lock);
723 if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) {
724 /* Nobody knew which slot this inode was orphaned
725 * into. This may happen during node death and
726 * recovery knows how to clean it up so we can safely
727 * ignore this inode for now on. */
728 mlog(0, "Nobody knew where inode %"MLFu64" was orphaned!\n",
729 oi->ip_blkno);
730 } else {
731 *wipe = 1;
732
733 mlog(0, "Inode %"MLFu64" is ok to wipe from orphan dir %d\n",
734 oi->ip_blkno, oi->ip_orphaned_slot);
735 }
736 spin_unlock(&oi->ip_lock);
737
738bail:
739 return status;
740}
741
742/* Support function for ocfs2_delete_inode. Will help us keep the
743 * inode data in a consistent state for clear_inode. Always truncates
744 * pages, optionally sync's them first. */
745static void ocfs2_cleanup_delete_inode(struct inode *inode,
746 int sync_data)
747{
748 mlog(0, "Cleanup inode %"MLFu64", sync = %d\n",
749 OCFS2_I(inode)->ip_blkno, sync_data);
750 if (sync_data)
751 write_inode_now(inode, 1);
752 truncate_inode_pages(&inode->i_data, 0);
753}
754
755void ocfs2_delete_inode(struct inode *inode)
756{
757 int wipe, status;
758 sigset_t blocked, oldset;
759 struct buffer_head *di_bh = NULL;
760
761 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
762
763 if (is_bad_inode(inode)) {
764 mlog(0, "Skipping delete of bad inode\n");
765 goto bail;
766 }
767
768 if (!ocfs2_inode_is_valid_to_delete(inode)) {
769 /* It's probably not necessary to truncate_inode_pages
770 * here but we do it for safety anyway (it will most
771 * likely be a no-op anyway) */
772 ocfs2_cleanup_delete_inode(inode, 0);
773 goto bail;
774 }
775
776 /* We want to block signals in delete_inode as the lock and
777 * messaging paths may return us -ERESTARTSYS. Which would
778 * cause us to exit early, resulting in inodes being orphaned
779 * forever. */
780 sigfillset(&blocked);
781 status = sigprocmask(SIG_BLOCK, &blocked, &oldset);
782 if (status < 0) {
783 mlog_errno(status);
784 ocfs2_cleanup_delete_inode(inode, 1);
785 goto bail;
786 }
787
788 /* Lock down the inode. This gives us an up to date view of
789 * it's metadata (for verification), and allows us to
790 * serialize delete_inode votes.
791 *
792 * Even though we might be doing a truncate, we don't take the
793 * allocation lock here as it won't be needed - nobody will
794 * have the file open.
795 */
796 status = ocfs2_meta_lock(inode, NULL, &di_bh, 1);
797 if (status < 0) {
798 if (status != -ENOENT)
799 mlog_errno(status);
800 ocfs2_cleanup_delete_inode(inode, 0);
801 goto bail_unblock;
802 }
803
804 /* Query the cluster. This will be the final decision made
805 * before we go ahead and wipe the inode. */
806 status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
807 if (!wipe || status < 0) {
808 /* Error and inode busy vote both mean we won't be
809 * removing the inode, so they take almost the same
810 * path. */
811 if (status < 0)
812 mlog_errno(status);
813
814 /* Someone in the cluster has voted to not wipe this
815 * inode, or it was never completely orphaned. Write
816 * out the pages and exit now. */
817 ocfs2_cleanup_delete_inode(inode, 1);
818 goto bail_unlock_inode;
819 }
820
821 ocfs2_cleanup_delete_inode(inode, 0);
822
823 status = ocfs2_wipe_inode(inode, di_bh);
824 if (status < 0) {
825 mlog_errno(status);
826 goto bail_unlock_inode;
827 }
828
829 /* Mark the inode as successfully deleted. This is important
830 * for ocfs2_clear_inode as it will check this flag and skip
831 * any checkpointing work */
832 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
833
834bail_unlock_inode:
835 ocfs2_meta_unlock(inode, 1);
836 brelse(di_bh);
837bail_unblock:
838 status = sigprocmask(SIG_SETMASK, &oldset, NULL);
839 if (status < 0)
840 mlog_errno(status);
841bail:
842 clear_inode(inode);
843 mlog_exit_void();
844}
845
846void ocfs2_clear_inode(struct inode *inode)
847{
848 int status;
849 struct ocfs2_inode_info *oi = OCFS2_I(inode);
850
851 mlog_entry_void();
852
853 if (!inode)
854 goto bail;
855
856 mlog(0, "Clearing inode: %"MLFu64", nlink = %u\n",
857 OCFS2_I(inode)->ip_blkno, inode->i_nlink);
858
859 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
860 "Inode=%lu\n", inode->i_ino);
861
862 /* Do these before all the other work so that we don't bounce
863 * the vote thread while waiting to destroy the locks. */
864 ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
865 ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
866 ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
867
868 /* We very well may get a clear_inode before all an inodes
869 * metadata has hit disk. Of course, we can't drop any cluster
870 * locks until the journal has finished with it. The only
871 * exception here are successfully wiped inodes - their
872 * metadata can now be considered to be part of the system
873 * inodes from which it came. */
874 if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED))
875 ocfs2_checkpoint_inode(inode);
876
877 mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
878 "Clear inode of %"MLFu64", inode has io markers\n",
879 oi->ip_blkno);
880
881 ocfs2_extent_map_drop(inode, 0);
882 ocfs2_extent_map_init(inode);
883
884 status = ocfs2_drop_inode_locks(inode);
885 if (status < 0)
886 mlog_errno(status);
887
888 ocfs2_lock_res_free(&oi->ip_rw_lockres);
889 ocfs2_lock_res_free(&oi->ip_meta_lockres);
890 ocfs2_lock_res_free(&oi->ip_data_lockres);
891
892 ocfs2_metadata_cache_purge(inode);
893
894 mlog_bug_on_msg(oi->ip_metadata_cache.ci_num_cached,
895 "Clear inode of %"MLFu64", inode has %u cache items\n",
896 oi->ip_blkno, oi->ip_metadata_cache.ci_num_cached);
897
898 mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
899 "Clear inode of %"MLFu64", inode has a bad flag\n",
900 oi->ip_blkno);
901
902 mlog_bug_on_msg(spin_is_locked(&oi->ip_lock),
903 "Clear inode of %"MLFu64", inode is locked\n",
904 oi->ip_blkno);
905
906 mlog_bug_on_msg(down_trylock(&oi->ip_io_sem),
907 "Clear inode of %"MLFu64", io_sem is locked\n",
908 oi->ip_blkno);
909 up(&oi->ip_io_sem);
910
911 /*
912 * down_trylock() returns 0, down_write_trylock() returns 1
913 * kernel 1, world 0
914 */
915 mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem),
916 "Clear inode of %"MLFu64", alloc_sem is locked\n",
917 oi->ip_blkno);
918 up_write(&oi->ip_alloc_sem);
919
920 mlog_bug_on_msg(oi->ip_open_count,
921 "Clear inode of %"MLFu64" has open count %d\n",
922 oi->ip_blkno, oi->ip_open_count);
923 mlog_bug_on_msg(!list_empty(&oi->ip_handle_list),
924 "Clear inode of %"MLFu64" has non empty handle list\n",
925 oi->ip_blkno);
926 mlog_bug_on_msg(oi->ip_handle,
927 "Clear inode of %"MLFu64" has non empty handle pointer\n",
928 oi->ip_blkno);
929
930 /* Clear all other flags. */
931 oi->ip_flags = OCFS2_INODE_CACHE_INLINE;
932 oi->ip_created_trans = 0;
933 oi->ip_last_trans = 0;
934 oi->ip_dir_start_lookup = 0;
935 oi->ip_blkno = 0ULL;
936
937bail:
938 mlog_exit_void();
939}
940
941/* Called under inode_lock, with no more references on the
942 * struct inode, so it's safe here to check the flags field
943 * and to manipulate i_nlink without any other locks. */
944void ocfs2_drop_inode(struct inode *inode)
945{
946 struct ocfs2_inode_info *oi = OCFS2_I(inode);
947
948 mlog_entry_void();
949
950 mlog(0, "Drop inode %"MLFu64", nlink = %u, ip_flags = 0x%x\n",
951 oi->ip_blkno, inode->i_nlink, oi->ip_flags);
952
953 /* Testing ip_orphaned_slot here wouldn't work because we may
954 * not have gotten a delete_inode vote from any other nodes
955 * yet. */
956 if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) {
957 mlog(0, "Inode was orphaned on another node, clearing nlink.\n");
958 inode->i_nlink = 0;
959 }
960
961 generic_drop_inode(inode);
962
963 mlog_exit_void();
964}
965
966/*
967 * TODO: this should probably be merged into ocfs2_get_block
968 *
969 * However, you now need to pay attention to the cont_prepare_write()
970 * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much
971 * expects never to extend).
972 */
973struct buffer_head *ocfs2_bread(struct inode *inode,
974 int block, int *err, int reada)
975{
976 struct buffer_head *bh = NULL;
977 int tmperr;
978 u64 p_blkno;
979 int readflags = OCFS2_BH_CACHED;
980
981#if 0
982 /* only turn this on if we know we can deal with read_block
983 * returning nothing */
984 if (reada)
985 readflags |= OCFS2_BH_READAHEAD;
986#endif
987
988 if (((u64)block << inode->i_sb->s_blocksize_bits) >=
989 i_size_read(inode)) {
990 BUG_ON(!reada);
991 return NULL;
992 }
993
994 tmperr = ocfs2_extent_map_get_blocks(inode, block, 1,
995 &p_blkno, NULL);
996 if (tmperr < 0) {
997 mlog_errno(tmperr);
998 goto fail;
999 }
1000
1001 tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh,
1002 readflags, inode);
1003 if (tmperr < 0)
1004 goto fail;
1005
1006 tmperr = 0;
1007
1008 *err = 0;
1009 return bh;
1010
1011fail:
1012 if (bh) {
1013 brelse(bh);
1014 bh = NULL;
1015 }
1016 *err = -EIO;
1017 return NULL;
1018}
1019
1020/*
1021 * This is called from our getattr.
1022 */
1023int ocfs2_inode_revalidate(struct dentry *dentry)
1024{
1025 struct inode *inode = dentry->d_inode;
1026 int status = 0;
1027
1028 mlog_entry("(inode = 0x%p, ino = %"MLFu64")\n", inode,
1029 inode ? OCFS2_I(inode)->ip_blkno : 0ULL);
1030
1031 if (!inode) {
1032 mlog(0, "eep, no inode!\n");
1033 status = -ENOENT;
1034 goto bail;
1035 }
1036
1037 spin_lock(&OCFS2_I(inode)->ip_lock);
1038 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
1039 spin_unlock(&OCFS2_I(inode)->ip_lock);
1040 mlog(0, "inode deleted!\n");
1041 status = -ENOENT;
1042 goto bail;
1043 }
1044 spin_unlock(&OCFS2_I(inode)->ip_lock);
1045
1046 /* Let ocfs2_meta_lock do the work of updating our struct
1047 * inode for us. */
1048 status = ocfs2_meta_lock(inode, NULL, NULL, 0);
1049 if (status < 0) {
1050 if (status != -ENOENT)
1051 mlog_errno(status);
1052 goto bail;
1053 }
1054 ocfs2_meta_unlock(inode, 0);
1055bail:
1056 mlog_exit(status);
1057
1058 return status;
1059}
1060
1061/*
1062 * Updates a disk inode from a
1063 * struct inode.
1064 * Only takes ip_lock.
1065 */
1066int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle,
1067 struct inode *inode,
1068 struct buffer_head *bh)
1069{
1070 int status;
1071 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
1072
1073 mlog_entry("(inode %"MLFu64")\n", OCFS2_I(inode)->ip_blkno);
1074
1075 status = ocfs2_journal_access(handle, inode, bh,
1076 OCFS2_JOURNAL_ACCESS_WRITE);
1077 if (status < 0) {
1078 mlog_errno(status);
1079 goto leave;
1080 }
1081
1082 spin_lock(&OCFS2_I(inode)->ip_lock);
1083 fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
1084 spin_unlock(&OCFS2_I(inode)->ip_lock);
1085
1086 fe->i_size = cpu_to_le64(i_size_read(inode));
1087 fe->i_links_count = cpu_to_le16(inode->i_nlink);
1088 fe->i_uid = cpu_to_le32(inode->i_uid);
1089 fe->i_gid = cpu_to_le32(inode->i_gid);
1090 fe->i_mode = cpu_to_le16(inode->i_mode);
1091 fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
1092 fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
1093 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
1094 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
1095 fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
1096 fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
1097
1098 status = ocfs2_journal_dirty(handle, bh);
1099 if (status < 0)
1100 mlog_errno(status);
1101
1102 status = 0;
1103leave:
1104
1105 mlog_exit(status);
1106 return status;
1107}
1108
1109/*
1110 *
1111 * Updates a struct inode from a disk inode.
1112 * does no i/o, only takes ip_lock.
1113 */
1114void ocfs2_refresh_inode(struct inode *inode,
1115 struct ocfs2_dinode *fe)
1116{
1117 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1118
1119 spin_lock(&OCFS2_I(inode)->ip_lock);
1120
1121 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
1122 i_size_write(inode, le64_to_cpu(fe->i_size));
1123 inode->i_nlink = le16_to_cpu(fe->i_links_count);
1124 inode->i_uid = le32_to_cpu(fe->i_uid);
1125 inode->i_gid = le32_to_cpu(fe->i_gid);
1126 inode->i_mode = le16_to_cpu(fe->i_mode);
1127 inode->i_blksize = (u32) osb->s_clustersize;
1128 if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
1129 inode->i_blocks = 0;
1130 else
1131 inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode));
1132 inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
1133 inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
1134 inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
1135 inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
1136 inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
1137 inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
1138
1139 spin_unlock(&OCFS2_I(inode)->ip_lock);
1140}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
new file mode 100644
index 00000000000..9b017743365
--- /dev/null
+++ b/fs/ocfs2/inode.h
@@ -0,0 +1,145 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * inode.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_INODE_H
27#define OCFS2_INODE_H
28
29/* OCFS2 Inode Private Data */
30struct ocfs2_inode_info
31{
32 u64 ip_blkno;
33
34 struct ocfs2_lock_res ip_rw_lockres;
35 struct ocfs2_lock_res ip_meta_lockres;
36 struct ocfs2_lock_res ip_data_lockres;
37
38 /* protects allocation changes on this inode. */
39 struct rw_semaphore ip_alloc_sem;
40
41 /* These fields are protected by ip_lock */
42 spinlock_t ip_lock;
43 u32 ip_open_count;
44 u32 ip_clusters;
45 struct ocfs2_extent_map ip_map;
46 struct list_head ip_io_markers;
47 int ip_orphaned_slot;
48
49 struct semaphore ip_io_sem;
50
51 /* Used by the journalling code to attach an inode to a
52 * handle. These are protected by ip_io_sem in order to lock
53 * out other I/O to the inode until we either commit or
54 * abort. */
55 struct list_head ip_handle_list;
56 struct ocfs2_journal_handle *ip_handle;
57
58 u32 ip_flags; /* see below */
59
60 /* protected by recovery_lock. */
61 struct inode *ip_next_orphan;
62
63 u32 ip_dir_start_lookup;
64
65 /* next two are protected by trans_inc_lock */
66 /* which transaction were we created on? Zero if none. */
67 unsigned long ip_created_trans;
68 /* last transaction we were a part of. */
69 unsigned long ip_last_trans;
70
71 struct ocfs2_caching_info ip_metadata_cache;
72
73 struct inode vfs_inode;
74};
75
76/*
77 * Flags for the ip_flags field
78 */
79/* System file inodes */
80#define OCFS2_INODE_SYSTEM_FILE 0x00000001
81#define OCFS2_INODE_JOURNAL 0x00000002
82#define OCFS2_INODE_BITMAP 0x00000004
83/* This inode has been wiped from disk */
84#define OCFS2_INODE_DELETED 0x00000008
85/* Another node is deleting, so our delete is a nop */
86#define OCFS2_INODE_SKIP_DELETE 0x00000010
87/* Has the inode been orphaned on another node?
88 *
89 * This hints to ocfs2_drop_inode that it should clear i_nlink before
90 * continuing.
91 *
92 * We *only* set this on unlink vote from another node. If the inode
93 * was locally orphaned, then we're sure of the state and don't need
94 * to twiddle i_nlink later - it's either zero or not depending on
95 * whether our unlink succeeded. Otherwise we got this from a node
96 * whose intention was to orphan the inode, however he may have
97 * crashed, failed etc, so we let ocfs2_drop_inode zero the value and
98 * rely on ocfs2_delete_inode to sort things out under the proper
99 * cluster locks.
100 */
101#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020
102/* Does someone have the file open O_DIRECT */
103#define OCFS2_INODE_OPEN_DIRECT 0x00000040
104/* Indicates that the metadata cache should be used as an array. */
105#define OCFS2_INODE_CACHE_INLINE 0x00000080
106
107static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
108{
109 return container_of(inode, struct ocfs2_inode_info, vfs_inode);
110}
111
112#define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL)
113#define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL)
114
115extern kmem_cache_t *ocfs2_inode_cache;
116
117extern struct address_space_operations ocfs2_aops;
118
119struct buffer_head *ocfs2_bread(struct inode *inode, int block,
120 int *err, int reada);
121void ocfs2_clear_inode(struct inode *inode);
122void ocfs2_delete_inode(struct inode *inode);
123void ocfs2_drop_inode(struct inode *inode);
124struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff);
125struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
126 u64 blkno,
127 int delete_vote);
128int ocfs2_inode_init_private(struct inode *inode);
129int ocfs2_inode_revalidate(struct dentry *dentry);
130int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
131 int create_ino);
132void ocfs2_read_inode(struct inode *inode);
133void ocfs2_read_inode2(struct inode *inode, void *opaque);
134ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
135 size_t size, loff_t *offp);
136void ocfs2_sync_blockdev(struct super_block *sb);
137void ocfs2_refresh_inode(struct inode *inode,
138 struct ocfs2_dinode *fe);
139int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle,
140 struct inode *inode,
141 struct buffer_head *bh);
142int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
143int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
144
145#endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
new file mode 100644
index 00000000000..303c8d96457
--- /dev/null
+++ b/fs/ocfs2/journal.c
@@ -0,0 +1,1652 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * journal.c
5 *
6 * Defines functions of journalling api
7 *
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30#include <linux/kthread.h>
31
32#define MLOG_MASK_PREFIX ML_JOURNAL
33#include <cluster/masklog.h>
34
35#include "ocfs2.h"
36
37#include "alloc.h"
38#include "dlmglue.h"
39#include "extent_map.h"
40#include "heartbeat.h"
41#include "inode.h"
42#include "journal.h"
43#include "localalloc.h"
44#include "namei.h"
45#include "slot_map.h"
46#include "super.h"
47#include "vote.h"
48#include "sysfile.h"
49
50#include "buffer_head_io.h"
51
52spinlock_t trans_inc_lock = SPIN_LOCK_UNLOCKED;
53
54static int ocfs2_force_read_journal(struct inode *inode);
55static int ocfs2_recover_node(struct ocfs2_super *osb,
56 int node_num);
57static int __ocfs2_recovery_thread(void *arg);
58static int ocfs2_commit_cache(struct ocfs2_super *osb);
59static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
60static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal,
61 struct ocfs2_journal_handle *handle);
62static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle);
63static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
64 int dirty);
65static int ocfs2_trylock_journal(struct ocfs2_super *osb,
66 int slot_num);
67static int ocfs2_recover_orphans(struct ocfs2_super *osb,
68 int slot);
69static int ocfs2_commit_thread(void *arg);
70
71static int ocfs2_commit_cache(struct ocfs2_super *osb)
72{
73 int status = 0;
74 unsigned int flushed;
75 unsigned long old_id;
76 struct ocfs2_journal *journal = NULL;
77
78 mlog_entry_void();
79
80 journal = osb->journal;
81
82 /* Flush all pending commits and checkpoint the journal. */
83 down_write(&journal->j_trans_barrier);
84
85 if (atomic_read(&journal->j_num_trans) == 0) {
86 up_write(&journal->j_trans_barrier);
87 mlog(0, "No transactions for me to flush!\n");
88 goto finally;
89 }
90
91 journal_lock_updates(journal->j_journal);
92 status = journal_flush(journal->j_journal);
93 journal_unlock_updates(journal->j_journal);
94 if (status < 0) {
95 up_write(&journal->j_trans_barrier);
96 mlog_errno(status);
97 goto finally;
98 }
99
100 old_id = ocfs2_inc_trans_id(journal);
101
102 flushed = atomic_read(&journal->j_num_trans);
103 atomic_set(&journal->j_num_trans, 0);
104 up_write(&journal->j_trans_barrier);
105
106 mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",
107 journal->j_trans_id, flushed);
108
109 ocfs2_kick_vote_thread(osb);
110 wake_up(&journal->j_checkpointed);
111finally:
112 mlog_exit(status);
113 return status;
114}
115
116struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb)
117{
118 struct ocfs2_journal_handle *retval = NULL;
119
120 retval = kcalloc(1, sizeof(*retval), GFP_KERNEL);
121 if (!retval) {
122 mlog(ML_ERROR, "Failed to allocate memory for journal "
123 "handle!\n");
124 return NULL;
125 }
126
127 retval->max_buffs = 0;
128 retval->num_locks = 0;
129 retval->k_handle = NULL;
130
131 INIT_LIST_HEAD(&retval->locks);
132 INIT_LIST_HEAD(&retval->inode_list);
133 retval->journal = osb->journal;
134
135 return retval;
136}
137
138/* pass it NULL and it will allocate a new handle object for you. If
139 * you pass it a handle however, it may still return error, in which
140 * case it has free'd the passed handle for you. */
141struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb,
142 struct ocfs2_journal_handle *handle,
143 int max_buffs)
144{
145 int ret;
146 journal_t *journal = osb->journal->j_journal;
147
148 mlog_entry("(max_buffs = %d)\n", max_buffs);
149
150 if (!osb || !osb->journal->j_journal)
151 BUG();
152
153 if (ocfs2_is_hard_readonly(osb)) {
154 ret = -EROFS;
155 goto done_free;
156 }
157
158 BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
159 BUG_ON(max_buffs <= 0);
160
161 /* JBD might support this, but our journalling code doesn't yet. */
162 if (journal_current_handle()) {
163 mlog(ML_ERROR, "Recursive transaction attempted!\n");
164 BUG();
165 }
166
167 if (!handle)
168 handle = ocfs2_alloc_handle(osb);
169 if (!handle) {
170 ret = -ENOMEM;
171 mlog(ML_ERROR, "Failed to allocate memory for journal "
172 "handle!\n");
173 goto done_free;
174 }
175
176 handle->max_buffs = max_buffs;
177
178 down_read(&osb->journal->j_trans_barrier);
179
180 /* actually start the transaction now */
181 handle->k_handle = journal_start(journal, max_buffs);
182 if (IS_ERR(handle->k_handle)) {
183 up_read(&osb->journal->j_trans_barrier);
184
185 ret = PTR_ERR(handle->k_handle);
186 handle->k_handle = NULL;
187 mlog_errno(ret);
188
189 if (is_journal_aborted(journal)) {
190 ocfs2_abort(osb->sb, "Detected aborted journal");
191 ret = -EROFS;
192 }
193 goto done_free;
194 }
195
196 atomic_inc(&(osb->journal->j_num_trans));
197 handle->flags |= OCFS2_HANDLE_STARTED;
198
199 mlog_exit_ptr(handle);
200 return handle;
201
202done_free:
203 if (handle)
204 ocfs2_commit_unstarted_handle(handle); /* will kfree handle */
205
206 mlog_exit(ret);
207 return ERR_PTR(ret);
208}
209
210void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
211 struct inode *inode)
212{
213 BUG_ON(!handle);
214 BUG_ON(!inode);
215
216 atomic_inc(&inode->i_count);
217
218 /* we're obviously changing it... */
219 mutex_lock(&inode->i_mutex);
220
221 /* sanity check */
222 BUG_ON(OCFS2_I(inode)->ip_handle);
223 BUG_ON(!list_empty(&OCFS2_I(inode)->ip_handle_list));
224
225 OCFS2_I(inode)->ip_handle = handle;
226 list_del(&(OCFS2_I(inode)->ip_handle_list));
227 list_add_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list));
228}
229
230static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle)
231{
232 struct list_head *p, *n;
233 struct inode *inode;
234 struct ocfs2_inode_info *oi;
235
236 list_for_each_safe(p, n, &handle->inode_list) {
237 oi = list_entry(p, struct ocfs2_inode_info,
238 ip_handle_list);
239 inode = &oi->vfs_inode;
240
241 OCFS2_I(inode)->ip_handle = NULL;
242 list_del_init(&OCFS2_I(inode)->ip_handle_list);
243
244 mutex_unlock(&inode->i_mutex);
245 iput(inode);
246 }
247}
248
249/* This is trivial so we do it out of the main commit
250 * paths. Beware, it can be called from start_trans too! */
251static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle)
252{
253 mlog_entry_void();
254
255 BUG_ON(handle->flags & OCFS2_HANDLE_STARTED);
256
257 ocfs2_handle_unlock_inodes(handle);
258 /* You are allowed to add journal locks before the transaction
259 * has started. */
260 ocfs2_handle_cleanup_locks(handle->journal, handle);
261
262 kfree(handle);
263
264 mlog_exit_void();
265}
266
267void ocfs2_commit_trans(struct ocfs2_journal_handle *handle)
268{
269 handle_t *jbd_handle;
270 int retval;
271 struct ocfs2_journal *journal = handle->journal;
272
273 mlog_entry_void();
274
275 BUG_ON(!handle);
276
277 if (!(handle->flags & OCFS2_HANDLE_STARTED)) {
278 ocfs2_commit_unstarted_handle(handle);
279 mlog_exit_void();
280 return;
281 }
282
283 /* release inode semaphores we took during this transaction */
284 ocfs2_handle_unlock_inodes(handle);
285
286 /* ocfs2_extend_trans may have had to call journal_restart
287 * which will always commit the transaction, but may return
288 * error for any number of reasons. If this is the case, we
289 * clear k_handle as it's not valid any more. */
290 if (handle->k_handle) {
291 jbd_handle = handle->k_handle;
292
293 if (handle->flags & OCFS2_HANDLE_SYNC)
294 jbd_handle->h_sync = 1;
295 else
296 jbd_handle->h_sync = 0;
297
298 /* actually stop the transaction. if we've set h_sync,
299 * it'll have been committed when we return */
300 retval = journal_stop(jbd_handle);
301 if (retval < 0) {
302 mlog_errno(retval);
303 mlog(ML_ERROR, "Could not commit transaction\n");
304 BUG();
305 }
306
307 handle->k_handle = NULL; /* it's been free'd in journal_stop */
308 }
309
310 ocfs2_handle_cleanup_locks(journal, handle);
311
312 up_read(&journal->j_trans_barrier);
313
314 kfree(handle);
315 mlog_exit_void();
316}
317
318/*
319 * 'nblocks' is what you want to add to the current
320 * transaction. extend_trans will either extend the current handle by
321 * nblocks, or commit it and start a new one with nblocks credits.
322 *
323 * WARNING: This will not release any semaphores or disk locks taken
324 * during the transaction, so make sure they were taken *before*
325 * start_trans or we'll have ordering deadlocks.
326 *
327 * WARNING2: Note that we do *not* drop j_trans_barrier here. This is
328 * good because transaction ids haven't yet been recorded on the
329 * cluster locks associated with this handle.
330 */
331int ocfs2_extend_trans(struct ocfs2_journal_handle *handle,
332 int nblocks)
333{
334 int status;
335
336 BUG_ON(!handle);
337 BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
338 BUG_ON(!nblocks);
339
340 mlog_entry_void();
341
342 mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
343
344 status = journal_extend(handle->k_handle, nblocks);
345 if (status < 0) {
346 mlog_errno(status);
347 goto bail;
348 }
349
350 if (status > 0) {
351 mlog(0, "journal_extend failed, trying journal_restart\n");
352 status = journal_restart(handle->k_handle, nblocks);
353 if (status < 0) {
354 handle->k_handle = NULL;
355 mlog_errno(status);
356 goto bail;
357 }
358 handle->max_buffs = nblocks;
359 } else
360 handle->max_buffs += nblocks;
361
362 status = 0;
363bail:
364
365 mlog_exit(status);
366 return status;
367}
368
369int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
370 struct inode *inode,
371 struct buffer_head *bh,
372 int type)
373{
374 int status;
375
376 BUG_ON(!inode);
377 BUG_ON(!handle);
378 BUG_ON(!bh);
379 BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
380
381 mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %hu\n",
382 (unsigned long long)bh->b_blocknr, type,
383 (type == OCFS2_JOURNAL_ACCESS_CREATE) ?
384 "OCFS2_JOURNAL_ACCESS_CREATE" :
385 "OCFS2_JOURNAL_ACCESS_WRITE",
386 bh->b_size);
387
388 /* we can safely remove this assertion after testing. */
389 if (!buffer_uptodate(bh)) {
390 mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n");
391 mlog(ML_ERROR, "b_blocknr=%llu\n",
392 (unsigned long long)bh->b_blocknr);
393 BUG();
394 }
395
396 /* Set the current transaction information on the inode so
397 * that the locking code knows whether it can drop it's locks
398 * on this inode or not. We're protected from the commit
399 * thread updating the current transaction id until
400 * ocfs2_commit_trans() because ocfs2_start_trans() took
401 * j_trans_barrier for us. */
402 ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode);
403
404 down(&OCFS2_I(inode)->ip_io_sem);
405 switch (type) {
406 case OCFS2_JOURNAL_ACCESS_CREATE:
407 case OCFS2_JOURNAL_ACCESS_WRITE:
408 status = journal_get_write_access(handle->k_handle, bh);
409 break;
410
411 case OCFS2_JOURNAL_ACCESS_UNDO:
412 status = journal_get_undo_access(handle->k_handle, bh);
413 break;
414
415 default:
416 status = -EINVAL;
417 mlog(ML_ERROR, "Uknown access type!\n");
418 }
419 up(&OCFS2_I(inode)->ip_io_sem);
420
421 if (status < 0)
422 mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
423 status, type);
424
425 mlog_exit(status);
426 return status;
427}
428
429int ocfs2_journal_dirty(struct ocfs2_journal_handle *handle,
430 struct buffer_head *bh)
431{
432 int status;
433
434 BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
435
436 mlog_entry("(bh->b_blocknr=%llu)\n",
437 (unsigned long long)bh->b_blocknr);
438
439 status = journal_dirty_metadata(handle->k_handle, bh);
440 if (status < 0)
441 mlog(ML_ERROR, "Could not dirty metadata buffer. "
442 "(bh->b_blocknr=%llu)\n",
443 (unsigned long long)bh->b_blocknr);
444
445 mlog_exit(status);
446 return status;
447}
448
449int ocfs2_journal_dirty_data(handle_t *handle,
450 struct buffer_head *bh)
451{
452 int err = journal_dirty_data(handle, bh);
453 if (err)
454 mlog_errno(err);
455 /* TODO: When we can handle it, abort the handle and go RO on
456 * error here. */
457
458 return err;
459}
460
461/* We always assume you're adding a metadata lock at level 'ex' */
462int ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle,
463 struct inode *inode)
464{
465 int status;
466 struct ocfs2_journal_lock *lock;
467
468 BUG_ON(!inode);
469
470 lock = kmem_cache_alloc(ocfs2_lock_cache, GFP_NOFS);
471 if (!lock) {
472 status = -ENOMEM;
473 mlog_errno(-ENOMEM);
474 goto bail;
475 }
476
477 if (!igrab(inode))
478 BUG();
479 lock->jl_inode = inode;
480
481 list_add_tail(&(lock->jl_lock_list), &(handle->locks));
482 handle->num_locks++;
483
484 status = 0;
485bail:
486 mlog_exit(status);
487 return status;
488}
489
490static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal,
491 struct ocfs2_journal_handle *handle)
492{
493 struct list_head *p, *n;
494 struct ocfs2_journal_lock *lock;
495 struct inode *inode;
496
497 list_for_each_safe(p, n, &(handle->locks)) {
498 lock = list_entry(p, struct ocfs2_journal_lock,
499 jl_lock_list);
500 list_del(&lock->jl_lock_list);
501 handle->num_locks--;
502
503 inode = lock->jl_inode;
504 ocfs2_meta_unlock(inode, 1);
505 if (atomic_read(&inode->i_count) == 1)
506 mlog(ML_ERROR,
507 "Inode %"MLFu64", I'm doing a last iput for!",
508 OCFS2_I(inode)->ip_blkno);
509 iput(inode);
510 kmem_cache_free(ocfs2_lock_cache, lock);
511 }
512}
513
514#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * 5)
515
516void ocfs2_set_journal_params(struct ocfs2_super *osb)
517{
518 journal_t *journal = osb->journal->j_journal;
519
520 spin_lock(&journal->j_state_lock);
521 journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
522 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
523 journal->j_flags |= JFS_BARRIER;
524 else
525 journal->j_flags &= ~JFS_BARRIER;
526 spin_unlock(&journal->j_state_lock);
527}
528
529int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
530{
531 int status = -1;
532 struct inode *inode = NULL; /* the journal inode */
533 journal_t *j_journal = NULL;
534 struct ocfs2_dinode *di = NULL;
535 struct buffer_head *bh = NULL;
536 struct ocfs2_super *osb;
537 int meta_lock = 0;
538
539 mlog_entry_void();
540
541 BUG_ON(!journal);
542
543 osb = journal->j_osb;
544
545 /* already have the inode for our journal */
546 inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
547 osb->slot_num);
548 if (inode == NULL) {
549 status = -EACCES;
550 mlog_errno(status);
551 goto done;
552 }
553 if (is_bad_inode(inode)) {
554 mlog(ML_ERROR, "access error (bad inode)\n");
555 iput(inode);
556 inode = NULL;
557 status = -EACCES;
558 goto done;
559 }
560
561 SET_INODE_JOURNAL(inode);
562 OCFS2_I(inode)->ip_open_count++;
563
564 status = ocfs2_meta_lock(inode, NULL, &bh, 1);
565 if (status < 0) {
566 if (status != -ERESTARTSYS)
567 mlog(ML_ERROR, "Could not get lock on journal!\n");
568 goto done;
569 }
570
571 meta_lock = 1;
572 di = (struct ocfs2_dinode *)bh->b_data;
573
574 if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) {
575 mlog(ML_ERROR, "Journal file size (%lld) is too small!\n",
576 inode->i_size);
577 status = -EINVAL;
578 goto done;
579 }
580
581 mlog(0, "inode->i_size = %lld\n", inode->i_size);
582 mlog(0, "inode->i_blocks = %lu\n", inode->i_blocks);
583 mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);
584
585 /* call the kernels journal init function now */
586 j_journal = journal_init_inode(inode);
587 if (j_journal == NULL) {
588 mlog(ML_ERROR, "Linux journal layer error\n");
589 status = -EINVAL;
590 goto done;
591 }
592
593 mlog(0, "Returned from journal_init_inode\n");
594 mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);
595
596 *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
597 OCFS2_JOURNAL_DIRTY_FL);
598
599 journal->j_journal = j_journal;
600 journal->j_inode = inode;
601 journal->j_bh = bh;
602
603 ocfs2_set_journal_params(osb);
604
605 journal->j_state = OCFS2_JOURNAL_LOADED;
606
607 status = 0;
608done:
609 if (status < 0) {
610 if (meta_lock)
611 ocfs2_meta_unlock(inode, 1);
612 if (bh != NULL)
613 brelse(bh);
614 if (inode) {
615 OCFS2_I(inode)->ip_open_count--;
616 iput(inode);
617 }
618 }
619
620 mlog_exit(status);
621 return status;
622}
623
624static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
625 int dirty)
626{
627 int status;
628 unsigned int flags;
629 struct ocfs2_journal *journal = osb->journal;
630 struct buffer_head *bh = journal->j_bh;
631 struct ocfs2_dinode *fe;
632
633 mlog_entry_void();
634
635 fe = (struct ocfs2_dinode *)bh->b_data;
636 if (!OCFS2_IS_VALID_DINODE(fe)) {
637 /* This is called from startup/shutdown which will
638 * handle the errors in a specific manner, so no need
639 * to call ocfs2_error() here. */
640 mlog(ML_ERROR, "Journal dinode %"MLFu64" has invalid "
641 "signature: %.*s", fe->i_blkno, 7, fe->i_signature);
642 status = -EIO;
643 goto out;
644 }
645
646 flags = le32_to_cpu(fe->id1.journal1.ij_flags);
647 if (dirty)
648 flags |= OCFS2_JOURNAL_DIRTY_FL;
649 else
650 flags &= ~OCFS2_JOURNAL_DIRTY_FL;
651 fe->id1.journal1.ij_flags = cpu_to_le32(flags);
652
653 status = ocfs2_write_block(osb, bh, journal->j_inode);
654 if (status < 0)
655 mlog_errno(status);
656
657out:
658 mlog_exit(status);
659 return status;
660}
661
662/*
663 * If the journal has been kmalloc'd it needs to be freed after this
664 * call.
665 */
666void ocfs2_journal_shutdown(struct ocfs2_super *osb)
667{
668 struct ocfs2_journal *journal = NULL;
669 int status = 0;
670 struct inode *inode = NULL;
671 int num_running_trans = 0;
672
673 mlog_entry_void();
674
675 if (!osb)
676 BUG();
677
678 journal = osb->journal;
679 if (!journal)
680 goto done;
681
682 inode = journal->j_inode;
683
684 if (journal->j_state != OCFS2_JOURNAL_LOADED)
685 goto done;
686
687 /* need to inc inode use count as journal_destroy will iput. */
688 if (!igrab(inode))
689 BUG();
690
691 num_running_trans = atomic_read(&(osb->journal->j_num_trans));
692 if (num_running_trans > 0)
693 mlog(0, "Shutting down journal: must wait on %d "
694 "running transactions!\n",
695 num_running_trans);
696
697 /* Do a commit_cache here. It will flush our journal, *and*
698 * release any locks that are still held.
699 * set the SHUTDOWN flag and release the trans lock.
700 * the commit thread will take the trans lock for us below. */
701 journal->j_state = OCFS2_JOURNAL_IN_SHUTDOWN;
702
703 /* The OCFS2_JOURNAL_IN_SHUTDOWN will signal to commit_cache to not
704 * drop the trans_lock (which we want to hold until we
705 * completely destroy the journal. */
706 if (osb->commit_task) {
707 /* Wait for the commit thread */
708 mlog(0, "Waiting for ocfs2commit to exit....\n");
709 kthread_stop(osb->commit_task);
710 osb->commit_task = NULL;
711 }
712
713 BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);
714
715 status = ocfs2_journal_toggle_dirty(osb, 0);
716 if (status < 0)
717 mlog_errno(status);
718
719 /* Shutdown the kernel journal system */
720 journal_destroy(journal->j_journal);
721
722 OCFS2_I(inode)->ip_open_count--;
723
724 /* unlock our journal */
725 ocfs2_meta_unlock(inode, 1);
726
727 brelse(journal->j_bh);
728 journal->j_bh = NULL;
729
730 journal->j_state = OCFS2_JOURNAL_FREE;
731
732// up_write(&journal->j_trans_barrier);
733done:
734 if (inode)
735 iput(inode);
736 mlog_exit_void();
737}
738
739static void ocfs2_clear_journal_error(struct super_block *sb,
740 journal_t *journal,
741 int slot)
742{
743 int olderr;
744
745 olderr = journal_errno(journal);
746 if (olderr) {
747 mlog(ML_ERROR, "File system error %d recorded in "
748 "journal %u.\n", olderr, slot);
749 mlog(ML_ERROR, "File system on device %s needs checking.\n",
750 sb->s_id);
751
752 journal_ack_err(journal);
753 journal_clear_err(journal);
754 }
755}
756
757int ocfs2_journal_load(struct ocfs2_journal *journal)
758{
759 int status = 0;
760 struct ocfs2_super *osb;
761
762 mlog_entry_void();
763
764 if (!journal)
765 BUG();
766
767 osb = journal->j_osb;
768
769 status = journal_load(journal->j_journal);
770 if (status < 0) {
771 mlog(ML_ERROR, "Failed to load journal!\n");
772 goto done;
773 }
774
775 ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
776
777 status = ocfs2_journal_toggle_dirty(osb, 1);
778 if (status < 0) {
779 mlog_errno(status);
780 goto done;
781 }
782
783 /* Launch the commit thread */
784 osb->commit_task = kthread_run(ocfs2_commit_thread, osb, "ocfs2cmt-%d",
785 osb->osb_id);
786 if (IS_ERR(osb->commit_task)) {
787 status = PTR_ERR(osb->commit_task);
788 osb->commit_task = NULL;
789 mlog(ML_ERROR, "unable to launch ocfs2commit thread, error=%d",
790 status);
791 goto done;
792 }
793
794done:
795 mlog_exit(status);
796 return status;
797}
798
799
800/* 'full' flag tells us whether we clear out all blocks or if we just
801 * mark the journal clean */
802int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
803{
804 int status;
805
806 mlog_entry_void();
807
808 if (!journal)
809 BUG();
810
811 status = journal_wipe(journal->j_journal, full);
812 if (status < 0) {
813 mlog_errno(status);
814 goto bail;
815 }
816
817 status = ocfs2_journal_toggle_dirty(journal->j_osb, 0);
818 if (status < 0)
819 mlog_errno(status);
820
821bail:
822 mlog_exit(status);
823 return status;
824}
825
826/*
827 * JBD Might read a cached version of another nodes journal file. We
828 * don't want this as this file changes often and we get no
829 * notification on those changes. The only way to be sure that we've
830 * got the most up to date version of those blocks then is to force
831 * read them off disk. Just searching through the buffer cache won't
832 * work as there may be pages backing this file which are still marked
833 * up to date. We know things can't change on this file underneath us
834 * as we have the lock by now :)
835 */
836static int ocfs2_force_read_journal(struct inode *inode)
837{
838 int status = 0;
839 int i, p_blocks;
840 u64 v_blkno, p_blkno;
841#define CONCURRENT_JOURNAL_FILL 32
842 struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
843
844 mlog_entry_void();
845
846 BUG_ON(inode->i_blocks !=
847 ocfs2_align_bytes_to_sectors(i_size_read(inode)));
848
849 memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
850
851 mlog(0, "Force reading %lu blocks\n",
852 (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9)));
853
854 v_blkno = 0;
855 while (v_blkno <
856 (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) {
857
858 status = ocfs2_extent_map_get_blocks(inode, v_blkno,
859 1, &p_blkno,
860 &p_blocks);
861 if (status < 0) {
862 mlog_errno(status);
863 goto bail;
864 }
865
866 if (p_blocks > CONCURRENT_JOURNAL_FILL)
867 p_blocks = CONCURRENT_JOURNAL_FILL;
868
869 status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
870 p_blkno, p_blocks, bhs, 0,
871 inode);
872 if (status < 0) {
873 mlog_errno(status);
874 goto bail;
875 }
876
877 for(i = 0; i < p_blocks; i++) {
878 brelse(bhs[i]);
879 bhs[i] = NULL;
880 }
881
882 v_blkno += p_blocks;
883 }
884
885bail:
886 for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
887 if (bhs[i])
888 brelse(bhs[i]);
889 mlog_exit(status);
890 return status;
891}
892
893struct ocfs2_la_recovery_item {
894 struct list_head lri_list;
895 int lri_slot;
896 struct ocfs2_dinode *lri_la_dinode;
897 struct ocfs2_dinode *lri_tl_dinode;
898};
899
900/* Does the second half of the recovery process. By this point, the
901 * node is marked clean and can actually be considered recovered,
902 * hence it's no longer in the recovery map, but there's still some
903 * cleanup we can do which shouldn't happen within the recovery thread
904 * as locking in that context becomes very difficult if we are to take
905 * recovering nodes into account.
906 *
907 * NOTE: This function can and will sleep on recovery of other nodes
908 * during cluster locking, just like any other ocfs2 process.
909 */
910void ocfs2_complete_recovery(void *data)
911{
912 int ret;
913 struct ocfs2_super *osb = data;
914 struct ocfs2_journal *journal = osb->journal;
915 struct ocfs2_dinode *la_dinode, *tl_dinode;
916 struct ocfs2_la_recovery_item *item;
917 struct list_head *p, *n;
918 LIST_HEAD(tmp_la_list);
919
920 mlog_entry_void();
921
922 mlog(0, "completing recovery from keventd\n");
923
924 spin_lock(&journal->j_lock);
925 list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
926 spin_unlock(&journal->j_lock);
927
928 list_for_each_safe(p, n, &tmp_la_list) {
929 item = list_entry(p, struct ocfs2_la_recovery_item, lri_list);
930 list_del_init(&item->lri_list);
931
932 mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
933
934 la_dinode = item->lri_la_dinode;
935 if (la_dinode) {
936 mlog(0, "Clean up local alloc %"MLFu64"\n",
937 la_dinode->i_blkno);
938
939 ret = ocfs2_complete_local_alloc_recovery(osb,
940 la_dinode);
941 if (ret < 0)
942 mlog_errno(ret);
943
944 kfree(la_dinode);
945 }
946
947 tl_dinode = item->lri_tl_dinode;
948 if (tl_dinode) {
949 mlog(0, "Clean up truncate log %"MLFu64"\n",
950 tl_dinode->i_blkno);
951
952 ret = ocfs2_complete_truncate_log_recovery(osb,
953 tl_dinode);
954 if (ret < 0)
955 mlog_errno(ret);
956
957 kfree(tl_dinode);
958 }
959
960 ret = ocfs2_recover_orphans(osb, item->lri_slot);
961 if (ret < 0)
962 mlog_errno(ret);
963
964 kfree(item);
965 }
966
967 mlog(0, "Recovery completion\n");
968 mlog_exit_void();
969}
970
971/* NOTE: This function always eats your references to la_dinode and
972 * tl_dinode, either manually on error, or by passing them to
973 * ocfs2_complete_recovery */
974static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
975 int slot_num,
976 struct ocfs2_dinode *la_dinode,
977 struct ocfs2_dinode *tl_dinode)
978{
979 struct ocfs2_la_recovery_item *item;
980
981 item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_KERNEL);
982 if (!item) {
983 /* Though we wish to avoid it, we are in fact safe in
984 * skipping local alloc cleanup as fsck.ocfs2 is more
985 * than capable of reclaiming unused space. */
986 if (la_dinode)
987 kfree(la_dinode);
988
989 if (tl_dinode)
990 kfree(tl_dinode);
991
992 mlog_errno(-ENOMEM);
993 return;
994 }
995
996 INIT_LIST_HEAD(&item->lri_list);
997 item->lri_la_dinode = la_dinode;
998 item->lri_slot = slot_num;
999 item->lri_tl_dinode = tl_dinode;
1000
1001 spin_lock(&journal->j_lock);
1002 list_add_tail(&item->lri_list, &journal->j_la_cleanups);
1003 queue_work(ocfs2_wq, &journal->j_recovery_work);
1004 spin_unlock(&journal->j_lock);
1005}
1006
1007/* Called by the mount code to queue recovery the last part of
1008 * recovery for it's own slot. */
1009void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
1010{
1011 struct ocfs2_journal *journal = osb->journal;
1012
1013 if (osb->dirty) {
1014 /* No need to queue up our truncate_log as regular
1015 * cleanup will catch that. */
1016 ocfs2_queue_recovery_completion(journal,
1017 osb->slot_num,
1018 osb->local_alloc_copy,
1019 NULL);
1020 ocfs2_schedule_truncate_log_flush(osb, 0);
1021
1022 osb->local_alloc_copy = NULL;
1023 osb->dirty = 0;
1024 }
1025}
1026
1027static int __ocfs2_recovery_thread(void *arg)
1028{
1029 int status, node_num;
1030 struct ocfs2_super *osb = arg;
1031
1032 mlog_entry_void();
1033
1034 status = ocfs2_wait_on_mount(osb);
1035 if (status < 0) {
1036 goto bail;
1037 }
1038
1039restart:
1040 status = ocfs2_super_lock(osb, 1);
1041 if (status < 0) {
1042 mlog_errno(status);
1043 goto bail;
1044 }
1045
1046 while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
1047 node_num = ocfs2_node_map_first_set_bit(osb,
1048 &osb->recovery_map);
1049 if (node_num == O2NM_INVALID_NODE_NUM) {
1050 mlog(0, "Out of nodes to recover.\n");
1051 break;
1052 }
1053
1054 status = ocfs2_recover_node(osb, node_num);
1055 if (status < 0) {
1056 mlog(ML_ERROR,
1057 "Error %d recovering node %d on device (%u,%u)!\n",
1058 status, node_num,
1059 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
1060 mlog(ML_ERROR, "Volume requires unmount.\n");
1061 continue;
1062 }
1063
1064 ocfs2_recovery_map_clear(osb, node_num);
1065 }
1066 ocfs2_super_unlock(osb, 1);
1067
1068 /* We always run recovery on our own orphan dir - the dead
1069 * node(s) may have voted "no" on an inode delete earlier. A
1070 * revote is therefore required. */
1071 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
1072 NULL);
1073
1074bail:
1075 down(&osb->recovery_lock);
1076 if (!status &&
1077 !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
1078 up(&osb->recovery_lock);
1079 goto restart;
1080 }
1081
1082 osb->recovery_thread_task = NULL;
1083 mb(); /* sync with ocfs2_recovery_thread_running */
1084 wake_up(&osb->recovery_event);
1085
1086 up(&osb->recovery_lock);
1087
1088 mlog_exit(status);
1089 /* no one is callint kthread_stop() for us so the kthread() api
1090 * requires that we call do_exit(). And it isn't exported, but
1091 * complete_and_exit() seems to be a minimal wrapper around it. */
1092 complete_and_exit(NULL, status);
1093 return status;
1094}
1095
1096void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
1097{
1098 mlog_entry("(node_num=%d, osb->node_num = %d)\n",
1099 node_num, osb->node_num);
1100
1101 down(&osb->recovery_lock);
1102 if (osb->disable_recovery)
1103 goto out;
1104
1105 /* People waiting on recovery will wait on
1106 * the recovery map to empty. */
1107 if (!ocfs2_recovery_map_set(osb, node_num))
1108 mlog(0, "node %d already be in recovery.\n", node_num);
1109
1110 mlog(0, "starting recovery thread...\n");
1111
1112 if (osb->recovery_thread_task)
1113 goto out;
1114
1115 osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb,
1116 "ocfs2rec-%d", osb->osb_id);
1117 if (IS_ERR(osb->recovery_thread_task)) {
1118 mlog_errno((int)PTR_ERR(osb->recovery_thread_task));
1119 osb->recovery_thread_task = NULL;
1120 }
1121
1122out:
1123 up(&osb->recovery_lock);
1124 wake_up(&osb->recovery_event);
1125
1126 mlog_exit_void();
1127}
1128
1129/* Does the actual journal replay and marks the journal inode as
1130 * clean. Will only replay if the journal inode is marked dirty. */
1131static int ocfs2_replay_journal(struct ocfs2_super *osb,
1132 int node_num,
1133 int slot_num)
1134{
1135 int status;
1136 int got_lock = 0;
1137 unsigned int flags;
1138 struct inode *inode = NULL;
1139 struct ocfs2_dinode *fe;
1140 journal_t *journal = NULL;
1141 struct buffer_head *bh = NULL;
1142
1143 inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
1144 slot_num);
1145 if (inode == NULL) {
1146 status = -EACCES;
1147 mlog_errno(status);
1148 goto done;
1149 }
1150 if (is_bad_inode(inode)) {
1151 status = -EACCES;
1152 iput(inode);
1153 inode = NULL;
1154 mlog_errno(status);
1155 goto done;
1156 }
1157 SET_INODE_JOURNAL(inode);
1158
1159 status = ocfs2_meta_lock_full(inode, NULL, &bh, 1,
1160 OCFS2_META_LOCK_RECOVERY);
1161 if (status < 0) {
1162 mlog(0, "status returned from ocfs2_meta_lock=%d\n", status);
1163 if (status != -ERESTARTSYS)
1164 mlog(ML_ERROR, "Could not lock journal!\n");
1165 goto done;
1166 }
1167 got_lock = 1;
1168
1169 fe = (struct ocfs2_dinode *) bh->b_data;
1170
1171 flags = le32_to_cpu(fe->id1.journal1.ij_flags);
1172
1173 if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
1174 mlog(0, "No recovery required for node %d\n", node_num);
1175 goto done;
1176 }
1177
1178 mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
1179 node_num, slot_num,
1180 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
1181
1182 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
1183
1184 status = ocfs2_force_read_journal(inode);
1185 if (status < 0) {
1186 mlog_errno(status);
1187 goto done;
1188 }
1189
1190 mlog(0, "calling journal_init_inode\n");
1191 journal = journal_init_inode(inode);
1192 if (journal == NULL) {
1193 mlog(ML_ERROR, "Linux journal layer error\n");
1194 status = -EIO;
1195 goto done;
1196 }
1197
1198 status = journal_load(journal);
1199 if (status < 0) {
1200 mlog_errno(status);
1201 if (!igrab(inode))
1202 BUG();
1203 journal_destroy(journal);
1204 goto done;
1205 }
1206
1207 ocfs2_clear_journal_error(osb->sb, journal, slot_num);
1208
1209 /* wipe the journal */
1210 mlog(0, "flushing the journal.\n");
1211 journal_lock_updates(journal);
1212 status = journal_flush(journal);
1213 journal_unlock_updates(journal);
1214 if (status < 0)
1215 mlog_errno(status);
1216
1217 /* This will mark the node clean */
1218 flags = le32_to_cpu(fe->id1.journal1.ij_flags);
1219 flags &= ~OCFS2_JOURNAL_DIRTY_FL;
1220 fe->id1.journal1.ij_flags = cpu_to_le32(flags);
1221
1222 status = ocfs2_write_block(osb, bh, inode);
1223 if (status < 0)
1224 mlog_errno(status);
1225
1226 if (!igrab(inode))
1227 BUG();
1228
1229 journal_destroy(journal);
1230
1231done:
1232 /* drop the lock on this nodes journal */
1233 if (got_lock)
1234 ocfs2_meta_unlock(inode, 1);
1235
1236 if (inode)
1237 iput(inode);
1238
1239 if (bh)
1240 brelse(bh);
1241
1242 mlog_exit(status);
1243 return status;
1244}
1245
1246/*
1247 * Do the most important parts of node recovery:
1248 * - Replay it's journal
1249 * - Stamp a clean local allocator file
1250 * - Stamp a clean truncate log
1251 * - Mark the node clean
1252 *
1253 * If this function completes without error, a node in OCFS2 can be
1254 * said to have been safely recovered. As a result, failure during the
1255 * second part of a nodes recovery process (local alloc recovery) is
1256 * far less concerning.
1257 */
1258static int ocfs2_recover_node(struct ocfs2_super *osb,
1259 int node_num)
1260{
1261 int status = 0;
1262 int slot_num;
1263 struct ocfs2_slot_info *si = osb->slot_info;
1264 struct ocfs2_dinode *la_copy = NULL;
1265 struct ocfs2_dinode *tl_copy = NULL;
1266
1267 mlog_entry("(node_num=%d, osb->node_num = %d)\n",
1268 node_num, osb->node_num);
1269
1270 mlog(0, "checking node %d\n", node_num);
1271
1272 /* Should not ever be called to recover ourselves -- in that
1273 * case we should've called ocfs2_journal_load instead. */
1274 if (osb->node_num == node_num)
1275 BUG();
1276
1277 slot_num = ocfs2_node_num_to_slot(si, node_num);
1278 if (slot_num == OCFS2_INVALID_SLOT) {
1279 status = 0;
1280 mlog(0, "no slot for this node, so no recovery required.\n");
1281 goto done;
1282 }
1283
1284 mlog(0, "node %d was using slot %d\n", node_num, slot_num);
1285
1286 status = ocfs2_replay_journal(osb, node_num, slot_num);
1287 if (status < 0) {
1288 mlog_errno(status);
1289 goto done;
1290 }
1291
1292 /* Stamp a clean local alloc file AFTER recovering the journal... */
1293 status = ocfs2_begin_local_alloc_recovery(osb, slot_num, &la_copy);
1294 if (status < 0) {
1295 mlog_errno(status);
1296 goto done;
1297 }
1298
1299 /* An error from begin_truncate_log_recovery is not
1300 * serious enough to warrant halting the rest of
1301 * recovery. */
1302 status = ocfs2_begin_truncate_log_recovery(osb, slot_num, &tl_copy);
1303 if (status < 0)
1304 mlog_errno(status);
1305
1306 /* Likewise, this would be a strange but ultimately not so
1307 * harmful place to get an error... */
1308 ocfs2_clear_slot(si, slot_num);
1309 status = ocfs2_update_disk_slots(osb, si);
1310 if (status < 0)
1311 mlog_errno(status);
1312
1313 /* This will kfree the memory pointed to by la_copy and tl_copy */
1314 ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
1315 tl_copy);
1316
1317 status = 0;
1318done:
1319
1320 mlog_exit(status);
1321 return status;
1322}
1323
1324/* Test node liveness by trylocking his journal. If we get the lock,
1325 * we drop it here. Return 0 if we got the lock, -EAGAIN if node is
1326 * still alive (we couldn't get the lock) and < 0 on error. */
1327static int ocfs2_trylock_journal(struct ocfs2_super *osb,
1328 int slot_num)
1329{
1330 int status, flags;
1331 struct inode *inode = NULL;
1332
1333 inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
1334 slot_num);
1335 if (inode == NULL) {
1336 mlog(ML_ERROR, "access error\n");
1337 status = -EACCES;
1338 goto bail;
1339 }
1340 if (is_bad_inode(inode)) {
1341 mlog(ML_ERROR, "access error (bad inode)\n");
1342 iput(inode);
1343 inode = NULL;
1344 status = -EACCES;
1345 goto bail;
1346 }
1347 SET_INODE_JOURNAL(inode);
1348
1349 flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE;
1350 status = ocfs2_meta_lock_full(inode, NULL, NULL, 1, flags);
1351 if (status < 0) {
1352 if (status != -EAGAIN)
1353 mlog_errno(status);
1354 goto bail;
1355 }
1356
1357 ocfs2_meta_unlock(inode, 1);
1358bail:
1359 if (inode)
1360 iput(inode);
1361
1362 return status;
1363}
1364
1365/* Call this underneath ocfs2_super_lock. It also assumes that the
1366 * slot info struct has been updated from disk. */
1367int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
1368{
1369 int status, i, node_num;
1370 struct ocfs2_slot_info *si = osb->slot_info;
1371
1372 /* This is called with the super block cluster lock, so we
1373 * know that the slot map can't change underneath us. */
1374
1375 spin_lock(&si->si_lock);
1376 for(i = 0; i < si->si_num_slots; i++) {
1377 if (i == osb->slot_num)
1378 continue;
1379 if (ocfs2_is_empty_slot(si, i))
1380 continue;
1381
1382 node_num = si->si_global_node_nums[i];
1383 if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num))
1384 continue;
1385 spin_unlock(&si->si_lock);
1386
1387 /* Ok, we have a slot occupied by another node which
1388 * is not in the recovery map. We trylock his journal
1389 * file here to test if he's alive. */
1390 status = ocfs2_trylock_journal(osb, i);
1391 if (!status) {
1392 /* Since we're called from mount, we know that
1393 * the recovery thread can't race us on
1394 * setting / checking the recovery bits. */
1395 ocfs2_recovery_thread(osb, node_num);
1396 } else if ((status < 0) && (status != -EAGAIN)) {
1397 mlog_errno(status);
1398 goto bail;
1399 }
1400
1401 spin_lock(&si->si_lock);
1402 }
1403 spin_unlock(&si->si_lock);
1404
1405 status = 0;
1406bail:
1407 mlog_exit(status);
1408 return status;
1409}
1410
1411static int ocfs2_recover_orphans(struct ocfs2_super *osb,
1412 int slot)
1413{
1414 int status = 0;
1415 int have_disk_lock = 0;
1416 struct inode *inode = NULL;
1417 struct inode *iter;
1418 struct inode *orphan_dir_inode = NULL;
1419 unsigned long offset, blk, local;
1420 struct buffer_head *bh = NULL;
1421 struct ocfs2_dir_entry *de;
1422 struct super_block *sb = osb->sb;
1423 struct ocfs2_inode_info *oi;
1424
1425 mlog(0, "Recover inodes from orphan dir in slot %d\n", slot);
1426
1427 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
1428 ORPHAN_DIR_SYSTEM_INODE,
1429 slot);
1430 if (!orphan_dir_inode) {
1431 status = -ENOENT;
1432 mlog_errno(status);
1433 goto out;
1434 }
1435
1436 mutex_lock(&orphan_dir_inode->i_mutex);
1437 status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0);
1438 if (status < 0) {
1439 mutex_unlock(&orphan_dir_inode->i_mutex);
1440 mlog_errno(status);
1441 goto out;
1442 }
1443 have_disk_lock = 1;
1444
1445 offset = 0;
1446 iter = NULL;
1447 while(offset < i_size_read(orphan_dir_inode)) {
1448 blk = offset >> sb->s_blocksize_bits;
1449
1450 bh = ocfs2_bread(orphan_dir_inode, blk, &status, 0);
1451 if (!bh)
1452 status = -EINVAL;
1453 if (status < 0) {
1454 mutex_unlock(&orphan_dir_inode->i_mutex);
1455 if (bh)
1456 brelse(bh);
1457 mlog_errno(status);
1458 goto out;
1459 }
1460
1461 local = 0;
1462 while(offset < i_size_read(orphan_dir_inode)
1463 && local < sb->s_blocksize) {
1464 de = (struct ocfs2_dir_entry *) (bh->b_data + local);
1465
1466 if (!ocfs2_check_dir_entry(orphan_dir_inode,
1467 de, bh, local)) {
1468 mutex_unlock(&orphan_dir_inode->i_mutex);
1469 status = -EINVAL;
1470 mlog_errno(status);
1471 brelse(bh);
1472 goto out;
1473 }
1474
1475 local += le16_to_cpu(de->rec_len);
1476 offset += le16_to_cpu(de->rec_len);
1477
1478 /* I guess we silently fail on no inode? */
1479 if (!le64_to_cpu(de->inode))
1480 continue;
1481 if (de->file_type > OCFS2_FT_MAX) {
1482 mlog(ML_ERROR,
1483 "block %llu contains invalid de: "
1484 "inode = %"MLFu64", rec_len = %u, "
1485 "name_len = %u, file_type = %u, "
1486 "name='%.*s'\n",
1487 (unsigned long long)bh->b_blocknr,
1488 le64_to_cpu(de->inode),
1489 le16_to_cpu(de->rec_len),
1490 de->name_len,
1491 de->file_type,
1492 de->name_len,
1493 de->name);
1494 continue;
1495 }
1496 if (de->name_len == 1 && !strncmp(".", de->name, 1))
1497 continue;
1498 if (de->name_len == 2 && !strncmp("..", de->name, 2))
1499 continue;
1500
1501 iter = ocfs2_iget(osb, le64_to_cpu(de->inode));
1502 if (IS_ERR(iter))
1503 continue;
1504
1505 mlog(0, "queue orphan %"MLFu64"\n",
1506 OCFS2_I(iter)->ip_blkno);
1507 OCFS2_I(iter)->ip_next_orphan = inode;
1508 inode = iter;
1509 }
1510 brelse(bh);
1511 }
1512 mutex_unlock(&orphan_dir_inode->i_mutex);
1513
1514 ocfs2_meta_unlock(orphan_dir_inode, 0);
1515 have_disk_lock = 0;
1516
1517 iput(orphan_dir_inode);
1518 orphan_dir_inode = NULL;
1519
1520 while (inode) {
1521 oi = OCFS2_I(inode);
1522 mlog(0, "iput orphan %"MLFu64"\n", oi->ip_blkno);
1523
1524 iter = oi->ip_next_orphan;
1525
1526 spin_lock(&oi->ip_lock);
1527 /* Delete voting may have set these on the assumption
1528 * that the other node would wipe them successfully.
1529 * If they are still in the node's orphan dir, we need
1530 * to reset that state. */
1531 oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
1532
1533 /* Set the proper information to get us going into
1534 * ocfs2_delete_inode. */
1535 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
1536 oi->ip_orphaned_slot = slot;
1537 spin_unlock(&oi->ip_lock);
1538
1539 iput(inode);
1540
1541 inode = iter;
1542 }
1543
1544out:
1545 if (have_disk_lock)
1546 ocfs2_meta_unlock(orphan_dir_inode, 0);
1547
1548 if (orphan_dir_inode)
1549 iput(orphan_dir_inode);
1550
1551 return status;
1552}
1553
1554static int ocfs2_wait_on_mount(struct ocfs2_super *osb)
1555{
1556 /* This check is good because ocfs2 will wait on our recovery
1557 * thread before changing it to something other than MOUNTED
1558 * or DISABLED. */
1559 wait_event(osb->osb_mount_event,
1560 atomic_read(&osb->vol_state) == VOLUME_MOUNTED ||
1561 atomic_read(&osb->vol_state) == VOLUME_DISABLED);
1562
1563 /* If there's an error on mount, then we may never get to the
1564 * MOUNTED flag, but this is set right before
1565 * dismount_volume() so we can trust it. */
1566 if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) {
1567 mlog(0, "mount error, exiting!\n");
1568 return -EBUSY;
1569 }
1570
1571 return 0;
1572}
1573
1574static int ocfs2_commit_thread(void *arg)
1575{
1576 int status;
1577 struct ocfs2_super *osb = arg;
1578 struct ocfs2_journal *journal = osb->journal;
1579
1580 /* we can trust j_num_trans here because _should_stop() is only set in
1581 * shutdown and nobody other than ourselves should be able to start
1582 * transactions. committing on shutdown might take a few iterations
1583 * as final transactions put deleted inodes on the list */
1584 while (!(kthread_should_stop() &&
1585 atomic_read(&journal->j_num_trans) == 0)) {
1586
1587 wait_event_interruptible_timeout(osb->checkpoint_event,
1588 atomic_read(&journal->j_num_trans)
1589 || kthread_should_stop(),
1590 OCFS2_CHECKPOINT_INTERVAL);
1591
1592 status = ocfs2_commit_cache(osb);
1593 if (status < 0)
1594 mlog_errno(status);
1595
1596 if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){
1597 mlog(ML_KTHREAD,
1598 "commit_thread: %u transactions pending on "
1599 "shutdown\n",
1600 atomic_read(&journal->j_num_trans));
1601 }
1602 }
1603
1604 return 0;
1605}
1606
1607/* Look for a dirty journal without taking any cluster locks. Used for
1608 * hard readonly access to determine whether the file system journals
1609 * require recovery. */
1610int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)
1611{
1612 int ret = 0;
1613 unsigned int slot;
1614 struct buffer_head *di_bh;
1615 struct ocfs2_dinode *di;
1616 struct inode *journal = NULL;
1617
1618 for(slot = 0; slot < osb->max_slots; slot++) {
1619 journal = ocfs2_get_system_file_inode(osb,
1620 JOURNAL_SYSTEM_INODE,
1621 slot);
1622 if (!journal || is_bad_inode(journal)) {
1623 ret = -EACCES;
1624 mlog_errno(ret);
1625 goto out;
1626 }
1627
1628 di_bh = NULL;
1629 ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh,
1630 0, journal);
1631 if (ret < 0) {
1632 mlog_errno(ret);
1633 goto out;
1634 }
1635
1636 di = (struct ocfs2_dinode *) di_bh->b_data;
1637
1638 if (le32_to_cpu(di->id1.journal1.ij_flags) &
1639 OCFS2_JOURNAL_DIRTY_FL)
1640 ret = -EROFS;
1641
1642 brelse(di_bh);
1643 if (ret)
1644 break;
1645 }
1646
1647out:
1648 if (journal)
1649 iput(journal);
1650
1651 return ret;
1652}
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
new file mode 100644
index 00000000000..7d0a816184f
--- /dev/null
+++ b/fs/ocfs2/journal.h
@@ -0,0 +1,457 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * journal.h
5 *
6 * Defines journalling api and structures.
7 *
8 * Copyright (C) 2003, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_JOURNAL_H
27#define OCFS2_JOURNAL_H
28
29#include <linux/fs.h>
30#include <linux/jbd.h>
31
32#define OCFS2_CHECKPOINT_INTERVAL (8 * HZ)
33
34enum ocfs2_journal_state {
35 OCFS2_JOURNAL_FREE = 0,
36 OCFS2_JOURNAL_LOADED,
37 OCFS2_JOURNAL_IN_SHUTDOWN,
38};
39
40struct ocfs2_super;
41struct ocfs2_dinode;
42struct ocfs2_journal_handle;
43
44struct ocfs2_journal {
45 enum ocfs2_journal_state j_state; /* Journals current state */
46
47 journal_t *j_journal; /* The kernels journal type */
48 struct inode *j_inode; /* Kernel inode pointing to
49 * this journal */
50 struct ocfs2_super *j_osb; /* pointer to the super
51 * block for the node
52 * we're currently
53 * running on -- not
54 * necessarily the super
55 * block from the node
56 * which we usually run
57 * from (recovery,
58 * etc) */
59 struct buffer_head *j_bh; /* Journal disk inode block */
60 atomic_t j_num_trans; /* Number of transactions
61 * currently in the system. */
62 unsigned long j_trans_id;
63 struct rw_semaphore j_trans_barrier;
64 wait_queue_head_t j_checkpointed;
65
66 spinlock_t j_lock;
67 struct list_head j_la_cleanups;
68 struct work_struct j_recovery_work;
69};
70
71extern spinlock_t trans_inc_lock;
72
73/* wrap j_trans_id so we never have it equal to zero. */
74static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j)
75{
76 unsigned long old_id;
77 spin_lock(&trans_inc_lock);
78 old_id = j->j_trans_id++;
79 if (unlikely(!j->j_trans_id))
80 j->j_trans_id = 1;
81 spin_unlock(&trans_inc_lock);
82 return old_id;
83}
84
85static inline void ocfs2_set_inode_lock_trans(struct ocfs2_journal *journal,
86 struct inode *inode)
87{
88 spin_lock(&trans_inc_lock);
89 OCFS2_I(inode)->ip_last_trans = journal->j_trans_id;
90 spin_unlock(&trans_inc_lock);
91}
92
93/* Used to figure out whether it's safe to drop a metadata lock on an
94 * inode. Returns true if all the inodes changes have been
95 * checkpointed to disk. You should be holding the spinlock on the
96 * metadata lock while calling this to be sure that nobody can take
97 * the lock and put it on another transaction. */
98static inline int ocfs2_inode_fully_checkpointed(struct inode *inode)
99{
100 int ret;
101 struct ocfs2_journal *journal = OCFS2_SB(inode->i_sb)->journal;
102
103 spin_lock(&trans_inc_lock);
104 ret = time_after(journal->j_trans_id, OCFS2_I(inode)->ip_last_trans);
105 spin_unlock(&trans_inc_lock);
106 return ret;
107}
108
109/* convenience function to check if an inode is still new (has never
110 * hit disk) Will do you a favor and set created_trans = 0 when you've
111 * been checkpointed. returns '1' if the inode is still new. */
112static inline int ocfs2_inode_is_new(struct inode *inode)
113{
114 int ret;
115
116 /* System files are never "new" as they're written out by
117 * mkfs. This helps us early during mount, before we have the
118 * journal open and j_trans_id could be junk. */
119 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
120 return 0;
121 spin_lock(&trans_inc_lock);
122 ret = !(time_after(OCFS2_SB(inode->i_sb)->journal->j_trans_id,
123 OCFS2_I(inode)->ip_created_trans));
124 if (!ret)
125 OCFS2_I(inode)->ip_created_trans = 0;
126 spin_unlock(&trans_inc_lock);
127 return ret;
128}
129
130static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
131 struct inode *inode)
132{
133 spin_lock(&trans_inc_lock);
134 OCFS2_I(inode)->ip_created_trans = osb->journal->j_trans_id;
135 spin_unlock(&trans_inc_lock);
136}
137
138extern kmem_cache_t *ocfs2_lock_cache;
139
140struct ocfs2_journal_lock {
141 struct inode *jl_inode;
142 struct list_head jl_lock_list;
143};
144
145struct ocfs2_journal_handle {
146 handle_t *k_handle; /* kernel handle. */
147 struct ocfs2_journal *journal;
148 u32 flags; /* see flags below. */
149 int max_buffs; /* Buffs reserved by this handle */
150
151 /* The following two fields are for ocfs2_handle_add_lock */
152 int num_locks;
153 struct list_head locks; /* A bunch of locks to
154 * release on commit. This
155 * should be a list_head */
156
157 struct list_head inode_list;
158};
159
160#define OCFS2_HANDLE_STARTED 1
161/* should we sync-commit this handle? */
162#define OCFS2_HANDLE_SYNC 2
163static inline int ocfs2_handle_started(struct ocfs2_journal_handle *handle)
164{
165 return handle->flags & OCFS2_HANDLE_STARTED;
166}
167
168static inline void ocfs2_handle_set_sync(struct ocfs2_journal_handle *handle, int sync)
169{
170 if (sync)
171 handle->flags |= OCFS2_HANDLE_SYNC;
172 else
173 handle->flags &= ~OCFS2_HANDLE_SYNC;
174}
175
176/* Exported only for the journal struct init code in super.c. Do not call. */
177void ocfs2_complete_recovery(void *data);
178
179/*
180 * Journal Control:
181 * Initialize, Load, Shutdown, Wipe a journal.
182 *
183 * ocfs2_journal_init - Initialize journal structures in the OSB.
184 * ocfs2_journal_load - Load the given journal off disk. Replay it if
185 * there's transactions still in there.
186 * ocfs2_journal_shutdown - Shutdown a journal, this will flush all
187 * uncommitted, uncheckpointed transactions.
188 * ocfs2_journal_wipe - Wipe transactions from a journal. Optionally
189 * zero out each block.
190 * ocfs2_recovery_thread - Perform recovery on a node. osb is our own osb.
191 * ocfs2_mark_dead_nodes - Start recovery on nodes we won't get a heartbeat
192 * event on.
193 * ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint.
194 */
195void ocfs2_set_journal_params(struct ocfs2_super *osb);
196int ocfs2_journal_init(struct ocfs2_journal *journal,
197 int *dirty);
198void ocfs2_journal_shutdown(struct ocfs2_super *osb);
199int ocfs2_journal_wipe(struct ocfs2_journal *journal,
200 int full);
201int ocfs2_journal_load(struct ocfs2_journal *journal);
202int ocfs2_check_journals_nolocks(struct ocfs2_super *osb);
203void ocfs2_recovery_thread(struct ocfs2_super *osb,
204 int node_num);
205int ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
206void ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
207
208static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
209{
210 atomic_set(&osb->needs_checkpoint, 1);
211 wake_up(&osb->checkpoint_event);
212}
213
214static inline void ocfs2_checkpoint_inode(struct inode *inode)
215{
216 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
217
218 if (!ocfs2_inode_fully_checkpointed(inode)) {
219 /* WARNING: This only kicks off a single
220 * checkpoint. If someone races you and adds more
221 * metadata to the journal, you won't know, and will
222 * wind up waiting *alot* longer than necessary. Right
223 * now we only use this in clear_inode so that's
224 * OK. */
225 ocfs2_start_checkpoint(osb);
226
227 wait_event(osb->journal->j_checkpointed,
228 ocfs2_inode_fully_checkpointed(inode));
229 }
230}
231
232/*
233 * Transaction Handling:
234 * Manage the lifetime of a transaction handle.
235 *
236 * ocfs2_alloc_handle - Only allocate a handle so we can start putting
237 * cluster locks on it. To actually change blocks,
238 * call ocfs2_start_trans with the handle returned
239 * from this function. You may call ocfs2_commit_trans
240 * at any time in the lifetime of a handle.
241 * ocfs2_start_trans - Begin a transaction. Give it an upper estimate of
242 * the number of blocks that will be changed during
243 * this handle.
244 * ocfs2_commit_trans - Complete a handle.
245 * ocfs2_extend_trans - Extend a handle by nblocks credits. This may
246 * commit the handle to disk in the process, but will
247 * not release any locks taken during the transaction.
248 * ocfs2_journal_access - Notify the handle that we want to journal this
249 * buffer. Will have to call ocfs2_journal_dirty once
250 * we've actually dirtied it. Type is one of . or .
251 * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data.
252 * ocfs2_journal_dirty_data - Indicate that a data buffer should go out before
253 * the current handle commits.
254 * ocfs2_handle_add_lock - Sometimes we need to delay lock release
255 * until after a transaction has been completed. Use
256 * ocfs2_handle_add_lock to indicate that a lock needs
257 * to be released at the end of that handle. Locks
258 * will be released in the order that they are added.
259 * ocfs2_handle_add_inode - Add a locked inode to a transaction.
260 */
261
262/* You must always start_trans with a number of buffs > 0, but it's
263 * perfectly legal to go through an entire transaction without having
264 * dirtied any buffers. */
265struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb);
266struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb,
267 struct ocfs2_journal_handle *handle,
268 int max_buffs);
269void ocfs2_commit_trans(struct ocfs2_journal_handle *handle);
270int ocfs2_extend_trans(struct ocfs2_journal_handle *handle,
271 int nblocks);
272
273/*
274 * Create access is for when we get a newly created buffer and we're
275 * not gonna read it off disk, but rather fill it ourselves. Right
276 * now, we don't do anything special with this (it turns into a write
277 * request), but this is a good placeholder in case we do...
278 *
279 * Write access is for when we read a block off disk and are going to
280 * modify it. This way the journalling layer knows it may need to make
281 * a copy of that block (if it's part of another, uncommitted
282 * transaction) before we do so.
283 */
284#define OCFS2_JOURNAL_ACCESS_CREATE 0
285#define OCFS2_JOURNAL_ACCESS_WRITE 1
286#define OCFS2_JOURNAL_ACCESS_UNDO 2
287
288int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
289 struct inode *inode,
290 struct buffer_head *bh,
291 int type);
292/*
293 * A word about the journal_access/journal_dirty "dance". It is
294 * entirely legal to journal_access a buffer more than once (as long
295 * as the access type is the same -- I'm not sure what will happen if
296 * access type is different but this should never happen anyway) It is
297 * also legal to journal_dirty a buffer more than once. In fact, you
298 * can even journal_access a buffer after you've done a
299 * journal_access/journal_dirty pair. The only thing you cannot do
300 * however, is journal_dirty a buffer which you haven't yet passed to
301 * journal_access at least once.
302 *
303 * That said, 99% of the time this doesn't matter and this is what the
304 * path looks like:
305 *
306 * <read a bh>
307 * ocfs2_journal_access(handle, bh, OCFS2_JOURNAL_ACCESS_WRITE);
308 * <modify the bh>
309 * ocfs2_journal_dirty(handle, bh);
310 */
311int ocfs2_journal_dirty(struct ocfs2_journal_handle *handle,
312 struct buffer_head *bh);
313int ocfs2_journal_dirty_data(handle_t *handle,
314 struct buffer_head *bh);
315int ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle,
316 struct inode *inode);
317/*
318 * Use this to protect from other processes reading buffer state while
319 * it's in flight.
320 */
321void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
322 struct inode *inode);
323
324/*
325 * Credit Macros:
326 * Convenience macros to calculate number of credits needed.
327 *
328 * For convenience sake, I have a set of macros here which calculate
329 * the *maximum* number of sectors which will be changed for various
330 * metadata updates.
331 */
332
333/* simple file updates like chmod, etc. */
334#define OCFS2_INODE_UPDATE_CREDITS 1
335
336/* get one bit out of a suballocator: dinode + group descriptor +
337 * prev. group desc. if we relink. */
338#define OCFS2_SUBALLOC_ALLOC (3)
339
340/* dinode + group descriptor update. We don't relink on free yet. */
341#define OCFS2_SUBALLOC_FREE (2)
342
343#define OCFS2_TRUNCATE_LOG_UPDATE OCFS2_INODE_UPDATE_CREDITS
344#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \
345 + OCFS2_TRUNCATE_LOG_UPDATE)
346
347/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
348 * bitmap block for the new bit) */
349#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
350
351/* parent fe, parent block, new file entry, inode alloc fe, inode alloc
352 * group descriptor + mkdir/symlink blocks */
353#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC \
354 + OCFS2_DIR_LINK_ADDITIONAL_CREDITS)
355
356/* local alloc metadata change + main bitmap updates */
357#define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS \
358 + OCFS2_SUBALLOC_ALLOC + OCFS2_SUBALLOC_FREE)
359
360/* used when we don't need an allocation change for a dir extend. One
361 * for the dinode, one for the new block. */
362#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
363
364/* file update (nlink, etc) + dir entry block */
365#define OCFS2_LINK_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
366
367/* inode + dir inode (if we unlink a dir), + dir entry block + orphan
368 * dir inode link */
369#define OCFS2_UNLINK_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 1 \
370 + OCFS2_LINK_CREDITS)
371
372/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
373 * inode alloc group descriptor */
374#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1)
375
376/* dinode update, old dir dinode update, new dir dinode update, old
377 * dir dir entry, new dir dir entry, dir entry update for renaming
378 * directory + target unlink */
379#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \
380 + OCFS2_UNLINK_CREDITS)
381
382static inline int ocfs2_calc_extend_credits(struct super_block *sb,
383 struct ocfs2_dinode *fe,
384 u32 bits_wanted)
385{
386 int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks;
387
388 /* bitmap dinode, group desc. + relinked group. */
389 bitmap_blocks = OCFS2_SUBALLOC_ALLOC;
390
391 /* we might need to shift tree depth so lets assume an
392 * absolute worst case of complete fragmentation. Even with
393 * that, we only need one update for the dinode, and then
394 * however many metadata chunks needed * a remaining suballoc
395 * alloc. */
396 sysfile_bitmap_blocks = 1 +
397 (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe);
398
399 /* this does not include *new* metadata blocks, which are
400 * accounted for in sysfile_bitmap_blocks. fe +
401 * prev. last_eb_blk + blocks along edge of tree.
402 * calc_symlink_credits passes because we just need 1
403 * credit for the dinode there. */
404 dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth);
405
406 return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks;
407}
408
409static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
410{
411 int blocks = OCFS2_MKNOD_CREDITS;
412
413 /* links can be longer than one block so we may update many
414 * within our single allocated extent. */
415 blocks += ocfs2_clusters_to_blocks(sb, 1);
416
417 return blocks;
418}
419
420static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
421 unsigned int cpg)
422{
423 int blocks;
424 int bitmap_blocks = OCFS2_SUBALLOC_ALLOC + 1;
425 /* parent inode update + new block group header + bitmap inode update
426 + bitmap blocks affected */
427 blocks = 1 + 1 + 1 + bitmap_blocks;
428 return blocks;
429}
430
431static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
432 unsigned int clusters_to_del,
433 struct ocfs2_dinode *fe,
434 struct ocfs2_extent_list *last_el)
435{
436 /* for dinode + all headers in this pass + update to next leaf */
437 u16 next_free = le16_to_cpu(last_el->l_next_free_rec);
438 u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth);
439 int credits = 1 + tree_depth + 1;
440 int i;
441
442 i = next_free - 1;
443 BUG_ON(i < 0);
444
445 /* We may be deleting metadata blocks, so metadata alloc dinode +
446 one desc. block for each possible delete. */
447 if (tree_depth && next_free == 1 &&
448 le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del)
449 credits += 1 + tree_depth;
450
451 /* update to the truncate log. */
452 credits += OCFS2_TRUNCATE_LOG_UPDATE;
453
454 return credits;
455}
456
457#endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
new file mode 100644
index 00000000000..149b3518166
--- /dev/null
+++ b/fs/ocfs2/localalloc.c
@@ -0,0 +1,983 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * localalloc.c
5 *
6 * Node local data allocation
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30#include <linux/bitops.h>
31
32#define MLOG_MASK_PREFIX ML_DISK_ALLOC
33#include <cluster/masklog.h>
34
35#include "ocfs2.h"
36
37#include "alloc.h"
38#include "dlmglue.h"
39#include "inode.h"
40#include "journal.h"
41#include "localalloc.h"
42#include "suballoc.h"
43#include "super.h"
44#include "sysfile.h"
45
46#include "buffer_head_io.h"
47
48#define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab))
49
50static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb);
51
52static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
53
54static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
55 struct ocfs2_dinode *alloc,
56 u32 numbits);
57
58static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
59
60static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
61 struct ocfs2_journal_handle *handle,
62 struct ocfs2_dinode *alloc,
63 struct inode *main_bm_inode,
64 struct buffer_head *main_bm_bh);
65
66static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
67 struct ocfs2_journal_handle *handle,
68 struct ocfs2_alloc_context **ac,
69 struct inode **bitmap_inode,
70 struct buffer_head **bitmap_bh);
71
72static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
73 struct ocfs2_journal_handle *handle,
74 struct ocfs2_alloc_context *ac);
75
76static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
77 struct inode *local_alloc_inode);
78
79/*
80 * Determine how large our local alloc window should be, in bits.
81 *
82 * These values (and the behavior in ocfs2_alloc_should_use_local) have
83 * been chosen so that most allocations, including new block groups go
84 * through local alloc.
85 */
86static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
87{
88 BUG_ON(osb->s_clustersize_bits < 12);
89
90 return 2048 >> (osb->s_clustersize_bits - 12);
91}
92
93/*
94 * Tell us whether a given allocation should use the local alloc
95 * file. Otherwise, it has to go to the main bitmap.
96 */
97int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
98{
99 int la_bits = ocfs2_local_alloc_window_bits(osb);
100
101 if (osb->local_alloc_state != OCFS2_LA_ENABLED)
102 return 0;
103
104 /* la_bits should be at least twice the size (in clusters) of
105 * a new block group. We want to be sure block group
106 * allocations go through the local alloc, so allow an
107 * allocation to take up to half the bitmap. */
108 if (bits > (la_bits / 2))
109 return 0;
110
111 return 1;
112}
113
114int ocfs2_load_local_alloc(struct ocfs2_super *osb)
115{
116 int status = 0;
117 struct ocfs2_dinode *alloc = NULL;
118 struct buffer_head *alloc_bh = NULL;
119 u32 num_used;
120 struct inode *inode = NULL;
121 struct ocfs2_local_alloc *la;
122
123 mlog_entry_void();
124
125 /* read the alloc off disk */
126 inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE,
127 osb->slot_num);
128 if (!inode) {
129 status = -EINVAL;
130 mlog_errno(status);
131 goto bail;
132 }
133
134 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
135 &alloc_bh, 0, inode);
136 if (status < 0) {
137 mlog_errno(status);
138 goto bail;
139 }
140
141 alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
142 la = OCFS2_LOCAL_ALLOC(alloc);
143
144 if (!(le32_to_cpu(alloc->i_flags) &
145 (OCFS2_LOCAL_ALLOC_FL|OCFS2_BITMAP_FL))) {
146 mlog(ML_ERROR, "Invalid local alloc inode, %"MLFu64"\n",
147 OCFS2_I(inode)->ip_blkno);
148 status = -EINVAL;
149 goto bail;
150 }
151
152 if ((la->la_size == 0) ||
153 (le16_to_cpu(la->la_size) > ocfs2_local_alloc_size(inode->i_sb))) {
154 mlog(ML_ERROR, "Local alloc size is invalid (la_size = %u)\n",
155 le16_to_cpu(la->la_size));
156 status = -EINVAL;
157 goto bail;
158 }
159
160 /* do a little verification. */
161 num_used = ocfs2_local_alloc_count_bits(alloc);
162
163 /* hopefully the local alloc has always been recovered before
164 * we load it. */
165 if (num_used
166 || alloc->id1.bitmap1.i_used
167 || alloc->id1.bitmap1.i_total
168 || la->la_bm_off)
169 mlog(ML_ERROR, "Local alloc hasn't been recovered!\n"
170 "found = %u, set = %u, taken = %u, off = %u\n",
171 num_used, le32_to_cpu(alloc->id1.bitmap1.i_used),
172 le32_to_cpu(alloc->id1.bitmap1.i_total),
173 OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
174
175 osb->local_alloc_bh = alloc_bh;
176 osb->local_alloc_state = OCFS2_LA_ENABLED;
177
178bail:
179 if (status < 0)
180 if (alloc_bh)
181 brelse(alloc_bh);
182 if (inode)
183 iput(inode);
184
185 mlog_exit(status);
186 return status;
187}
188
189/*
190 * return any unused bits to the bitmap and write out a clean
191 * local_alloc.
192 *
193 * local_alloc_bh is optional. If not passed, we will simply use the
194 * one off osb. If you do pass it however, be warned that it *will* be
195 * returned brelse'd and NULL'd out.*/
196void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
197{
198 int status;
199 struct ocfs2_journal_handle *handle = NULL;
200 struct inode *local_alloc_inode = NULL;
201 struct buffer_head *bh = NULL;
202 struct buffer_head *main_bm_bh = NULL;
203 struct inode *main_bm_inode = NULL;
204 struct ocfs2_dinode *alloc_copy = NULL;
205 struct ocfs2_dinode *alloc = NULL;
206
207 mlog_entry_void();
208
209 if (osb->local_alloc_state == OCFS2_LA_UNUSED)
210 goto bail;
211
212 local_alloc_inode =
213 ocfs2_get_system_file_inode(osb,
214 LOCAL_ALLOC_SYSTEM_INODE,
215 osb->slot_num);
216 if (!local_alloc_inode) {
217 status = -ENOENT;
218 mlog_errno(status);
219 goto bail;
220 }
221
222 osb->local_alloc_state = OCFS2_LA_DISABLED;
223
224 handle = ocfs2_alloc_handle(osb);
225 if (!handle) {
226 status = -ENOMEM;
227 mlog_errno(status);
228 goto bail;
229 }
230
231 main_bm_inode = ocfs2_get_system_file_inode(osb,
232 GLOBAL_BITMAP_SYSTEM_INODE,
233 OCFS2_INVALID_SLOT);
234 if (!main_bm_inode) {
235 status = -EINVAL;
236 mlog_errno(status);
237 goto bail;
238 }
239
240 ocfs2_handle_add_inode(handle, main_bm_inode);
241 status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
242 if (status < 0) {
243 mlog_errno(status);
244 goto bail;
245 }
246
247 /* WINDOW_MOVE_CREDITS is a bit heavy... */
248 handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
249 if (IS_ERR(handle)) {
250 mlog_errno(PTR_ERR(handle));
251 handle = NULL;
252 goto bail;
253 }
254
255 bh = osb->local_alloc_bh;
256 alloc = (struct ocfs2_dinode *) bh->b_data;
257
258 alloc_copy = kmalloc(bh->b_size, GFP_KERNEL);
259 if (!alloc_copy) {
260 status = -ENOMEM;
261 goto bail;
262 }
263 memcpy(alloc_copy, alloc, bh->b_size);
264
265 status = ocfs2_journal_access(handle, local_alloc_inode, bh,
266 OCFS2_JOURNAL_ACCESS_WRITE);
267 if (status < 0) {
268 mlog_errno(status);
269 goto bail;
270 }
271
272 ocfs2_clear_local_alloc(alloc);
273
274 status = ocfs2_journal_dirty(handle, bh);
275 if (status < 0) {
276 mlog_errno(status);
277 goto bail;
278 }
279
280 brelse(bh);
281 osb->local_alloc_bh = NULL;
282 osb->local_alloc_state = OCFS2_LA_UNUSED;
283
284 status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
285 main_bm_inode, main_bm_bh);
286 if (status < 0)
287 mlog_errno(status);
288
289bail:
290 if (handle)
291 ocfs2_commit_trans(handle);
292
293 if (main_bm_bh)
294 brelse(main_bm_bh);
295
296 if (main_bm_inode)
297 iput(main_bm_inode);
298
299 if (local_alloc_inode)
300 iput(local_alloc_inode);
301
302 if (alloc_copy)
303 kfree(alloc_copy);
304
305 mlog_exit_void();
306}
307
308/*
309 * We want to free the bitmap bits outside of any recovery context as
310 * we'll need a cluster lock to do so, but we must clear the local
311 * alloc before giving up the recovered nodes journal. To solve this,
312 * we kmalloc a copy of the local alloc before it's change for the
313 * caller to process with ocfs2_complete_local_alloc_recovery
314 */
315int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
316 int slot_num,
317 struct ocfs2_dinode **alloc_copy)
318{
319 int status = 0;
320 struct buffer_head *alloc_bh = NULL;
321 struct inode *inode = NULL;
322 struct ocfs2_dinode *alloc;
323
324 mlog_entry("(slot_num = %d)\n", slot_num);
325
326 *alloc_copy = NULL;
327
328 inode = ocfs2_get_system_file_inode(osb,
329 LOCAL_ALLOC_SYSTEM_INODE,
330 slot_num);
331 if (!inode) {
332 status = -EINVAL;
333 mlog_errno(status);
334 goto bail;
335 }
336
337 mutex_lock(&inode->i_mutex);
338
339 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
340 &alloc_bh, 0, inode);
341 if (status < 0) {
342 mlog_errno(status);
343 goto bail;
344 }
345
346 *alloc_copy = kmalloc(alloc_bh->b_size, GFP_KERNEL);
347 if (!(*alloc_copy)) {
348 status = -ENOMEM;
349 goto bail;
350 }
351 memcpy((*alloc_copy), alloc_bh->b_data, alloc_bh->b_size);
352
353 alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
354 ocfs2_clear_local_alloc(alloc);
355
356 status = ocfs2_write_block(osb, alloc_bh, inode);
357 if (status < 0)
358 mlog_errno(status);
359
360bail:
361 if ((status < 0) && (*alloc_copy)) {
362 kfree(*alloc_copy);
363 *alloc_copy = NULL;
364 }
365
366 if (alloc_bh)
367 brelse(alloc_bh);
368
369 if (inode) {
370 mutex_unlock(&inode->i_mutex);
371 iput(inode);
372 }
373
374 mlog_exit(status);
375 return status;
376}
377
378/*
379 * Step 2: By now, we've completed the journal recovery, we've stamped
380 * a clean local alloc on disk and dropped the node out of the
381 * recovery map. Dlm locks will no longer stall, so lets clear out the
382 * main bitmap.
383 */
384int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
385 struct ocfs2_dinode *alloc)
386{
387 int status;
388 struct ocfs2_journal_handle *handle = NULL;
389 struct buffer_head *main_bm_bh = NULL;
390 struct inode *main_bm_inode = NULL;
391
392 mlog_entry_void();
393
394 handle = ocfs2_alloc_handle(osb);
395 if (!handle) {
396 status = -ENOMEM;
397 mlog_errno(status);
398 goto bail;
399 }
400
401 main_bm_inode = ocfs2_get_system_file_inode(osb,
402 GLOBAL_BITMAP_SYSTEM_INODE,
403 OCFS2_INVALID_SLOT);
404 if (!main_bm_inode) {
405 status = -EINVAL;
406 mlog_errno(status);
407 goto bail;
408 }
409
410 ocfs2_handle_add_inode(handle, main_bm_inode);
411 status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
412 if (status < 0) {
413 mlog_errno(status);
414 goto bail;
415 }
416
417 handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
418 if (IS_ERR(handle)) {
419 status = PTR_ERR(handle);
420 handle = NULL;
421 mlog_errno(status);
422 goto bail;
423 }
424
425 /* we want the bitmap change to be recorded on disk asap */
426 ocfs2_handle_set_sync(handle, 1);
427
428 status = ocfs2_sync_local_to_main(osb, handle, alloc,
429 main_bm_inode, main_bm_bh);
430 if (status < 0)
431 mlog_errno(status);
432
433bail:
434 if (handle)
435 ocfs2_commit_trans(handle);
436
437 if (main_bm_bh)
438 brelse(main_bm_bh);
439
440 if (main_bm_inode)
441 iput(main_bm_inode);
442
443 mlog_exit(status);
444 return status;
445}
446
447/*
448 * make sure we've got at least bitswanted contiguous bits in the
449 * local alloc. You lose them when you drop i_mutex.
450 *
451 * We will add ourselves to the transaction passed in, but may start
452 * our own in order to shift windows.
453 */
454int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
455 struct ocfs2_journal_handle *passed_handle,
456 u32 bits_wanted,
457 struct ocfs2_alloc_context *ac)
458{
459 int status;
460 struct ocfs2_dinode *alloc;
461 struct inode *local_alloc_inode;
462 unsigned int free_bits;
463
464 mlog_entry_void();
465
466 BUG_ON(!passed_handle);
467 BUG_ON(!ac);
468 BUG_ON(passed_handle->flags & OCFS2_HANDLE_STARTED);
469
470 local_alloc_inode =
471 ocfs2_get_system_file_inode(osb,
472 LOCAL_ALLOC_SYSTEM_INODE,
473 osb->slot_num);
474 if (!local_alloc_inode) {
475 status = -ENOENT;
476 mlog_errno(status);
477 goto bail;
478 }
479 ocfs2_handle_add_inode(passed_handle, local_alloc_inode);
480
481 if (osb->local_alloc_state != OCFS2_LA_ENABLED) {
482 status = -ENOSPC;
483 goto bail;
484 }
485
486 if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) {
487 mlog(0, "Asking for more than my max window size!\n");
488 status = -ENOSPC;
489 goto bail;
490 }
491
492 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
493
494 if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
495 ocfs2_local_alloc_count_bits(alloc)) {
496 ocfs2_error(osb->sb, "local alloc inode %"MLFu64" says it has "
497 "%u free bits, but a count shows %u",
498 le64_to_cpu(alloc->i_blkno),
499 le32_to_cpu(alloc->id1.bitmap1.i_used),
500 ocfs2_local_alloc_count_bits(alloc));
501 status = -EIO;
502 goto bail;
503 }
504
505 free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
506 le32_to_cpu(alloc->id1.bitmap1.i_used);
507 if (bits_wanted > free_bits) {
508 /* uhoh, window change time. */
509 status =
510 ocfs2_local_alloc_slide_window(osb, local_alloc_inode);
511 if (status < 0) {
512 if (status != -ENOSPC)
513 mlog_errno(status);
514 goto bail;
515 }
516 }
517
518 ac->ac_inode = igrab(local_alloc_inode);
519 get_bh(osb->local_alloc_bh);
520 ac->ac_bh = osb->local_alloc_bh;
521 ac->ac_which = OCFS2_AC_USE_LOCAL;
522 status = 0;
523bail:
524 if (local_alloc_inode)
525 iput(local_alloc_inode);
526
527 mlog_exit(status);
528 return status;
529}
530
531int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
532 struct ocfs2_journal_handle *handle,
533 struct ocfs2_alloc_context *ac,
534 u32 min_bits,
535 u32 *bit_off,
536 u32 *num_bits)
537{
538 int status, start;
539 struct inode *local_alloc_inode;
540 u32 bits_wanted;
541 void *bitmap;
542 struct ocfs2_dinode *alloc;
543 struct ocfs2_local_alloc *la;
544
545 mlog_entry_void();
546 BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
547
548 bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
549 local_alloc_inode = ac->ac_inode;
550 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
551 la = OCFS2_LOCAL_ALLOC(alloc);
552
553 start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
554 if (start == -1) {
555 /* TODO: Shouldn't we just BUG here? */
556 status = -ENOSPC;
557 mlog_errno(status);
558 goto bail;
559 }
560
561 bitmap = la->la_bitmap;
562 *bit_off = le32_to_cpu(la->la_bm_off) + start;
563 /* local alloc is always contiguous by nature -- we never
564 * delete bits from it! */
565 *num_bits = bits_wanted;
566
567 status = ocfs2_journal_access(handle, local_alloc_inode,
568 osb->local_alloc_bh,
569 OCFS2_JOURNAL_ACCESS_WRITE);
570 if (status < 0) {
571 mlog_errno(status);
572 goto bail;
573 }
574
575 while(bits_wanted--)
576 ocfs2_set_bit(start++, bitmap);
577
578 alloc->id1.bitmap1.i_used = cpu_to_le32(*num_bits +
579 le32_to_cpu(alloc->id1.bitmap1.i_used));
580
581 status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
582 if (status < 0) {
583 mlog_errno(status);
584 goto bail;
585 }
586
587 status = 0;
588bail:
589 mlog_exit(status);
590 return status;
591}
592
593static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
594{
595 int i;
596 u8 *buffer;
597 u32 count = 0;
598 struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
599
600 mlog_entry_void();
601
602 buffer = la->la_bitmap;
603 for (i = 0; i < le16_to_cpu(la->la_size); i++)
604 count += hweight8(buffer[i]);
605
606 mlog_exit(count);
607 return count;
608}
609
610static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
611 struct ocfs2_dinode *alloc,
612 u32 numbits)
613{
614 int numfound, bitoff, left, startoff, lastzero;
615 void *bitmap = NULL;
616
617 mlog_entry("(numbits wanted = %u)\n", numbits);
618
619 if (!alloc->id1.bitmap1.i_total) {
620 mlog(0, "No bits in my window!\n");
621 bitoff = -1;
622 goto bail;
623 }
624
625 bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
626
627 numfound = bitoff = startoff = 0;
628 lastzero = -1;
629 left = le32_to_cpu(alloc->id1.bitmap1.i_total);
630 while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) {
631 if (bitoff == left) {
632 /* mlog(0, "bitoff (%d) == left", bitoff); */
633 break;
634 }
635 /* mlog(0, "Found a zero: bitoff = %d, startoff = %d, "
636 "numfound = %d\n", bitoff, startoff, numfound);*/
637
638 /* Ok, we found a zero bit... is it contig. or do we
639 * start over?*/
640 if (bitoff == startoff) {
641 /* we found a zero */
642 numfound++;
643 startoff++;
644 } else {
645 /* got a zero after some ones */
646 numfound = 1;
647 startoff = bitoff+1;
648 }
649 /* we got everything we needed */
650 if (numfound == numbits) {
651 /* mlog(0, "Found it all!\n"); */
652 break;
653 }
654 }
655
656 mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
657 numfound);
658
659 if (numfound == numbits)
660 bitoff = startoff - numfound;
661 else
662 bitoff = -1;
663
664bail:
665 mlog_exit(bitoff);
666 return bitoff;
667}
668
669static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc)
670{
671 struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
672 int i;
673 mlog_entry_void();
674
675 alloc->id1.bitmap1.i_total = 0;
676 alloc->id1.bitmap1.i_used = 0;
677 la->la_bm_off = 0;
678 for(i = 0; i < le16_to_cpu(la->la_size); i++)
679 la->la_bitmap[i] = 0;
680
681 mlog_exit_void();
682}
683
684#if 0
685/* turn this on and uncomment below to aid debugging window shifts. */
686static void ocfs2_verify_zero_bits(unsigned long *bitmap,
687 unsigned int start,
688 unsigned int count)
689{
690 unsigned int tmp = count;
691 while(tmp--) {
692 if (ocfs2_test_bit(start + tmp, bitmap)) {
693 printk("ocfs2_verify_zero_bits: start = %u, count = "
694 "%u\n", start, count);
695 printk("ocfs2_verify_zero_bits: bit %u is set!",
696 start + tmp);
697 BUG();
698 }
699 }
700}
701#endif
702
703/*
704 * sync the local alloc to main bitmap.
705 *
706 * assumes you've already locked the main bitmap -- the bitmap inode
707 * passed is used for caching.
708 */
709static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
710 struct ocfs2_journal_handle *handle,
711 struct ocfs2_dinode *alloc,
712 struct inode *main_bm_inode,
713 struct buffer_head *main_bm_bh)
714{
715 int status = 0;
716 int bit_off, left, count, start;
717 u64 la_start_blk;
718 u64 blkno;
719 void *bitmap;
720 struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
721
722 mlog_entry("total = %u, COUNT = %u, used = %u\n",
723 le32_to_cpu(alloc->id1.bitmap1.i_total),
724 ocfs2_local_alloc_count_bits(alloc),
725 le32_to_cpu(alloc->id1.bitmap1.i_used));
726
727 if (!alloc->id1.bitmap1.i_total) {
728 mlog(0, "nothing to sync!\n");
729 goto bail;
730 }
731
732 if (le32_to_cpu(alloc->id1.bitmap1.i_used) ==
733 le32_to_cpu(alloc->id1.bitmap1.i_total)) {
734 mlog(0, "all bits were taken!\n");
735 goto bail;
736 }
737
738 la_start_blk = ocfs2_clusters_to_blocks(osb->sb,
739 le32_to_cpu(la->la_bm_off));
740 bitmap = la->la_bitmap;
741 start = count = bit_off = 0;
742 left = le32_to_cpu(alloc->id1.bitmap1.i_total);
743
744 while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start))
745 != -1) {
746 if ((bit_off < left) && (bit_off == start)) {
747 count++;
748 start++;
749 continue;
750 }
751 if (count) {
752 blkno = la_start_blk +
753 ocfs2_clusters_to_blocks(osb->sb,
754 start - count);
755
756 mlog(0, "freeing %u bits starting at local "
757 "alloc bit %u (la_start_blk = %"MLFu64", "
758 "blkno = %"MLFu64")\n", count, start - count,
759 la_start_blk, blkno);
760
761 status = ocfs2_free_clusters(handle, main_bm_inode,
762 main_bm_bh, blkno, count);
763 if (status < 0) {
764 mlog_errno(status);
765 goto bail;
766 }
767 }
768 if (bit_off >= left)
769 break;
770 count = 1;
771 start = bit_off + 1;
772 }
773
774bail:
775 mlog_exit(status);
776 return status;
777}
778
779static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
780 struct ocfs2_journal_handle *handle,
781 struct ocfs2_alloc_context **ac,
782 struct inode **bitmap_inode,
783 struct buffer_head **bitmap_bh)
784{
785 int status;
786
787 *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
788 if (!(*ac)) {
789 status = -ENOMEM;
790 mlog_errno(status);
791 goto bail;
792 }
793
794 (*ac)->ac_handle = handle;
795 (*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb);
796
797 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
798 if (status < 0) {
799 if (status != -ENOSPC)
800 mlog_errno(status);
801 goto bail;
802 }
803
804 *bitmap_inode = (*ac)->ac_inode;
805 igrab(*bitmap_inode);
806 *bitmap_bh = (*ac)->ac_bh;
807 get_bh(*bitmap_bh);
808 status = 0;
809bail:
810 if ((status < 0) && *ac) {
811 ocfs2_free_alloc_context(*ac);
812 *ac = NULL;
813 }
814
815 mlog_exit(status);
816 return status;
817}
818
819/*
820 * pass it the bitmap lock in lock_bh if you have it.
821 */
822static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
823 struct ocfs2_journal_handle *handle,
824 struct ocfs2_alloc_context *ac)
825{
826 int status = 0;
827 u32 cluster_off, cluster_count;
828 struct ocfs2_dinode *alloc = NULL;
829 struct ocfs2_local_alloc *la;
830
831 mlog_entry_void();
832
833 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
834 la = OCFS2_LOCAL_ALLOC(alloc);
835
836 if (alloc->id1.bitmap1.i_total)
837 mlog(0, "asking me to alloc a new window over a non-empty "
838 "one\n");
839
840 mlog(0, "Allocating %u clusters for a new window.\n",
841 ocfs2_local_alloc_window_bits(osb));
842 /* we used the generic suballoc reserve function, but we set
843 * everything up nicely, so there's no reason why we can't use
844 * the more specific cluster api to claim bits. */
845 status = ocfs2_claim_clusters(osb, handle, ac,
846 ocfs2_local_alloc_window_bits(osb),
847 &cluster_off, &cluster_count);
848 if (status < 0) {
849 if (status != -ENOSPC)
850 mlog_errno(status);
851 goto bail;
852 }
853
854 la->la_bm_off = cpu_to_le32(cluster_off);
855 alloc->id1.bitmap1.i_total = cpu_to_le32(cluster_count);
856 /* just in case... In the future when we find space ourselves,
857 * we don't have to get all contiguous -- but we'll have to
858 * set all previously used bits in bitmap and update
859 * la_bits_set before setting the bits in the main bitmap. */
860 alloc->id1.bitmap1.i_used = 0;
861 memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
862 le16_to_cpu(la->la_size));
863
864 mlog(0, "New window allocated:\n");
865 mlog(0, "window la_bm_off = %u\n",
866 OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
867 mlog(0, "window bits = %u\n", le32_to_cpu(alloc->id1.bitmap1.i_total));
868
869bail:
870 mlog_exit(status);
871 return status;
872}
873
874/* Note that we do *NOT* lock the local alloc inode here as
875 * it's been locked already for us. */
876static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
877 struct inode *local_alloc_inode)
878{
879 int status = 0;
880 struct buffer_head *main_bm_bh = NULL;
881 struct inode *main_bm_inode = NULL;
882 struct ocfs2_journal_handle *handle = NULL;
883 struct ocfs2_dinode *alloc;
884 struct ocfs2_dinode *alloc_copy = NULL;
885 struct ocfs2_alloc_context *ac = NULL;
886
887 mlog_entry_void();
888
889 handle = ocfs2_alloc_handle(osb);
890 if (!handle) {
891 status = -ENOMEM;
892 mlog_errno(status);
893 goto bail;
894 }
895
896 /* This will lock the main bitmap for us. */
897 status = ocfs2_local_alloc_reserve_for_window(osb,
898 handle,
899 &ac,
900 &main_bm_inode,
901 &main_bm_bh);
902 if (status < 0) {
903 if (status != -ENOSPC)
904 mlog_errno(status);
905 goto bail;
906 }
907
908 handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
909 if (IS_ERR(handle)) {
910 status = PTR_ERR(handle);
911 handle = NULL;
912 mlog_errno(status);
913 goto bail;
914 }
915
916 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
917
918 /* We want to clear the local alloc before doing anything
919 * else, so that if we error later during this operation,
920 * local alloc shutdown won't try to double free main bitmap
921 * bits. Make a copy so the sync function knows which bits to
922 * free. */
923 alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_KERNEL);
924 if (!alloc_copy) {
925 status = -ENOMEM;
926 mlog_errno(status);
927 goto bail;
928 }
929 memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
930
931 status = ocfs2_journal_access(handle, local_alloc_inode,
932 osb->local_alloc_bh,
933 OCFS2_JOURNAL_ACCESS_WRITE);
934 if (status < 0) {
935 mlog_errno(status);
936 goto bail;
937 }
938
939 ocfs2_clear_local_alloc(alloc);
940
941 status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
942 if (status < 0) {
943 mlog_errno(status);
944 goto bail;
945 }
946
947 status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
948 main_bm_inode, main_bm_bh);
949 if (status < 0) {
950 mlog_errno(status);
951 goto bail;
952 }
953
954 status = ocfs2_local_alloc_new_window(osb, handle, ac);
955 if (status < 0) {
956 if (status != -ENOSPC)
957 mlog_errno(status);
958 goto bail;
959 }
960
961 atomic_inc(&osb->alloc_stats.moves);
962
963 status = 0;
964bail:
965 if (handle)
966 ocfs2_commit_trans(handle);
967
968 if (main_bm_bh)
969 brelse(main_bm_bh);
970
971 if (main_bm_inode)
972 iput(main_bm_inode);
973
974 if (alloc_copy)
975 kfree(alloc_copy);
976
977 if (ac)
978 ocfs2_free_alloc_context(ac);
979
980 mlog_exit(status);
981 return status;
982}
983
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
new file mode 100644
index 00000000000..30f88ce14e4
--- /dev/null
+++ b/fs/ocfs2/localalloc.h
@@ -0,0 +1,56 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * localalloc.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_LOCALALLOC_H
27#define OCFS2_LOCALALLOC_H
28
29int ocfs2_load_local_alloc(struct ocfs2_super *osb);
30
31void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
32
33int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
34 int node_num,
35 struct ocfs2_dinode **alloc_copy);
36
37int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
38 struct ocfs2_dinode *alloc);
39
40int ocfs2_alloc_should_use_local(struct ocfs2_super *osb,
41 u64 bits);
42
43struct ocfs2_alloc_context;
44int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
45 struct ocfs2_journal_handle *passed_handle,
46 u32 bits_wanted,
47 struct ocfs2_alloc_context *ac);
48
49int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
50 struct ocfs2_journal_handle *handle,
51 struct ocfs2_alloc_context *ac,
52 u32 min_bits,
53 u32 *bit_off,
54 u32 *num_bits);
55
56#endif /* OCFS2_LOCALALLOC_H */
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
new file mode 100644
index 00000000000..843cf9ddefe
--- /dev/null
+++ b/fs/ocfs2/mmap.c
@@ -0,0 +1,98 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * mmap.c
5 *
6 * Code to deal with the mess that is clustered mmap.
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30#include <linux/pagemap.h>
31#include <linux/uio.h>
32#include <linux/signal.h>
33#include <linux/rbtree.h>
34
35#define MLOG_MASK_PREFIX ML_FILE_IO
36#include <cluster/masklog.h>
37
38#include "ocfs2.h"
39
40#include "dlmglue.h"
41#include "file.h"
42#include "inode.h"
43#include "mmap.h"
44
45static struct page *ocfs2_nopage(struct vm_area_struct * area,
46 unsigned long address,
47 int *type)
48{
49 struct inode *inode = area->vm_file->f_dentry->d_inode;
50 struct page *page = NOPAGE_SIGBUS;
51 sigset_t blocked, oldset;
52 int ret;
53
54 mlog_entry("(inode %lu, address %lu)\n", inode->i_ino, address);
55
56 /* The best way to deal with signals in this path is
57 * to block them upfront, rather than allowing the
58 * locking paths to return -ERESTARTSYS. */
59 sigfillset(&blocked);
60
61 /* We should technically never get a bad ret return
62 * from sigprocmask */
63 ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
64 if (ret < 0) {
65 mlog_errno(ret);
66 goto out;
67 }
68
69 page = filemap_nopage(area, address, type);
70
71 ret = sigprocmask(SIG_SETMASK, &oldset, NULL);
72 if (ret < 0)
73 mlog_errno(ret);
74out:
75 mlog_exit_ptr(page);
76 return page;
77}
78
79static struct vm_operations_struct ocfs2_file_vm_ops = {
80 .nopage = ocfs2_nopage,
81};
82
83int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
84{
85 /* We don't want to support shared writable mappings yet. */
86 if (((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE))
87 && ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
88 mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
89 /* This is -EINVAL because generic_file_readonly_mmap
90 * returns it in a similar situation. */
91 return -EINVAL;
92 }
93
94 file_accessed(file);
95 vma->vm_ops = &ocfs2_file_vm_ops;
96 return 0;
97}
98
diff --git a/fs/ocfs2/mmap.h b/fs/ocfs2/mmap.h
new file mode 100644
index 00000000000..1274ee0f1fe
--- /dev/null
+++ b/fs/ocfs2/mmap.h
@@ -0,0 +1,6 @@
1#ifndef OCFS2_MMAP_H
2#define OCFS2_MMAP_H
3
4int ocfs2_mmap(struct file *file, struct vm_area_struct *vma);
5
6#endif /* OCFS2_MMAP_H */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
new file mode 100644
index 00000000000..f6b77ff1d2b
--- /dev/null
+++ b/fs/ocfs2/namei.c
@@ -0,0 +1,2264 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * namei.c
5 *
6 * Create and rename file, directory, symlinks
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * Portions of this code from linux/fs/ext3/dir.c
11 *
12 * Copyright (C) 1992, 1993, 1994, 1995
13 * Remy Card (card@masi.ibp.fr)
14 * Laboratoire MASI - Institut Blaise pascal
15 * Universite Pierre et Marie Curie (Paris VI)
16 *
17 * from
18 *
19 * linux/fs/minix/dir.c
20 *
21 * Copyright (C) 1991, 1992 Linux Torvalds
22 *
23 * This program is free software; you can redistribute it and/or
24 * modify it under the terms of the GNU General Public
25 * License as published by the Free Software Foundation; either
26 * version 2 of the License, or (at your option) any later version.
27 *
28 * This program is distributed in the hope that it will be useful,
29 * but WITHOUT ANY WARRANTY; without even the implied warranty of
30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
31 * General Public License for more details.
32 *
33 * You should have received a copy of the GNU General Public
34 * License along with this program; if not, write to the
35 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
36 * Boston, MA 021110-1307, USA.
37 */
38
39#include <linux/fs.h>
40#include <linux/types.h>
41#include <linux/slab.h>
42#include <linux/highmem.h>
43
44#define MLOG_MASK_PREFIX ML_NAMEI
45#include <cluster/masklog.h>
46
47#include "ocfs2.h"
48
49#include "alloc.h"
50#include "dcache.h"
51#include "dir.h"
52#include "dlmglue.h"
53#include "extent_map.h"
54#include "file.h"
55#include "inode.h"
56#include "journal.h"
57#include "namei.h"
58#include "suballoc.h"
59#include "symlink.h"
60#include "sysfile.h"
61#include "uptodate.h"
62#include "vote.h"
63
64#include "buffer_head_io.h"
65
66#define NAMEI_RA_CHUNKS 2
67#define NAMEI_RA_BLOCKS 4
68#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
69#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
70
71static int inline ocfs2_search_dirblock(struct buffer_head *bh,
72 struct inode *dir,
73 const char *name, int namelen,
74 unsigned long offset,
75 struct ocfs2_dir_entry **res_dir);
76
77static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle,
78 struct inode *dir,
79 struct ocfs2_dir_entry *de_del,
80 struct buffer_head *bh);
81
82static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle,
83 struct inode *dir,
84 const char *name, int namelen,
85 struct inode *inode, u64 blkno,
86 struct buffer_head *parent_fe_bh,
87 struct buffer_head *insert_bh);
88
89static int ocfs2_mknod_locked(struct ocfs2_super *osb,
90 struct inode *dir,
91 struct dentry *dentry, int mode,
92 dev_t dev,
93 struct buffer_head **new_fe_bh,
94 struct buffer_head *parent_fe_bh,
95 struct ocfs2_journal_handle *handle,
96 struct inode **ret_inode,
97 struct ocfs2_alloc_context *inode_ac);
98
99static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
100 struct ocfs2_journal_handle *handle,
101 struct inode *parent,
102 struct inode *inode,
103 struct buffer_head *fe_bh,
104 struct ocfs2_alloc_context *data_ac);
105
106static int ocfs2_double_lock(struct ocfs2_super *osb,
107 struct ocfs2_journal_handle *handle,
108 struct buffer_head **bh1,
109 struct inode *inode1,
110 struct buffer_head **bh2,
111 struct inode *inode2);
112
113static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
114 struct ocfs2_journal_handle *handle,
115 struct inode *inode,
116 char *name,
117 struct buffer_head **de_bh);
118
119static int ocfs2_orphan_add(struct ocfs2_super *osb,
120 struct ocfs2_journal_handle *handle,
121 struct inode *inode,
122 struct ocfs2_dinode *fe,
123 char *name,
124 struct buffer_head *de_bh);
125
126static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
127 struct ocfs2_journal_handle *handle,
128 struct inode *inode,
129 const char *symname);
130
131static inline int ocfs2_add_entry(struct ocfs2_journal_handle *handle,
132 struct dentry *dentry,
133 struct inode *inode, u64 blkno,
134 struct buffer_head *parent_fe_bh,
135 struct buffer_head *insert_bh)
136{
137 return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
138 dentry->d_name.name, dentry->d_name.len,
139 inode, blkno, parent_fe_bh, insert_bh);
140}
141
142/* An orphan dir name is an 8 byte value, printed as a hex string */
143#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
144
145static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
146 struct nameidata *nd)
147{
148 int status;
149 u64 blkno;
150 struct buffer_head *dirent_bh = NULL;
151 struct inode *inode = NULL;
152 struct dentry *ret;
153 struct ocfs2_dir_entry *dirent;
154 struct ocfs2_inode_info *oi;
155
156 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
157 dentry->d_name.len, dentry->d_name.name);
158
159 if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) {
160 ret = ERR_PTR(-ENAMETOOLONG);
161 goto bail;
162 }
163
164 mlog(0, "find name %.*s in directory %"MLFu64"\n", dentry->d_name.len,
165 dentry->d_name.name, OCFS2_I(dir)->ip_blkno);
166
167 status = ocfs2_meta_lock(dir, NULL, NULL, 0);
168 if (status < 0) {
169 if (status != -ENOENT)
170 mlog_errno(status);
171 ret = ERR_PTR(status);
172 goto bail;
173 }
174
175 status = ocfs2_find_files_on_disk(dentry->d_name.name,
176 dentry->d_name.len, &blkno,
177 dir, &dirent_bh, &dirent);
178 if (status < 0)
179 goto bail_add;
180
181 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno);
182 if (IS_ERR(inode)) {
183 mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno);
184 ret = ERR_PTR(-EACCES);
185 goto bail_unlock;
186 }
187
188 oi = OCFS2_I(inode);
189 /* Clear any orphaned state... If we were able to look up the
190 * inode from a directory, it certainly can't be orphaned. We
191 * might have the bad state from a node which intended to
192 * orphan this inode but crashed before it could commit the
193 * unlink. */
194 spin_lock(&oi->ip_lock);
195 oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
196 oi->ip_orphaned_slot = OCFS2_INVALID_SLOT;
197 spin_unlock(&oi->ip_lock);
198
199bail_add:
200
201 dentry->d_op = &ocfs2_dentry_ops;
202 ret = d_splice_alias(inode, dentry);
203
204bail_unlock:
205 /* Don't drop the cluster lock until *after* the d_add --
206 * unlink on another node will message us to remove that
207 * dentry under this lock so otherwise we can race this with
208 * the vote thread and have a stale dentry. */
209 ocfs2_meta_unlock(dir, 0);
210
211bail:
212 if (dirent_bh)
213 brelse(dirent_bh);
214
215 mlog_exit_ptr(ret);
216
217 return ret;
218}
219
220static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
221 struct ocfs2_journal_handle *handle,
222 struct inode *parent,
223 struct inode *inode,
224 struct buffer_head *fe_bh,
225 struct ocfs2_alloc_context *data_ac)
226{
227 int status;
228 struct buffer_head *new_bh = NULL;
229 struct ocfs2_dir_entry *de = NULL;
230
231 mlog_entry_void();
232
233 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
234 data_ac, NULL, &new_bh);
235 if (status < 0) {
236 mlog_errno(status);
237 goto bail;
238 }
239
240 ocfs2_set_new_buffer_uptodate(inode, new_bh);
241
242 status = ocfs2_journal_access(handle, inode, new_bh,
243 OCFS2_JOURNAL_ACCESS_CREATE);
244 if (status < 0) {
245 mlog_errno(status);
246 goto bail;
247 }
248 memset(new_bh->b_data, 0, osb->sb->s_blocksize);
249
250 de = (struct ocfs2_dir_entry *) new_bh->b_data;
251 de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
252 de->name_len = 1;
253 de->rec_len =
254 cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
255 strcpy(de->name, ".");
256 ocfs2_set_de_type(de, S_IFDIR);
257 de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len));
258 de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno);
259 de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize -
260 OCFS2_DIR_REC_LEN(1));
261 de->name_len = 2;
262 strcpy(de->name, "..");
263 ocfs2_set_de_type(de, S_IFDIR);
264
265 status = ocfs2_journal_dirty(handle, new_bh);
266 if (status < 0) {
267 mlog_errno(status);
268 goto bail;
269 }
270
271 i_size_write(inode, inode->i_sb->s_blocksize);
272 inode->i_nlink = 2;
273 inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize);
274 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
275 if (status < 0) {
276 mlog_errno(status);
277 goto bail;
278 }
279
280 status = 0;
281bail:
282 if (new_bh)
283 brelse(new_bh);
284
285 mlog_exit(status);
286 return status;
287}
288
289static int ocfs2_mknod(struct inode *dir,
290 struct dentry *dentry,
291 int mode,
292 dev_t dev)
293{
294 int status = 0;
295 struct buffer_head *parent_fe_bh = NULL;
296 struct ocfs2_journal_handle *handle = NULL;
297 struct ocfs2_super *osb;
298 struct ocfs2_dinode *dirfe;
299 struct buffer_head *new_fe_bh = NULL;
300 struct buffer_head *de_bh = NULL;
301 struct inode *inode = NULL;
302 struct ocfs2_alloc_context *inode_ac = NULL;
303 struct ocfs2_alloc_context *data_ac = NULL;
304
305 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
306 (unsigned long)dev, dentry->d_name.len,
307 dentry->d_name.name);
308
309 /* get our super block */
310 osb = OCFS2_SB(dir->i_sb);
311
312 if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) {
313 mlog(ML_ERROR, "inode %"MLFu64" has i_nlink of %u\n",
314 OCFS2_I(dir)->ip_blkno, dir->i_nlink);
315 status = -EMLINK;
316 goto leave;
317 }
318
319 handle = ocfs2_alloc_handle(osb);
320 if (handle == NULL) {
321 status = -ENOMEM;
322 mlog_errno(status);
323 goto leave;
324 }
325
326 status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
327 if (status < 0) {
328 if (status != -ENOENT)
329 mlog_errno(status);
330 goto leave;
331 }
332
333 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
334 if (!dirfe->i_links_count) {
335 /* can't make a file in a deleted directory. */
336 status = -ENOENT;
337 goto leave;
338 }
339
340 status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
341 dentry->d_name.len);
342 if (status)
343 goto leave;
344
345 /* get a spot inside the dir. */
346 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
347 dentry->d_name.name,
348 dentry->d_name.len, &de_bh);
349 if (status < 0) {
350 mlog_errno(status);
351 goto leave;
352 }
353
354 /* reserve an inode spot */
355 status = ocfs2_reserve_new_inode(osb, handle, &inode_ac);
356 if (status < 0) {
357 if (status != -ENOSPC)
358 mlog_errno(status);
359 goto leave;
360 }
361
362 /* are we making a directory? If so, reserve a cluster for his
363 * 1st extent. */
364 if (S_ISDIR(mode)) {
365 status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
366 if (status < 0) {
367 if (status != -ENOSPC)
368 mlog_errno(status);
369 goto leave;
370 }
371 }
372
373 handle = ocfs2_start_trans(osb, handle, OCFS2_MKNOD_CREDITS);
374 if (IS_ERR(handle)) {
375 status = PTR_ERR(handle);
376 handle = NULL;
377 mlog_errno(status);
378 goto leave;
379 }
380
381 /* do the real work now. */
382 status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev,
383 &new_fe_bh, parent_fe_bh, handle,
384 &inode, inode_ac);
385 if (status < 0) {
386 mlog_errno(status);
387 goto leave;
388 }
389
390 if (S_ISDIR(mode)) {
391 status = ocfs2_fill_new_dir(osb, handle, dir, inode,
392 new_fe_bh, data_ac);
393 if (status < 0) {
394 mlog_errno(status);
395 goto leave;
396 }
397
398 status = ocfs2_journal_access(handle, dir, parent_fe_bh,
399 OCFS2_JOURNAL_ACCESS_WRITE);
400 if (status < 0) {
401 mlog_errno(status);
402 goto leave;
403 }
404 le16_add_cpu(&dirfe->i_links_count, 1);
405 status = ocfs2_journal_dirty(handle, parent_fe_bh);
406 if (status < 0) {
407 mlog_errno(status);
408 goto leave;
409 }
410 dir->i_nlink++;
411 }
412
413 status = ocfs2_add_entry(handle, dentry, inode,
414 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
415 de_bh);
416 if (status < 0) {
417 mlog_errno(status);
418 goto leave;
419 }
420
421 insert_inode_hash(inode);
422 dentry->d_op = &ocfs2_dentry_ops;
423 d_instantiate(dentry, inode);
424 status = 0;
425leave:
426 if (handle)
427 ocfs2_commit_trans(handle);
428
429 if (status == -ENOSPC)
430 mlog(0, "Disk is full\n");
431
432 if (new_fe_bh)
433 brelse(new_fe_bh);
434
435 if (de_bh)
436 brelse(de_bh);
437
438 if (parent_fe_bh)
439 brelse(parent_fe_bh);
440
441 if ((status < 0) && inode)
442 iput(inode);
443
444 if (inode_ac)
445 ocfs2_free_alloc_context(inode_ac);
446
447 if (data_ac)
448 ocfs2_free_alloc_context(data_ac);
449
450 mlog_exit(status);
451
452 return status;
453}
454
455static int ocfs2_mknod_locked(struct ocfs2_super *osb,
456 struct inode *dir,
457 struct dentry *dentry, int mode,
458 dev_t dev,
459 struct buffer_head **new_fe_bh,
460 struct buffer_head *parent_fe_bh,
461 struct ocfs2_journal_handle *handle,
462 struct inode **ret_inode,
463 struct ocfs2_alloc_context *inode_ac)
464{
465 int status = 0;
466 struct ocfs2_dinode *fe = NULL;
467 struct ocfs2_extent_list *fel;
468 u64 fe_blkno = 0;
469 u16 suballoc_bit;
470 struct inode *inode = NULL;
471
472 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
473 (unsigned long)dev, dentry->d_name.len,
474 dentry->d_name.name);
475
476 *new_fe_bh = NULL;
477 *ret_inode = NULL;
478
479 status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
480 &fe_blkno);
481 if (status < 0) {
482 mlog_errno(status);
483 goto leave;
484 }
485
486 inode = new_inode(dir->i_sb);
487 if (IS_ERR(inode)) {
488 status = PTR_ERR(inode);
489 mlog(ML_ERROR, "new_inode failed!\n");
490 goto leave;
491 }
492
493 /* populate as many fields early on as possible - many of
494 * these are used by the support functions here and in
495 * callers. */
496 inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
497 OCFS2_I(inode)->ip_blkno = fe_blkno;
498 if (S_ISDIR(mode))
499 inode->i_nlink = 2;
500 else
501 inode->i_nlink = 1;
502 inode->i_mode = mode;
503 spin_lock(&osb->osb_lock);
504 inode->i_generation = osb->s_next_generation++;
505 spin_unlock(&osb->osb_lock);
506
507 *new_fe_bh = sb_getblk(osb->sb, fe_blkno);
508 if (!*new_fe_bh) {
509 status = -EIO;
510 mlog_errno(status);
511 goto leave;
512 }
513 ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
514
515 status = ocfs2_journal_access(handle, inode, *new_fe_bh,
516 OCFS2_JOURNAL_ACCESS_CREATE);
517 if (status < 0) {
518 mlog_errno(status);
519 goto leave;
520 }
521
522 fe = (struct ocfs2_dinode *) (*new_fe_bh)->b_data;
523 memset(fe, 0, osb->sb->s_blocksize);
524
525 fe->i_generation = cpu_to_le32(inode->i_generation);
526 fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
527 fe->i_blkno = cpu_to_le64(fe_blkno);
528 fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
529 fe->i_suballoc_slot = cpu_to_le16(osb->slot_num);
530 fe->i_uid = cpu_to_le32(current->fsuid);
531 if (dir->i_mode & S_ISGID) {
532 fe->i_gid = cpu_to_le32(dir->i_gid);
533 if (S_ISDIR(mode))
534 mode |= S_ISGID;
535 } else
536 fe->i_gid = cpu_to_le32(current->fsgid);
537 fe->i_mode = cpu_to_le16(mode);
538 if (S_ISCHR(mode) || S_ISBLK(mode))
539 fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
540
541 fe->i_links_count = cpu_to_le16(inode->i_nlink);
542
543 fe->i_last_eb_blk = 0;
544 strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
545 le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL);
546 fe->i_atime = fe->i_ctime = fe->i_mtime =
547 cpu_to_le64(CURRENT_TIME.tv_sec);
548 fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
549 cpu_to_le32(CURRENT_TIME.tv_nsec);
550 fe->i_dtime = 0;
551
552 fel = &fe->id2.i_list;
553 fel->l_tree_depth = 0;
554 fel->l_next_free_rec = 0;
555 fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
556
557 status = ocfs2_journal_dirty(handle, *new_fe_bh);
558 if (status < 0) {
559 mlog_errno(status);
560 goto leave;
561 }
562
563 if (ocfs2_populate_inode(inode, fe, 1) < 0) {
564 mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
565 "i_blkno=%"MLFu64", i_ino=%lu\n",
566 (unsigned long long) (*new_fe_bh)->b_blocknr,
567 fe->i_blkno, inode->i_ino);
568 BUG();
569 }
570
571 ocfs2_inode_set_new(osb, inode);
572 status = ocfs2_create_new_inode_locks(inode);
573 if (status < 0)
574 mlog_errno(status);
575
576 status = 0; /* error in ocfs2_create_new_inode_locks is not
577 * critical */
578
579 *ret_inode = inode;
580leave:
581 if (status < 0) {
582 if (*new_fe_bh) {
583 brelse(*new_fe_bh);
584 *new_fe_bh = NULL;
585 }
586 if (inode)
587 iput(inode);
588 }
589
590 mlog_exit(status);
591 return status;
592}
593
594static int ocfs2_mkdir(struct inode *dir,
595 struct dentry *dentry,
596 int mode)
597{
598 int ret;
599
600 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
601 dentry->d_name.len, dentry->d_name.name);
602 ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0);
603 mlog_exit(ret);
604
605 return ret;
606}
607
608static int ocfs2_create(struct inode *dir,
609 struct dentry *dentry,
610 int mode,
611 struct nameidata *nd)
612{
613 int ret;
614
615 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
616 dentry->d_name.len, dentry->d_name.name);
617 ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0);
618 mlog_exit(ret);
619
620 return ret;
621}
622
623static int ocfs2_link(struct dentry *old_dentry,
624 struct inode *dir,
625 struct dentry *dentry)
626{
627 struct ocfs2_journal_handle *handle = NULL;
628 struct inode *inode = old_dentry->d_inode;
629 int err;
630 struct buffer_head *fe_bh = NULL;
631 struct buffer_head *parent_fe_bh = NULL;
632 struct buffer_head *de_bh = NULL;
633 struct ocfs2_dinode *fe = NULL;
634 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
635
636 mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
637 old_dentry->d_name.len, old_dentry->d_name.name,
638 dentry->d_name.len, dentry->d_name.name);
639
640 if (S_ISDIR(inode->i_mode)) {
641 err = -EPERM;
642 goto bail;
643 }
644
645 if (inode->i_nlink >= OCFS2_LINK_MAX) {
646 err = -EMLINK;
647 goto bail;
648 }
649
650 handle = ocfs2_alloc_handle(osb);
651 if (handle == NULL) {
652 err = -ENOMEM;
653 goto bail;
654 }
655
656 err = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
657 if (err < 0) {
658 if (err != -ENOENT)
659 mlog_errno(err);
660 goto bail;
661 }
662
663 err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
664 dentry->d_name.len);
665 if (err)
666 goto bail;
667
668 err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
669 dentry->d_name.name,
670 dentry->d_name.len, &de_bh);
671 if (err < 0) {
672 mlog_errno(err);
673 goto bail;
674 }
675
676 err = ocfs2_meta_lock(inode, handle, &fe_bh, 1);
677 if (err < 0) {
678 if (err != -ENOENT)
679 mlog_errno(err);
680 goto bail;
681 }
682
683 fe = (struct ocfs2_dinode *) fe_bh->b_data;
684 if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) {
685 err = -EMLINK;
686 goto bail;
687 }
688
689 handle = ocfs2_start_trans(osb, handle, OCFS2_LINK_CREDITS);
690 if (IS_ERR(handle)) {
691 err = PTR_ERR(handle);
692 handle = NULL;
693 mlog_errno(err);
694 goto bail;
695 }
696
697 err = ocfs2_journal_access(handle, inode, fe_bh,
698 OCFS2_JOURNAL_ACCESS_WRITE);
699 if (err < 0) {
700 mlog_errno(err);
701 goto bail;
702 }
703
704 inode->i_nlink++;
705 inode->i_ctime = CURRENT_TIME;
706 fe->i_links_count = cpu_to_le16(inode->i_nlink);
707 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
708 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
709
710 err = ocfs2_journal_dirty(handle, fe_bh);
711 if (err < 0) {
712 le16_add_cpu(&fe->i_links_count, -1);
713 inode->i_nlink--;
714 mlog_errno(err);
715 goto bail;
716 }
717
718 err = ocfs2_add_entry(handle, dentry, inode,
719 OCFS2_I(inode)->ip_blkno,
720 parent_fe_bh, de_bh);
721 if (err) {
722 le16_add_cpu(&fe->i_links_count, -1);
723 inode->i_nlink--;
724 mlog_errno(err);
725 goto bail;
726 }
727
728 atomic_inc(&inode->i_count);
729 dentry->d_op = &ocfs2_dentry_ops;
730 d_instantiate(dentry, inode);
731bail:
732 if (handle)
733 ocfs2_commit_trans(handle);
734 if (de_bh)
735 brelse(de_bh);
736 if (fe_bh)
737 brelse(fe_bh);
738 if (parent_fe_bh)
739 brelse(parent_fe_bh);
740
741 mlog_exit(err);
742
743 return err;
744}
745
746static int ocfs2_unlink(struct inode *dir,
747 struct dentry *dentry)
748{
749 int status;
750 unsigned int saved_nlink = 0;
751 struct inode *inode = dentry->d_inode;
752 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
753 u64 blkno;
754 struct ocfs2_dinode *fe = NULL;
755 struct buffer_head *fe_bh = NULL;
756 struct buffer_head *parent_node_bh = NULL;
757 struct ocfs2_journal_handle *handle = NULL;
758 struct ocfs2_dir_entry *dirent = NULL;
759 struct buffer_head *dirent_bh = NULL;
760 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
761 struct buffer_head *orphan_entry_bh = NULL;
762
763 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
764 dentry->d_name.len, dentry->d_name.name);
765
766 BUG_ON(dentry->d_parent->d_inode != dir);
767
768 mlog(0, "ino = %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
769
770 if (inode == osb->root_inode) {
771 mlog(0, "Cannot delete the root directory\n");
772 status = -EPERM;
773 goto leave;
774 }
775
776 handle = ocfs2_alloc_handle(osb);
777 if (handle == NULL) {
778 status = -ENOMEM;
779 mlog_errno(status);
780 goto leave;
781 }
782
783 status = ocfs2_meta_lock(dir, handle, &parent_node_bh, 1);
784 if (status < 0) {
785 if (status != -ENOENT)
786 mlog_errno(status);
787 goto leave;
788 }
789
790 status = ocfs2_find_files_on_disk(dentry->d_name.name,
791 dentry->d_name.len, &blkno,
792 dir, &dirent_bh, &dirent);
793 if (status < 0) {
794 if (status != -ENOENT)
795 mlog_errno(status);
796 goto leave;
797 }
798
799 if (OCFS2_I(inode)->ip_blkno != blkno) {
800 status = -ENOENT;
801
802 mlog(0, "ip_blkno (%"MLFu64") != dirent blkno (%"MLFu64") "
803 "ip_flags = %x\n", OCFS2_I(inode)->ip_blkno, blkno,
804 OCFS2_I(inode)->ip_flags);
805 goto leave;
806 }
807
808 status = ocfs2_meta_lock(inode, handle, &fe_bh, 1);
809 if (status < 0) {
810 if (status != -ENOENT)
811 mlog_errno(status);
812 goto leave;
813 }
814
815 if (S_ISDIR(inode->i_mode)) {
816 if (!ocfs2_empty_dir(inode)) {
817 status = -ENOTEMPTY;
818 goto leave;
819 } else if (inode->i_nlink != 2) {
820 status = -ENOTEMPTY;
821 goto leave;
822 }
823 }
824
825 /* There are still a few steps left until we can consider the
826 * unlink to have succeeded. Save off nlink here before
827 * modification so we can set it back in case we hit an issue
828 * before commit. */
829 saved_nlink = inode->i_nlink;
830 if (S_ISDIR(inode->i_mode))
831 inode->i_nlink = 0;
832 else
833 inode->i_nlink--;
834
835 status = ocfs2_request_unlink_vote(inode, dentry,
836 (unsigned int) inode->i_nlink);
837 if (status < 0) {
838 /* This vote should succeed under all normal
839 * circumstances. */
840 mlog_errno(status);
841 goto leave;
842 }
843
844 if (!inode->i_nlink) {
845 status = ocfs2_prepare_orphan_dir(osb, handle, inode,
846 orphan_name,
847 &orphan_entry_bh);
848 if (status < 0) {
849 mlog_errno(status);
850 goto leave;
851 }
852 }
853
854 handle = ocfs2_start_trans(osb, handle, OCFS2_UNLINK_CREDITS);
855 if (IS_ERR(handle)) {
856 status = PTR_ERR(handle);
857 handle = NULL;
858 mlog_errno(status);
859 goto leave;
860 }
861
862 status = ocfs2_journal_access(handle, inode, fe_bh,
863 OCFS2_JOURNAL_ACCESS_WRITE);
864 if (status < 0) {
865 mlog_errno(status);
866 goto leave;
867 }
868
869 fe = (struct ocfs2_dinode *) fe_bh->b_data;
870
871 if (!inode->i_nlink) {
872 status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
873 orphan_entry_bh);
874 if (status < 0) {
875 mlog_errno(status);
876 goto leave;
877 }
878 }
879
880 /* delete the name from the parent dir */
881 status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh);
882 if (status < 0) {
883 mlog_errno(status);
884 goto leave;
885 }
886
887 /* We can set nlink on the dinode now. clear the saved version
888 * so that it doesn't get set later. */
889 fe->i_links_count = cpu_to_le16(inode->i_nlink);
890 saved_nlink = 0;
891
892 status = ocfs2_journal_dirty(handle, fe_bh);
893 if (status < 0) {
894 mlog_errno(status);
895 goto leave;
896 }
897
898 if (S_ISDIR(inode->i_mode)) {
899 dir->i_nlink--;
900 status = ocfs2_mark_inode_dirty(handle, dir,
901 parent_node_bh);
902 if (status < 0) {
903 mlog_errno(status);
904 dir->i_nlink++;
905 }
906 }
907
908leave:
909 if (status < 0 && saved_nlink)
910 inode->i_nlink = saved_nlink;
911
912 if (handle)
913 ocfs2_commit_trans(handle);
914
915 if (fe_bh)
916 brelse(fe_bh);
917
918 if (dirent_bh)
919 brelse(dirent_bh);
920
921 if (parent_node_bh)
922 brelse(parent_node_bh);
923
924 if (orphan_entry_bh)
925 brelse(orphan_entry_bh);
926
927 mlog_exit(status);
928
929 return status;
930}
931
932/*
933 * The only place this should be used is rename!
934 * if they have the same id, then the 1st one is the only one locked.
935 */
936static int ocfs2_double_lock(struct ocfs2_super *osb,
937 struct ocfs2_journal_handle *handle,
938 struct buffer_head **bh1,
939 struct inode *inode1,
940 struct buffer_head **bh2,
941 struct inode *inode2)
942{
943 int status;
944 struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
945 struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
946 struct buffer_head **tmpbh;
947 struct inode *tmpinode;
948
949 mlog_entry("(inode1 = %"MLFu64", inode2 = %"MLFu64")\n",
950 oi1->ip_blkno, oi2->ip_blkno);
951
952 BUG_ON(!handle);
953
954 if (*bh1)
955 *bh1 = NULL;
956 if (*bh2)
957 *bh2 = NULL;
958
959 /* we always want to lock the one with the lower lockid first. */
960 if (oi1->ip_blkno != oi2->ip_blkno) {
961 if (oi1->ip_blkno < oi2->ip_blkno) {
962 /* switch id1 and id2 around */
963 mlog(0, "switching them around...\n");
964 tmpbh = bh2;
965 bh2 = bh1;
966 bh1 = tmpbh;
967
968 tmpinode = inode2;
969 inode2 = inode1;
970 inode1 = tmpinode;
971 }
972 /* lock id2 */
973 status = ocfs2_meta_lock(inode2, handle, bh2, 1);
974 if (status < 0) {
975 if (status != -ENOENT)
976 mlog_errno(status);
977 goto bail;
978 }
979 }
980 /* lock id1 */
981 status = ocfs2_meta_lock(inode1, handle, bh1, 1);
982 if (status < 0) {
983 if (status != -ENOENT)
984 mlog_errno(status);
985 goto bail;
986 }
987bail:
988 mlog_exit(status);
989 return status;
990}
991
992#define PARENT_INO(buffer) \
993 ((struct ocfs2_dir_entry *) \
994 ((char *)buffer + \
995 le16_to_cpu(((struct ocfs2_dir_entry *)buffer)->rec_len)))->inode
996
997static int ocfs2_rename(struct inode *old_dir,
998 struct dentry *old_dentry,
999 struct inode *new_dir,
1000 struct dentry *new_dentry)
1001{
1002 int status = 0, rename_lock = 0;
1003 struct inode *old_inode = old_dentry->d_inode;
1004 struct inode *new_inode = new_dentry->d_inode;
1005 struct ocfs2_dinode *newfe = NULL;
1006 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
1007 struct buffer_head *orphan_entry_bh = NULL;
1008 struct buffer_head *newfe_bh = NULL;
1009 struct buffer_head *insert_entry_bh = NULL;
1010 struct ocfs2_super *osb = NULL;
1011 u64 newfe_blkno;
1012 struct ocfs2_journal_handle *handle = NULL;
1013 struct buffer_head *old_dir_bh = NULL;
1014 struct buffer_head *new_dir_bh = NULL;
1015 struct ocfs2_dir_entry *old_de = NULL, *new_de = NULL; // dirent for old_dentry
1016 // and new_dentry
1017 struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above
1018 struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
1019 // this is the 1st dirent bh
1020 nlink_t old_dir_nlink = old_dir->i_nlink, new_dir_nlink = new_dir->i_nlink;
1021 unsigned int links_count;
1022
1023 /* At some point it might be nice to break this function up a
1024 * bit. */
1025
1026 mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p, from='%.*s' to='%.*s')\n",
1027 old_dir, old_dentry, new_dir, new_dentry,
1028 old_dentry->d_name.len, old_dentry->d_name.name,
1029 new_dentry->d_name.len, new_dentry->d_name.name);
1030
1031 osb = OCFS2_SB(old_dir->i_sb);
1032
1033 if (new_inode) {
1034 if (!igrab(new_inode))
1035 BUG();
1036 }
1037
1038 if (atomic_read(&old_dentry->d_count) > 2) {
1039 shrink_dcache_parent(old_dentry);
1040 if (atomic_read(&old_dentry->d_count) > 2) {
1041 status = -EBUSY;
1042 goto bail;
1043 }
1044 }
1045
1046 /* Assume a directory heirarchy thusly:
1047 * a/b/c
1048 * a/d
1049 * a,b,c, and d are all directories.
1050 *
1051 * from cwd of 'a' on both nodes:
1052 * node1: mv b/c d
1053 * node2: mv d b/c
1054 *
1055 * And that's why, just like the VFS, we need a file system
1056 * rename lock. */
1057 if (old_dentry != new_dentry) {
1058 status = ocfs2_rename_lock(osb);
1059 if (status < 0) {
1060 mlog_errno(status);
1061 goto bail;
1062 }
1063 rename_lock = 1;
1064 }
1065
1066 handle = ocfs2_alloc_handle(osb);
1067 if (handle == NULL) {
1068 status = -ENOMEM;
1069 mlog_errno(status);
1070 goto bail;
1071 }
1072
1073 /* if old and new are the same, this'll just do one lock. */
1074 status = ocfs2_double_lock(osb, handle,
1075 &old_dir_bh, old_dir,
1076 &new_dir_bh, new_dir);
1077 if (status < 0) {
1078 mlog_errno(status);
1079 goto bail;
1080 }
1081
1082 /* make sure both dirs have bhs
1083 * get an extra ref on old_dir_bh if old==new */
1084 if (!new_dir_bh) {
1085 if (old_dir_bh) {
1086 new_dir_bh = old_dir_bh;
1087 get_bh(new_dir_bh);
1088 } else {
1089 mlog(ML_ERROR, "no old_dir_bh!\n");
1090 status = -EIO;
1091 goto bail;
1092 }
1093 }
1094
1095 if (S_ISDIR(old_inode->i_mode)) {
1096 /* Directories actually require metadata updates to
1097 * the directory info so we can't get away with not
1098 * doing node locking on it. */
1099 status = ocfs2_meta_lock(old_inode, handle, NULL, 1);
1100 if (status < 0) {
1101 if (status != -ENOENT)
1102 mlog_errno(status);
1103 goto bail;
1104 }
1105
1106 status = ocfs2_request_rename_vote(old_inode, old_dentry);
1107 if (status < 0) {
1108 mlog_errno(status);
1109 goto bail;
1110 }
1111
1112 status = -EIO;
1113 old_inode_de_bh = ocfs2_bread(old_inode, 0, &status, 0);
1114 if (!old_inode_de_bh)
1115 goto bail;
1116
1117 status = -EIO;
1118 if (le64_to_cpu(PARENT_INO(old_inode_de_bh->b_data)) !=
1119 OCFS2_I(old_dir)->ip_blkno)
1120 goto bail;
1121 status = -EMLINK;
1122 if (!new_inode && new_dir!=old_dir &&
1123 new_dir->i_nlink >= OCFS2_LINK_MAX)
1124 goto bail;
1125 } else {
1126 /* Ah, the simple case - we're a file so just send a
1127 * message. */
1128 status = ocfs2_request_rename_vote(old_inode, old_dentry);
1129 if (status < 0) {
1130 mlog_errno(status);
1131 goto bail;
1132 }
1133 }
1134
1135 status = -ENOENT;
1136 old_de_bh = ocfs2_find_entry(old_dentry->d_name.name,
1137 old_dentry->d_name.len,
1138 old_dir, &old_de);
1139 if (!old_de_bh)
1140 goto bail;
1141
1142 /*
1143 * Check for inode number is _not_ due to possible IO errors.
1144 * We might rmdir the source, keep it as pwd of some process
1145 * and merrily kill the link to whatever was created under the
1146 * same name. Goodbye sticky bit ;-<
1147 */
1148 if (le64_to_cpu(old_de->inode) != OCFS2_I(old_inode)->ip_blkno)
1149 goto bail;
1150
1151 /* check if the target already exists (in which case we need
1152 * to delete it */
1153 status = ocfs2_find_files_on_disk(new_dentry->d_name.name,
1154 new_dentry->d_name.len,
1155 &newfe_blkno, new_dir, &new_de_bh,
1156 &new_de);
1157 /* The only error we allow here is -ENOENT because the new
1158 * file not existing is perfectly valid. */
1159 if ((status < 0) && (status != -ENOENT)) {
1160 /* If we cannot find the file specified we should just */
1161 /* return the error... */
1162 mlog_errno(status);
1163 goto bail;
1164 }
1165
1166 if (!new_de && new_inode)
1167 mlog(ML_ERROR, "inode %lu does not exist in it's parent "
1168 "directory!", new_inode->i_ino);
1169
1170 /* In case we need to overwrite an existing file, we blow it
1171 * away first */
1172 if (new_de) {
1173 /* VFS didn't think there existed an inode here, but
1174 * someone else in the cluster must have raced our
1175 * rename to create one. Today we error cleanly, in
1176 * the future we should consider calling iget to build
1177 * a new struct inode for this entry. */
1178 if (!new_inode) {
1179 status = -EACCES;
1180
1181 mlog(0, "We found an inode for name %.*s but VFS "
1182 "didn't give us one.\n", new_dentry->d_name.len,
1183 new_dentry->d_name.name);
1184 goto bail;
1185 }
1186
1187 if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) {
1188 status = -EACCES;
1189
1190 mlog(0, "Inode blkno (%"MLFu64") and dir (%"MLFu64") "
1191 "disagree. ip_flags = %x\n",
1192 OCFS2_I(new_inode)->ip_blkno, newfe_blkno,
1193 OCFS2_I(new_inode)->ip_flags);
1194 goto bail;
1195 }
1196
1197 status = ocfs2_meta_lock(new_inode, handle, &newfe_bh, 1);
1198 if (status < 0) {
1199 if (status != -ENOENT)
1200 mlog_errno(status);
1201 goto bail;
1202 }
1203
1204 if (S_ISDIR(new_inode->i_mode))
1205 links_count = 0;
1206 else
1207 links_count = (unsigned int) (new_inode->i_nlink - 1);
1208
1209 status = ocfs2_request_unlink_vote(new_inode, new_dentry,
1210 links_count);
1211 if (status < 0) {
1212 mlog_errno(status);
1213 goto bail;
1214 }
1215
1216 newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
1217
1218 mlog(0, "aha rename over existing... new_de=%p "
1219 "new_blkno=%"MLFu64" newfebh=%p bhblocknr=%llu\n",
1220 new_de, newfe_blkno, newfe_bh, newfe_bh ?
1221 (unsigned long long)newfe_bh->b_blocknr : 0ULL);
1222
1223 if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
1224 status = ocfs2_prepare_orphan_dir(osb, handle,
1225 new_inode,
1226 orphan_name,
1227 &orphan_entry_bh);
1228 if (status < 0) {
1229 mlog_errno(status);
1230 goto bail;
1231 }
1232 }
1233 } else {
1234 BUG_ON(new_dentry->d_parent->d_inode != new_dir);
1235
1236 status = ocfs2_check_dir_for_entry(new_dir,
1237 new_dentry->d_name.name,
1238 new_dentry->d_name.len);
1239 if (status)
1240 goto bail;
1241
1242 status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh,
1243 new_dentry->d_name.name,
1244 new_dentry->d_name.len,
1245 &insert_entry_bh);
1246 if (status < 0) {
1247 mlog_errno(status);
1248 goto bail;
1249 }
1250 }
1251
1252 handle = ocfs2_start_trans(osb, handle, OCFS2_RENAME_CREDITS);
1253 if (IS_ERR(handle)) {
1254 status = PTR_ERR(handle);
1255 handle = NULL;
1256 mlog_errno(status);
1257 goto bail;
1258 }
1259
1260 if (new_de) {
1261 if (S_ISDIR(new_inode->i_mode)) {
1262 if (!ocfs2_empty_dir(new_inode) ||
1263 new_inode->i_nlink != 2) {
1264 status = -ENOTEMPTY;
1265 goto bail;
1266 }
1267 }
1268 status = ocfs2_journal_access(handle, new_inode, newfe_bh,
1269 OCFS2_JOURNAL_ACCESS_WRITE);
1270 if (status < 0) {
1271 mlog_errno(status);
1272 goto bail;
1273 }
1274
1275 if (S_ISDIR(new_inode->i_mode) ||
1276 (newfe->i_links_count == cpu_to_le16(1))){
1277 status = ocfs2_orphan_add(osb, handle, new_inode,
1278 newfe, orphan_name,
1279 orphan_entry_bh);
1280 if (status < 0) {
1281 mlog_errno(status);
1282 goto bail;
1283 }
1284 }
1285
1286 /* change the dirent to point to the correct inode */
1287 status = ocfs2_journal_access(handle, new_dir, new_de_bh,
1288 OCFS2_JOURNAL_ACCESS_WRITE);
1289 if (status < 0) {
1290 mlog_errno(status);
1291 goto bail;
1292 }
1293 new_de->inode = cpu_to_le64(OCFS2_I(old_inode)->ip_blkno);
1294 new_de->file_type = old_de->file_type;
1295 new_dir->i_version++;
1296 status = ocfs2_journal_dirty(handle, new_de_bh);
1297 if (status < 0) {
1298 mlog_errno(status);
1299 goto bail;
1300 }
1301
1302 if (S_ISDIR(new_inode->i_mode))
1303 newfe->i_links_count = 0;
1304 else
1305 le16_add_cpu(&newfe->i_links_count, -1);
1306
1307 status = ocfs2_journal_dirty(handle, newfe_bh);
1308 if (status < 0) {
1309 mlog_errno(status);
1310 goto bail;
1311 }
1312 } else {
1313 /* if the name was not found in new_dir, add it now */
1314 status = ocfs2_add_entry(handle, new_dentry, old_inode,
1315 OCFS2_I(old_inode)->ip_blkno,
1316 new_dir_bh, insert_entry_bh);
1317 }
1318
1319 old_inode->i_ctime = CURRENT_TIME;
1320 mark_inode_dirty(old_inode);
1321
1322 /* now that the name has been added to new_dir, remove the old name */
1323 status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh);
1324 if (status < 0) {
1325 mlog_errno(status);
1326 goto bail;
1327 }
1328
1329 if (new_inode) {
1330 new_inode->i_nlink--;
1331 new_inode->i_ctime = CURRENT_TIME;
1332 }
1333 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
1334 if (old_inode_de_bh) {
1335 status = ocfs2_journal_access(handle, old_inode,
1336 old_inode_de_bh,
1337 OCFS2_JOURNAL_ACCESS_WRITE);
1338 PARENT_INO(old_inode_de_bh->b_data) =
1339 cpu_to_le64(OCFS2_I(new_dir)->ip_blkno);
1340 status = ocfs2_journal_dirty(handle, old_inode_de_bh);
1341 old_dir->i_nlink--;
1342 if (new_inode) {
1343 new_inode->i_nlink--;
1344 } else {
1345 new_dir->i_nlink++;
1346 mark_inode_dirty(new_dir);
1347 }
1348 }
1349 mark_inode_dirty(old_dir);
1350 if (new_inode)
1351 mark_inode_dirty(new_inode);
1352
1353 if (old_dir != new_dir)
1354 if (new_dir_nlink != new_dir->i_nlink) {
1355 if (!new_dir_bh) {
1356 mlog(ML_ERROR, "need to change nlink for new "
1357 "dir %"MLFu64" from %d to %d but bh is "
1358 "NULL\n", OCFS2_I(new_dir)->ip_blkno,
1359 (int)new_dir_nlink, new_dir->i_nlink);
1360 } else {
1361 struct ocfs2_dinode *fe;
1362 status = ocfs2_journal_access(handle,
1363 new_dir,
1364 new_dir_bh,
1365 OCFS2_JOURNAL_ACCESS_WRITE);
1366 fe = (struct ocfs2_dinode *) new_dir_bh->b_data;
1367 fe->i_links_count = cpu_to_le16(new_dir->i_nlink);
1368 status = ocfs2_journal_dirty(handle, new_dir_bh);
1369 }
1370 }
1371
1372 if (old_dir_nlink != old_dir->i_nlink) {
1373 if (!old_dir_bh) {
1374 mlog(ML_ERROR, "need to change nlink for old dir "
1375 "%"MLFu64" from %d to %d but bh is NULL!\n",
1376 OCFS2_I(old_dir)->ip_blkno,
1377 (int)old_dir_nlink,
1378 old_dir->i_nlink);
1379 } else {
1380 struct ocfs2_dinode *fe;
1381 status = ocfs2_journal_access(handle, old_dir,
1382 old_dir_bh,
1383 OCFS2_JOURNAL_ACCESS_WRITE);
1384 fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
1385 fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
1386 status = ocfs2_journal_dirty(handle, old_dir_bh);
1387 }
1388 }
1389
1390 status = 0;
1391bail:
1392 if (rename_lock)
1393 ocfs2_rename_unlock(osb);
1394
1395 if (handle)
1396 ocfs2_commit_trans(handle);
1397
1398 if (new_inode)
1399 sync_mapping_buffers(old_inode->i_mapping);
1400
1401 if (new_inode)
1402 iput(new_inode);
1403 if (newfe_bh)
1404 brelse(newfe_bh);
1405 if (old_dir_bh)
1406 brelse(old_dir_bh);
1407 if (new_dir_bh)
1408 brelse(new_dir_bh);
1409 if (new_de_bh)
1410 brelse(new_de_bh);
1411 if (old_de_bh)
1412 brelse(old_de_bh);
1413 if (old_inode_de_bh)
1414 brelse(old_inode_de_bh);
1415 if (orphan_entry_bh)
1416 brelse(orphan_entry_bh);
1417 if (insert_entry_bh)
1418 brelse(insert_entry_bh);
1419
1420 mlog_exit(status);
1421
1422 return status;
1423}
1424
1425/*
1426 * we expect i_size = strlen(symname). Copy symname into the file
1427 * data, including the null terminator.
1428 */
1429static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
1430 struct ocfs2_journal_handle *handle,
1431 struct inode *inode,
1432 const char *symname)
1433{
1434 struct buffer_head **bhs = NULL;
1435 const char *c;
1436 struct super_block *sb = osb->sb;
1437 u64 p_blkno;
1438 int p_blocks;
1439 int virtual, blocks, status, i, bytes_left;
1440
1441 bytes_left = i_size_read(inode) + 1;
1442 /* we can't trust i_blocks because we're actually going to
1443 * write i_size + 1 bytes. */
1444 blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
1445
1446 mlog_entry("i_blocks = %lu, i_size = %llu, blocks = %d\n",
1447 inode->i_blocks, i_size_read(inode), blocks);
1448
1449 /* Sanity check -- make sure we're going to fit. */
1450 if (bytes_left >
1451 ocfs2_clusters_to_bytes(sb, OCFS2_I(inode)->ip_clusters)) {
1452 status = -EIO;
1453 mlog_errno(status);
1454 goto bail;
1455 }
1456
1457 bhs = kcalloc(blocks, sizeof(struct buffer_head *), GFP_KERNEL);
1458 if (!bhs) {
1459 status = -ENOMEM;
1460 mlog_errno(status);
1461 goto bail;
1462 }
1463
1464 status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno,
1465 &p_blocks);
1466 if (status < 0) {
1467 mlog_errno(status);
1468 goto bail;
1469 }
1470
1471 /* links can never be larger than one cluster so we know this
1472 * is all going to be contiguous, but do a sanity check
1473 * anyway. */
1474 if ((p_blocks << sb->s_blocksize_bits) < bytes_left) {
1475 status = -EIO;
1476 mlog_errno(status);
1477 goto bail;
1478 }
1479
1480 virtual = 0;
1481 while(bytes_left > 0) {
1482 c = &symname[virtual * sb->s_blocksize];
1483
1484 bhs[virtual] = sb_getblk(sb, p_blkno);
1485 if (!bhs[virtual]) {
1486 status = -ENOMEM;
1487 mlog_errno(status);
1488 goto bail;
1489 }
1490 ocfs2_set_new_buffer_uptodate(inode, bhs[virtual]);
1491
1492 status = ocfs2_journal_access(handle, inode, bhs[virtual],
1493 OCFS2_JOURNAL_ACCESS_CREATE);
1494 if (status < 0) {
1495 mlog_errno(status);
1496 goto bail;
1497 }
1498
1499 memset(bhs[virtual]->b_data, 0, sb->s_blocksize);
1500
1501 memcpy(bhs[virtual]->b_data, c,
1502 (bytes_left > sb->s_blocksize) ? sb->s_blocksize :
1503 bytes_left);
1504
1505 status = ocfs2_journal_dirty(handle, bhs[virtual]);
1506 if (status < 0) {
1507 mlog_errno(status);
1508 goto bail;
1509 }
1510
1511 virtual++;
1512 p_blkno++;
1513 bytes_left -= sb->s_blocksize;
1514 }
1515
1516 status = 0;
1517bail:
1518
1519 if (bhs) {
1520 for(i = 0; i < blocks; i++)
1521 if (bhs[i])
1522 brelse(bhs[i]);
1523 kfree(bhs);
1524 }
1525
1526 mlog_exit(status);
1527 return status;
1528}
1529
1530static int ocfs2_symlink(struct inode *dir,
1531 struct dentry *dentry,
1532 const char *symname)
1533{
1534 int status, l, credits;
1535 u64 newsize;
1536 struct ocfs2_super *osb = NULL;
1537 struct inode *inode = NULL;
1538 struct super_block *sb;
1539 struct buffer_head *new_fe_bh = NULL;
1540 struct buffer_head *de_bh = NULL;
1541 struct buffer_head *parent_fe_bh = NULL;
1542 struct ocfs2_dinode *fe = NULL;
1543 struct ocfs2_dinode *dirfe;
1544 struct ocfs2_journal_handle *handle = NULL;
1545 struct ocfs2_alloc_context *inode_ac = NULL;
1546 struct ocfs2_alloc_context *data_ac = NULL;
1547
1548 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
1549 dentry, symname, dentry->d_name.len, dentry->d_name.name);
1550
1551 sb = dir->i_sb;
1552 osb = OCFS2_SB(sb);
1553
1554 l = strlen(symname) + 1;
1555
1556 credits = ocfs2_calc_symlink_credits(sb);
1557
1558 handle = ocfs2_alloc_handle(osb);
1559 if (handle == NULL) {
1560 status = -ENOMEM;
1561 mlog_errno(status);
1562 goto bail;
1563 }
1564
1565 /* lock the parent directory */
1566 status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
1567 if (status < 0) {
1568 if (status != -ENOENT)
1569 mlog_errno(status);
1570 goto bail;
1571 }
1572
1573 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
1574 if (!dirfe->i_links_count) {
1575 /* can't make a file in a deleted directory. */
1576 status = -ENOENT;
1577 goto bail;
1578 }
1579
1580 status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
1581 dentry->d_name.len);
1582 if (status)
1583 goto bail;
1584
1585 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
1586 dentry->d_name.name,
1587 dentry->d_name.len, &de_bh);
1588 if (status < 0) {
1589 mlog_errno(status);
1590 goto bail;
1591 }
1592
1593 status = ocfs2_reserve_new_inode(osb, handle, &inode_ac);
1594 if (status < 0) {
1595 if (status != -ENOSPC)
1596 mlog_errno(status);
1597 goto bail;
1598 }
1599
1600 /* don't reserve bitmap space for fast symlinks. */
1601 if (l > ocfs2_fast_symlink_chars(sb)) {
1602 status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
1603 if (status < 0) {
1604 if (status != -ENOSPC)
1605 mlog_errno(status);
1606 goto bail;
1607 }
1608 }
1609
1610 handle = ocfs2_start_trans(osb, handle, credits);
1611 if (IS_ERR(handle)) {
1612 status = PTR_ERR(handle);
1613 handle = NULL;
1614 mlog_errno(status);
1615 goto bail;
1616 }
1617
1618 status = ocfs2_mknod_locked(osb, dir, dentry,
1619 S_IFLNK | S_IRWXUGO, 0,
1620 &new_fe_bh, parent_fe_bh, handle,
1621 &inode, inode_ac);
1622 if (status < 0) {
1623 mlog_errno(status);
1624 goto bail;
1625 }
1626
1627 fe = (struct ocfs2_dinode *) new_fe_bh->b_data;
1628 inode->i_rdev = 0;
1629 newsize = l - 1;
1630 if (l > ocfs2_fast_symlink_chars(sb)) {
1631 inode->i_op = &ocfs2_symlink_inode_operations;
1632 status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh,
1633 handle, data_ac, NULL,
1634 NULL);
1635 if (status < 0) {
1636 if (status != -ENOSPC && status != -EINTR) {
1637 mlog(ML_ERROR, "Failed to extend file to "
1638 "%"MLFu64"\n",
1639 newsize);
1640 mlog_errno(status);
1641 status = -ENOSPC;
1642 }
1643 goto bail;
1644 }
1645 i_size_write(inode, newsize);
1646 inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize);
1647 } else {
1648 inode->i_op = &ocfs2_fast_symlink_inode_operations;
1649 memcpy((char *) fe->id2.i_symlink, symname, l);
1650 i_size_write(inode, newsize);
1651 inode->i_blocks = 0;
1652 }
1653
1654 status = ocfs2_mark_inode_dirty(handle, inode, new_fe_bh);
1655 if (status < 0) {
1656 mlog_errno(status);
1657 goto bail;
1658 }
1659
1660 if (!ocfs2_inode_is_fast_symlink(inode)) {
1661 status = ocfs2_create_symlink_data(osb, handle, inode,
1662 symname);
1663 if (status < 0) {
1664 mlog_errno(status);
1665 goto bail;
1666 }
1667 }
1668
1669 status = ocfs2_add_entry(handle, dentry, inode,
1670 le64_to_cpu(fe->i_blkno), parent_fe_bh,
1671 de_bh);
1672 if (status < 0) {
1673 mlog_errno(status);
1674 goto bail;
1675 }
1676
1677 insert_inode_hash(inode);
1678 dentry->d_op = &ocfs2_dentry_ops;
1679 d_instantiate(dentry, inode);
1680bail:
1681 if (handle)
1682 ocfs2_commit_trans(handle);
1683 if (new_fe_bh)
1684 brelse(new_fe_bh);
1685 if (parent_fe_bh)
1686 brelse(parent_fe_bh);
1687 if (de_bh)
1688 brelse(de_bh);
1689 if (inode_ac)
1690 ocfs2_free_alloc_context(inode_ac);
1691 if (data_ac)
1692 ocfs2_free_alloc_context(data_ac);
1693 if ((status < 0) && inode)
1694 iput(inode);
1695
1696 mlog_exit(status);
1697
1698 return status;
1699}
1700
1701int ocfs2_check_dir_entry(struct inode * dir,
1702 struct ocfs2_dir_entry * de,
1703 struct buffer_head * bh,
1704 unsigned long offset)
1705{
1706 const char *error_msg = NULL;
1707 const int rlen = le16_to_cpu(de->rec_len);
1708
1709 if (rlen < OCFS2_DIR_REC_LEN(1))
1710 error_msg = "rec_len is smaller than minimal";
1711 else if (rlen % 4 != 0)
1712 error_msg = "rec_len % 4 != 0";
1713 else if (rlen < OCFS2_DIR_REC_LEN(de->name_len))
1714 error_msg = "rec_len is too small for name_len";
1715 else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
1716 error_msg = "directory entry across blocks";
1717
1718 if (error_msg != NULL)
1719 mlog(ML_ERROR, "bad entry in directory #%"MLFu64": %s - "
1720 "offset=%lu, inode=%"MLFu64", rec_len=%d, name_len=%d\n",
1721 OCFS2_I(dir)->ip_blkno, error_msg, offset,
1722 le64_to_cpu(de->inode), rlen, de->name_len);
1723 return error_msg == NULL ? 1 : 0;
1724}
1725
1726/* we don't always have a dentry for what we want to add, so people
1727 * like orphan dir can call this instead.
1728 *
1729 * If you pass me insert_bh, I'll skip the search of the other dir
1730 * blocks and put the record in there.
1731 */
1732static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle,
1733 struct inode *dir,
1734 const char *name, int namelen,
1735 struct inode *inode, u64 blkno,
1736 struct buffer_head *parent_fe_bh,
1737 struct buffer_head *insert_bh)
1738{
1739 unsigned long offset;
1740 unsigned short rec_len;
1741 struct ocfs2_dir_entry *de, *de1;
1742 struct super_block *sb;
1743 int retval, status;
1744
1745 mlog_entry_void();
1746
1747 sb = dir->i_sb;
1748
1749 if (!namelen)
1750 return -EINVAL;
1751
1752 rec_len = OCFS2_DIR_REC_LEN(namelen);
1753 offset = 0;
1754 de = (struct ocfs2_dir_entry *) insert_bh->b_data;
1755 while (1) {
1756 BUG_ON((char *)de >= sb->s_blocksize + insert_bh->b_data);
1757 /* These checks should've already been passed by the
1758 * prepare function, but I guess we can leave them
1759 * here anyway. */
1760 if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) {
1761 retval = -ENOENT;
1762 goto bail;
1763 }
1764 if (ocfs2_match(namelen, name, de)) {
1765 retval = -EEXIST;
1766 goto bail;
1767 }
1768 if (((le64_to_cpu(de->inode) == 0) &&
1769 (le16_to_cpu(de->rec_len) >= rec_len)) ||
1770 (le16_to_cpu(de->rec_len) >=
1771 (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
1772 status = ocfs2_journal_access(handle, dir, insert_bh,
1773 OCFS2_JOURNAL_ACCESS_WRITE);
1774 /* By now the buffer is marked for journaling */
1775 offset += le16_to_cpu(de->rec_len);
1776 if (le64_to_cpu(de->inode)) {
1777 de1 = (struct ocfs2_dir_entry *)((char *) de +
1778 OCFS2_DIR_REC_LEN(de->name_len));
1779 de1->rec_len =
1780 cpu_to_le16(le16_to_cpu(de->rec_len) -
1781 OCFS2_DIR_REC_LEN(de->name_len));
1782 de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
1783 de = de1;
1784 }
1785 de->file_type = OCFS2_FT_UNKNOWN;
1786 if (blkno) {
1787 de->inode = cpu_to_le64(blkno);
1788 ocfs2_set_de_type(de, inode->i_mode);
1789 } else
1790 de->inode = 0;
1791 de->name_len = namelen;
1792 memcpy(de->name, name, namelen);
1793
1794 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
1795 dir->i_version++;
1796 status = ocfs2_journal_dirty(handle, insert_bh);
1797 retval = 0;
1798 goto bail;
1799 }
1800 offset += le16_to_cpu(de->rec_len);
1801 de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
1802 }
1803
1804 /* when you think about it, the assert above should prevent us
1805 * from ever getting here. */
1806 retval = -ENOSPC;
1807bail:
1808
1809 mlog_exit(retval);
1810 return retval;
1811}
1812
1813
1814/*
1815 * ocfs2_delete_entry deletes a directory entry by merging it with the
1816 * previous entry
1817 */
1818static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle,
1819 struct inode *dir,
1820 struct ocfs2_dir_entry *de_del,
1821 struct buffer_head *bh)
1822{
1823 struct ocfs2_dir_entry *de, *pde;
1824 int i, status = -ENOENT;
1825
1826 mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
1827
1828 i = 0;
1829 pde = NULL;
1830 de = (struct ocfs2_dir_entry *) bh->b_data;
1831 while (i < bh->b_size) {
1832 if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
1833 status = -EIO;
1834 mlog_errno(status);
1835 goto bail;
1836 }
1837 if (de == de_del) {
1838 status = ocfs2_journal_access(handle, dir, bh,
1839 OCFS2_JOURNAL_ACCESS_WRITE);
1840 if (status < 0) {
1841 status = -EIO;
1842 mlog_errno(status);
1843 goto bail;
1844 }
1845 if (pde)
1846 pde->rec_len =
1847 cpu_to_le16(le16_to_cpu(pde->rec_len) +
1848 le16_to_cpu(de->rec_len));
1849 else
1850 de->inode = 0;
1851 dir->i_version++;
1852 status = ocfs2_journal_dirty(handle, bh);
1853 goto bail;
1854 }
1855 i += le16_to_cpu(de->rec_len);
1856 pde = de;
1857 de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
1858 }
1859bail:
1860 mlog_exit(status);
1861 return status;
1862}
1863
1864/*
1865 * Returns 0 if not found, -1 on failure, and 1 on success
1866 */
1867static int inline ocfs2_search_dirblock(struct buffer_head *bh,
1868 struct inode *dir,
1869 const char *name, int namelen,
1870 unsigned long offset,
1871 struct ocfs2_dir_entry **res_dir)
1872{
1873 struct ocfs2_dir_entry *de;
1874 char *dlimit, *de_buf;
1875 int de_len;
1876 int ret = 0;
1877
1878 mlog_entry_void();
1879
1880 de_buf = bh->b_data;
1881 dlimit = de_buf + dir->i_sb->s_blocksize;
1882
1883 while (de_buf < dlimit) {
1884 /* this code is executed quadratically often */
1885 /* do minimal checking `by hand' */
1886
1887 de = (struct ocfs2_dir_entry *) de_buf;
1888
1889 if (de_buf + namelen <= dlimit &&
1890 ocfs2_match(namelen, name, de)) {
1891 /* found a match - just to be sure, do a full check */
1892 if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
1893 ret = -1;
1894 goto bail;
1895 }
1896 *res_dir = de;
1897 ret = 1;
1898 goto bail;
1899 }
1900
1901 /* prevent looping on a bad block */
1902 de_len = le16_to_cpu(de->rec_len);
1903 if (de_len <= 0) {
1904 ret = -1;
1905 goto bail;
1906 }
1907
1908 de_buf += de_len;
1909 offset += de_len;
1910 }
1911
1912bail:
1913 mlog_exit(ret);
1914 return ret;
1915}
1916
1917struct buffer_head *ocfs2_find_entry(const char *name, int namelen,
1918 struct inode *dir,
1919 struct ocfs2_dir_entry **res_dir)
1920{
1921 struct super_block *sb;
1922 struct buffer_head *bh_use[NAMEI_RA_SIZE];
1923 struct buffer_head *bh, *ret = NULL;
1924 unsigned long start, block, b;
1925 int ra_max = 0; /* Number of bh's in the readahead
1926 buffer, bh_use[] */
1927 int ra_ptr = 0; /* Current index into readahead
1928 buffer */
1929 int num = 0;
1930 int nblocks, i, err;
1931
1932 mlog_entry_void();
1933
1934 *res_dir = NULL;
1935 sb = dir->i_sb;
1936
1937 nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
1938 start = OCFS2_I(dir)->ip_dir_start_lookup;
1939 if (start >= nblocks)
1940 start = 0;
1941 block = start;
1942
1943restart:
1944 do {
1945 /*
1946 * We deal with the read-ahead logic here.
1947 */
1948 if (ra_ptr >= ra_max) {
1949 /* Refill the readahead buffer */
1950 ra_ptr = 0;
1951 b = block;
1952 for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
1953 /*
1954 * Terminate if we reach the end of the
1955 * directory and must wrap, or if our
1956 * search has finished at this block.
1957 */
1958 if (b >= nblocks || (num && block == start)) {
1959 bh_use[ra_max] = NULL;
1960 break;
1961 }
1962 num++;
1963
1964 /* XXX: questionable readahead stuff here */
1965 bh = ocfs2_bread(dir, b++, &err, 1);
1966 bh_use[ra_max] = bh;
1967#if 0 // ???
1968 if (bh)
1969 ll_rw_block(READ, 1, &bh);
1970#endif
1971 }
1972 }
1973 if ((bh = bh_use[ra_ptr++]) == NULL)
1974 goto next;
1975 wait_on_buffer(bh);
1976 if (!buffer_uptodate(bh)) {
1977 /* read error, skip block & hope for the best */
1978 brelse(bh);
1979 goto next;
1980 }
1981 i = ocfs2_search_dirblock(bh, dir, name, namelen,
1982 block << sb->s_blocksize_bits,
1983 res_dir);
1984 if (i == 1) {
1985 OCFS2_I(dir)->ip_dir_start_lookup = block;
1986 ret = bh;
1987 goto cleanup_and_exit;
1988 } else {
1989 brelse(bh);
1990 if (i < 0)
1991 goto cleanup_and_exit;
1992 }
1993 next:
1994 if (++block >= nblocks)
1995 block = 0;
1996 } while (block != start);
1997
1998 /*
1999 * If the directory has grown while we were searching, then
2000 * search the last part of the directory before giving up.
2001 */
2002 block = nblocks;
2003 nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
2004 if (block < nblocks) {
2005 start = 0;
2006 goto restart;
2007 }
2008
2009cleanup_and_exit:
2010 /* Clean up the read-ahead blocks */
2011 for (; ra_ptr < ra_max; ra_ptr++)
2012 brelse(bh_use[ra_ptr]);
2013
2014 mlog_exit_ptr(ret);
2015 return ret;
2016}
2017
2018static int ocfs2_blkno_stringify(u64 blkno, char *name)
2019{
2020 int status, namelen;
2021
2022 mlog_entry_void();
2023
2024 namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016"MLFx64,
2025 blkno);
2026 if (namelen <= 0) {
2027 if (namelen)
2028 status = namelen;
2029 else
2030 status = -EINVAL;
2031 mlog_errno(status);
2032 goto bail;
2033 }
2034 if (namelen != OCFS2_ORPHAN_NAMELEN) {
2035 status = -EINVAL;
2036 mlog_errno(status);
2037 goto bail;
2038 }
2039
2040 mlog(0, "built filename '%s' for orphan dir (len=%d)\n", name,
2041 namelen);
2042
2043 status = 0;
2044bail:
2045 mlog_exit(status);
2046 return status;
2047}
2048
2049static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
2050 struct ocfs2_journal_handle *handle,
2051 struct inode *inode,
2052 char *name,
2053 struct buffer_head **de_bh)
2054{
2055 struct inode *orphan_dir_inode = NULL;
2056 struct buffer_head *orphan_dir_bh = NULL;
2057 int status = 0;
2058
2059 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
2060 if (status < 0) {
2061 mlog_errno(status);
2062 goto leave;
2063 }
2064
2065 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
2066 ORPHAN_DIR_SYSTEM_INODE,
2067 osb->slot_num);
2068 if (!orphan_dir_inode) {
2069 status = -ENOENT;
2070 mlog_errno(status);
2071 goto leave;
2072 }
2073
2074 ocfs2_handle_add_inode(handle, orphan_dir_inode);
2075 status = ocfs2_meta_lock(orphan_dir_inode, handle, &orphan_dir_bh, 1);
2076 if (status < 0) {
2077 mlog_errno(status);
2078 goto leave;
2079 }
2080
2081 status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
2082 orphan_dir_bh, name,
2083 OCFS2_ORPHAN_NAMELEN, de_bh);
2084 if (status < 0) {
2085 mlog_errno(status);
2086 goto leave;
2087 }
2088
2089leave:
2090 if (orphan_dir_inode)
2091 iput(orphan_dir_inode);
2092
2093 if (orphan_dir_bh)
2094 brelse(orphan_dir_bh);
2095
2096 mlog_exit(status);
2097 return status;
2098}
2099
2100static int ocfs2_orphan_add(struct ocfs2_super *osb,
2101 struct ocfs2_journal_handle *handle,
2102 struct inode *inode,
2103 struct ocfs2_dinode *fe,
2104 char *name,
2105 struct buffer_head *de_bh)
2106{
2107 struct inode *orphan_dir_inode = NULL;
2108 struct buffer_head *orphan_dir_bh = NULL;
2109 int status = 0;
2110 struct ocfs2_dinode *orphan_fe;
2111
2112 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
2113
2114 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
2115 ORPHAN_DIR_SYSTEM_INODE,
2116 osb->slot_num);
2117 if (!orphan_dir_inode) {
2118 status = -ENOENT;
2119 mlog_errno(status);
2120 goto leave;
2121 }
2122
2123 status = ocfs2_read_block(osb,
2124 OCFS2_I(orphan_dir_inode)->ip_blkno,
2125 &orphan_dir_bh, OCFS2_BH_CACHED,
2126 orphan_dir_inode);
2127 if (status < 0) {
2128 mlog_errno(status);
2129 goto leave;
2130 }
2131
2132 status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh,
2133 OCFS2_JOURNAL_ACCESS_WRITE);
2134 if (status < 0) {
2135 mlog_errno(status);
2136 goto leave;
2137 }
2138
2139 /* we're a cluster, and nlink can change on disk from
2140 * underneath us... */
2141 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
2142 if (S_ISDIR(inode->i_mode))
2143 le16_add_cpu(&orphan_fe->i_links_count, 1);
2144 orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
2145
2146 status = ocfs2_journal_dirty(handle, orphan_dir_bh);
2147 if (status < 0) {
2148 mlog_errno(status);
2149 goto leave;
2150 }
2151
2152 status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
2153 OCFS2_ORPHAN_NAMELEN, inode,
2154 OCFS2_I(inode)->ip_blkno,
2155 orphan_dir_bh, de_bh);
2156 if (status < 0) {
2157 mlog_errno(status);
2158 goto leave;
2159 }
2160
2161 le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
2162
2163 /* Record which orphan dir our inode now resides
2164 * in. delete_inode will use this to determine which orphan
2165 * dir to lock. */
2166 spin_lock(&OCFS2_I(inode)->ip_lock);
2167 OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num;
2168 spin_unlock(&OCFS2_I(inode)->ip_lock);
2169
2170 mlog(0, "Inode %"MLFu64" orphaned in slot %d\n",
2171 OCFS2_I(inode)->ip_blkno, osb->slot_num);
2172
2173leave:
2174 if (orphan_dir_inode)
2175 iput(orphan_dir_inode);
2176
2177 if (orphan_dir_bh)
2178 brelse(orphan_dir_bh);
2179
2180 mlog_exit(status);
2181 return status;
2182}
2183
2184/* unlike orphan_add, we expect the orphan dir to already be locked here. */
2185int ocfs2_orphan_del(struct ocfs2_super *osb,
2186 struct ocfs2_journal_handle *handle,
2187 struct inode *orphan_dir_inode,
2188 struct inode *inode,
2189 struct buffer_head *orphan_dir_bh)
2190{
2191 char name[OCFS2_ORPHAN_NAMELEN + 1];
2192 struct ocfs2_dinode *orphan_fe;
2193 int status = 0;
2194 struct buffer_head *target_de_bh = NULL;
2195 struct ocfs2_dir_entry *target_de = NULL;
2196
2197 mlog_entry_void();
2198
2199 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
2200 if (status < 0) {
2201 mlog_errno(status);
2202 goto leave;
2203 }
2204
2205 mlog(0, "removing '%s' from orphan dir %"MLFu64" (namelen=%d)\n",
2206 name, OCFS2_I(orphan_dir_inode)->ip_blkno, OCFS2_ORPHAN_NAMELEN);
2207
2208 /* find it's spot in the orphan directory */
2209 target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN,
2210 orphan_dir_inode, &target_de);
2211 if (!target_de_bh) {
2212 status = -ENOENT;
2213 mlog_errno(status);
2214 goto leave;
2215 }
2216
2217 /* remove it from the orphan directory */
2218 status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de,
2219 target_de_bh);
2220 if (status < 0) {
2221 mlog_errno(status);
2222 goto leave;
2223 }
2224
2225 status = ocfs2_journal_access(handle,orphan_dir_inode, orphan_dir_bh,
2226 OCFS2_JOURNAL_ACCESS_WRITE);
2227 if (status < 0) {
2228 mlog_errno(status);
2229 goto leave;
2230 }
2231
2232 /* do the i_nlink dance! :) */
2233 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
2234 if (S_ISDIR(inode->i_mode))
2235 le16_add_cpu(&orphan_fe->i_links_count, -1);
2236 orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
2237
2238 status = ocfs2_journal_dirty(handle, orphan_dir_bh);
2239 if (status < 0) {
2240 mlog_errno(status);
2241 goto leave;
2242 }
2243
2244leave:
2245 if (target_de_bh)
2246 brelse(target_de_bh);
2247
2248 mlog_exit(status);
2249 return status;
2250}
2251
2252struct inode_operations ocfs2_dir_iops = {
2253 .create = ocfs2_create,
2254 .lookup = ocfs2_lookup,
2255 .link = ocfs2_link,
2256 .unlink = ocfs2_unlink,
2257 .rmdir = ocfs2_unlink,
2258 .symlink = ocfs2_symlink,
2259 .mkdir = ocfs2_mkdir,
2260 .mknod = ocfs2_mknod,
2261 .rename = ocfs2_rename,
2262 .setattr = ocfs2_setattr,
2263 .getattr = ocfs2_getattr,
2264};
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
new file mode 100644
index 00000000000..deaaa97dbf0
--- /dev/null
+++ b/fs/ocfs2/namei.h
@@ -0,0 +1,58 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * namei.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_NAMEI_H
27#define OCFS2_NAMEI_H
28
29extern struct inode_operations ocfs2_dir_iops;
30
31struct dentry *ocfs2_get_parent(struct dentry *child);
32
33int ocfs2_check_dir_entry (struct inode *dir,
34 struct ocfs2_dir_entry *de,
35 struct buffer_head *bh,
36 unsigned long offset);
37struct buffer_head *ocfs2_find_entry(const char *name,
38 int namelen,
39 struct inode *dir,
40 struct ocfs2_dir_entry **res_dir);
41int ocfs2_orphan_del(struct ocfs2_super *osb,
42 struct ocfs2_journal_handle *handle,
43 struct inode *orphan_dir_inode,
44 struct inode *inode,
45 struct buffer_head *orphan_dir_bh);
46
47static inline int ocfs2_match(int len,
48 const char * const name,
49 struct ocfs2_dir_entry *de)
50{
51 if (len != de->name_len)
52 return 0;
53 if (!de->inode)
54 return 0;
55 return !memcmp(name, de->name, len);
56}
57
58#endif /* OCFS2_NAMEI_H */
diff --git a/fs/ocfs2/ocfs1_fs_compat.h b/fs/ocfs2/ocfs1_fs_compat.h
new file mode 100644
index 00000000000..0b499bccec5
--- /dev/null
+++ b/fs/ocfs2/ocfs1_fs_compat.h
@@ -0,0 +1,109 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs1_fs_compat.h
5 *
6 * OCFS1 volume header definitions. OCFS2 creates valid but unmountable
7 * OCFS1 volume headers on the first two sectors of an OCFS2 volume.
8 * This allows an OCFS1 volume to see the partition and cleanly fail to
9 * mount it.
10 *
11 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
12 *
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public
15 * License, version 2, as published by the Free Software Foundation.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public
23 * License along with this program; if not, write to the
24 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
25 * Boston, MA 021110-1307, USA.
26 */
27
28#ifndef _OCFS1_FS_COMPAT_H
29#define _OCFS1_FS_COMPAT_H
30
31#define OCFS1_MAX_VOL_SIGNATURE_LEN 128
32#define OCFS1_MAX_MOUNT_POINT_LEN 128
33#define OCFS1_MAX_VOL_ID_LENGTH 16
34#define OCFS1_MAX_VOL_LABEL_LEN 64
35#define OCFS1_MAX_CLUSTER_NAME_LEN 64
36
37#define OCFS1_MAJOR_VERSION (2)
38#define OCFS1_MINOR_VERSION (0)
39#define OCFS1_VOLUME_SIGNATURE "OracleCFS"
40
41/*
42 * OCFS1 superblock. Lives at sector 0.
43 */
44struct ocfs1_vol_disk_hdr
45{
46/*00*/ __u32 minor_version;
47 __u32 major_version;
48/*08*/ __u8 signature[OCFS1_MAX_VOL_SIGNATURE_LEN];
49/*88*/ __u8 mount_point[OCFS1_MAX_MOUNT_POINT_LEN];
50/*108*/ __u64 serial_num;
51/*110*/ __u64 device_size;
52 __u64 start_off;
53/*120*/ __u64 bitmap_off;
54 __u64 publ_off;
55/*130*/ __u64 vote_off;
56 __u64 root_bitmap_off;
57/*140*/ __u64 data_start_off;
58 __u64 root_bitmap_size;
59/*150*/ __u64 root_off;
60 __u64 root_size;
61/*160*/ __u64 cluster_size;
62 __u64 num_nodes;
63/*170*/ __u64 num_clusters;
64 __u64 dir_node_size;
65/*180*/ __u64 file_node_size;
66 __u64 internal_off;
67/*190*/ __u64 node_cfg_off;
68 __u64 node_cfg_size;
69/*1A0*/ __u64 new_cfg_off;
70 __u32 prot_bits;
71 __s32 excl_mount;
72/*1B0*/
73};
74
75
76struct ocfs1_disk_lock
77{
78/*00*/ __u32 curr_master;
79 __u8 file_lock;
80 __u8 compat_pad[3]; /* Not in orignal definition. Used to
81 make the already existing alignment
82 explicit */
83 __u64 last_write_time;
84/*10*/ __u64 last_read_time;
85 __u32 writer_node_num;
86 __u32 reader_node_num;
87/*20*/ __u64 oin_node_map;
88 __u64 dlock_seq_num;
89/*30*/
90};
91
92/*
93 * OCFS1 volume label. Lives at sector 1.
94 */
95struct ocfs1_vol_label
96{
97/*00*/ struct ocfs1_disk_lock disk_lock;
98/*30*/ __u8 label[OCFS1_MAX_VOL_LABEL_LEN];
99/*70*/ __u16 label_len;
100/*72*/ __u8 vol_id[OCFS1_MAX_VOL_ID_LENGTH];
101/*82*/ __u16 vol_id_len;
102/*84*/ __u8 cluster_name[OCFS1_MAX_CLUSTER_NAME_LEN];
103/*A4*/ __u16 cluster_name_len;
104/*A6*/
105};
106
107
108#endif /* _OCFS1_FS_COMPAT_H */
109
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
new file mode 100644
index 00000000000..f468c600cf9
--- /dev/null
+++ b/fs/ocfs2/ocfs2.h
@@ -0,0 +1,464 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2.h
5 *
6 * Defines macros and structures used in OCFS2
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_H
27#define OCFS2_H
28
29#include <linux/spinlock.h>
30#include <linux/sched.h>
31#include <linux/wait.h>
32#include <linux/list.h>
33#include <linux/rbtree.h>
34#include <linux/workqueue.h>
35#include <linux/kref.h>
36
37#include "cluster/nodemanager.h"
38#include "cluster/heartbeat.h"
39#include "cluster/tcp.h"
40
41#include "dlm/dlmapi.h"
42
43#include "ocfs2_fs.h"
44#include "endian.h"
45#include "ocfs2_lockid.h"
46
47struct ocfs2_extent_map {
48 u32 em_clusters;
49 struct rb_root em_extents;
50};
51
52/* Most user visible OCFS2 inodes will have very few pieces of
53 * metadata, but larger files (including bitmaps, etc) must be taken
54 * into account when designing an access scheme. We allow a small
55 * amount of inlined blocks to be stored on an array and grow the
56 * structure into a rb tree when necessary. */
57#define OCFS2_INODE_MAX_CACHE_ARRAY 2
58
59struct ocfs2_caching_info {
60 unsigned int ci_num_cached;
61 union {
62 sector_t ci_array[OCFS2_INODE_MAX_CACHE_ARRAY];
63 struct rb_root ci_tree;
64 } ci_cache;
65};
66
67/* this limits us to 256 nodes
68 * if we need more, we can do a kmalloc for the map */
69#define OCFS2_NODE_MAP_MAX_NODES 256
70struct ocfs2_node_map {
71 u16 num_nodes;
72 unsigned long map[BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES)];
73};
74
75enum ocfs2_ast_action {
76 OCFS2_AST_INVALID = 0,
77 OCFS2_AST_ATTACH,
78 OCFS2_AST_CONVERT,
79 OCFS2_AST_DOWNCONVERT,
80};
81
82/* actions for an unlockast function to take. */
83enum ocfs2_unlock_action {
84 OCFS2_UNLOCK_INVALID = 0,
85 OCFS2_UNLOCK_CANCEL_CONVERT,
86 OCFS2_UNLOCK_DROP_LOCK,
87};
88
89/* ocfs2_lock_res->l_flags flags. */
90#define OCFS2_LOCK_ATTACHED (0x00000001) /* have we initialized
91 * the lvb */
92#define OCFS2_LOCK_BUSY (0x00000002) /* we are currently in
93 * dlm_lock */
94#define OCFS2_LOCK_BLOCKED (0x00000004) /* blocked waiting to
95 * downconvert*/
96#define OCFS2_LOCK_LOCAL (0x00000008) /* newly created inode */
97#define OCFS2_LOCK_NEEDS_REFRESH (0x00000010)
98#define OCFS2_LOCK_REFRESHING (0x00000020)
99#define OCFS2_LOCK_INITIALIZED (0x00000040) /* track initialization
100 * for shutdown paths */
101#define OCFS2_LOCK_FREEING (0x00000080) /* help dlmglue track
102 * when to skip queueing
103 * a lock because it's
104 * about to be
105 * dropped. */
106#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */
107
108struct ocfs2_lock_res_ops;
109
110typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
111
112struct ocfs2_lock_res {
113 void *l_priv;
114 struct ocfs2_lock_res_ops *l_ops;
115 spinlock_t l_lock;
116
117 struct list_head l_blocked_list;
118 struct list_head l_mask_waiters;
119
120 enum ocfs2_lock_type l_type;
121 unsigned long l_flags;
122 char l_name[OCFS2_LOCK_ID_MAX_LEN];
123 int l_level;
124 unsigned int l_ro_holders;
125 unsigned int l_ex_holders;
126 struct dlm_lockstatus l_lksb;
127
128 /* used from AST/BAST funcs. */
129 enum ocfs2_ast_action l_action;
130 enum ocfs2_unlock_action l_unlock_action;
131 int l_requested;
132 int l_blocking;
133
134 wait_queue_head_t l_event;
135
136 struct list_head l_debug_list;
137};
138
139struct ocfs2_dlm_debug {
140 struct kref d_refcnt;
141 struct dentry *d_locking_state;
142 struct list_head d_lockres_tracking;
143};
144
145enum ocfs2_vol_state
146{
147 VOLUME_INIT = 0,
148 VOLUME_MOUNTED,
149 VOLUME_DISMOUNTED,
150 VOLUME_DISABLED
151};
152
153struct ocfs2_alloc_stats
154{
155 atomic_t moves;
156 atomic_t local_data;
157 atomic_t bitmap_data;
158 atomic_t bg_allocs;
159 atomic_t bg_extends;
160};
161
162enum ocfs2_local_alloc_state
163{
164 OCFS2_LA_UNUSED = 0,
165 OCFS2_LA_ENABLED,
166 OCFS2_LA_DISABLED
167};
168
169enum ocfs2_mount_options
170{
171 OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Heartbeat started in local mode */
172 OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */
173 OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */
174 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
175 OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
176#ifdef OCFS2_ORACORE_WORKAROUNDS
177 OCFS2_MOUNT_COMPAT_OCFS = 1 << 30, /* ocfs1 compatibility mode */
178#endif
179};
180
181#define OCFS2_OSB_SOFT_RO 0x0001
182#define OCFS2_OSB_HARD_RO 0x0002
183#define OCFS2_OSB_ERROR_FS 0x0004
184
185struct ocfs2_journal;
186struct ocfs2_journal_handle;
187struct ocfs2_super
188{
189 u32 osb_id; /* id used by the proc interface */
190 struct task_struct *commit_task;
191 struct super_block *sb;
192 struct inode *root_inode;
193 struct inode *sys_root_inode;
194 struct inode *system_inodes[NUM_SYSTEM_INODES];
195
196 struct ocfs2_slot_info *slot_info;
197
198 spinlock_t node_map_lock;
199 struct ocfs2_node_map mounted_map;
200 struct ocfs2_node_map recovery_map;
201 struct ocfs2_node_map umount_map;
202
203 u32 num_clusters;
204 u64 root_blkno;
205 u64 system_dir_blkno;
206 u64 bitmap_blkno;
207 u32 bitmap_cpg;
208 u8 *uuid;
209 char *uuid_str;
210 u8 *vol_label;
211 u64 first_cluster_group_blkno;
212 u32 fs_generation;
213
214 u32 s_feature_compat;
215 u32 s_feature_incompat;
216 u32 s_feature_ro_compat;
217
218 /* Protects s_next_generaion, osb_flags. Could protect more on
219 * osb as it's very short lived. */
220 spinlock_t osb_lock;
221 u32 s_next_generation;
222 unsigned long osb_flags;
223
224 unsigned long s_mount_opt;
225
226 u16 max_slots;
227 u16 num_nodes;
228 s16 node_num;
229 s16 slot_num;
230 int s_sectsize_bits;
231 int s_clustersize;
232 int s_clustersize_bits;
233 struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */
234
235 atomic_t vol_state;
236 struct semaphore recovery_lock;
237 struct task_struct *recovery_thread_task;
238 int disable_recovery;
239 wait_queue_head_t checkpoint_event;
240 atomic_t needs_checkpoint;
241 struct ocfs2_journal *journal;
242
243 enum ocfs2_local_alloc_state local_alloc_state;
244 struct buffer_head *local_alloc_bh;
245
246 /* Next two fields are for local node slot recovery during
247 * mount. */
248 int dirty;
249 struct ocfs2_dinode *local_alloc_copy;
250
251 struct ocfs2_alloc_stats alloc_stats;
252 char dev_str[20]; /* "major,minor" of the device */
253
254 struct dlm_ctxt *dlm;
255 struct ocfs2_lock_res osb_super_lockres;
256 struct ocfs2_lock_res osb_rename_lockres;
257 struct dlm_eviction_cb osb_eviction_cb;
258 struct ocfs2_dlm_debug *osb_dlm_debug;
259
260 struct dentry *osb_debug_root;
261
262 wait_queue_head_t recovery_event;
263
264 spinlock_t vote_task_lock;
265 struct task_struct *vote_task;
266 wait_queue_head_t vote_event;
267 unsigned long vote_wake_sequence;
268 unsigned long vote_work_sequence;
269
270 struct list_head blocked_lock_list;
271 unsigned long blocked_lock_count;
272
273 struct list_head vote_list;
274 int vote_count;
275
276 u32 net_key;
277 spinlock_t net_response_lock;
278 unsigned int net_response_ids;
279 struct list_head net_response_list;
280
281 struct o2hb_callback_func osb_hb_up;
282 struct o2hb_callback_func osb_hb_down;
283
284 struct list_head osb_net_handlers;
285
286 wait_queue_head_t osb_mount_event;
287
288 /* Truncate log info */
289 struct inode *osb_tl_inode;
290 struct buffer_head *osb_tl_bh;
291 struct work_struct osb_truncate_log_wq;
292};
293
294#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
295#define OCFS2_MAX_OSB_ID 65536
296
297static inline int ocfs2_should_order_data(struct inode *inode)
298{
299 if (!S_ISREG(inode->i_mode))
300 return 0;
301 if (OCFS2_SB(inode->i_sb)->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)
302 return 0;
303 return 1;
304}
305
306/* set / clear functions because cluster events can make these happen
307 * in parallel so we want the transitions to be atomic. this also
308 * means that any future flags osb_flags must be protected by spinlock
309 * too! */
310static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
311 unsigned long flag)
312{
313 spin_lock(&osb->osb_lock);
314 osb->osb_flags |= flag;
315 spin_unlock(&osb->osb_lock);
316}
317
318static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
319 int hard)
320{
321 spin_lock(&osb->osb_lock);
322 osb->osb_flags &= ~(OCFS2_OSB_SOFT_RO|OCFS2_OSB_HARD_RO);
323 if (hard)
324 osb->osb_flags |= OCFS2_OSB_HARD_RO;
325 else
326 osb->osb_flags |= OCFS2_OSB_SOFT_RO;
327 spin_unlock(&osb->osb_lock);
328}
329
330static inline int ocfs2_is_hard_readonly(struct ocfs2_super *osb)
331{
332 int ret;
333
334 spin_lock(&osb->osb_lock);
335 ret = osb->osb_flags & OCFS2_OSB_HARD_RO;
336 spin_unlock(&osb->osb_lock);
337
338 return ret;
339}
340
341static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
342{
343 int ret;
344
345 spin_lock(&osb->osb_lock);
346 ret = osb->osb_flags & OCFS2_OSB_SOFT_RO;
347 spin_unlock(&osb->osb_lock);
348
349 return ret;
350}
351
352#define OCFS2_IS_VALID_DINODE(ptr) \
353 (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
354
355#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di) do { \
356 typeof(__di) ____di = (__di); \
357 ocfs2_error((__sb), \
358 "Dinode # %"MLFu64" has bad signature %.*s", \
359 (____di)->i_blkno, 7, \
360 (____di)->i_signature); \
361} while (0);
362
363#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \
364 (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
365
366#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb) do { \
367 typeof(__eb) ____eb = (__eb); \
368 ocfs2_error((__sb), \
369 "Extent Block # %"MLFu64" has bad signature %.*s", \
370 (____eb)->h_blkno, 7, \
371 (____eb)->h_signature); \
372} while (0);
373
374#define OCFS2_IS_VALID_GROUP_DESC(ptr) \
375 (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
376
377#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd) do { \
378 typeof(__gd) ____gd = (__gd); \
379 ocfs2_error((__sb), \
380 "Group Descriptor # %"MLFu64" has bad signature %.*s", \
381 (____gd)->bg_blkno, 7, \
382 (____gd)->bg_signature); \
383} while (0);
384
385static inline unsigned long ino_from_blkno(struct super_block *sb,
386 u64 blkno)
387{
388 return (unsigned long)(blkno & (u64)ULONG_MAX);
389}
390
391static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb,
392 u32 clusters)
393{
394 int c_to_b_bits = OCFS2_SB(sb)->s_clustersize_bits -
395 sb->s_blocksize_bits;
396
397 return (u64)clusters << c_to_b_bits;
398}
399
400static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb,
401 u64 blocks)
402{
403 int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits -
404 sb->s_blocksize_bits;
405
406 return (u32)(blocks >> b_to_c_bits);
407}
408
409static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb,
410 u64 bytes)
411{
412 int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
413 unsigned int clusters;
414
415 bytes += OCFS2_SB(sb)->s_clustersize - 1;
416 /* OCFS2 just cannot have enough clusters to overflow this */
417 clusters = (unsigned int)(bytes >> cl_bits);
418
419 return clusters;
420}
421
422static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb,
423 u64 bytes)
424{
425 bytes += sb->s_blocksize - 1;
426 return bytes >> sb->s_blocksize_bits;
427}
428
429static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
430 u32 clusters)
431{
432 return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits;
433}
434
435static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb,
436 u64 bytes)
437{
438 int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
439 unsigned int clusters;
440
441 clusters = ocfs2_clusters_for_bytes(sb, bytes);
442 return (u64)clusters << cl_bits;
443}
444
445static inline u64 ocfs2_align_bytes_to_blocks(struct super_block *sb,
446 u64 bytes)
447{
448 u64 blocks;
449
450 blocks = ocfs2_blocks_for_bytes(sb, bytes);
451 return blocks << sb->s_blocksize_bits;
452}
453
454static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
455{
456 return (unsigned long)((bytes + 511) >> 9);
457}
458
459#define ocfs2_set_bit ext2_set_bit
460#define ocfs2_clear_bit ext2_clear_bit
461#define ocfs2_test_bit ext2_test_bit
462#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
463#endif /* OCFS2_H */
464
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
new file mode 100644
index 00000000000..dfb8a5bedfc
--- /dev/null
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -0,0 +1,638 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_fs.h
5 *
6 * On-disk structures for OCFS2.
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License, version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public
20 * License along with this program; if not, write to the
21 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
22 * Boston, MA 021110-1307, USA.
23 */
24
25#ifndef _OCFS2_FS_H
26#define _OCFS2_FS_H
27
28/* Version */
29#define OCFS2_MAJOR_REV_LEVEL 0
30#define OCFS2_MINOR_REV_LEVEL 90
31
32/*
33 * An OCFS2 volume starts this way:
34 * Sector 0: Valid ocfs1_vol_disk_hdr that cleanly fails to mount OCFS.
35 * Sector 1: Valid ocfs1_vol_label that cleanly fails to mount OCFS.
36 * Block OCFS2_SUPER_BLOCK_BLKNO: OCFS2 superblock.
37 *
38 * All other structures are found from the superblock information.
39 *
40 * OCFS2_SUPER_BLOCK_BLKNO is in blocks, not sectors. eg, for a
41 * blocksize of 2K, it is 4096 bytes into disk.
42 */
43#define OCFS2_SUPER_BLOCK_BLKNO 2
44
45/*
46 * Cluster size limits. The maximum is kept arbitrarily at 1 MB, and could
47 * grow if needed.
48 */
49#define OCFS2_MIN_CLUSTERSIZE 4096
50#define OCFS2_MAX_CLUSTERSIZE 1048576
51
52/*
53 * Blocks cannot be bigger than clusters, so the maximum blocksize is the
54 * minimum cluster size.
55 */
56#define OCFS2_MIN_BLOCKSIZE 512
57#define OCFS2_MAX_BLOCKSIZE OCFS2_MIN_CLUSTERSIZE
58
59/* Filesystem magic number */
60#define OCFS2_SUPER_MAGIC 0x7461636f
61
62/* Object signatures */
63#define OCFS2_SUPER_BLOCK_SIGNATURE "OCFSV2"
64#define OCFS2_INODE_SIGNATURE "INODE01"
65#define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01"
66#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01"
67
68/* Compatibility flags */
69#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
70 ( OCFS2_SB(sb)->s_feature_compat & (mask) )
71#define OCFS2_HAS_RO_COMPAT_FEATURE(sb,mask) \
72 ( OCFS2_SB(sb)->s_feature_ro_compat & (mask) )
73#define OCFS2_HAS_INCOMPAT_FEATURE(sb,mask) \
74 ( OCFS2_SB(sb)->s_feature_incompat & (mask) )
75#define OCFS2_SET_COMPAT_FEATURE(sb,mask) \
76 OCFS2_SB(sb)->s_feature_compat |= (mask)
77#define OCFS2_SET_RO_COMPAT_FEATURE(sb,mask) \
78 OCFS2_SB(sb)->s_feature_ro_compat |= (mask)
79#define OCFS2_SET_INCOMPAT_FEATURE(sb,mask) \
80 OCFS2_SB(sb)->s_feature_incompat |= (mask)
81#define OCFS2_CLEAR_COMPAT_FEATURE(sb,mask) \
82 OCFS2_SB(sb)->s_feature_compat &= ~(mask)
83#define OCFS2_CLEAR_RO_COMPAT_FEATURE(sb,mask) \
84 OCFS2_SB(sb)->s_feature_ro_compat &= ~(mask)
85#define OCFS2_CLEAR_INCOMPAT_FEATURE(sb,mask) \
86 OCFS2_SB(sb)->s_feature_incompat &= ~(mask)
87
88#define OCFS2_FEATURE_COMPAT_SUPP 0
89#define OCFS2_FEATURE_INCOMPAT_SUPP 0
90#define OCFS2_FEATURE_RO_COMPAT_SUPP 0
91
92/*
93 * Heartbeat-only devices are missing journals and other files. The
94 * filesystem driver can't load them, but the library can. Never put
95 * this in OCFS2_FEATURE_INCOMPAT_SUPP, *ever*.
96 */
97#define OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV 0x0002
98
99
100/*
101 * Flags on ocfs2_dinode.i_flags
102 */
103#define OCFS2_VALID_FL (0x00000001) /* Inode is valid */
104#define OCFS2_UNUSED2_FL (0x00000002)
105#define OCFS2_ORPHANED_FL (0x00000004) /* On the orphan list */
106#define OCFS2_UNUSED3_FL (0x00000008)
107/* System inode flags */
108#define OCFS2_SYSTEM_FL (0x00000010) /* System inode */
109#define OCFS2_SUPER_BLOCK_FL (0x00000020) /* Super block */
110#define OCFS2_LOCAL_ALLOC_FL (0x00000040) /* Slot local alloc bitmap */
111#define OCFS2_BITMAP_FL (0x00000080) /* Allocation bitmap */
112#define OCFS2_JOURNAL_FL (0x00000100) /* Slot local journal */
113#define OCFS2_HEARTBEAT_FL (0x00000200) /* Heartbeat area */
114#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */
115#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */
116
117/*
118 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
119 */
120#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */
121
122/*
123 * superblock s_state flags
124 */
125#define OCFS2_ERROR_FS (0x00000001) /* FS saw errors */
126
127/* Limit of space in ocfs2_dir_entry */
128#define OCFS2_MAX_FILENAME_LEN 255
129
130/* Maximum slots on an ocfs2 file system */
131#define OCFS2_MAX_SLOTS 255
132
133/* Slot map indicator for an empty slot */
134#define OCFS2_INVALID_SLOT -1
135
136#define OCFS2_VOL_UUID_LEN 16
137#define OCFS2_MAX_VOL_LABEL_LEN 64
138
139/* Journal limits (in bytes) */
140#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
141#define OCFS2_MAX_JOURNAL_SIZE (500 * 1024 * 1024)
142
143struct ocfs2_system_inode_info {
144 char *si_name;
145 int si_iflags;
146 int si_mode;
147};
148
149/* System file index */
150enum {
151 BAD_BLOCK_SYSTEM_INODE = 0,
152 GLOBAL_INODE_ALLOC_SYSTEM_INODE,
153 SLOT_MAP_SYSTEM_INODE,
154#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
155 HEARTBEAT_SYSTEM_INODE,
156 GLOBAL_BITMAP_SYSTEM_INODE,
157#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE
158 ORPHAN_DIR_SYSTEM_INODE,
159 EXTENT_ALLOC_SYSTEM_INODE,
160 INODE_ALLOC_SYSTEM_INODE,
161 JOURNAL_SYSTEM_INODE,
162 LOCAL_ALLOC_SYSTEM_INODE,
163 TRUNCATE_LOG_SYSTEM_INODE,
164 NUM_SYSTEM_INODES
165};
166
167static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
168 /* Global system inodes (single copy) */
169 /* The first two are only used from userspace mfks/tunefs */
170 [BAD_BLOCK_SYSTEM_INODE] = { "bad_blocks", 0, S_IFREG | 0644 },
171 [GLOBAL_INODE_ALLOC_SYSTEM_INODE] = { "global_inode_alloc", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
172
173 /* These are used by the running filesystem */
174 [SLOT_MAP_SYSTEM_INODE] = { "slot_map", 0, S_IFREG | 0644 },
175 [HEARTBEAT_SYSTEM_INODE] = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 },
176 [GLOBAL_BITMAP_SYSTEM_INODE] = { "global_bitmap", 0, S_IFREG | 0644 },
177
178 /* Slot-specific system inodes (one copy per slot) */
179 [ORPHAN_DIR_SYSTEM_INODE] = { "orphan_dir:%04d", 0, S_IFDIR | 0755 },
180 [EXTENT_ALLOC_SYSTEM_INODE] = { "extent_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
181 [INODE_ALLOC_SYSTEM_INODE] = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
182 [JOURNAL_SYSTEM_INODE] = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 },
183 [LOCAL_ALLOC_SYSTEM_INODE] = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 },
184 [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }
185};
186
187/* Parameter passed from mount.ocfs2 to module */
188#define OCFS2_HB_NONE "heartbeat=none"
189#define OCFS2_HB_LOCAL "heartbeat=local"
190
191/*
192 * OCFS2 directory file types. Only the low 3 bits are used. The
193 * other bits are reserved for now.
194 */
195#define OCFS2_FT_UNKNOWN 0
196#define OCFS2_FT_REG_FILE 1
197#define OCFS2_FT_DIR 2
198#define OCFS2_FT_CHRDEV 3
199#define OCFS2_FT_BLKDEV 4
200#define OCFS2_FT_FIFO 5
201#define OCFS2_FT_SOCK 6
202#define OCFS2_FT_SYMLINK 7
203
204#define OCFS2_FT_MAX 8
205
206/*
207 * OCFS2_DIR_PAD defines the directory entries boundaries
208 *
209 * NOTE: It must be a multiple of 4
210 */
211#define OCFS2_DIR_PAD 4
212#define OCFS2_DIR_ROUND (OCFS2_DIR_PAD - 1)
213#define OCFS2_DIR_MEMBER_LEN offsetof(struct ocfs2_dir_entry, name)
214#define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \
215 OCFS2_DIR_ROUND) & \
216 ~OCFS2_DIR_ROUND)
217
218#define OCFS2_LINK_MAX 32000
219
220#define S_SHIFT 12
221static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
222 [S_IFREG >> S_SHIFT] = OCFS2_FT_REG_FILE,
223 [S_IFDIR >> S_SHIFT] = OCFS2_FT_DIR,
224 [S_IFCHR >> S_SHIFT] = OCFS2_FT_CHRDEV,
225 [S_IFBLK >> S_SHIFT] = OCFS2_FT_BLKDEV,
226 [S_IFIFO >> S_SHIFT] = OCFS2_FT_FIFO,
227 [S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK,
228 [S_IFLNK >> S_SHIFT] = OCFS2_FT_SYMLINK,
229};
230
231
232/*
233 * Convenience casts
234 */
235#define OCFS2_RAW_SB(dinode) (&((dinode)->id2.i_super))
236
237/*
238 * On disk extent record for OCFS2
239 * It describes a range of clusters on disk.
240 */
241struct ocfs2_extent_rec {
242/*00*/ __le32 e_cpos; /* Offset into the file, in clusters */
243 __le32 e_clusters; /* Clusters covered by this extent */
244 __le64 e_blkno; /* Physical disk offset, in blocks */
245/*10*/
246};
247
248struct ocfs2_chain_rec {
249 __le32 c_free; /* Number of free bits in this chain. */
250 __le32 c_total; /* Number of total bits in this chain */
251 __le64 c_blkno; /* Physical disk offset (blocks) of 1st group */
252};
253
254struct ocfs2_truncate_rec {
255 __le32 t_start; /* 1st cluster in this log */
256 __le32 t_clusters; /* Number of total clusters covered */
257};
258
259/*
260 * On disk extent list for OCFS2 (node in the tree). Note that this
261 * is contained inside ocfs2_dinode or ocfs2_extent_block, so the
262 * offsets are relative to ocfs2_dinode.id2.i_list or
263 * ocfs2_extent_block.h_list, respectively.
264 */
265struct ocfs2_extent_list {
266/*00*/ __le16 l_tree_depth; /* Extent tree depth from this
267 point. 0 means data extents
268 hang directly off this
269 header (a leaf) */
270 __le16 l_count; /* Number of extent records */
271 __le16 l_next_free_rec; /* Next unused extent slot */
272 __le16 l_reserved1;
273 __le64 l_reserved2; /* Pad to
274 sizeof(ocfs2_extent_rec) */
275/*10*/ struct ocfs2_extent_rec l_recs[0]; /* Extent records */
276};
277
278/*
279 * On disk allocation chain list for OCFS2. Note that this is
280 * contained inside ocfs2_dinode, so the offsets are relative to
281 * ocfs2_dinode.id2.i_chain.
282 */
283struct ocfs2_chain_list {
284/*00*/ __le16 cl_cpg; /* Clusters per Block Group */
285 __le16 cl_bpc; /* Bits per cluster */
286 __le16 cl_count; /* Total chains in this list */
287 __le16 cl_next_free_rec; /* Next unused chain slot */
288 __le64 cl_reserved1;
289/*10*/ struct ocfs2_chain_rec cl_recs[0]; /* Chain records */
290};
291
292/*
293 * On disk deallocation log for OCFS2. Note that this is
294 * contained inside ocfs2_dinode, so the offsets are relative to
295 * ocfs2_dinode.id2.i_dealloc.
296 */
297struct ocfs2_truncate_log {
298/*00*/ __le16 tl_count; /* Total records in this log */
299 __le16 tl_used; /* Number of records in use */
300 __le32 tl_reserved1;
301/*08*/ struct ocfs2_truncate_rec tl_recs[0]; /* Truncate records */
302};
303
304/*
305 * On disk extent block (indirect block) for OCFS2
306 */
307struct ocfs2_extent_block
308{
309/*00*/ __u8 h_signature[8]; /* Signature for verification */
310 __le64 h_reserved1;
311/*10*/ __le16 h_suballoc_slot; /* Slot suballocator this
312 extent_header belongs to */
313 __le16 h_suballoc_bit; /* Bit offset in suballocator
314 block group */
315 __le32 h_fs_generation; /* Must match super block */
316 __le64 h_blkno; /* Offset on disk, in blocks */
317/*20*/ __le64 h_reserved3;
318 __le64 h_next_leaf_blk; /* Offset on disk, in blocks,
319 of next leaf header pointing
320 to data */
321/*30*/ struct ocfs2_extent_list h_list; /* Extent record list */
322/* Actual on-disk size is one block */
323};
324
325/*
326 * On disk superblock for OCFS2
327 * Note that it is contained inside an ocfs2_dinode, so all offsets
328 * are relative to the start of ocfs2_dinode.id2.
329 */
330struct ocfs2_super_block {
331/*00*/ __le16 s_major_rev_level;
332 __le16 s_minor_rev_level;
333 __le16 s_mnt_count;
334 __le16 s_max_mnt_count;
335 __le16 s_state; /* File system state */
336 __le16 s_errors; /* Behaviour when detecting errors */
337 __le32 s_checkinterval; /* Max time between checks */
338/*10*/ __le64 s_lastcheck; /* Time of last check */
339 __le32 s_creator_os; /* OS */
340 __le32 s_feature_compat; /* Compatible feature set */
341/*20*/ __le32 s_feature_incompat; /* Incompatible feature set */
342 __le32 s_feature_ro_compat; /* Readonly-compatible feature set */
343 __le64 s_root_blkno; /* Offset, in blocks, of root directory
344 dinode */
345/*30*/ __le64 s_system_dir_blkno; /* Offset, in blocks, of system
346 directory dinode */
347 __le32 s_blocksize_bits; /* Blocksize for this fs */
348 __le32 s_clustersize_bits; /* Clustersize for this fs */
349/*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts
350 before tunefs required */
351 __le16 s_reserved1;
352 __le32 s_reserved2;
353 __le64 s_first_cluster_group; /* Block offset of 1st cluster
354 * group header */
355/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
356/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */
357/*A0*/
358};
359
360/*
361 * Local allocation bitmap for OCFS2 slots
362 * Note that it exists inside an ocfs2_dinode, so all offsets are
363 * relative to the start of ocfs2_dinode.id2.
364 */
365struct ocfs2_local_alloc
366{
367/*00*/ __le32 la_bm_off; /* Starting bit offset in main bitmap */
368 __le16 la_size; /* Size of included bitmap, in bytes */
369 __le16 la_reserved1;
370 __le64 la_reserved2;
371/*10*/ __u8 la_bitmap[0];
372};
373
374/*
375 * On disk inode for OCFS2
376 */
377struct ocfs2_dinode {
378/*00*/ __u8 i_signature[8]; /* Signature for validation */
379 __le32 i_generation; /* Generation number */
380 __le16 i_suballoc_slot; /* Slot suballocator this inode
381 belongs to */
382 __le16 i_suballoc_bit; /* Bit offset in suballocator
383 block group */
384/*10*/ __le32 i_reserved0;
385 __le32 i_clusters; /* Cluster count */
386 __le32 i_uid; /* Owner UID */
387 __le32 i_gid; /* Owning GID */
388/*20*/ __le64 i_size; /* Size in bytes */
389 __le16 i_mode; /* File mode */
390 __le16 i_links_count; /* Links count */
391 __le32 i_flags; /* File flags */
392/*30*/ __le64 i_atime; /* Access time */
393 __le64 i_ctime; /* Creation time */
394/*40*/ __le64 i_mtime; /* Modification time */
395 __le64 i_dtime; /* Deletion time */
396/*50*/ __le64 i_blkno; /* Offset on disk, in blocks */
397 __le64 i_last_eb_blk; /* Pointer to last extent
398 block */
399/*60*/ __le32 i_fs_generation; /* Generation per fs-instance */
400 __le32 i_atime_nsec;
401 __le32 i_ctime_nsec;
402 __le32 i_mtime_nsec;
403/*70*/ __le64 i_reserved1[9];
404/*B8*/ union {
405 __le64 i_pad1; /* Generic way to refer to this
406 64bit union */
407 struct {
408 __le64 i_rdev; /* Device number */
409 } dev1;
410 struct { /* Info for bitmap system
411 inodes */
412 __le32 i_used; /* Bits (ie, clusters) used */
413 __le32 i_total; /* Total bits (clusters)
414 available */
415 } bitmap1;
416 struct { /* Info for journal system
417 inodes */
418 __le32 ij_flags; /* Mounted, version, etc. */
419 __le32 ij_pad;
420 } journal1;
421 } id1; /* Inode type dependant 1 */
422/*C0*/ union {
423 struct ocfs2_super_block i_super;
424 struct ocfs2_local_alloc i_lab;
425 struct ocfs2_chain_list i_chain;
426 struct ocfs2_extent_list i_list;
427 struct ocfs2_truncate_log i_dealloc;
428 __u8 i_symlink[0];
429 } id2;
430/* Actual on-disk size is one block */
431};
432
433/*
434 * On-disk directory entry structure for OCFS2
435 *
436 * Packed as this structure could be accessed unaligned on 64-bit platforms
437 */
438struct ocfs2_dir_entry {
439/*00*/ __le64 inode; /* Inode number */
440 __le16 rec_len; /* Directory entry length */
441 __u8 name_len; /* Name length */
442 __u8 file_type;
443/*0C*/ char name[OCFS2_MAX_FILENAME_LEN]; /* File name */
444/* Actual on-disk length specified by rec_len */
445} __attribute__ ((packed));
446
447/*
448 * On disk allocator group structure for OCFS2
449 */
450struct ocfs2_group_desc
451{
452/*00*/ __u8 bg_signature[8]; /* Signature for validation */
453 __le16 bg_size; /* Size of included bitmap in
454 bytes. */
455 __le16 bg_bits; /* Bits represented by this
456 group. */
457 __le16 bg_free_bits_count; /* Free bits count */
458 __le16 bg_chain; /* What chain I am in. */
459/*10*/ __le32 bg_generation;
460 __le32 bg_reserved1;
461 __le64 bg_next_group; /* Next group in my list, in
462 blocks */
463/*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in
464 blocks */
465 __le64 bg_blkno; /* Offset on disk, in blocks */
466/*30*/ __le64 bg_reserved2[2];
467/*40*/ __u8 bg_bitmap[0];
468};
469
470#ifdef __KERNEL__
471static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
472{
473 return sb->s_blocksize -
474 offsetof(struct ocfs2_dinode, id2.i_symlink);
475}
476
477static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
478{
479 int size;
480
481 size = sb->s_blocksize -
482 offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
483
484 return size / sizeof(struct ocfs2_extent_rec);
485}
486
487static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
488{
489 int size;
490
491 size = sb->s_blocksize -
492 offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs);
493
494 return size / sizeof(struct ocfs2_chain_rec);
495}
496
497static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
498{
499 int size;
500
501 size = sb->s_blocksize -
502 offsetof(struct ocfs2_extent_block, h_list.l_recs);
503
504 return size / sizeof(struct ocfs2_extent_rec);
505}
506
507static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
508{
509 u16 size;
510
511 size = sb->s_blocksize -
512 offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
513
514 return size;
515}
516
517static inline int ocfs2_group_bitmap_size(struct super_block *sb)
518{
519 int size;
520
521 size = sb->s_blocksize -
522 offsetof(struct ocfs2_group_desc, bg_bitmap);
523
524 return size;
525}
526
527static inline int ocfs2_truncate_recs_per_inode(struct super_block *sb)
528{
529 int size;
530
531 size = sb->s_blocksize -
532 offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs);
533
534 return size / sizeof(struct ocfs2_truncate_rec);
535}
536#else
537static inline int ocfs2_fast_symlink_chars(int blocksize)
538{
539 return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink);
540}
541
542static inline int ocfs2_extent_recs_per_inode(int blocksize)
543{
544 int size;
545
546 size = blocksize -
547 offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
548
549 return size / sizeof(struct ocfs2_extent_rec);
550}
551
552static inline int ocfs2_chain_recs_per_inode(int blocksize)
553{
554 int size;
555
556 size = blocksize -
557 offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs);
558
559 return size / sizeof(struct ocfs2_chain_rec);
560}
561
562static inline int ocfs2_extent_recs_per_eb(int blocksize)
563{
564 int size;
565
566 size = blocksize -
567 offsetof(struct ocfs2_extent_block, h_list.l_recs);
568
569 return size / sizeof(struct ocfs2_extent_rec);
570}
571
572static inline int ocfs2_local_alloc_size(int blocksize)
573{
574 int size;
575
576 size = blocksize -
577 offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
578
579 return size;
580}
581
582static inline int ocfs2_group_bitmap_size(int blocksize)
583{
584 int size;
585
586 size = blocksize -
587 offsetof(struct ocfs2_group_desc, bg_bitmap);
588
589 return size;
590}
591
592static inline int ocfs2_truncate_recs_per_inode(int blocksize)
593{
594 int size;
595
596 size = blocksize -
597 offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs);
598
599 return size / sizeof(struct ocfs2_truncate_rec);
600}
601#endif /* __KERNEL__ */
602
603
604static inline int ocfs2_system_inode_is_global(int type)
605{
606 return ((type >= 0) &&
607 (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE));
608}
609
610static inline int ocfs2_sprintf_system_inode_name(char *buf, int len,
611 int type, int slot)
612{
613 int chars;
614
615 /*
616 * Global system inodes can only have one copy. Everything
617 * after OCFS2_LAST_GLOBAL_SYSTEM_INODE in the system inode
618 * list has a copy per slot.
619 */
620 if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)
621 chars = snprintf(buf, len,
622 ocfs2_system_inodes[type].si_name);
623 else
624 chars = snprintf(buf, len,
625 ocfs2_system_inodes[type].si_name,
626 slot);
627
628 return chars;
629}
630
631static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
632 umode_t mode)
633{
634 de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
635}
636
637#endif /* _OCFS2_FS_H */
638
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
new file mode 100644
index 00000000000..7dd9e1e705b
--- /dev/null
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -0,0 +1,73 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_lockid.h
5 *
6 * Defines OCFS2 lockid bits.
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_LOCKID_H
27#define OCFS2_LOCKID_H
28
29/* lock ids are made up in the following manner:
30 * name[0] --> type
31 * name[1-6] --> 6 pad characters, reserved for now
32 * name[7-22] --> block number, expressed in hex as 16 chars
33 * name[23-30] --> i_generation, expressed in hex 8 chars
34 * name[31] --> '\0' */
35#define OCFS2_LOCK_ID_MAX_LEN 32
36#define OCFS2_LOCK_ID_PAD "000000"
37
38enum ocfs2_lock_type {
39 OCFS2_LOCK_TYPE_META = 0,
40 OCFS2_LOCK_TYPE_DATA,
41 OCFS2_LOCK_TYPE_SUPER,
42 OCFS2_LOCK_TYPE_RENAME,
43 OCFS2_LOCK_TYPE_RW,
44 OCFS2_NUM_LOCK_TYPES
45};
46
47static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
48{
49 char c;
50 switch (type) {
51 case OCFS2_LOCK_TYPE_META:
52 c = 'M';
53 break;
54 case OCFS2_LOCK_TYPE_DATA:
55 c = 'D';
56 break;
57 case OCFS2_LOCK_TYPE_SUPER:
58 c = 'S';
59 break;
60 case OCFS2_LOCK_TYPE_RENAME:
61 c = 'R';
62 break;
63 case OCFS2_LOCK_TYPE_RW:
64 c = 'W';
65 break;
66 default:
67 c = '\0';
68 }
69
70 return c;
71}
72
73#endif /* OCFS2_LOCKID_H */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
new file mode 100644
index 00000000000..871627961d6
--- /dev/null
+++ b/fs/ocfs2/slot_map.c
@@ -0,0 +1,303 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * slot_map.c
5 *
6 *
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/types.h>
27#include <linux/slab.h>
28#include <linux/highmem.h>
29#include <linux/smp_lock.h>
30
31#define MLOG_MASK_PREFIX ML_SUPER
32#include <cluster/masklog.h>
33
34#include "ocfs2.h"
35
36#include "dlmglue.h"
37#include "extent_map.h"
38#include "heartbeat.h"
39#include "inode.h"
40#include "slot_map.h"
41#include "super.h"
42#include "sysfile.h"
43
44#include "buffer_head_io.h"
45
46static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
47 s16 global);
48static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
49 s16 slot_num,
50 s16 node_num);
51
52/* Use the slot information we've collected to create a map of mounted
53 * nodes. Should be holding an EX on super block. assumes slot info is
54 * up to date. Note that we call this *after* we find a slot, so our
55 * own node should be set in the map too... */
56void ocfs2_populate_mounted_map(struct ocfs2_super *osb)
57{
58 int i;
59 struct ocfs2_slot_info *si = osb->slot_info;
60
61 spin_lock(&si->si_lock);
62
63 for (i = 0; i < si->si_size; i++)
64 if (si->si_global_node_nums[i] != OCFS2_INVALID_SLOT)
65 ocfs2_node_map_set_bit(osb, &osb->mounted_map,
66 si->si_global_node_nums[i]);
67
68 spin_unlock(&si->si_lock);
69}
70
71/* post the slot information on disk into our slot_info struct. */
72void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
73{
74 int i;
75 __le16 *disk_info;
76
77 /* we don't read the slot block here as ocfs2_super_lock
78 * should've made sure we have the most recent copy. */
79 spin_lock(&si->si_lock);
80 disk_info = (__le16 *) si->si_bh->b_data;
81
82 for (i = 0; i < si->si_size; i++)
83 si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]);
84
85 spin_unlock(&si->si_lock);
86}
87
88/* post the our slot info stuff into it's destination bh and write it
89 * out. */
90int ocfs2_update_disk_slots(struct ocfs2_super *osb,
91 struct ocfs2_slot_info *si)
92{
93 int status, i;
94 __le16 *disk_info = (__le16 *) si->si_bh->b_data;
95
96 spin_lock(&si->si_lock);
97 for (i = 0; i < si->si_size; i++)
98 disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]);
99 spin_unlock(&si->si_lock);
100
101 status = ocfs2_write_block(osb, si->si_bh, si->si_inode);
102 if (status < 0)
103 mlog_errno(status);
104
105 return status;
106}
107
108/* try to find global node in the slot info. Returns
109 * OCFS2_INVALID_SLOT if nothing is found. */
110static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
111 s16 global)
112{
113 int i;
114 s16 ret = OCFS2_INVALID_SLOT;
115
116 for(i = 0; i < si->si_num_slots; i++) {
117 if (global == si->si_global_node_nums[i]) {
118 ret = (s16) i;
119 break;
120 }
121 }
122 return ret;
123}
124
125static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si)
126{
127 int i;
128 s16 ret = OCFS2_INVALID_SLOT;
129
130 for(i = 0; i < si->si_num_slots; i++) {
131 if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) {
132 ret = (s16) i;
133 break;
134 }
135 }
136 return ret;
137}
138
139s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
140 s16 global)
141{
142 s16 ret;
143
144 spin_lock(&si->si_lock);
145 ret = __ocfs2_node_num_to_slot(si, global);
146 spin_unlock(&si->si_lock);
147 return ret;
148}
149
150static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
151 s16 slot_num,
152 s16 node_num)
153{
154 BUG_ON(slot_num == OCFS2_INVALID_SLOT);
155 BUG_ON(slot_num >= si->si_num_slots);
156 BUG_ON((node_num != O2NM_INVALID_NODE_NUM) &&
157 (node_num >= O2NM_MAX_NODES));
158
159 si->si_global_node_nums[slot_num] = node_num;
160}
161
162void ocfs2_clear_slot(struct ocfs2_slot_info *si,
163 s16 slot_num)
164{
165 spin_lock(&si->si_lock);
166 __ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT);
167 spin_unlock(&si->si_lock);
168}
169
170int ocfs2_init_slot_info(struct ocfs2_super *osb)
171{
172 int status, i;
173 u64 blkno;
174 struct inode *inode = NULL;
175 struct buffer_head *bh = NULL;
176 struct ocfs2_slot_info *si;
177
178 si = kcalloc(1, sizeof(struct ocfs2_slot_info), GFP_KERNEL);
179 if (!si) {
180 status = -ENOMEM;
181 mlog_errno(status);
182 goto bail;
183 }
184
185 spin_lock_init(&si->si_lock);
186 si->si_num_slots = osb->max_slots;
187 si->si_size = OCFS2_MAX_SLOTS;
188
189 for(i = 0; i < si->si_num_slots; i++)
190 si->si_global_node_nums[i] = OCFS2_INVALID_SLOT;
191
192 inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
193 OCFS2_INVALID_SLOT);
194 if (!inode) {
195 status = -EINVAL;
196 mlog_errno(status);
197 goto bail;
198 }
199
200 status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL);
201 if (status < 0) {
202 mlog_errno(status);
203 goto bail;
204 }
205
206 status = ocfs2_read_block(osb, blkno, &bh, 0, inode);
207 if (status < 0) {
208 mlog_errno(status);
209 goto bail;
210 }
211
212 si->si_inode = inode;
213 si->si_bh = bh;
214 osb->slot_info = si;
215bail:
216 if (status < 0 && si)
217 ocfs2_free_slot_info(si);
218
219 return status;
220}
221
222void ocfs2_free_slot_info(struct ocfs2_slot_info *si)
223{
224 if (si->si_inode)
225 iput(si->si_inode);
226 if (si->si_bh)
227 brelse(si->si_bh);
228 kfree(si);
229}
230
231int ocfs2_find_slot(struct ocfs2_super *osb)
232{
233 int status;
234 s16 slot;
235 struct ocfs2_slot_info *si;
236
237 mlog_entry_void();
238
239 si = osb->slot_info;
240
241 ocfs2_update_slot_info(si);
242
243 spin_lock(&si->si_lock);
244 /* search for ourselves first and take the slot if it already
245 * exists. Perhaps we need to mark this in a variable for our
246 * own journal recovery? Possibly not, though we certainly
247 * need to warn to the user */
248 slot = __ocfs2_node_num_to_slot(si, osb->node_num);
249 if (slot == OCFS2_INVALID_SLOT) {
250 /* if no slot yet, then just take 1st available
251 * one. */
252 slot = __ocfs2_find_empty_slot(si);
253 if (slot == OCFS2_INVALID_SLOT) {
254 spin_unlock(&si->si_lock);
255 mlog(ML_ERROR, "no free slots available!\n");
256 status = -EINVAL;
257 goto bail;
258 }
259 } else
260 mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
261 slot);
262
263 __ocfs2_fill_slot(si, slot, osb->node_num);
264 osb->slot_num = slot;
265 spin_unlock(&si->si_lock);
266
267 mlog(ML_NOTICE, "taking node slot %d\n", osb->slot_num);
268
269 status = ocfs2_update_disk_slots(osb, si);
270 if (status < 0)
271 mlog_errno(status);
272
273bail:
274 mlog_exit(status);
275 return status;
276}
277
278void ocfs2_put_slot(struct ocfs2_super *osb)
279{
280 int status;
281 struct ocfs2_slot_info *si = osb->slot_info;
282
283 if (!si)
284 return;
285
286 ocfs2_update_slot_info(si);
287
288 spin_lock(&si->si_lock);
289 __ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT);
290 osb->slot_num = OCFS2_INVALID_SLOT;
291 spin_unlock(&si->si_lock);
292
293 status = ocfs2_update_disk_slots(osb, si);
294 if (status < 0) {
295 mlog_errno(status);
296 goto bail;
297 }
298
299bail:
300 osb->slot_info = NULL;
301 ocfs2_free_slot_info(si);
302}
303
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
new file mode 100644
index 00000000000..d8c8ceed031
--- /dev/null
+++ b/fs/ocfs2/slot_map.h
@@ -0,0 +1,66 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * slotmap.h
5 *
6 * description here
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26
27#ifndef SLOTMAP_H
28#define SLOTMAP_H
29
30struct ocfs2_slot_info {
31 spinlock_t si_lock;
32
33 struct inode *si_inode;
34 struct buffer_head *si_bh;
35 unsigned int si_num_slots;
36 unsigned int si_size;
37 s16 si_global_node_nums[OCFS2_MAX_SLOTS];
38};
39
40int ocfs2_init_slot_info(struct ocfs2_super *osb);
41void ocfs2_free_slot_info(struct ocfs2_slot_info *si);
42
43int ocfs2_find_slot(struct ocfs2_super *osb);
44void ocfs2_put_slot(struct ocfs2_super *osb);
45
46void ocfs2_update_slot_info(struct ocfs2_slot_info *si);
47int ocfs2_update_disk_slots(struct ocfs2_super *osb,
48 struct ocfs2_slot_info *si);
49
50s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
51 s16 global);
52void ocfs2_clear_slot(struct ocfs2_slot_info *si,
53 s16 slot_num);
54
55void ocfs2_populate_mounted_map(struct ocfs2_super *osb);
56
57static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
58 int slot_num)
59{
60 BUG_ON(slot_num == OCFS2_INVALID_SLOT);
61 assert_spin_locked(&si->si_lock);
62
63 return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT;
64}
65
66#endif
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
new file mode 100644
index 00000000000..c46c164aefb
--- /dev/null
+++ b/fs/ocfs2/suballoc.c
@@ -0,0 +1,1651 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * suballoc.c
5 *
6 * metadata alloc and free
7 * Inspired by ext3 block groups.
8 *
9 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 021110-1307, USA.
25 */
26
27#include <linux/fs.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h>
31
32#define MLOG_MASK_PREFIX ML_DISK_ALLOC
33#include <cluster/masklog.h>
34
35#include "ocfs2.h"
36
37#include "alloc.h"
38#include "dlmglue.h"
39#include "inode.h"
40#include "journal.h"
41#include "localalloc.h"
42#include "suballoc.h"
43#include "super.h"
44#include "sysfile.h"
45#include "uptodate.h"
46
47#include "buffer_head_io.h"
48
49static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
50static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
51static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
52static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
53 struct inode *alloc_inode,
54 struct buffer_head *bg_bh,
55 u64 group_blkno,
56 u16 my_chain,
57 struct ocfs2_chain_list *cl);
58static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
59 struct inode *alloc_inode,
60 struct buffer_head *bh);
61
62static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
63 struct ocfs2_alloc_context *ac);
64
65static int ocfs2_cluster_group_search(struct inode *inode,
66 struct buffer_head *group_bh,
67 u32 bits_wanted, u32 min_bits,
68 u16 *bit_off, u16 *bits_found);
69static int ocfs2_block_group_search(struct inode *inode,
70 struct buffer_head *group_bh,
71 u32 bits_wanted, u32 min_bits,
72 u16 *bit_off, u16 *bits_found);
73static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
74 u32 bits_wanted,
75 u32 min_bits,
76 u16 *bit_off,
77 unsigned int *num_bits,
78 u64 *bg_blkno);
79static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
80 struct ocfs2_alloc_context *ac,
81 u32 bits_wanted,
82 u32 min_bits,
83 u16 *bit_off,
84 unsigned int *num_bits,
85 u64 *bg_blkno);
86static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
87 int nr);
88static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
89 struct buffer_head *bg_bh,
90 unsigned int bits_wanted,
91 u16 *bit_off,
92 u16 *bits_found);
93static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
94 struct inode *alloc_inode,
95 struct ocfs2_group_desc *bg,
96 struct buffer_head *group_bh,
97 unsigned int bit_off,
98 unsigned int num_bits);
99static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
100 struct inode *alloc_inode,
101 struct ocfs2_group_desc *bg,
102 struct buffer_head *group_bh,
103 unsigned int bit_off,
104 unsigned int num_bits);
105
106static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
107 struct inode *alloc_inode,
108 struct buffer_head *fe_bh,
109 struct buffer_head *bg_bh,
110 struct buffer_head *prev_bg_bh,
111 u16 chain);
112static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
113 u32 wanted);
114static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
115 struct inode *alloc_inode,
116 struct buffer_head *alloc_bh,
117 unsigned int start_bit,
118 u64 bg_blkno,
119 unsigned int count);
120static inline u64 ocfs2_which_suballoc_group(u64 block,
121 unsigned int bit);
122static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
123 u64 bg_blkno,
124 u16 bg_bit_off);
125static inline u64 ocfs2_which_cluster_group(struct inode *inode,
126 u32 cluster);
127static inline void ocfs2_block_to_cluster_group(struct inode *inode,
128 u64 data_blkno,
129 u64 *bg_blkno,
130 u16 *bg_bit_off);
131
132void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
133{
134 if (ac->ac_inode)
135 iput(ac->ac_inode);
136 if (ac->ac_bh)
137 brelse(ac->ac_bh);
138 kfree(ac);
139}
140
141static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
142{
143 return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
144}
145
146static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
147 struct inode *alloc_inode,
148 struct buffer_head *bg_bh,
149 u64 group_blkno,
150 u16 my_chain,
151 struct ocfs2_chain_list *cl)
152{
153 int status = 0;
154 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
155 struct super_block * sb = alloc_inode->i_sb;
156
157 mlog_entry_void();
158
159 if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
160 ocfs2_error(alloc_inode->i_sb, "group block (%"MLFu64") "
161 "!= b_blocknr (%llu)", group_blkno,
162 (unsigned long long) bg_bh->b_blocknr);
163 status = -EIO;
164 goto bail;
165 }
166
167 status = ocfs2_journal_access(handle,
168 alloc_inode,
169 bg_bh,
170 OCFS2_JOURNAL_ACCESS_CREATE);
171 if (status < 0) {
172 mlog_errno(status);
173 goto bail;
174 }
175
176 memset(bg, 0, sb->s_blocksize);
177 strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
178 bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
179 bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
180 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
181 bg->bg_chain = cpu_to_le16(my_chain);
182 bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
183 bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
184 bg->bg_blkno = cpu_to_le64(group_blkno);
185 /* set the 1st bit in the bitmap to account for the descriptor block */
186 ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
187 bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
188
189 status = ocfs2_journal_dirty(handle, bg_bh);
190 if (status < 0)
191 mlog_errno(status);
192
193 /* There is no need to zero out or otherwise initialize the
194 * other blocks in a group - All valid FS metadata in a block
195 * group stores the superblock fs_generation value at
196 * allocation time. */
197
198bail:
199 mlog_exit(status);
200 return status;
201}
202
203static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
204{
205 u16 curr, best;
206
207 best = curr = 0;
208 while (curr < le16_to_cpu(cl->cl_count)) {
209 if (le32_to_cpu(cl->cl_recs[best].c_total) >
210 le32_to_cpu(cl->cl_recs[curr].c_total))
211 best = curr;
212 curr++;
213 }
214 return best;
215}
216
217/*
218 * We expect the block group allocator to already be locked.
219 */
220static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
221 struct inode *alloc_inode,
222 struct buffer_head *bh)
223{
224 int status, credits;
225 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
226 struct ocfs2_chain_list *cl;
227 struct ocfs2_alloc_context *ac = NULL;
228 struct ocfs2_journal_handle *handle = NULL;
229 u32 bit_off, num_bits;
230 u16 alloc_rec;
231 u64 bg_blkno;
232 struct buffer_head *bg_bh = NULL;
233 struct ocfs2_group_desc *bg;
234
235 BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
236
237 mlog_entry_void();
238
239 handle = ocfs2_alloc_handle(osb);
240 if (!handle) {
241 status = -ENOMEM;
242 mlog_errno(status);
243 goto bail;
244 }
245
246 cl = &fe->id2.i_chain;
247 status = ocfs2_reserve_clusters(osb,
248 handle,
249 le16_to_cpu(cl->cl_cpg),
250 &ac);
251 if (status < 0) {
252 if (status != -ENOSPC)
253 mlog_errno(status);
254 goto bail;
255 }
256
257 credits = ocfs2_calc_group_alloc_credits(osb->sb,
258 le16_to_cpu(cl->cl_cpg));
259 handle = ocfs2_start_trans(osb, handle, credits);
260 if (IS_ERR(handle)) {
261 status = PTR_ERR(handle);
262 handle = NULL;
263 mlog_errno(status);
264 goto bail;
265 }
266
267 status = ocfs2_claim_clusters(osb,
268 handle,
269 ac,
270 le16_to_cpu(cl->cl_cpg),
271 &bit_off,
272 &num_bits);
273 if (status < 0) {
274 if (status != -ENOSPC)
275 mlog_errno(status);
276 goto bail;
277 }
278
279 alloc_rec = ocfs2_find_smallest_chain(cl);
280
281 /* setup the group */
282 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
283 mlog(0, "new descriptor, record %u, at block %"MLFu64"\n",
284 alloc_rec, bg_blkno);
285
286 bg_bh = sb_getblk(osb->sb, bg_blkno);
287 if (!bg_bh) {
288 status = -EIO;
289 mlog_errno(status);
290 goto bail;
291 }
292 ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh);
293
294 status = ocfs2_block_group_fill(handle,
295 alloc_inode,
296 bg_bh,
297 bg_blkno,
298 alloc_rec,
299 cl);
300 if (status < 0) {
301 mlog_errno(status);
302 goto bail;
303 }
304
305 bg = (struct ocfs2_group_desc *) bg_bh->b_data;
306
307 status = ocfs2_journal_access(handle, alloc_inode,
308 bh, OCFS2_JOURNAL_ACCESS_WRITE);
309 if (status < 0) {
310 mlog_errno(status);
311 goto bail;
312 }
313
314 le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
315 le16_to_cpu(bg->bg_free_bits_count));
316 le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
317 cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg_blkno);
318 if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
319 le16_add_cpu(&cl->cl_next_free_rec, 1);
320
321 le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
322 le16_to_cpu(bg->bg_free_bits_count));
323 le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
324 le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
325
326 status = ocfs2_journal_dirty(handle, bh);
327 if (status < 0) {
328 mlog_errno(status);
329 goto bail;
330 }
331
332 spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
333 OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
334 fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
335 le32_to_cpu(fe->i_clusters)));
336 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
337 i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
338 alloc_inode->i_blocks =
339 ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode));
340
341 status = 0;
342bail:
343 if (handle)
344 ocfs2_commit_trans(handle);
345
346 if (ac)
347 ocfs2_free_alloc_context(ac);
348
349 if (bg_bh)
350 brelse(bg_bh);
351
352 mlog_exit(status);
353 return status;
354}
355
356static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
357 struct ocfs2_alloc_context *ac)
358{
359 int status;
360 u32 bits_wanted = ac->ac_bits_wanted;
361 struct inode *alloc_inode = ac->ac_inode;
362 struct buffer_head *bh = NULL;
363 struct ocfs2_journal_handle *handle = ac->ac_handle;
364 struct ocfs2_dinode *fe;
365 u32 free_bits;
366
367 mlog_entry_void();
368
369 BUG_ON(handle->flags & OCFS2_HANDLE_STARTED);
370
371 ocfs2_handle_add_inode(handle, alloc_inode);
372 status = ocfs2_meta_lock(alloc_inode, handle, &bh, 1);
373 if (status < 0) {
374 mlog_errno(status);
375 goto bail;
376 }
377
378 fe = (struct ocfs2_dinode *) bh->b_data;
379 if (!OCFS2_IS_VALID_DINODE(fe)) {
380 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
381 status = -EIO;
382 goto bail;
383 }
384 if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
385 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator "
386 "# %"MLFu64, le64_to_cpu(fe->i_blkno));
387 status = -EIO;
388 goto bail;
389 }
390
391 free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
392 le32_to_cpu(fe->id1.bitmap1.i_used);
393
394 if (bits_wanted > free_bits) {
395 /* cluster bitmap never grows */
396 if (ocfs2_is_cluster_bitmap(alloc_inode)) {
397 mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
398 bits_wanted, free_bits);
399 status = -ENOSPC;
400 goto bail;
401 }
402
403 status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
404 if (status < 0) {
405 if (status != -ENOSPC)
406 mlog_errno(status);
407 goto bail;
408 }
409 atomic_inc(&osb->alloc_stats.bg_extends);
410
411 /* You should never ask for this much metadata */
412 BUG_ON(bits_wanted >
413 (le32_to_cpu(fe->id1.bitmap1.i_total)
414 - le32_to_cpu(fe->id1.bitmap1.i_used)));
415 }
416
417 get_bh(bh);
418 ac->ac_bh = bh;
419bail:
420 if (bh)
421 brelse(bh);
422
423 mlog_exit(status);
424 return status;
425}
426
427int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
428 struct ocfs2_journal_handle *handle,
429 struct ocfs2_dinode *fe,
430 struct ocfs2_alloc_context **ac)
431{
432 int status;
433 struct inode *alloc_inode = NULL;
434
435 *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
436 if (!(*ac)) {
437 status = -ENOMEM;
438 mlog_errno(status);
439 goto bail;
440 }
441
442 (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
443 (*ac)->ac_handle = handle;
444 (*ac)->ac_which = OCFS2_AC_USE_META;
445
446#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
447 alloc_inode = ocfs2_get_system_file_inode(osb,
448 EXTENT_ALLOC_SYSTEM_INODE,
449 0);
450#else
451 alloc_inode = ocfs2_get_system_file_inode(osb,
452 EXTENT_ALLOC_SYSTEM_INODE,
453 osb->slot_num);
454#endif
455 if (!alloc_inode) {
456 status = -ENOMEM;
457 mlog_errno(status);
458 goto bail;
459 }
460
461 (*ac)->ac_inode = igrab(alloc_inode);
462 (*ac)->ac_group_search = ocfs2_block_group_search;
463
464 status = ocfs2_reserve_suballoc_bits(osb, (*ac));
465 if (status < 0) {
466 if (status != -ENOSPC)
467 mlog_errno(status);
468 goto bail;
469 }
470
471 status = 0;
472bail:
473 if ((status < 0) && *ac) {
474 ocfs2_free_alloc_context(*ac);
475 *ac = NULL;
476 }
477
478 if (alloc_inode)
479 iput(alloc_inode);
480
481 mlog_exit(status);
482 return status;
483}
484
485int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
486 struct ocfs2_journal_handle *handle,
487 struct ocfs2_alloc_context **ac)
488{
489 int status;
490 struct inode *alloc_inode = NULL;
491
492 *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
493 if (!(*ac)) {
494 status = -ENOMEM;
495 mlog_errno(status);
496 goto bail;
497 }
498
499 (*ac)->ac_bits_wanted = 1;
500 (*ac)->ac_handle = handle;
501 (*ac)->ac_which = OCFS2_AC_USE_INODE;
502
503 alloc_inode = ocfs2_get_system_file_inode(osb,
504 INODE_ALLOC_SYSTEM_INODE,
505 osb->slot_num);
506 if (!alloc_inode) {
507 status = -ENOMEM;
508 mlog_errno(status);
509 goto bail;
510 }
511
512 (*ac)->ac_inode = igrab(alloc_inode);
513 (*ac)->ac_group_search = ocfs2_block_group_search;
514
515 status = ocfs2_reserve_suballoc_bits(osb, *ac);
516 if (status < 0) {
517 if (status != -ENOSPC)
518 mlog_errno(status);
519 goto bail;
520 }
521
522 status = 0;
523bail:
524 if ((status < 0) && *ac) {
525 ocfs2_free_alloc_context(*ac);
526 *ac = NULL;
527 }
528
529 if (alloc_inode)
530 iput(alloc_inode);
531
532 mlog_exit(status);
533 return status;
534}
535
536/* local alloc code has to do the same thing, so rather than do this
537 * twice.. */
538int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
539 struct ocfs2_alloc_context *ac)
540{
541 int status;
542
543 ac->ac_inode = ocfs2_get_system_file_inode(osb,
544 GLOBAL_BITMAP_SYSTEM_INODE,
545 OCFS2_INVALID_SLOT);
546 if (!ac->ac_inode) {
547 status = -EINVAL;
548 mlog(ML_ERROR, "Could not get bitmap inode!\n");
549 goto bail;
550 }
551 ac->ac_which = OCFS2_AC_USE_MAIN;
552 ac->ac_group_search = ocfs2_cluster_group_search;
553
554 status = ocfs2_reserve_suballoc_bits(osb, ac);
555 if (status < 0 && status != -ENOSPC)
556 mlog_errno(status);
557bail:
558 return status;
559}
560
561/* Callers don't need to care which bitmap (local alloc or main) to
562 * use so we figure it out for them, but unfortunately this clutters
563 * things a bit. */
564int ocfs2_reserve_clusters(struct ocfs2_super *osb,
565 struct ocfs2_journal_handle *handle,
566 u32 bits_wanted,
567 struct ocfs2_alloc_context **ac)
568{
569 int status;
570
571 mlog_entry_void();
572
573 BUG_ON(!handle);
574
575 *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
576 if (!(*ac)) {
577 status = -ENOMEM;
578 mlog_errno(status);
579 goto bail;
580 }
581
582 (*ac)->ac_bits_wanted = bits_wanted;
583 (*ac)->ac_handle = handle;
584
585 status = -ENOSPC;
586 if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
587 status = ocfs2_reserve_local_alloc_bits(osb,
588 handle,
589 bits_wanted,
590 *ac);
591 if ((status < 0) && (status != -ENOSPC)) {
592 mlog_errno(status);
593 goto bail;
594 } else if (status == -ENOSPC) {
595 /* reserve_local_bits will return enospc with
596 * the local alloc inode still locked, so we
597 * can change this safely here. */
598 mlog(0, "Disabling local alloc\n");
599 /* We set to OCFS2_LA_DISABLED so that umount
600 * can clean up what's left of the local
601 * allocation */
602 osb->local_alloc_state = OCFS2_LA_DISABLED;
603 }
604 }
605
606 if (status == -ENOSPC) {
607 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
608 if (status < 0) {
609 if (status != -ENOSPC)
610 mlog_errno(status);
611 goto bail;
612 }
613 }
614
615 status = 0;
616bail:
617 if ((status < 0) && *ac) {
618 ocfs2_free_alloc_context(*ac);
619 *ac = NULL;
620 }
621
622 mlog_exit(status);
623 return status;
624}
625
626/*
627 * More or less lifted from ext3. I'll leave their description below:
628 *
629 * "For ext3 allocations, we must not reuse any blocks which are
630 * allocated in the bitmap buffer's "last committed data" copy. This
631 * prevents deletes from freeing up the page for reuse until we have
632 * committed the delete transaction.
633 *
634 * If we didn't do this, then deleting something and reallocating it as
635 * data would allow the old block to be overwritten before the
636 * transaction committed (because we force data to disk before commit).
637 * This would lead to corruption if we crashed between overwriting the
638 * data and committing the delete.
639 *
640 * @@@ We may want to make this allocation behaviour conditional on
641 * data-writes at some point, and disable it for metadata allocations or
642 * sync-data inodes."
643 *
644 * Note: OCFS2 already does this differently for metadata vs data
645 * allocations, as those bitmaps are seperate and undo access is never
646 * called on a metadata group descriptor.
647 */
648static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
649 int nr)
650{
651 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
652
653 if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
654 return 0;
655 if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data)
656 return 1;
657
658 bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
659 return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
660}
661
662static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
663 struct buffer_head *bg_bh,
664 unsigned int bits_wanted,
665 u16 *bit_off,
666 u16 *bits_found)
667{
668 void *bitmap;
669 u16 best_offset, best_size;
670 int offset, start, found, status = 0;
671 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
672
673 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
674 OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
675 return -EIO;
676 }
677
678 found = start = best_offset = best_size = 0;
679 bitmap = bg->bg_bitmap;
680
681 while((offset = ocfs2_find_next_zero_bit(bitmap,
682 le16_to_cpu(bg->bg_bits),
683 start)) != -1) {
684 if (offset == le16_to_cpu(bg->bg_bits))
685 break;
686
687 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
688 /* We found a zero, but we can't use it as it
689 * hasn't been put to disk yet! */
690 found = 0;
691 start = offset + 1;
692 } else if (offset == start) {
693 /* we found a zero */
694 found++;
695 /* move start to the next bit to test */
696 start++;
697 } else {
698 /* got a zero after some ones */
699 found = 1;
700 start = offset + 1;
701 }
702 if (found > best_size) {
703 best_size = found;
704 best_offset = start - found;
705 }
706 /* we got everything we needed */
707 if (found == bits_wanted) {
708 /* mlog(0, "Found it all!\n"); */
709 break;
710 }
711 }
712
713 /* XXX: I think the first clause is equivalent to the second
714 * - jlbec */
715 if (found == bits_wanted) {
716 *bit_off = start - found;
717 *bits_found = found;
718 } else if (best_size) {
719 *bit_off = best_offset;
720 *bits_found = best_size;
721 } else {
722 status = -ENOSPC;
723 /* No error log here -- see the comment above
724 * ocfs2_test_bg_bit_allocatable */
725 }
726
727 return status;
728}
729
730static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
731 struct inode *alloc_inode,
732 struct ocfs2_group_desc *bg,
733 struct buffer_head *group_bh,
734 unsigned int bit_off,
735 unsigned int num_bits)
736{
737 int status;
738 void *bitmap = bg->bg_bitmap;
739 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
740
741 mlog_entry_void();
742
743 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
744 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
745 status = -EIO;
746 goto bail;
747 }
748 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
749
750 mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
751 num_bits);
752
753 if (ocfs2_is_cluster_bitmap(alloc_inode))
754 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
755
756 status = ocfs2_journal_access(handle,
757 alloc_inode,
758 group_bh,
759 journal_type);
760 if (status < 0) {
761 mlog_errno(status);
762 goto bail;
763 }
764
765 le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
766
767 while(num_bits--)
768 ocfs2_set_bit(bit_off++, bitmap);
769
770 status = ocfs2_journal_dirty(handle,
771 group_bh);
772 if (status < 0) {
773 mlog_errno(status);
774 goto bail;
775 }
776
777bail:
778 mlog_exit(status);
779 return status;
780}
781
782/* find the one with the most empty bits */
783static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
784{
785 u16 curr, best;
786
787 BUG_ON(!cl->cl_next_free_rec);
788
789 best = curr = 0;
790 while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
791 if (le32_to_cpu(cl->cl_recs[curr].c_free) >
792 le32_to_cpu(cl->cl_recs[best].c_free))
793 best = curr;
794 curr++;
795 }
796
797 BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
798 return best;
799}
800
801static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
802 struct inode *alloc_inode,
803 struct buffer_head *fe_bh,
804 struct buffer_head *bg_bh,
805 struct buffer_head *prev_bg_bh,
806 u16 chain)
807{
808 int status;
809 /* there is a really tiny chance the journal calls could fail,
810 * but we wouldn't want inconsistent blocks in *any* case. */
811 u64 fe_ptr, bg_ptr, prev_bg_ptr;
812 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
813 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
814 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
815
816 if (!OCFS2_IS_VALID_DINODE(fe)) {
817 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
818 status = -EIO;
819 goto out;
820 }
821 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
822 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
823 status = -EIO;
824 goto out;
825 }
826 if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
827 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
828 status = -EIO;
829 goto out;
830 }
831
832 mlog(0, "In suballoc %"MLFu64", chain %u, move group %"MLFu64" to "
833 "top, prev = %"MLFu64"\n",
834 fe->i_blkno, chain, bg->bg_blkno, prev_bg->bg_blkno);
835
836 fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
837 bg_ptr = le64_to_cpu(bg->bg_next_group);
838 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
839
840 status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
841 OCFS2_JOURNAL_ACCESS_WRITE);
842 if (status < 0) {
843 mlog_errno(status);
844 goto out_rollback;
845 }
846
847 prev_bg->bg_next_group = bg->bg_next_group;
848
849 status = ocfs2_journal_dirty(handle, prev_bg_bh);
850 if (status < 0) {
851 mlog_errno(status);
852 goto out_rollback;
853 }
854
855 status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
856 OCFS2_JOURNAL_ACCESS_WRITE);
857 if (status < 0) {
858 mlog_errno(status);
859 goto out_rollback;
860 }
861
862 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
863
864 status = ocfs2_journal_dirty(handle, bg_bh);
865 if (status < 0) {
866 mlog_errno(status);
867 goto out_rollback;
868 }
869
870 status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
871 OCFS2_JOURNAL_ACCESS_WRITE);
872 if (status < 0) {
873 mlog_errno(status);
874 goto out_rollback;
875 }
876
877 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
878
879 status = ocfs2_journal_dirty(handle, fe_bh);
880 if (status < 0) {
881 mlog_errno(status);
882 goto out_rollback;
883 }
884
885 status = 0;
886out_rollback:
887 if (status < 0) {
888 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
889 bg->bg_next_group = cpu_to_le64(bg_ptr);
890 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
891 }
892out:
893 mlog_exit(status);
894 return status;
895}
896
897static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
898 u32 wanted)
899{
900 return le16_to_cpu(bg->bg_free_bits_count) > wanted;
901}
902
903/* return 0 on success, -ENOSPC to keep searching and any other < 0
904 * value on error. */
905static int ocfs2_cluster_group_search(struct inode *inode,
906 struct buffer_head *group_bh,
907 u32 bits_wanted, u32 min_bits,
908 u16 *bit_off, u16 *bits_found)
909{
910 int search = -ENOSPC;
911 int ret;
912 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
913 u16 tmp_off, tmp_found;
914
915 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
916
917 if (bg->bg_free_bits_count) {
918 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
919 group_bh, bits_wanted,
920 &tmp_off, &tmp_found);
921 if (ret)
922 return ret;
923
924 /* ocfs2_block_group_find_clear_bits() might
925 * return success, but we still want to return
926 * -ENOSPC unless it found the minimum number
927 * of bits. */
928 if (min_bits <= tmp_found) {
929 *bit_off = tmp_off;
930 *bits_found = tmp_found;
931 search = 0; /* success */
932 }
933 }
934
935 return search;
936}
937
938static int ocfs2_block_group_search(struct inode *inode,
939 struct buffer_head *group_bh,
940 u32 bits_wanted, u32 min_bits,
941 u16 *bit_off, u16 *bits_found)
942{
943 int ret = -ENOSPC;
944 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
945
946 BUG_ON(min_bits != 1);
947 BUG_ON(ocfs2_is_cluster_bitmap(inode));
948
949 if (bg->bg_free_bits_count)
950 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
951 group_bh, bits_wanted,
952 bit_off, bits_found);
953
954 return ret;
955}
956
957static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
958 u32 bits_wanted,
959 u32 min_bits,
960 u16 *bit_off,
961 unsigned int *num_bits,
962 u64 *bg_blkno)
963{
964 int status;
965 u16 chain, tmp_bits;
966 u32 tmp_used;
967 u64 next_group;
968 struct ocfs2_journal_handle *handle = ac->ac_handle;
969 struct inode *alloc_inode = ac->ac_inode;
970 struct buffer_head *group_bh = NULL;
971 struct buffer_head *prev_group_bh = NULL;
972 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
973 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
974 struct ocfs2_group_desc *bg;
975
976 chain = ac->ac_chain;
977 mlog(0, "trying to alloc %u bits from chain %u, inode %"MLFu64"\n",
978 bits_wanted, chain, OCFS2_I(alloc_inode)->ip_blkno);
979
980 status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
981 le64_to_cpu(cl->cl_recs[chain].c_blkno),
982 &group_bh, OCFS2_BH_CACHED, alloc_inode);
983 if (status < 0) {
984 mlog_errno(status);
985 goto bail;
986 }
987 bg = (struct ocfs2_group_desc *) group_bh->b_data;
988 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
989 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
990 status = -EIO;
991 goto bail;
992 }
993
994 status = -ENOSPC;
995 /* for now, the chain search is a bit simplistic. We just use
996 * the 1st group with any empty bits. */
997 while ((status = ac->ac_group_search(alloc_inode, group_bh,
998 bits_wanted, min_bits, bit_off,
999 &tmp_bits)) == -ENOSPC) {
1000 if (!bg->bg_next_group)
1001 break;
1002
1003 if (prev_group_bh) {
1004 brelse(prev_group_bh);
1005 prev_group_bh = NULL;
1006 }
1007 next_group = le64_to_cpu(bg->bg_next_group);
1008 prev_group_bh = group_bh;
1009 group_bh = NULL;
1010 status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
1011 next_group, &group_bh,
1012 OCFS2_BH_CACHED, alloc_inode);
1013 if (status < 0) {
1014 mlog_errno(status);
1015 goto bail;
1016 }
1017 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1018 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
1019 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
1020 status = -EIO;
1021 goto bail;
1022 }
1023 }
1024 if (status < 0) {
1025 if (status != -ENOSPC)
1026 mlog_errno(status);
1027 goto bail;
1028 }
1029
1030 mlog(0, "alloc succeeds: we give %u bits from block group %"MLFu64"\n",
1031 tmp_bits, bg->bg_blkno);
1032
1033 *num_bits = tmp_bits;
1034
1035 BUG_ON(*num_bits == 0);
1036
1037 /*
1038 * Keep track of previous block descriptor read. When
1039 * we find a target, if we have read more than X
1040 * number of descriptors, and the target is reasonably
1041 * empty, relink him to top of his chain.
1042 *
1043 * We've read 0 extra blocks and only send one more to
1044 * the transaction, yet the next guy to search has a
1045 * much easier time.
1046 *
1047 * Do this *after* figuring out how many bits we're taking out
1048 * of our target group.
1049 */
1050 if (ac->ac_allow_chain_relink &&
1051 (prev_group_bh) &&
1052 (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
1053 status = ocfs2_relink_block_group(handle, alloc_inode,
1054 ac->ac_bh, group_bh,
1055 prev_group_bh, chain);
1056 if (status < 0) {
1057 mlog_errno(status);
1058 goto bail;
1059 }
1060 }
1061
1062 /* Ok, claim our bits now: set the info on dinode, chainlist
1063 * and then the group */
1064 status = ocfs2_journal_access(handle,
1065 alloc_inode,
1066 ac->ac_bh,
1067 OCFS2_JOURNAL_ACCESS_WRITE);
1068 if (status < 0) {
1069 mlog_errno(status);
1070 goto bail;
1071 }
1072
1073 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1074 fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
1075 le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
1076
1077 status = ocfs2_journal_dirty(handle,
1078 ac->ac_bh);
1079 if (status < 0) {
1080 mlog_errno(status);
1081 goto bail;
1082 }
1083
1084 status = ocfs2_block_group_set_bits(handle,
1085 alloc_inode,
1086 bg,
1087 group_bh,
1088 *bit_off,
1089 *num_bits);
1090 if (status < 0) {
1091 mlog_errno(status);
1092 goto bail;
1093 }
1094
1095 mlog(0, "Allocated %u bits from suballocator %"MLFu64"\n",
1096 *num_bits, fe->i_blkno);
1097
1098 *bg_blkno = le64_to_cpu(bg->bg_blkno);
1099bail:
1100 if (group_bh)
1101 brelse(group_bh);
1102 if (prev_group_bh)
1103 brelse(prev_group_bh);
1104
1105 mlog_exit(status);
1106 return status;
1107}
1108
1109/* will give out up to bits_wanted contiguous bits. */
1110static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1111 struct ocfs2_alloc_context *ac,
1112 u32 bits_wanted,
1113 u32 min_bits,
1114 u16 *bit_off,
1115 unsigned int *num_bits,
1116 u64 *bg_blkno)
1117{
1118 int status;
1119 u16 victim, i;
1120 struct ocfs2_chain_list *cl;
1121 struct ocfs2_dinode *fe;
1122
1123 mlog_entry_void();
1124
1125 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1126 BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1127 BUG_ON(!ac->ac_bh);
1128
1129 fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1130 if (!OCFS2_IS_VALID_DINODE(fe)) {
1131 OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
1132 status = -EIO;
1133 goto bail;
1134 }
1135 if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1136 le32_to_cpu(fe->id1.bitmap1.i_total)) {
1137 ocfs2_error(osb->sb, "Chain allocator dinode %"MLFu64" has %u"
1138 "used bits but only %u total.",
1139 le64_to_cpu(fe->i_blkno),
1140 le32_to_cpu(fe->id1.bitmap1.i_used),
1141 le32_to_cpu(fe->id1.bitmap1.i_total));
1142 status = -EIO;
1143 goto bail;
1144 }
1145
1146 cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1147
1148 victim = ocfs2_find_victim_chain(cl);
1149 ac->ac_chain = victim;
1150 ac->ac_allow_chain_relink = 1;
1151
1152 status = ocfs2_search_chain(ac, bits_wanted, min_bits, bit_off,
1153 num_bits, bg_blkno);
1154 if (!status)
1155 goto bail;
1156 if (status < 0 && status != -ENOSPC) {
1157 mlog_errno(status);
1158 goto bail;
1159 }
1160
1161 mlog(0, "Search of victim chain %u came up with nothing, "
1162 "trying all chains now.\n", victim);
1163
1164 /* If we didn't pick a good victim, then just default to
1165 * searching each chain in order. Don't allow chain relinking
1166 * because we only calculate enough journal credits for one
1167 * relink per alloc. */
1168 ac->ac_allow_chain_relink = 0;
1169 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1170 if (i == victim)
1171 continue;
1172 if (!cl->cl_recs[i].c_free)
1173 continue;
1174
1175 ac->ac_chain = i;
1176 status = ocfs2_search_chain(ac, bits_wanted, min_bits,
1177 bit_off, num_bits,
1178 bg_blkno);
1179 if (!status)
1180 break;
1181 if (status < 0 && status != -ENOSPC) {
1182 mlog_errno(status);
1183 goto bail;
1184 }
1185 }
1186bail:
1187
1188 mlog_exit(status);
1189 return status;
1190}
1191
1192int ocfs2_claim_metadata(struct ocfs2_super *osb,
1193 struct ocfs2_journal_handle *handle,
1194 struct ocfs2_alloc_context *ac,
1195 u32 bits_wanted,
1196 u16 *suballoc_bit_start,
1197 unsigned int *num_bits,
1198 u64 *blkno_start)
1199{
1200 int status;
1201 u64 bg_blkno;
1202
1203 BUG_ON(!ac);
1204 BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1205 BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1206 BUG_ON(ac->ac_handle != handle);
1207
1208 status = ocfs2_claim_suballoc_bits(osb,
1209 ac,
1210 bits_wanted,
1211 1,
1212 suballoc_bit_start,
1213 num_bits,
1214 &bg_blkno);
1215 if (status < 0) {
1216 mlog_errno(status);
1217 goto bail;
1218 }
1219 atomic_inc(&osb->alloc_stats.bg_allocs);
1220
1221 *blkno_start = bg_blkno + (u64) *suballoc_bit_start;
1222 ac->ac_bits_given += (*num_bits);
1223 status = 0;
1224bail:
1225 mlog_exit(status);
1226 return status;
1227}
1228
1229int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1230 struct ocfs2_journal_handle *handle,
1231 struct ocfs2_alloc_context *ac,
1232 u16 *suballoc_bit,
1233 u64 *fe_blkno)
1234{
1235 int status;
1236 unsigned int num_bits;
1237 u64 bg_blkno;
1238
1239 mlog_entry_void();
1240
1241 BUG_ON(!ac);
1242 BUG_ON(ac->ac_bits_given != 0);
1243 BUG_ON(ac->ac_bits_wanted != 1);
1244 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
1245 BUG_ON(ac->ac_handle != handle);
1246
1247 status = ocfs2_claim_suballoc_bits(osb,
1248 ac,
1249 1,
1250 1,
1251 suballoc_bit,
1252 &num_bits,
1253 &bg_blkno);
1254 if (status < 0) {
1255 mlog_errno(status);
1256 goto bail;
1257 }
1258 atomic_inc(&osb->alloc_stats.bg_allocs);
1259
1260 BUG_ON(num_bits != 1);
1261
1262 *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
1263 ac->ac_bits_given++;
1264 status = 0;
1265bail:
1266 mlog_exit(status);
1267 return status;
1268}
1269
1270/* translate a group desc. blkno and it's bitmap offset into
1271 * disk cluster offset. */
1272static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
1273 u64 bg_blkno,
1274 u16 bg_bit_off)
1275{
1276 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1277 u32 cluster = 0;
1278
1279 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1280
1281 if (bg_blkno != osb->first_cluster_group_blkno)
1282 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
1283 cluster += (u32) bg_bit_off;
1284 return cluster;
1285}
1286
1287/* given a cluster offset, calculate which block group it belongs to
1288 * and return that block offset. */
1289static inline u64 ocfs2_which_cluster_group(struct inode *inode,
1290 u32 cluster)
1291{
1292 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1293 u32 group_no;
1294
1295 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1296
1297 group_no = cluster / osb->bitmap_cpg;
1298 if (!group_no)
1299 return osb->first_cluster_group_blkno;
1300 return ocfs2_clusters_to_blocks(inode->i_sb,
1301 group_no * osb->bitmap_cpg);
1302}
1303
1304/* given the block number of a cluster start, calculate which cluster
1305 * group and descriptor bitmap offset that corresponds to. */
1306static inline void ocfs2_block_to_cluster_group(struct inode *inode,
1307 u64 data_blkno,
1308 u64 *bg_blkno,
1309 u16 *bg_bit_off)
1310{
1311 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1312 u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
1313
1314 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1315
1316 *bg_blkno = ocfs2_which_cluster_group(inode,
1317 data_cluster);
1318
1319 if (*bg_blkno == osb->first_cluster_group_blkno)
1320 *bg_bit_off = (u16) data_cluster;
1321 else
1322 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
1323 data_blkno - *bg_blkno);
1324}
1325
1326/*
1327 * min_bits - minimum contiguous chunk from this total allocation we
1328 * can handle. set to what we asked for originally for a full
1329 * contig. allocation, set to '1' to indicate we can deal with extents
1330 * of any size.
1331 */
1332int ocfs2_claim_clusters(struct ocfs2_super *osb,
1333 struct ocfs2_journal_handle *handle,
1334 struct ocfs2_alloc_context *ac,
1335 u32 min_clusters,
1336 u32 *cluster_start,
1337 u32 *num_clusters)
1338{
1339 int status;
1340 unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
1341 u64 bg_blkno;
1342 u16 bg_bit_off;
1343
1344 mlog_entry_void();
1345
1346 BUG_ON(!ac);
1347 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1348
1349 BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
1350 && ac->ac_which != OCFS2_AC_USE_MAIN);
1351 BUG_ON(ac->ac_handle != handle);
1352
1353 if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
1354 status = ocfs2_claim_local_alloc_bits(osb,
1355 handle,
1356 ac,
1357 bits_wanted,
1358 cluster_start,
1359 num_clusters);
1360 if (!status)
1361 atomic_inc(&osb->alloc_stats.local_data);
1362 } else {
1363 if (min_clusters > (osb->bitmap_cpg - 1)) {
1364 /* The only paths asking for contiguousness
1365 * should know about this already. */
1366 mlog(ML_ERROR, "minimum allocation requested exceeds "
1367 "group bitmap size!");
1368 status = -ENOSPC;
1369 goto bail;
1370 }
1371 /* clamp the current request down to a realistic size. */
1372 if (bits_wanted > (osb->bitmap_cpg - 1))
1373 bits_wanted = osb->bitmap_cpg - 1;
1374
1375 status = ocfs2_claim_suballoc_bits(osb,
1376 ac,
1377 bits_wanted,
1378 min_clusters,
1379 &bg_bit_off,
1380 num_clusters,
1381 &bg_blkno);
1382 if (!status) {
1383 *cluster_start =
1384 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
1385 bg_blkno,
1386 bg_bit_off);
1387 atomic_inc(&osb->alloc_stats.bitmap_data);
1388 }
1389 }
1390 if (status < 0) {
1391 if (status != -ENOSPC)
1392 mlog_errno(status);
1393 goto bail;
1394 }
1395
1396 ac->ac_bits_given += *num_clusters;
1397
1398bail:
1399 mlog_exit(status);
1400 return status;
1401}
1402
1403static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
1404 struct inode *alloc_inode,
1405 struct ocfs2_group_desc *bg,
1406 struct buffer_head *group_bh,
1407 unsigned int bit_off,
1408 unsigned int num_bits)
1409{
1410 int status;
1411 unsigned int tmp;
1412 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1413 struct ocfs2_group_desc *undo_bg = NULL;
1414
1415 mlog_entry_void();
1416
1417 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
1418 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
1419 status = -EIO;
1420 goto bail;
1421 }
1422
1423 mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1424
1425 if (ocfs2_is_cluster_bitmap(alloc_inode))
1426 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1427
1428 status = ocfs2_journal_access(handle, alloc_inode, group_bh,
1429 journal_type);
1430 if (status < 0) {
1431 mlog_errno(status);
1432 goto bail;
1433 }
1434
1435 if (ocfs2_is_cluster_bitmap(alloc_inode))
1436 undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data;
1437
1438 tmp = num_bits;
1439 while(tmp--) {
1440 ocfs2_clear_bit((bit_off + tmp),
1441 (unsigned long *) bg->bg_bitmap);
1442 if (ocfs2_is_cluster_bitmap(alloc_inode))
1443 ocfs2_set_bit(bit_off + tmp,
1444 (unsigned long *) undo_bg->bg_bitmap);
1445 }
1446 le16_add_cpu(&bg->bg_free_bits_count, num_bits);
1447
1448 status = ocfs2_journal_dirty(handle, group_bh);
1449 if (status < 0)
1450 mlog_errno(status);
1451bail:
1452 return status;
1453}
1454
1455/*
1456 * expects the suballoc inode to already be locked.
1457 */
1458static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
1459 struct inode *alloc_inode,
1460 struct buffer_head *alloc_bh,
1461 unsigned int start_bit,
1462 u64 bg_blkno,
1463 unsigned int count)
1464{
1465 int status = 0;
1466 u32 tmp_used;
1467 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
1468 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
1469 struct ocfs2_chain_list *cl = &fe->id2.i_chain;
1470 struct buffer_head *group_bh = NULL;
1471 struct ocfs2_group_desc *group;
1472
1473 mlog_entry_void();
1474
1475 if (!OCFS2_IS_VALID_DINODE(fe)) {
1476 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
1477 status = -EIO;
1478 goto bail;
1479 }
1480 BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
1481
1482 mlog(0, "suballocator %"MLFu64": freeing %u bits from group %"MLFu64
1483 ", starting at %u\n",
1484 OCFS2_I(alloc_inode)->ip_blkno, count, bg_blkno,
1485 start_bit);
1486
1487 status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED,
1488 alloc_inode);
1489 if (status < 0) {
1490 mlog_errno(status);
1491 goto bail;
1492 }
1493
1494 group = (struct ocfs2_group_desc *) group_bh->b_data;
1495 if (!OCFS2_IS_VALID_GROUP_DESC(group)) {
1496 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, group);
1497 status = -EIO;
1498 goto bail;
1499 }
1500 BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
1501
1502 status = ocfs2_block_group_clear_bits(handle, alloc_inode,
1503 group, group_bh,
1504 start_bit, count);
1505 if (status < 0) {
1506 mlog_errno(status);
1507 goto bail;
1508 }
1509
1510 status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
1511 OCFS2_JOURNAL_ACCESS_WRITE);
1512 if (status < 0) {
1513 mlog_errno(status);
1514 goto bail;
1515 }
1516
1517 le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
1518 count);
1519 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1520 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
1521
1522 status = ocfs2_journal_dirty(handle, alloc_bh);
1523 if (status < 0) {
1524 mlog_errno(status);
1525 goto bail;
1526 }
1527
1528bail:
1529 if (group_bh)
1530 brelse(group_bh);
1531
1532 mlog_exit(status);
1533 return status;
1534}
1535
1536static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
1537{
1538 u64 group = block - (u64) bit;
1539
1540 return group;
1541}
1542
1543int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
1544 struct inode *inode_alloc_inode,
1545 struct buffer_head *inode_alloc_bh,
1546 struct ocfs2_dinode *di)
1547{
1548 u64 blk = le64_to_cpu(di->i_blkno);
1549 u16 bit = le16_to_cpu(di->i_suballoc_bit);
1550 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1551
1552 return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
1553 inode_alloc_bh, bit, bg_blkno, 1);
1554}
1555
1556int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
1557 struct inode *eb_alloc_inode,
1558 struct buffer_head *eb_alloc_bh,
1559 struct ocfs2_extent_block *eb)
1560{
1561 u64 blk = le64_to_cpu(eb->h_blkno);
1562 u16 bit = le16_to_cpu(eb->h_suballoc_bit);
1563 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1564
1565 return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh,
1566 bit, bg_blkno, 1);
1567}
1568
1569int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
1570 struct inode *bitmap_inode,
1571 struct buffer_head *bitmap_bh,
1572 u64 start_blk,
1573 unsigned int num_clusters)
1574{
1575 int status;
1576 u16 bg_start_bit;
1577 u64 bg_blkno;
1578 struct ocfs2_dinode *fe;
1579
1580 /* You can't ever have a contiguous set of clusters
1581 * bigger than a block group bitmap so we never have to worry
1582 * about looping on them. */
1583
1584 mlog_entry_void();
1585
1586 /* This is expensive. We can safely remove once this stuff has
1587 * gotten tested really well. */
1588 BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
1589
1590 fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
1591
1592 ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
1593 &bg_start_bit);
1594
1595 mlog(0, "want to free %u clusters starting at block %"MLFu64"\n",
1596 num_clusters, start_blk);
1597 mlog(0, "bg_blkno = %"MLFu64", bg_start_bit = %u\n",
1598 bg_blkno, bg_start_bit);
1599
1600 status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
1601 bg_start_bit, bg_blkno,
1602 num_clusters);
1603 if (status < 0)
1604 mlog_errno(status);
1605
1606 mlog_exit(status);
1607 return status;
1608}
1609
1610static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
1611{
1612 printk("Block Group:\n");
1613 printk("bg_signature: %s\n", bg->bg_signature);
1614 printk("bg_size: %u\n", bg->bg_size);
1615 printk("bg_bits: %u\n", bg->bg_bits);
1616 printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
1617 printk("bg_chain: %u\n", bg->bg_chain);
1618 printk("bg_generation: %u\n", le32_to_cpu(bg->bg_generation));
1619 printk("bg_next_group: %"MLFu64"\n", bg->bg_next_group);
1620 printk("bg_parent_dinode: %"MLFu64"\n", bg->bg_parent_dinode);
1621 printk("bg_blkno: %"MLFu64"\n", bg->bg_blkno);
1622}
1623
1624static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
1625{
1626 int i;
1627
1628 printk("Suballoc Inode %"MLFu64":\n", fe->i_blkno);
1629 printk("i_signature: %s\n", fe->i_signature);
1630 printk("i_size: %"MLFu64"\n", fe->i_size);
1631 printk("i_clusters: %u\n", fe->i_clusters);
1632 printk("i_generation: %u\n",
1633 le32_to_cpu(fe->i_generation));
1634 printk("id1.bitmap1.i_used: %u\n",
1635 le32_to_cpu(fe->id1.bitmap1.i_used));
1636 printk("id1.bitmap1.i_total: %u\n",
1637 le32_to_cpu(fe->id1.bitmap1.i_total));
1638 printk("id2.i_chain.cl_cpg: %u\n", fe->id2.i_chain.cl_cpg);
1639 printk("id2.i_chain.cl_bpc: %u\n", fe->id2.i_chain.cl_bpc);
1640 printk("id2.i_chain.cl_count: %u\n", fe->id2.i_chain.cl_count);
1641 printk("id2.i_chain.cl_next_free_rec: %u\n",
1642 fe->id2.i_chain.cl_next_free_rec);
1643 for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
1644 printk("fe->id2.i_chain.cl_recs[%d].c_free: %u\n", i,
1645 fe->id2.i_chain.cl_recs[i].c_free);
1646 printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
1647 fe->id2.i_chain.cl_recs[i].c_total);
1648 printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %"MLFu64"\n", i,
1649 fe->id2.i_chain.cl_recs[i].c_blkno);
1650 }
1651}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
new file mode 100644
index 00000000000..a76c82a7cea
--- /dev/null
+++ b/fs/ocfs2/suballoc.h
@@ -0,0 +1,132 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * suballoc.h
5 *
6 * Defines sub allocator api
7 *
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef _CHAINALLOC_H_
27#define _CHAINALLOC_H_
28
29typedef int (group_search_t)(struct inode *,
30 struct buffer_head *,
31 u32,
32 u32,
33 u16 *,
34 u16 *);
35
36struct ocfs2_alloc_context {
37 struct inode *ac_inode; /* which bitmap are we allocating from? */
38 struct buffer_head *ac_bh; /* file entry bh */
39 u32 ac_bits_wanted;
40 u32 ac_bits_given;
41#define OCFS2_AC_USE_LOCAL 1
42#define OCFS2_AC_USE_MAIN 2
43#define OCFS2_AC_USE_INODE 3
44#define OCFS2_AC_USE_META 4
45 u32 ac_which;
46 struct ocfs2_journal_handle *ac_handle;
47
48 /* these are used by the chain search */
49 u16 ac_chain;
50 int ac_allow_chain_relink;
51 group_search_t *ac_group_search;
52};
53
54void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
55static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
56{
57 return ac->ac_bits_wanted - ac->ac_bits_given;
58}
59
60int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
61 struct ocfs2_journal_handle *handle,
62 struct ocfs2_dinode *fe,
63 struct ocfs2_alloc_context **ac);
64int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
65 struct ocfs2_journal_handle *handle,
66 struct ocfs2_alloc_context **ac);
67int ocfs2_reserve_clusters(struct ocfs2_super *osb,
68 struct ocfs2_journal_handle *handle,
69 u32 bits_wanted,
70 struct ocfs2_alloc_context **ac);
71
72int ocfs2_claim_metadata(struct ocfs2_super *osb,
73 struct ocfs2_journal_handle *handle,
74 struct ocfs2_alloc_context *ac,
75 u32 bits_wanted,
76 u16 *suballoc_bit_start,
77 u32 *num_bits,
78 u64 *blkno_start);
79int ocfs2_claim_new_inode(struct ocfs2_super *osb,
80 struct ocfs2_journal_handle *handle,
81 struct ocfs2_alloc_context *ac,
82 u16 *suballoc_bit,
83 u64 *fe_blkno);
84int ocfs2_claim_clusters(struct ocfs2_super *osb,
85 struct ocfs2_journal_handle *handle,
86 struct ocfs2_alloc_context *ac,
87 u32 min_clusters,
88 u32 *cluster_start,
89 u32 *num_clusters);
90
91int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
92 struct inode *inode_alloc_inode,
93 struct buffer_head *inode_alloc_bh,
94 struct ocfs2_dinode *di);
95int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
96 struct inode *eb_alloc_inode,
97 struct buffer_head *eb_alloc_bh,
98 struct ocfs2_extent_block *eb);
99int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
100 struct inode *bitmap_inode,
101 struct buffer_head *bitmap_bh,
102 u64 start_blk,
103 unsigned int num_clusters);
104
105static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb,
106 u64 bg_blkno)
107{
108 /* This should work for all block group descriptors as only
109 * the 1st group descriptor of the cluster bitmap is
110 * different. */
111
112 if (bg_blkno == osb->first_cluster_group_blkno)
113 return 0;
114
115 /* the rest of the block groups are located at the beginning
116 * of their 1st cluster, so a direct translation just
117 * works. */
118 return ocfs2_blocks_to_clusters(osb->sb, bg_blkno);
119}
120
121static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
122{
123 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
124 return osb->bitmap_blkno == OCFS2_I(inode)->ip_blkno;
125}
126
127/* This is for local alloc ONLY. Others should use the task-specific
128 * apis above. */
129int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
130 struct ocfs2_alloc_context *ac);
131
132#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
new file mode 100644
index 00000000000..364d64bd5f1
--- /dev/null
+++ b/fs/ocfs2/super.c
@@ -0,0 +1,1733 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * super.c
5 *
6 * load/unload driver, mount/dismount volumes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/fs.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h>
31#include <linux/utsname.h>
32#include <linux/init.h>
33#include <linux/random.h>
34#include <linux/statfs.h>
35#include <linux/moduleparam.h>
36#include <linux/blkdev.h>
37#include <linux/socket.h>
38#include <linux/inet.h>
39#include <linux/parser.h>
40#include <linux/crc32.h>
41#include <linux/debugfs.h>
42
43#include <cluster/nodemanager.h>
44
45#define MLOG_MASK_PREFIX ML_SUPER
46#include <cluster/masklog.h>
47
48#include "ocfs2.h"
49
50/* this should be the only file to include a version 1 header */
51#include "ocfs1_fs_compat.h"
52
53#include "alloc.h"
54#include "dlmglue.h"
55#include "export.h"
56#include "extent_map.h"
57#include "heartbeat.h"
58#include "inode.h"
59#include "journal.h"
60#include "localalloc.h"
61#include "namei.h"
62#include "slot_map.h"
63#include "super.h"
64#include "sysfile.h"
65#include "uptodate.h"
66#include "ver.h"
67#include "vote.h"
68
69#include "buffer_head_io.h"
70
71/*
72 * Globals
73 */
74static spinlock_t ocfs2_globals_lock = SPIN_LOCK_UNLOCKED;
75
76static u32 osb_id; /* Keeps track of next available OSB Id */
77
78static kmem_cache_t *ocfs2_inode_cachep = NULL;
79
80kmem_cache_t *ocfs2_lock_cache = NULL;
81
82/* OCFS2 needs to schedule several differnt types of work which
83 * require cluster locking, disk I/O, recovery waits, etc. Since these
84 * types of work tend to be heavy we avoid using the kernel events
85 * workqueue and schedule on our own. */
86struct workqueue_struct *ocfs2_wq = NULL;
87
88static struct dentry *ocfs2_debugfs_root = NULL;
89
90MODULE_AUTHOR("Oracle");
91MODULE_LICENSE("GPL");
92
93static int ocfs2_parse_options(struct super_block *sb, char *options,
94 unsigned long *mount_opt, int is_remount);
95static void ocfs2_put_super(struct super_block *sb);
96static int ocfs2_mount_volume(struct super_block *sb);
97static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
98static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err);
99static int ocfs2_initialize_mem_caches(void);
100static void ocfs2_free_mem_caches(void);
101static void ocfs2_delete_osb(struct ocfs2_super *osb);
102
103static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf);
104
105static int ocfs2_sync_fs(struct super_block *sb, int wait);
106
107static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb);
108static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb);
109static int ocfs2_release_system_inodes(struct ocfs2_super *osb);
110static int ocfs2_fill_local_node_info(struct ocfs2_super *osb);
111static int ocfs2_check_volume(struct ocfs2_super *osb);
112static int ocfs2_verify_volume(struct ocfs2_dinode *di,
113 struct buffer_head *bh,
114 u32 sectsize);
115static int ocfs2_initialize_super(struct super_block *sb,
116 struct buffer_head *bh,
117 int sector_size);
118static int ocfs2_get_sector(struct super_block *sb,
119 struct buffer_head **bh,
120 int block,
121 int sect_size);
122static void ocfs2_write_super(struct super_block *sb);
123static struct inode *ocfs2_alloc_inode(struct super_block *sb);
124static void ocfs2_destroy_inode(struct inode *inode);
125
126static unsigned long long ocfs2_max_file_offset(unsigned int blockshift);
127
128static struct super_operations ocfs2_sops = {
129 .statfs = ocfs2_statfs,
130 .alloc_inode = ocfs2_alloc_inode,
131 .destroy_inode = ocfs2_destroy_inode,
132 .drop_inode = ocfs2_drop_inode,
133 .clear_inode = ocfs2_clear_inode,
134 .delete_inode = ocfs2_delete_inode,
135 .sync_fs = ocfs2_sync_fs,
136 .write_super = ocfs2_write_super,
137 .put_super = ocfs2_put_super,
138 .remount_fs = ocfs2_remount,
139};
140
141enum {
142 Opt_barrier,
143 Opt_err_panic,
144 Opt_err_ro,
145 Opt_intr,
146 Opt_nointr,
147 Opt_hb_none,
148 Opt_hb_local,
149 Opt_data_ordered,
150 Opt_data_writeback,
151 Opt_err,
152};
153
154static match_table_t tokens = {
155 {Opt_barrier, "barrier=%u"},
156 {Opt_err_panic, "errors=panic"},
157 {Opt_err_ro, "errors=remount-ro"},
158 {Opt_intr, "intr"},
159 {Opt_nointr, "nointr"},
160 {Opt_hb_none, OCFS2_HB_NONE},
161 {Opt_hb_local, OCFS2_HB_LOCAL},
162 {Opt_data_ordered, "data=ordered"},
163 {Opt_data_writeback, "data=writeback"},
164 {Opt_err, NULL}
165};
166
167/*
168 * write_super and sync_fs ripped right out of ext3.
169 */
170static void ocfs2_write_super(struct super_block *sb)
171{
172 if (mutex_trylock(&sb->s_lock) != 0)
173 BUG();
174 sb->s_dirt = 0;
175}
176
177static int ocfs2_sync_fs(struct super_block *sb, int wait)
178{
179 int status = 0;
180 tid_t target;
181 struct ocfs2_super *osb = OCFS2_SB(sb);
182
183 sb->s_dirt = 0;
184
185 if (ocfs2_is_hard_readonly(osb))
186 return -EROFS;
187
188 if (wait) {
189 status = ocfs2_flush_truncate_log(osb);
190 if (status < 0)
191 mlog_errno(status);
192 } else {
193 ocfs2_schedule_truncate_log_flush(osb, 0);
194 }
195
196 if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) {
197 if (wait)
198 log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
199 target);
200 }
201 return 0;
202}
203
204static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
205{
206 struct inode *new = NULL;
207 int status = 0;
208 int i;
209
210 mlog_entry_void();
211
212 new = ocfs2_iget(osb, osb->root_blkno);
213 if (IS_ERR(new)) {
214 status = PTR_ERR(new);
215 mlog_errno(status);
216 goto bail;
217 }
218 osb->root_inode = new;
219
220 new = ocfs2_iget(osb, osb->system_dir_blkno);
221 if (IS_ERR(new)) {
222 status = PTR_ERR(new);
223 mlog_errno(status);
224 goto bail;
225 }
226 osb->sys_root_inode = new;
227
228 for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
229 i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
230 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
231 if (!new) {
232 ocfs2_release_system_inodes(osb);
233 status = -EINVAL;
234 mlog_errno(status);
235 /* FIXME: Should ERROR_RO_FS */
236 mlog(ML_ERROR, "Unable to load system inode %d, "
237 "possibly corrupt fs?", i);
238 goto bail;
239 }
240 // the array now has one ref, so drop this one
241 iput(new);
242 }
243
244bail:
245 mlog_exit(status);
246 return status;
247}
248
249static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
250{
251 struct inode *new = NULL;
252 int status = 0;
253 int i;
254
255 mlog_entry_void();
256
257 for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
258 i < NUM_SYSTEM_INODES;
259 i++) {
260 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
261 if (!new) {
262 ocfs2_release_system_inodes(osb);
263 status = -EINVAL;
264 mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n",
265 status, i, osb->slot_num);
266 goto bail;
267 }
268 /* the array now has one ref, so drop this one */
269 iput(new);
270 }
271
272bail:
273 mlog_exit(status);
274 return status;
275}
276
277static int ocfs2_release_system_inodes(struct ocfs2_super *osb)
278{
279 int status = 0, i;
280 struct inode *inode;
281
282 mlog_entry_void();
283
284 for (i = 0; i < NUM_SYSTEM_INODES; i++) {
285 inode = osb->system_inodes[i];
286 if (inode) {
287 iput(inode);
288 osb->system_inodes[i] = NULL;
289 }
290 }
291
292 inode = osb->sys_root_inode;
293 if (inode) {
294 iput(inode);
295 osb->sys_root_inode = NULL;
296 }
297
298 inode = osb->root_inode;
299 if (inode) {
300 iput(inode);
301 osb->root_inode = NULL;
302 }
303
304 mlog_exit(status);
305 return status;
306}
307
308/* We're allocating fs objects, use GFP_NOFS */
309static struct inode *ocfs2_alloc_inode(struct super_block *sb)
310{
311 struct ocfs2_inode_info *oi;
312
313 oi = kmem_cache_alloc(ocfs2_inode_cachep, SLAB_NOFS);
314 if (!oi)
315 return NULL;
316
317 return &oi->vfs_inode;
318}
319
320static void ocfs2_destroy_inode(struct inode *inode)
321{
322 kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
323}
324
325/* From xfs_super.c:xfs_max_file_offset
326 * Copyright (c) 2000-2004 Silicon Graphics, Inc.
327 */
328static unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
329{
330 unsigned int pagefactor = 1;
331 unsigned int bitshift = BITS_PER_LONG - 1;
332
333 /* Figure out maximum filesize, on Linux this can depend on
334 * the filesystem blocksize (on 32 bit platforms).
335 * __block_prepare_write does this in an [unsigned] long...
336 * page->index << (PAGE_CACHE_SHIFT - bbits)
337 * So, for page sized blocks (4K on 32 bit platforms),
338 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
339 * (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
340 * but for smaller blocksizes it is less (bbits = log2 bsize).
341 * Note1: get_block_t takes a long (implicit cast from above)
342 * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch
343 * can optionally convert the [unsigned] long from above into
344 * an [unsigned] long long.
345 */
346
347#if BITS_PER_LONG == 32
348# if defined(CONFIG_LBD)
349 BUG_ON(sizeof(sector_t) != 8);
350 pagefactor = PAGE_CACHE_SIZE;
351 bitshift = BITS_PER_LONG;
352# else
353 pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift);
354# endif
355#endif
356
357 return (((unsigned long long)pagefactor) << bitshift) - 1;
358}
359
360static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
361{
362 int incompat_features;
363 int ret = 0;
364 unsigned long parsed_options;
365 struct ocfs2_super *osb = OCFS2_SB(sb);
366
367 if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) {
368 ret = -EINVAL;
369 goto out;
370 }
371
372 if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) !=
373 (parsed_options & OCFS2_MOUNT_HB_LOCAL)) {
374 ret = -EINVAL;
375 mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
376 goto out;
377 }
378
379 if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) !=
380 (parsed_options & OCFS2_MOUNT_DATA_WRITEBACK)) {
381 ret = -EINVAL;
382 mlog(ML_ERROR, "Cannot change data mode on remount\n");
383 goto out;
384 }
385
386 /* We're going to/from readonly mode. */
387 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
388 /* Lock here so the check of HARD_RO and the potential
389 * setting of SOFT_RO is atomic. */
390 spin_lock(&osb->osb_lock);
391 if (osb->osb_flags & OCFS2_OSB_HARD_RO) {
392 mlog(ML_ERROR, "Remount on readonly device is forbidden.\n");
393 ret = -EROFS;
394 goto unlock_osb;
395 }
396
397 if (*flags & MS_RDONLY) {
398 mlog(0, "Going to ro mode.\n");
399 sb->s_flags |= MS_RDONLY;
400 osb->osb_flags |= OCFS2_OSB_SOFT_RO;
401 } else {
402 mlog(0, "Making ro filesystem writeable.\n");
403
404 if (osb->osb_flags & OCFS2_OSB_ERROR_FS) {
405 mlog(ML_ERROR, "Cannot remount RDWR "
406 "filesystem due to previous errors.\n");
407 ret = -EROFS;
408 goto unlock_osb;
409 }
410 incompat_features = OCFS2_HAS_RO_COMPAT_FEATURE(sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP);
411 if (incompat_features) {
412 mlog(ML_ERROR, "Cannot remount RDWR because "
413 "of unsupported optional features "
414 "(%x).\n", incompat_features);
415 ret = -EINVAL;
416 goto unlock_osb;
417 }
418 sb->s_flags &= ~MS_RDONLY;
419 osb->osb_flags &= ~OCFS2_OSB_SOFT_RO;
420 }
421unlock_osb:
422 spin_unlock(&osb->osb_lock);
423 }
424
425 if (!ret) {
426 if (!ocfs2_is_hard_readonly(osb))
427 ocfs2_set_journal_params(osb);
428
429 /* Only save off the new mount options in case of a successful
430 * remount. */
431 osb->s_mount_opt = parsed_options;
432 }
433out:
434 return ret;
435}
436
437static int ocfs2_sb_probe(struct super_block *sb,
438 struct buffer_head **bh,
439 int *sector_size)
440{
441 int status = 0, tmpstat;
442 struct ocfs1_vol_disk_hdr *hdr;
443 struct ocfs2_dinode *di;
444 int blksize;
445
446 *bh = NULL;
447
448 /* may be > 512 */
449 *sector_size = bdev_hardsect_size(sb->s_bdev);
450 if (*sector_size > OCFS2_MAX_BLOCKSIZE) {
451 mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n",
452 *sector_size, OCFS2_MAX_BLOCKSIZE);
453 status = -EINVAL;
454 goto bail;
455 }
456
457 /* Can this really happen? */
458 if (*sector_size < OCFS2_MIN_BLOCKSIZE)
459 *sector_size = OCFS2_MIN_BLOCKSIZE;
460
461 /* check block zero for old format */
462 status = ocfs2_get_sector(sb, bh, 0, *sector_size);
463 if (status < 0) {
464 mlog_errno(status);
465 goto bail;
466 }
467 hdr = (struct ocfs1_vol_disk_hdr *) (*bh)->b_data;
468 if (hdr->major_version == OCFS1_MAJOR_VERSION) {
469 mlog(ML_ERROR, "incompatible version: %u.%u\n",
470 hdr->major_version, hdr->minor_version);
471 status = -EINVAL;
472 }
473 if (memcmp(hdr->signature, OCFS1_VOLUME_SIGNATURE,
474 strlen(OCFS1_VOLUME_SIGNATURE)) == 0) {
475 mlog(ML_ERROR, "incompatible volume signature: %8s\n",
476 hdr->signature);
477 status = -EINVAL;
478 }
479 brelse(*bh);
480 *bh = NULL;
481 if (status < 0) {
482 mlog(ML_ERROR, "This is an ocfs v1 filesystem which must be "
483 "upgraded before mounting with ocfs v2\n");
484 goto bail;
485 }
486
487 /*
488 * Now check at magic offset for 512, 1024, 2048, 4096
489 * blocksizes. 4096 is the maximum blocksize because it is
490 * the minimum clustersize.
491 */
492 status = -EINVAL;
493 for (blksize = *sector_size;
494 blksize <= OCFS2_MAX_BLOCKSIZE;
495 blksize <<= 1) {
496 tmpstat = ocfs2_get_sector(sb, bh,
497 OCFS2_SUPER_BLOCK_BLKNO,
498 blksize);
499 if (tmpstat < 0) {
500 status = tmpstat;
501 mlog_errno(status);
502 goto bail;
503 }
504 di = (struct ocfs2_dinode *) (*bh)->b_data;
505 status = ocfs2_verify_volume(di, *bh, blksize);
506 if (status >= 0)
507 goto bail;
508 brelse(*bh);
509 *bh = NULL;
510 if (status != -EAGAIN)
511 break;
512 }
513
514bail:
515 return status;
516}
517
518static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
519{
520 struct dentry *root;
521 int status, sector_size;
522 unsigned long parsed_opt;
523 struct inode *inode = NULL;
524 struct ocfs2_super *osb = NULL;
525 struct buffer_head *bh = NULL;
526
527 mlog_entry("%p, %p, %i", sb, data, silent);
528
529 /* for now we only have one cluster/node, make sure we see it
530 * in the heartbeat universe */
531 if (!o2hb_check_local_node_heartbeating()) {
532 status = -EINVAL;
533 goto read_super_error;
534 }
535
536 /* probe for superblock */
537 status = ocfs2_sb_probe(sb, &bh, &sector_size);
538 if (status < 0) {
539 mlog(ML_ERROR, "superblock probe failed!\n");
540 goto read_super_error;
541 }
542
543 status = ocfs2_initialize_super(sb, bh, sector_size);
544 osb = OCFS2_SB(sb);
545 if (status < 0) {
546 mlog_errno(status);
547 goto read_super_error;
548 }
549 brelse(bh);
550 bh = NULL;
551
552 if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) {
553 status = -EINVAL;
554 goto read_super_error;
555 }
556 osb->s_mount_opt = parsed_opt;
557
558 sb->s_magic = OCFS2_SUPER_MAGIC;
559
560 /* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
561 * heartbeat=none */
562 if (bdev_read_only(sb->s_bdev)) {
563 if (!(sb->s_flags & MS_RDONLY)) {
564 status = -EACCES;
565 mlog(ML_ERROR, "Readonly device detected but readonly "
566 "mount was not specified.\n");
567 goto read_super_error;
568 }
569
570 /* You should not be able to start a local heartbeat
571 * on a readonly device. */
572 if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
573 status = -EROFS;
574 mlog(ML_ERROR, "Local heartbeat specified on readonly "
575 "device.\n");
576 goto read_super_error;
577 }
578
579 status = ocfs2_check_journals_nolocks(osb);
580 if (status < 0) {
581 if (status == -EROFS)
582 mlog(ML_ERROR, "Recovery required on readonly "
583 "file system, but write access is "
584 "unavailable.\n");
585 else
586 mlog_errno(status);
587 goto read_super_error;
588 }
589
590 ocfs2_set_ro_flag(osb, 1);
591
592 printk(KERN_NOTICE "Readonly device detected. No cluster "
593 "services will be utilized for this mount. Recovery "
594 "will be skipped.\n");
595 }
596
597 if (!ocfs2_is_hard_readonly(osb)) {
598 /* If this isn't a hard readonly mount, then we need
599 * to make sure that heartbeat is in a valid state,
600 * and that we mark ourselves soft readonly is -oro
601 * was specified. */
602 if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
603 mlog(ML_ERROR, "No heartbeat for device (%s)\n",
604 sb->s_id);
605 status = -EINVAL;
606 goto read_super_error;
607 }
608
609 if (sb->s_flags & MS_RDONLY)
610 ocfs2_set_ro_flag(osb, 0);
611 }
612
613 osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
614 ocfs2_debugfs_root);
615 if (!osb->osb_debug_root) {
616 status = -EINVAL;
617 mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n");
618 goto read_super_error;
619 }
620
621 status = ocfs2_mount_volume(sb);
622 if (osb->root_inode)
623 inode = igrab(osb->root_inode);
624
625 if (status < 0)
626 goto read_super_error;
627
628 if (!inode) {
629 status = -EIO;
630 mlog_errno(status);
631 goto read_super_error;
632 }
633
634 root = d_alloc_root(inode);
635 if (!root) {
636 status = -ENOMEM;
637 mlog_errno(status);
638 goto read_super_error;
639 }
640
641 sb->s_root = root;
642
643 ocfs2_complete_mount_recovery(osb);
644
645 printk("ocfs2: Mounting device (%u,%u) on (node %d, slot %d) with %s "
646 "data mode.\n",
647 MAJOR(sb->s_dev), MINOR(sb->s_dev), osb->node_num,
648 osb->slot_num,
649 osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" :
650 "ordered");
651
652 atomic_set(&osb->vol_state, VOLUME_MOUNTED);
653 wake_up(&osb->osb_mount_event);
654
655 mlog_exit(status);
656 return status;
657
658read_super_error:
659 if (bh != NULL)
660 brelse(bh);
661
662 if (inode)
663 iput(inode);
664
665 if (osb) {
666 atomic_set(&osb->vol_state, VOLUME_DISABLED);
667 wake_up(&osb->osb_mount_event);
668 ocfs2_dismount_volume(sb, 1);
669 }
670
671 mlog_exit(status);
672 return status;
673}
674
675static struct super_block *ocfs2_get_sb(struct file_system_type *fs_type,
676 int flags,
677 const char *dev_name,
678 void *data)
679{
680 return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
681}
682
683static struct file_system_type ocfs2_fs_type = {
684 .owner = THIS_MODULE,
685 .name = "ocfs2",
686 .get_sb = ocfs2_get_sb, /* is this called when we mount
687 * the fs? */
688 .kill_sb = kill_block_super, /* set to the generic one
689 * right now, but do we
690 * need to change that? */
691 .fs_flags = FS_REQUIRES_DEV,
692 .next = NULL
693};
694
695static int ocfs2_parse_options(struct super_block *sb,
696 char *options,
697 unsigned long *mount_opt,
698 int is_remount)
699{
700 int status;
701 char *p;
702
703 mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
704 options ? options : "(none)");
705
706 *mount_opt = 0;
707
708 if (!options) {
709 status = 1;
710 goto bail;
711 }
712
713 while ((p = strsep(&options, ",")) != NULL) {
714 int token, option;
715 substring_t args[MAX_OPT_ARGS];
716
717 if (!*p)
718 continue;
719
720 token = match_token(p, tokens, args);
721 switch (token) {
722 case Opt_hb_local:
723 *mount_opt |= OCFS2_MOUNT_HB_LOCAL;
724 break;
725 case Opt_hb_none:
726 *mount_opt &= ~OCFS2_MOUNT_HB_LOCAL;
727 break;
728 case Opt_barrier:
729 if (match_int(&args[0], &option)) {
730 status = 0;
731 goto bail;
732 }
733 if (option)
734 *mount_opt |= OCFS2_MOUNT_BARRIER;
735 else
736 *mount_opt &= ~OCFS2_MOUNT_BARRIER;
737 break;
738 case Opt_intr:
739 *mount_opt &= ~OCFS2_MOUNT_NOINTR;
740 break;
741 case Opt_nointr:
742 *mount_opt |= OCFS2_MOUNT_NOINTR;
743 break;
744 case Opt_err_panic:
745 *mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
746 break;
747 case Opt_err_ro:
748 *mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
749 break;
750 case Opt_data_ordered:
751 *mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
752 break;
753 case Opt_data_writeback:
754 *mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
755 break;
756 default:
757 mlog(ML_ERROR,
758 "Unrecognized mount option \"%s\" "
759 "or missing value\n", p);
760 status = 0;
761 goto bail;
762 }
763 }
764
765 status = 1;
766
767bail:
768 mlog_exit(status);
769 return status;
770}
771
772static int __init ocfs2_init(void)
773{
774 int status;
775
776 mlog_entry_void();
777
778 ocfs2_print_version();
779
780 if (init_ocfs2_extent_maps())
781 return -ENOMEM;
782
783 status = init_ocfs2_uptodate_cache();
784 if (status < 0) {
785 mlog_errno(status);
786 goto leave;
787 }
788
789 status = ocfs2_initialize_mem_caches();
790 if (status < 0) {
791 mlog_errno(status);
792 goto leave;
793 }
794
795 ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
796 if (!ocfs2_wq) {
797 status = -ENOMEM;
798 goto leave;
799 }
800
801 spin_lock(&ocfs2_globals_lock);
802 osb_id = 0;
803 spin_unlock(&ocfs2_globals_lock);
804
805 ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
806 if (!ocfs2_debugfs_root) {
807 status = -EFAULT;
808 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
809 }
810
811leave:
812 if (status < 0) {
813 ocfs2_free_mem_caches();
814 exit_ocfs2_uptodate_cache();
815 exit_ocfs2_extent_maps();
816 }
817
818 mlog_exit(status);
819
820 if (status >= 0) {
821 return register_filesystem(&ocfs2_fs_type);
822 } else
823 return -1;
824}
825
826static void __exit ocfs2_exit(void)
827{
828 mlog_entry_void();
829
830 if (ocfs2_wq) {
831 flush_workqueue(ocfs2_wq);
832 destroy_workqueue(ocfs2_wq);
833 }
834
835 debugfs_remove(ocfs2_debugfs_root);
836
837 ocfs2_free_mem_caches();
838
839 unregister_filesystem(&ocfs2_fs_type);
840
841 exit_ocfs2_extent_maps();
842
843 exit_ocfs2_uptodate_cache();
844
845 mlog_exit_void();
846}
847
848static void ocfs2_put_super(struct super_block *sb)
849{
850 mlog_entry("(0x%p)\n", sb);
851
852 ocfs2_sync_blockdev(sb);
853 ocfs2_dismount_volume(sb, 0);
854
855 mlog_exit_void();
856}
857
858static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf)
859{
860 struct ocfs2_super *osb;
861 u32 numbits, freebits;
862 int status;
863 struct ocfs2_dinode *bm_lock;
864 struct buffer_head *bh = NULL;
865 struct inode *inode = NULL;
866
867 mlog_entry("(%p, %p)\n", sb, buf);
868
869 osb = OCFS2_SB(sb);
870
871 inode = ocfs2_get_system_file_inode(osb,
872 GLOBAL_BITMAP_SYSTEM_INODE,
873 OCFS2_INVALID_SLOT);
874 if (!inode) {
875 mlog(ML_ERROR, "failed to get bitmap inode\n");
876 status = -EIO;
877 goto bail;
878 }
879
880 status = ocfs2_meta_lock(inode, NULL, &bh, 0);
881 if (status < 0) {
882 mlog_errno(status);
883 goto bail;
884 }
885
886 bm_lock = (struct ocfs2_dinode *) bh->b_data;
887
888 numbits = le32_to_cpu(bm_lock->id1.bitmap1.i_total);
889 freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used);
890
891 buf->f_type = OCFS2_SUPER_MAGIC;
892 buf->f_bsize = sb->s_blocksize;
893 buf->f_namelen = OCFS2_MAX_FILENAME_LEN;
894 buf->f_blocks = ((sector_t) numbits) *
895 (osb->s_clustersize >> osb->sb->s_blocksize_bits);
896 buf->f_bfree = ((sector_t) freebits) *
897 (osb->s_clustersize >> osb->sb->s_blocksize_bits);
898 buf->f_bavail = buf->f_bfree;
899 buf->f_files = numbits;
900 buf->f_ffree = freebits;
901
902 brelse(bh);
903
904 ocfs2_meta_unlock(inode, 0);
905 status = 0;
906bail:
907 if (inode)
908 iput(inode);
909
910 mlog_exit(status);
911
912 return status;
913}
914
915static void ocfs2_inode_init_once(void *data,
916 kmem_cache_t *cachep,
917 unsigned long flags)
918{
919 struct ocfs2_inode_info *oi = data;
920
921 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
922 SLAB_CTOR_CONSTRUCTOR) {
923 oi->ip_flags = 0;
924 oi->ip_open_count = 0;
925 spin_lock_init(&oi->ip_lock);
926 ocfs2_extent_map_init(&oi->vfs_inode);
927 INIT_LIST_HEAD(&oi->ip_handle_list);
928 INIT_LIST_HEAD(&oi->ip_io_markers);
929 oi->ip_handle = NULL;
930 oi->ip_created_trans = 0;
931 oi->ip_last_trans = 0;
932 oi->ip_dir_start_lookup = 0;
933
934 init_rwsem(&oi->ip_alloc_sem);
935 init_MUTEX(&(oi->ip_io_sem));
936
937 oi->ip_blkno = 0ULL;
938 oi->ip_clusters = 0;
939
940 ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
941 ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
942 ocfs2_lock_res_init_once(&oi->ip_data_lockres);
943
944 ocfs2_metadata_cache_init(&oi->vfs_inode);
945
946 inode_init_once(&oi->vfs_inode);
947 }
948}
949
950static int ocfs2_initialize_mem_caches(void)
951{
952 ocfs2_inode_cachep = kmem_cache_create("ocfs2_inode_cache",
953 sizeof(struct ocfs2_inode_info),
954 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
955 ocfs2_inode_init_once, NULL);
956 if (!ocfs2_inode_cachep)
957 return -ENOMEM;
958
959 ocfs2_lock_cache = kmem_cache_create("ocfs2_lock",
960 sizeof(struct ocfs2_journal_lock),
961 0,
962 SLAB_NO_REAP|SLAB_HWCACHE_ALIGN,
963 NULL, NULL);
964 if (!ocfs2_lock_cache)
965 return -ENOMEM;
966
967 return 0;
968}
969
970static void ocfs2_free_mem_caches(void)
971{
972 if (ocfs2_inode_cachep)
973 kmem_cache_destroy(ocfs2_inode_cachep);
974 if (ocfs2_lock_cache)
975 kmem_cache_destroy(ocfs2_lock_cache);
976
977 ocfs2_inode_cachep = NULL;
978 ocfs2_lock_cache = NULL;
979}
980
981static int ocfs2_get_sector(struct super_block *sb,
982 struct buffer_head **bh,
983 int block,
984 int sect_size)
985{
986 if (!sb_set_blocksize(sb, sect_size)) {
987 mlog(ML_ERROR, "unable to set blocksize\n");
988 return -EIO;
989 }
990
991 *bh = sb_getblk(sb, block);
992 if (!*bh) {
993 mlog_errno(-EIO);
994 return -EIO;
995 }
996 lock_buffer(*bh);
997 if (!buffer_dirty(*bh))
998 clear_buffer_uptodate(*bh);
999 unlock_buffer(*bh);
1000 ll_rw_block(READ, 1, bh);
1001 wait_on_buffer(*bh);
1002 return 0;
1003}
1004
1005/* ocfs2 1.0 only allows one cluster and node identity per kernel image. */
1006static int ocfs2_fill_local_node_info(struct ocfs2_super *osb)
1007{
1008 int status;
1009
1010 /* XXX hold a ref on the node while mounte? easy enough, if
1011 * desirable. */
1012 osb->node_num = o2nm_this_node();
1013 if (osb->node_num == O2NM_MAX_NODES) {
1014 mlog(ML_ERROR, "could not find this host's node number\n");
1015 status = -ENOENT;
1016 goto bail;
1017 }
1018
1019 mlog(ML_NOTICE, "I am node %d\n", osb->node_num);
1020
1021 status = 0;
1022bail:
1023 return status;
1024}
1025
1026static int ocfs2_mount_volume(struct super_block *sb)
1027{
1028 int status = 0;
1029 int unlock_super = 0;
1030 struct ocfs2_super *osb = OCFS2_SB(sb);
1031
1032 mlog_entry_void();
1033
1034 if (ocfs2_is_hard_readonly(osb))
1035 goto leave;
1036
1037 status = ocfs2_fill_local_node_info(osb);
1038 if (status < 0) {
1039 mlog_errno(status);
1040 goto leave;
1041 }
1042
1043 status = ocfs2_register_hb_callbacks(osb);
1044 if (status < 0) {
1045 mlog_errno(status);
1046 goto leave;
1047 }
1048
1049 status = ocfs2_dlm_init(osb);
1050 if (status < 0) {
1051 mlog_errno(status);
1052 goto leave;
1053 }
1054
1055 /* requires vote_thread to be running. */
1056 status = ocfs2_register_net_handlers(osb);
1057 if (status < 0) {
1058 mlog_errno(status);
1059 goto leave;
1060 }
1061
1062 status = ocfs2_super_lock(osb, 1);
1063 if (status < 0) {
1064 mlog_errno(status);
1065 goto leave;
1066 }
1067 unlock_super = 1;
1068
1069 /* This will load up the node map and add ourselves to it. */
1070 status = ocfs2_find_slot(osb);
1071 if (status < 0) {
1072 mlog_errno(status);
1073 goto leave;
1074 }
1075
1076 ocfs2_populate_mounted_map(osb);
1077
1078 /* load all node-local system inodes */
1079 status = ocfs2_init_local_system_inodes(osb);
1080 if (status < 0) {
1081 mlog_errno(status);
1082 goto leave;
1083 }
1084
1085 status = ocfs2_check_volume(osb);
1086 if (status < 0) {
1087 mlog_errno(status);
1088 goto leave;
1089 }
1090
1091 status = ocfs2_truncate_log_init(osb);
1092 if (status < 0) {
1093 mlog_errno(status);
1094 goto leave;
1095 }
1096
1097 /* This should be sent *after* we recovered our journal as it
1098 * will cause other nodes to unmark us as needing
1099 * recovery. However, we need to send it *before* dropping the
1100 * super block lock as otherwise their recovery threads might
1101 * try to clean us up while we're live! */
1102 status = ocfs2_request_mount_vote(osb);
1103 if (status < 0)
1104 mlog_errno(status);
1105
1106leave:
1107 if (unlock_super)
1108 ocfs2_super_unlock(osb, 1);
1109
1110 mlog_exit(status);
1111 return status;
1112}
1113
1114/* we can't grab the goofy sem lock from inside wait_event, so we use
1115 * memory barriers to make sure that we'll see the null task before
1116 * being woken up */
1117static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
1118{
1119 mb();
1120 return osb->recovery_thread_task != NULL;
1121}
1122
1123static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1124{
1125 int tmp;
1126 struct ocfs2_super *osb = NULL;
1127
1128 mlog_entry("(0x%p)\n", sb);
1129
1130 BUG_ON(!sb);
1131 osb = OCFS2_SB(sb);
1132 BUG_ON(!osb);
1133
1134 ocfs2_shutdown_local_alloc(osb);
1135
1136 ocfs2_truncate_log_shutdown(osb);
1137
1138 /* disable any new recovery threads and wait for any currently
1139 * running ones to exit. Do this before setting the vol_state. */
1140 down(&osb->recovery_lock);
1141 osb->disable_recovery = 1;
1142 up(&osb->recovery_lock);
1143 wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
1144
1145 /* At this point, we know that no more recovery threads can be
1146 * launched, so wait for any recovery completion work to
1147 * complete. */
1148 flush_workqueue(ocfs2_wq);
1149
1150 ocfs2_journal_shutdown(osb);
1151
1152 ocfs2_sync_blockdev(sb);
1153
1154 /* No dlm means we've failed during mount, so skip all the
1155 * steps which depended on that to complete. */
1156 if (osb->dlm) {
1157 tmp = ocfs2_super_lock(osb, 1);
1158 if (tmp < 0) {
1159 mlog_errno(tmp);
1160 return;
1161 }
1162
1163 tmp = ocfs2_request_umount_vote(osb);
1164 if (tmp < 0)
1165 mlog_errno(tmp);
1166
1167 if (osb->slot_num != OCFS2_INVALID_SLOT)
1168 ocfs2_put_slot(osb);
1169
1170 ocfs2_super_unlock(osb, 1);
1171 }
1172
1173 ocfs2_release_system_inodes(osb);
1174
1175 if (osb->dlm) {
1176 ocfs2_unregister_net_handlers(osb);
1177
1178 ocfs2_dlm_shutdown(osb);
1179 }
1180
1181 ocfs2_clear_hb_callbacks(osb);
1182
1183 debugfs_remove(osb->osb_debug_root);
1184
1185 if (!mnt_err)
1186 ocfs2_stop_heartbeat(osb);
1187
1188 atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
1189
1190 printk("ocfs2: Unmounting device (%u,%u) on (node %d)\n",
1191 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev), osb->node_num);
1192
1193 ocfs2_delete_osb(osb);
1194 kfree(osb);
1195 sb->s_dev = 0;
1196 sb->s_fs_info = NULL;
1197}
1198
1199static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uuid,
1200 unsigned uuid_bytes)
1201{
1202 int i, ret;
1203 char *ptr;
1204
1205 BUG_ON(uuid_bytes != OCFS2_VOL_UUID_LEN);
1206
1207 osb->uuid_str = kcalloc(1, OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL);
1208 if (osb->uuid_str == NULL)
1209 return -ENOMEM;
1210
1211 memcpy(osb->uuid, uuid, OCFS2_VOL_UUID_LEN);
1212
1213 for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) {
1214 /* print with null */
1215 ret = snprintf(ptr, 3, "%02X", uuid[i]);
1216 if (ret != 2) /* drop super cleans up */
1217 return -EINVAL;
1218 /* then only advance past the last char */
1219 ptr += 2;
1220 }
1221
1222 return 0;
1223}
1224
1225static int ocfs2_initialize_super(struct super_block *sb,
1226 struct buffer_head *bh,
1227 int sector_size)
1228{
1229 int status = 0;
1230 int i;
1231 struct ocfs2_dinode *di = NULL;
1232 struct inode *inode = NULL;
1233 struct buffer_head *bitmap_bh = NULL;
1234 struct ocfs2_journal *journal;
1235 __le32 uuid_net_key;
1236 struct ocfs2_super *osb;
1237
1238 mlog_entry_void();
1239
1240 osb = kcalloc(1, sizeof(struct ocfs2_super), GFP_KERNEL);
1241 if (!osb) {
1242 status = -ENOMEM;
1243 mlog_errno(status);
1244 goto bail;
1245 }
1246
1247 sb->s_fs_info = osb;
1248 sb->s_op = &ocfs2_sops;
1249 sb->s_export_op = &ocfs2_export_ops;
1250 sb->s_flags |= MS_NOATIME;
1251 /* this is needed to support O_LARGEFILE */
1252 sb->s_maxbytes = ocfs2_max_file_offset(sb->s_blocksize_bits);
1253
1254 osb->sb = sb;
1255 /* Save off for ocfs2_rw_direct */
1256 osb->s_sectsize_bits = blksize_bits(sector_size);
1257 if (!osb->s_sectsize_bits)
1258 BUG();
1259
1260 osb->net_response_ids = 0;
1261 spin_lock_init(&osb->net_response_lock);
1262 INIT_LIST_HEAD(&osb->net_response_list);
1263
1264 INIT_LIST_HEAD(&osb->osb_net_handlers);
1265 init_waitqueue_head(&osb->recovery_event);
1266 spin_lock_init(&osb->vote_task_lock);
1267 init_waitqueue_head(&osb->vote_event);
1268 osb->vote_work_sequence = 0;
1269 osb->vote_wake_sequence = 0;
1270 INIT_LIST_HEAD(&osb->blocked_lock_list);
1271 osb->blocked_lock_count = 0;
1272 INIT_LIST_HEAD(&osb->vote_list);
1273 spin_lock_init(&osb->osb_lock);
1274
1275 atomic_set(&osb->alloc_stats.moves, 0);
1276 atomic_set(&osb->alloc_stats.local_data, 0);
1277 atomic_set(&osb->alloc_stats.bitmap_data, 0);
1278 atomic_set(&osb->alloc_stats.bg_allocs, 0);
1279 atomic_set(&osb->alloc_stats.bg_extends, 0);
1280
1281 ocfs2_init_node_maps(osb);
1282
1283 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
1284 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
1285
1286 init_MUTEX(&osb->recovery_lock);
1287
1288 osb->disable_recovery = 0;
1289 osb->recovery_thread_task = NULL;
1290
1291 init_waitqueue_head(&osb->checkpoint_event);
1292 atomic_set(&osb->needs_checkpoint, 0);
1293
1294 osb->node_num = O2NM_INVALID_NODE_NUM;
1295 osb->slot_num = OCFS2_INVALID_SLOT;
1296
1297 osb->local_alloc_state = OCFS2_LA_UNUSED;
1298 osb->local_alloc_bh = NULL;
1299
1300 ocfs2_setup_hb_callbacks(osb);
1301
1302 init_waitqueue_head(&osb->osb_mount_event);
1303
1304 osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
1305 if (!osb->vol_label) {
1306 mlog(ML_ERROR, "unable to alloc vol label\n");
1307 status = -ENOMEM;
1308 goto bail;
1309 }
1310
1311 osb->uuid = kmalloc(OCFS2_VOL_UUID_LEN, GFP_KERNEL);
1312 if (!osb->uuid) {
1313 mlog(ML_ERROR, "unable to alloc uuid\n");
1314 status = -ENOMEM;
1315 goto bail;
1316 }
1317
1318 di = (struct ocfs2_dinode *)bh->b_data;
1319
1320 osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
1321 if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
1322 mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
1323 osb->max_slots);
1324 status = -EINVAL;
1325 goto bail;
1326 }
1327 mlog(ML_NOTICE, "max_slots for this device: %u\n", osb->max_slots);
1328
1329 osb->s_feature_compat =
1330 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat);
1331 osb->s_feature_ro_compat =
1332 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_ro_compat);
1333 osb->s_feature_incompat =
1334 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_incompat);
1335
1336 if ((i = OCFS2_HAS_INCOMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_INCOMPAT_SUPP))) {
1337 mlog(ML_ERROR, "couldn't mount because of unsupported "
1338 "optional features (%x).\n", i);
1339 status = -EINVAL;
1340 goto bail;
1341 }
1342 if (!(osb->sb->s_flags & MS_RDONLY) &&
1343 (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) {
1344 mlog(ML_ERROR, "couldn't mount RDWR because of "
1345 "unsupported optional features (%x).\n", i);
1346 status = -EINVAL;
1347 goto bail;
1348 }
1349
1350 get_random_bytes(&osb->s_next_generation, sizeof(u32));
1351
1352 /* FIXME
1353 * This should be done in ocfs2_journal_init(), but unknown
1354 * ordering issues will cause the filesystem to crash.
1355 * If anyone wants to figure out what part of the code
1356 * refers to osb->journal before ocfs2_journal_init() is run,
1357 * be my guest.
1358 */
1359 /* initialize our journal structure */
1360
1361 journal = kcalloc(1, sizeof(struct ocfs2_journal), GFP_KERNEL);
1362 if (!journal) {
1363 mlog(ML_ERROR, "unable to alloc journal\n");
1364 status = -ENOMEM;
1365 goto bail;
1366 }
1367 osb->journal = journal;
1368 journal->j_osb = osb;
1369
1370 atomic_set(&journal->j_num_trans, 0);
1371 init_rwsem(&journal->j_trans_barrier);
1372 init_waitqueue_head(&journal->j_checkpointed);
1373 spin_lock_init(&journal->j_lock);
1374 journal->j_trans_id = (unsigned long) 1;
1375 INIT_LIST_HEAD(&journal->j_la_cleanups);
1376 INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery, osb);
1377 journal->j_state = OCFS2_JOURNAL_FREE;
1378
1379 /* get some pseudo constants for clustersize bits */
1380 osb->s_clustersize_bits =
1381 le32_to_cpu(di->id2.i_super.s_clustersize_bits);
1382 osb->s_clustersize = 1 << osb->s_clustersize_bits;
1383 mlog(0, "clusterbits=%d\n", osb->s_clustersize_bits);
1384
1385 if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE ||
1386 osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) {
1387 mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n",
1388 osb->s_clustersize);
1389 status = -EINVAL;
1390 goto bail;
1391 }
1392
1393 if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1)
1394 > (u32)~0UL) {
1395 mlog(ML_ERROR, "Volume might try to write to blocks beyond "
1396 "what jbd can address in 32 bits.\n");
1397 status = -EINVAL;
1398 goto bail;
1399 }
1400
1401 if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid,
1402 sizeof(di->id2.i_super.s_uuid))) {
1403 mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n");
1404 status = -ENOMEM;
1405 goto bail;
1406 }
1407
1408 memcpy(&uuid_net_key, &osb->uuid[i], sizeof(osb->net_key));
1409 osb->net_key = le32_to_cpu(uuid_net_key);
1410
1411 strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
1412 osb->vol_label[63] = '\0';
1413 osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);
1414 osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno);
1415 osb->first_cluster_group_blkno =
1416 le64_to_cpu(di->id2.i_super.s_first_cluster_group);
1417 osb->fs_generation = le32_to_cpu(di->i_fs_generation);
1418 mlog(0, "vol_label: %s\n", osb->vol_label);
1419 mlog(0, "uuid: %s\n", osb->uuid_str);
1420 mlog(0, "root_blkno=%"MLFu64", system_dir_blkno=%"MLFu64"\n",
1421 osb->root_blkno, osb->system_dir_blkno);
1422
1423 osb->osb_dlm_debug = ocfs2_new_dlm_debug();
1424 if (!osb->osb_dlm_debug) {
1425 status = -ENOMEM;
1426 mlog_errno(status);
1427 goto bail;
1428 }
1429
1430 atomic_set(&osb->vol_state, VOLUME_INIT);
1431
1432 /* load root, system_dir, and all global system inodes */
1433 status = ocfs2_init_global_system_inodes(osb);
1434 if (status < 0) {
1435 mlog_errno(status);
1436 goto bail;
1437 }
1438
1439 /*
1440 * global bitmap
1441 */
1442 inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
1443 OCFS2_INVALID_SLOT);
1444 if (!inode) {
1445 status = -EINVAL;
1446 mlog_errno(status);
1447 goto bail;
1448 }
1449
1450 osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
1451
1452 status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0,
1453 inode);
1454 iput(inode);
1455 if (status < 0) {
1456 mlog_errno(status);
1457 goto bail;
1458 }
1459
1460 di = (struct ocfs2_dinode *) bitmap_bh->b_data;
1461 osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
1462 osb->num_clusters = le32_to_cpu(di->id1.bitmap1.i_total);
1463 brelse(bitmap_bh);
1464 mlog(0, "cluster bitmap inode: %"MLFu64", clusters per group: %u\n",
1465 osb->bitmap_blkno, osb->bitmap_cpg);
1466
1467 status = ocfs2_init_slot_info(osb);
1468 if (status < 0) {
1469 mlog_errno(status);
1470 goto bail;
1471 }
1472
1473 /* Link this osb onto the global linked list of all osb structures. */
1474 /* The Global Link List is mainted for the whole driver . */
1475 spin_lock(&ocfs2_globals_lock);
1476 osb->osb_id = osb_id;
1477 if (osb_id < OCFS2_MAX_OSB_ID)
1478 osb_id++;
1479 else {
1480 mlog(ML_ERROR, "Too many volumes mounted\n");
1481 status = -ENOMEM;
1482 }
1483 spin_unlock(&ocfs2_globals_lock);
1484
1485bail:
1486 mlog_exit(status);
1487 return status;
1488}
1489
1490/*
1491 * will return: -EAGAIN if it is ok to keep searching for superblocks
1492 * -EINVAL if there is a bad superblock
1493 * 0 on success
1494 */
1495static int ocfs2_verify_volume(struct ocfs2_dinode *di,
1496 struct buffer_head *bh,
1497 u32 blksz)
1498{
1499 int status = -EAGAIN;
1500
1501 mlog_entry_void();
1502
1503 if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
1504 strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
1505 status = -EINVAL;
1506 if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
1507 mlog(ML_ERROR, "found superblock with incorrect block "
1508 "size: found %u, should be %u\n",
1509 1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits),
1510 blksz);
1511 } else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) !=
1512 OCFS2_MAJOR_REV_LEVEL ||
1513 le16_to_cpu(di->id2.i_super.s_minor_rev_level) !=
1514 OCFS2_MINOR_REV_LEVEL) {
1515 mlog(ML_ERROR, "found superblock with bad version: "
1516 "found %u.%u, should be %u.%u\n",
1517 le16_to_cpu(di->id2.i_super.s_major_rev_level),
1518 le16_to_cpu(di->id2.i_super.s_minor_rev_level),
1519 OCFS2_MAJOR_REV_LEVEL,
1520 OCFS2_MINOR_REV_LEVEL);
1521 } else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) {
1522 mlog(ML_ERROR, "bad block number on superblock: "
1523 "found %"MLFu64", should be %llu\n",
1524 di->i_blkno, (unsigned long long)bh->b_blocknr);
1525 } else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 ||
1526 le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) {
1527 mlog(ML_ERROR, "bad cluster size found: %u\n",
1528 1 << le32_to_cpu(di->id2.i_super.s_clustersize_bits));
1529 } else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) {
1530 mlog(ML_ERROR, "bad root_blkno: 0\n");
1531 } else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) {
1532 mlog(ML_ERROR, "bad system_dir_blkno: 0\n");
1533 } else if (le16_to_cpu(di->id2.i_super.s_max_slots) > OCFS2_MAX_SLOTS) {
1534 mlog(ML_ERROR,
1535 "Superblock slots found greater than file system "
1536 "maximum: found %u, max %u\n",
1537 le16_to_cpu(di->id2.i_super.s_max_slots),
1538 OCFS2_MAX_SLOTS);
1539 } else {
1540 /* found it! */
1541 status = 0;
1542 }
1543 }
1544
1545 mlog_exit(status);
1546 return status;
1547}
1548
1549static int ocfs2_check_volume(struct ocfs2_super *osb)
1550{
1551 int status = 0;
1552 int dirty;
1553 struct ocfs2_dinode *local_alloc = NULL; /* only used if we
1554 * recover
1555 * ourselves. */
1556
1557 mlog_entry_void();
1558
1559 /* Init our journal object. */
1560 status = ocfs2_journal_init(osb->journal, &dirty);
1561 if (status < 0) {
1562 mlog(ML_ERROR, "Could not initialize journal!\n");
1563 goto finally;
1564 }
1565
1566 /* If the journal was unmounted cleanly then we don't want to
1567 * recover anything. Otherwise, journal_load will do that
1568 * dirty work for us :) */
1569 if (!dirty) {
1570 status = ocfs2_journal_wipe(osb->journal, 0);
1571 if (status < 0) {
1572 mlog_errno(status);
1573 goto finally;
1574 }
1575 } else {
1576 mlog(ML_NOTICE, "File system was not unmounted cleanly, "
1577 "recovering volume.\n");
1578 }
1579
1580 /* will play back anything left in the journal. */
1581 ocfs2_journal_load(osb->journal);
1582
1583 if (dirty) {
1584 /* recover my local alloc if we didn't unmount cleanly. */
1585 status = ocfs2_begin_local_alloc_recovery(osb,
1586 osb->slot_num,
1587 &local_alloc);
1588 if (status < 0) {
1589 mlog_errno(status);
1590 goto finally;
1591 }
1592 /* we complete the recovery process after we've marked
1593 * ourselves as mounted. */
1594 }
1595
1596 mlog(0, "Journal loaded.\n");
1597
1598 status = ocfs2_load_local_alloc(osb);
1599 if (status < 0) {
1600 mlog_errno(status);
1601 goto finally;
1602 }
1603
1604 if (dirty) {
1605 /* Recovery will be completed after we've mounted the
1606 * rest of the volume. */
1607 osb->dirty = 1;
1608 osb->local_alloc_copy = local_alloc;
1609 local_alloc = NULL;
1610 }
1611
1612 /* go through each journal, trylock it and if you get the
1613 * lock, and it's marked as dirty, set the bit in the recover
1614 * map and launch a recovery thread for it. */
1615 status = ocfs2_mark_dead_nodes(osb);
1616 if (status < 0)
1617 mlog_errno(status);
1618
1619finally:
1620 if (local_alloc)
1621 kfree(local_alloc);
1622
1623 mlog_exit(status);
1624 return status;
1625}
1626
1627/*
1628 * The routine gets called from dismount or close whenever a dismount on
1629 * volume is requested and the osb open count becomes 1.
1630 * It will remove the osb from the global list and also free up all the
1631 * initialized resources and fileobject.
1632 */
1633static void ocfs2_delete_osb(struct ocfs2_super *osb)
1634{
1635 mlog_entry_void();
1636
1637 /* This function assumes that the caller has the main osb resource */
1638
1639 if (osb->slot_info)
1640 ocfs2_free_slot_info(osb->slot_info);
1641
1642 /* FIXME
1643 * This belongs in journal shutdown, but because we have to
1644 * allocate osb->journal at the start of ocfs2_initalize_osb(),
1645 * we free it here.
1646 */
1647 kfree(osb->journal);
1648 if (osb->local_alloc_copy)
1649 kfree(osb->local_alloc_copy);
1650 kfree(osb->uuid_str);
1651 ocfs2_put_dlm_debug(osb->osb_dlm_debug);
1652 memset(osb, 0, sizeof(struct ocfs2_super));
1653
1654 mlog_exit_void();
1655}
1656
1657/* Put OCFS2 into a readonly state, or (if the user specifies it),
1658 * panic(). We do not support continue-on-error operation. */
1659static void ocfs2_handle_error(struct super_block *sb)
1660{
1661 struct ocfs2_super *osb = OCFS2_SB(sb);
1662
1663 if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC)
1664 panic("OCFS2: (device %s): panic forced after error\n",
1665 sb->s_id);
1666
1667 ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS);
1668
1669 if (sb->s_flags & MS_RDONLY &&
1670 (ocfs2_is_soft_readonly(osb) ||
1671 ocfs2_is_hard_readonly(osb)))
1672 return;
1673
1674 printk(KERN_CRIT "File system is now read-only due to the potential "
1675 "of on-disk corruption. Please run fsck.ocfs2 once the file "
1676 "system is unmounted.\n");
1677 sb->s_flags |= MS_RDONLY;
1678 ocfs2_set_ro_flag(osb, 0);
1679}
1680
1681static char error_buf[1024];
1682
1683void __ocfs2_error(struct super_block *sb,
1684 const char *function,
1685 const char *fmt, ...)
1686{
1687 va_list args;
1688
1689 va_start(args, fmt);
1690 vsprintf(error_buf, fmt, args);
1691 va_end(args);
1692
1693 /* Not using mlog here because we want to show the actual
1694 * function the error came from. */
1695 printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n",
1696 sb->s_id, function, error_buf);
1697
1698 ocfs2_handle_error(sb);
1699}
1700
1701/* Handle critical errors. This is intentionally more drastic than
1702 * ocfs2_handle_error, so we only use for things like journal errors,
1703 * etc. */
1704void __ocfs2_abort(struct super_block* sb,
1705 const char *function,
1706 const char *fmt, ...)
1707{
1708 va_list args;
1709
1710 va_start(args, fmt);
1711 vsprintf(error_buf, fmt, args);
1712 va_end(args);
1713
1714 printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n",
1715 sb->s_id, function, error_buf);
1716
1717 /* We don't have the cluster support yet to go straight to
1718 * hard readonly in here. Until then, we want to keep
1719 * ocfs2_abort() so that we can at least mark critical
1720 * errors.
1721 *
1722 * TODO: This should abort the journal and alert other nodes
1723 * that our slot needs recovery. */
1724
1725 /* Force a panic(). This stinks, but it's better than letting
1726 * things continue without having a proper hard readonly
1727 * here. */
1728 OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
1729 ocfs2_handle_error(sb);
1730}
1731
1732module_init(ocfs2_init);
1733module_exit(ocfs2_exit);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
new file mode 100644
index 00000000000..c564177dfbd
--- /dev/null
+++ b/fs/ocfs2/super.h
@@ -0,0 +1,44 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * super.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_SUPER_H
27#define OCFS2_SUPER_H
28
29extern struct workqueue_struct *ocfs2_wq;
30
31int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
32 int node_num);
33
34void __ocfs2_error(struct super_block *sb,
35 const char *function,
36 const char *fmt, ...);
37#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
38
39void __ocfs2_abort(struct super_block *sb,
40 const char *function,
41 const char *fmt, ...);
42#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
43
44#endif /* OCFS2_SUPER_H */
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
new file mode 100644
index 00000000000..f6986bd79e7
--- /dev/null
+++ b/fs/ocfs2/symlink.c
@@ -0,0 +1,180 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * linux/cluster/ssi/cfs/symlink.c
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of
9 * the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE
14 * or NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Questions/Comments/Bugfixes to ssic-linux-devel@lists.sourceforge.net
22 *
23 * Copyright (C) 1992 Rick Sladkey
24 *
25 * Optimization changes Copyright (C) 1994 Florian La Roche
26 *
27 * Jun 7 1999, cache symlink lookups in the page cache. -DaveM
28 *
29 * Portions Copyright (C) 2001 Compaq Computer Corporation
30 *
31 * ocfs2 symlink handling code.
32 *
33 * Copyright (C) 2004, 2005 Oracle.
34 *
35 */
36
37#include <linux/fs.h>
38#include <linux/types.h>
39#include <linux/slab.h>
40#include <linux/pagemap.h>
41#include <linux/utsname.h>
42
43#define MLOG_MASK_PREFIX ML_NAMEI
44#include <cluster/masklog.h>
45
46#include "ocfs2.h"
47
48#include "alloc.h"
49#include "file.h"
50#include "inode.h"
51#include "journal.h"
52#include "symlink.h"
53
54#include "buffer_head_io.h"
55
56static char *ocfs2_page_getlink(struct dentry * dentry,
57 struct page **ppage);
58static char *ocfs2_fast_symlink_getlink(struct inode *inode,
59 struct buffer_head **bh);
60
61/* get the link contents into pagecache */
62static char *ocfs2_page_getlink(struct dentry * dentry,
63 struct page **ppage)
64{
65 struct page * page;
66 struct address_space *mapping = dentry->d_inode->i_mapping;
67 page = read_cache_page(mapping, 0,
68 (filler_t *)mapping->a_ops->readpage, NULL);
69 if (IS_ERR(page))
70 goto sync_fail;
71 wait_on_page_locked(page);
72 if (!PageUptodate(page))
73 goto async_fail;
74 *ppage = page;
75 return kmap(page);
76
77async_fail:
78 page_cache_release(page);
79 return ERR_PTR(-EIO);
80
81sync_fail:
82 return (char*)page;
83}
84
85static char *ocfs2_fast_symlink_getlink(struct inode *inode,
86 struct buffer_head **bh)
87{
88 int status;
89 char *link = NULL;
90 struct ocfs2_dinode *fe;
91
92 mlog_entry_void();
93
94 status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
95 OCFS2_I(inode)->ip_blkno,
96 bh,
97 OCFS2_BH_CACHED,
98 inode);
99 if (status < 0) {
100 mlog_errno(status);
101 link = ERR_PTR(status);
102 goto bail;
103 }
104
105 fe = (struct ocfs2_dinode *) (*bh)->b_data;
106 link = (char *) fe->id2.i_symlink;
107bail:
108 mlog_exit(status);
109
110 return link;
111}
112
113static int ocfs2_readlink(struct dentry *dentry,
114 char __user *buffer,
115 int buflen)
116{
117 int ret;
118 char *link;
119 struct buffer_head *bh = NULL;
120 struct inode *inode = dentry->d_inode;
121
122 mlog_entry_void();
123
124 link = ocfs2_fast_symlink_getlink(inode, &bh);
125 if (IS_ERR(link)) {
126 ret = PTR_ERR(link);
127 goto out;
128 }
129
130 ret = vfs_readlink(dentry, buffer, buflen, link);
131
132 brelse(bh);
133out:
134 mlog_exit(ret);
135 return ret;
136}
137
138static void *ocfs2_follow_link(struct dentry *dentry,
139 struct nameidata *nd)
140{
141 int status;
142 char *link;
143 struct inode *inode = dentry->d_inode;
144 struct page *page = NULL;
145 struct buffer_head *bh = NULL;
146
147 if (ocfs2_inode_is_fast_symlink(inode))
148 link = ocfs2_fast_symlink_getlink(inode, &bh);
149 else
150 link = ocfs2_page_getlink(dentry, &page);
151 if (IS_ERR(link)) {
152 status = PTR_ERR(link);
153 mlog_errno(status);
154 goto bail;
155 }
156
157 status = vfs_follow_link(nd, link);
158 if (status)
159 mlog_errno(status);
160bail:
161 if (page) {
162 kunmap(page);
163 page_cache_release(page);
164 }
165 if (bh)
166 brelse(bh);
167
168 return ERR_PTR(status);
169}
170
171struct inode_operations ocfs2_symlink_inode_operations = {
172 .readlink = page_readlink,
173 .follow_link = ocfs2_follow_link,
174 .getattr = ocfs2_getattr,
175};
176struct inode_operations ocfs2_fast_symlink_inode_operations = {
177 .readlink = ocfs2_readlink,
178 .follow_link = ocfs2_follow_link,
179 .getattr = ocfs2_getattr,
180};
diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h
new file mode 100644
index 00000000000..1ea9e4d9e9e
--- /dev/null
+++ b/fs/ocfs2/symlink.h
@@ -0,0 +1,42 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * symlink.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_SYMLINK_H
27#define OCFS2_SYMLINK_H
28
29extern struct inode_operations ocfs2_symlink_inode_operations;
30extern struct inode_operations ocfs2_fast_symlink_inode_operations;
31
32/*
33 * Test whether an inode is a fast symlink.
34 */
35static inline int ocfs2_inode_is_fast_symlink(struct inode *inode)
36{
37 return (S_ISLNK(inode->i_mode) &&
38 inode->i_blocks == 0);
39}
40
41
42#endif /* OCFS2_SYMLINK_H */
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
new file mode 100644
index 00000000000..600a8bc5b54
--- /dev/null
+++ b/fs/ocfs2/sysfile.c
@@ -0,0 +1,131 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * sysfile.c
5 *
6 * Initialize, read, write, etc. system files.
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30
31#include "ocfs2.h"
32
33#define MLOG_MASK_PREFIX ML_INODE
34#include <cluster/masklog.h>
35
36#include "alloc.h"
37#include "dir.h"
38#include "inode.h"
39#include "journal.h"
40#include "sysfile.h"
41
42#include "buffer_head_io.h"
43
44static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
45 int type,
46 u32 slot);
47
48static inline int is_global_system_inode(int type);
49static inline int is_in_system_inode_array(struct ocfs2_super *osb,
50 int type,
51 u32 slot);
52
53static inline int is_global_system_inode(int type)
54{
55 return type >= OCFS2_FIRST_ONLINE_SYSTEM_INODE &&
56 type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE;
57}
58
59static inline int is_in_system_inode_array(struct ocfs2_super *osb,
60 int type,
61 u32 slot)
62{
63 return slot == osb->slot_num || is_global_system_inode(type);
64}
65
66struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
67 int type,
68 u32 slot)
69{
70 struct inode *inode = NULL;
71 struct inode **arr = NULL;
72
73 /* avoid the lookup if cached in local system file array */
74 if (is_in_system_inode_array(osb, type, slot))
75 arr = &(osb->system_inodes[type]);
76
77 if (arr && ((inode = *arr) != NULL)) {
78 /* get a ref in addition to the array ref */
79 inode = igrab(inode);
80 if (!inode)
81 BUG();
82
83 return inode;
84 }
85
86 /* this gets one ref thru iget */
87 inode = _ocfs2_get_system_file_inode(osb, type, slot);
88
89 /* add one more if putting into array for first time */
90 if (arr && inode) {
91 *arr = igrab(inode);
92 if (!*arr)
93 BUG();
94 }
95 return inode;
96}
97
98static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
99 int type,
100 u32 slot)
101{
102 char namebuf[40];
103 struct inode *inode = NULL;
104 u64 blkno;
105 struct buffer_head *dirent_bh = NULL;
106 struct ocfs2_dir_entry *de = NULL;
107 int status = 0;
108
109 ocfs2_sprintf_system_inode_name(namebuf,
110 sizeof(namebuf),
111 type, slot);
112
113 status = ocfs2_find_files_on_disk(namebuf, strlen(namebuf),
114 &blkno, osb->sys_root_inode,
115 &dirent_bh, &de);
116 if (status < 0) {
117 goto bail;
118 }
119
120 inode = ocfs2_iget(osb, blkno);
121 if (IS_ERR(inode)) {
122 mlog_errno(PTR_ERR(inode));
123 inode = NULL;
124 goto bail;
125 }
126bail:
127 if (dirent_bh)
128 brelse(dirent_bh);
129 return inode;
130}
131
diff --git a/fs/ocfs2/sysfile.h b/fs/ocfs2/sysfile.h
new file mode 100644
index 00000000000..cc9ea661ffc
--- /dev/null
+++ b/fs/ocfs2/sysfile.h
@@ -0,0 +1,33 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * sysfile.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_SYSFILE_H
27#define OCFS2_SYSFILE_H
28
29struct inode * ocfs2_get_system_file_inode(struct ocfs2_super *osb,
30 int type,
31 u32 slot);
32
33#endif /* OCFS2_SYSFILE_H */
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
new file mode 100644
index 00000000000..3a0458fd3e1
--- /dev/null
+++ b/fs/ocfs2/uptodate.c
@@ -0,0 +1,544 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * uptodate.c
5 *
6 * Tracking the up-to-date-ness of a local buffer_head with respect to
7 * the cluster.
8 *
9 * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 021110-1307, USA.
25 *
26 * Standard buffer head caching flags (uptodate, etc) are insufficient
27 * in a clustered environment - a buffer may be marked up to date on
28 * our local node but could have been modified by another cluster
29 * member. As a result an additional (and performant) caching scheme
30 * is required. A further requirement is that we consume as little
31 * memory as possible - we never pin buffer_head structures in order
32 * to cache them.
33 *
34 * We track the existence of up to date buffers on the inodes which
35 * are associated with them. Because we don't want to pin
36 * buffer_heads, this is only a (strong) hint and several other checks
37 * are made in the I/O path to ensure that we don't use a stale or
38 * invalid buffer without going to disk:
39 * - buffer_jbd is used liberally - if a bh is in the journal on
40 * this node then it *must* be up to date.
41 * - the standard buffer_uptodate() macro is used to detect buffers
42 * which may be invalid (even if we have an up to date tracking
43 * item for them)
44 *
45 * For a full understanding of how this code works together, one
46 * should read the callers in dlmglue.c, the I/O functions in
47 * buffer_head_io.c and ocfs2_journal_access in journal.c
48 */
49
50#include <linux/fs.h>
51#include <linux/types.h>
52#include <linux/slab.h>
53#include <linux/highmem.h>
54#include <linux/buffer_head.h>
55#include <linux/rbtree.h>
56#include <linux/jbd.h>
57
58#define MLOG_MASK_PREFIX ML_UPTODATE
59
60#include <cluster/masklog.h>
61
62#include "ocfs2.h"
63
64#include "inode.h"
65#include "uptodate.h"
66
67struct ocfs2_meta_cache_item {
68 struct rb_node c_node;
69 sector_t c_block;
70};
71
72static kmem_cache_t *ocfs2_uptodate_cachep = NULL;
73
74void ocfs2_metadata_cache_init(struct inode *inode)
75{
76 struct ocfs2_inode_info *oi = OCFS2_I(inode);
77 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
78
79 oi->ip_flags |= OCFS2_INODE_CACHE_INLINE;
80 ci->ci_num_cached = 0;
81}
82
83/* No lock taken here as 'root' is not expected to be visible to other
84 * processes. */
85static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
86{
87 unsigned int purged = 0;
88 struct rb_node *node;
89 struct ocfs2_meta_cache_item *item;
90
91 while ((node = rb_last(root)) != NULL) {
92 item = rb_entry(node, struct ocfs2_meta_cache_item, c_node);
93
94 mlog(0, "Purge item %llu\n",
95 (unsigned long long) item->c_block);
96
97 rb_erase(&item->c_node, root);
98 kmem_cache_free(ocfs2_uptodate_cachep, item);
99
100 purged++;
101 }
102 return purged;
103}
104
105/* Called from locking and called from ocfs2_clear_inode. Dump the
106 * cache for a given inode.
107 *
108 * This function is a few more lines longer than necessary due to some
109 * accounting done here, but I think it's worth tracking down those
110 * bugs sooner -- Mark */
111void ocfs2_metadata_cache_purge(struct inode *inode)
112{
113 struct ocfs2_inode_info *oi = OCFS2_I(inode);
114 unsigned int tree, to_purge, purged;
115 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
116 struct rb_root root = RB_ROOT;
117
118 spin_lock(&oi->ip_lock);
119 tree = !(oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
120 to_purge = ci->ci_num_cached;
121
122 mlog(0, "Purge %u %s items from Inode %"MLFu64"\n", to_purge,
123 tree ? "array" : "tree", oi->ip_blkno);
124
125 /* If we're a tree, save off the root so that we can safely
126 * initialize the cache. We do the work to free tree members
127 * without the spinlock. */
128 if (tree)
129 root = ci->ci_cache.ci_tree;
130
131 ocfs2_metadata_cache_init(inode);
132 spin_unlock(&oi->ip_lock);
133
134 purged = ocfs2_purge_copied_metadata_tree(&root);
135 /* If possible, track the number wiped so that we can more
136 * easily detect counting errors. Unfortunately, this is only
137 * meaningful for trees. */
138 if (tree && purged != to_purge)
139 mlog(ML_ERROR, "Inode %"MLFu64", count = %u, purged = %u\n",
140 oi->ip_blkno, to_purge, purged);
141}
142
143/* Returns the index in the cache array, -1 if not found.
144 * Requires ip_lock. */
145static int ocfs2_search_cache_array(struct ocfs2_caching_info *ci,
146 sector_t item)
147{
148 int i;
149
150 for (i = 0; i < ci->ci_num_cached; i++) {
151 if (item == ci->ci_cache.ci_array[i])
152 return i;
153 }
154
155 return -1;
156}
157
158/* Returns the cache item if found, otherwise NULL.
159 * Requires ip_lock. */
160static struct ocfs2_meta_cache_item *
161ocfs2_search_cache_tree(struct ocfs2_caching_info *ci,
162 sector_t block)
163{
164 struct rb_node * n = ci->ci_cache.ci_tree.rb_node;
165 struct ocfs2_meta_cache_item *item = NULL;
166
167 while (n) {
168 item = rb_entry(n, struct ocfs2_meta_cache_item, c_node);
169
170 if (block < item->c_block)
171 n = n->rb_left;
172 else if (block > item->c_block)
173 n = n->rb_right;
174 else
175 return item;
176 }
177
178 return NULL;
179}
180
181static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi,
182 struct buffer_head *bh)
183{
184 int index = -1;
185 struct ocfs2_meta_cache_item *item = NULL;
186
187 spin_lock(&oi->ip_lock);
188
189 mlog(0, "Inode %"MLFu64", query block %llu (inline = %u)\n",
190 oi->ip_blkno, (unsigned long long) bh->b_blocknr,
191 !!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE));
192
193 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE)
194 index = ocfs2_search_cache_array(&oi->ip_metadata_cache,
195 bh->b_blocknr);
196 else
197 item = ocfs2_search_cache_tree(&oi->ip_metadata_cache,
198 bh->b_blocknr);
199
200 spin_unlock(&oi->ip_lock);
201
202 mlog(0, "index = %d, item = %p\n", index, item);
203
204 return (index != -1) || (item != NULL);
205}
206
207/* Warning: even if it returns true, this does *not* guarantee that
208 * the block is stored in our inode metadata cache. */
209int ocfs2_buffer_uptodate(struct inode *inode,
210 struct buffer_head *bh)
211{
212 /* Doesn't matter if the bh is in our cache or not -- if it's
213 * not marked uptodate then we know it can't have correct
214 * data. */
215 if (!buffer_uptodate(bh))
216 return 0;
217
218 /* OCFS2 does not allow multiple nodes to be changing the same
219 * block at the same time. */
220 if (buffer_jbd(bh))
221 return 1;
222
223 /* Ok, locally the buffer is marked as up to date, now search
224 * our cache to see if we can trust that. */
225 return ocfs2_buffer_cached(OCFS2_I(inode), bh);
226}
227
228/* Requires ip_lock */
229static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
230 sector_t block)
231{
232 BUG_ON(ci->ci_num_cached >= OCFS2_INODE_MAX_CACHE_ARRAY);
233
234 mlog(0, "block %llu takes position %u\n", (unsigned long long) block,
235 ci->ci_num_cached);
236
237 ci->ci_cache.ci_array[ci->ci_num_cached] = block;
238 ci->ci_num_cached++;
239}
240
241/* By now the caller should have checked that the item does *not*
242 * exist in the tree.
243 * Requires ip_lock. */
244static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
245 struct ocfs2_meta_cache_item *new)
246{
247 sector_t block = new->c_block;
248 struct rb_node *parent = NULL;
249 struct rb_node **p = &ci->ci_cache.ci_tree.rb_node;
250 struct ocfs2_meta_cache_item *tmp;
251
252 mlog(0, "Insert block %llu num = %u\n", (unsigned long long) block,
253 ci->ci_num_cached);
254
255 while(*p) {
256 parent = *p;
257
258 tmp = rb_entry(parent, struct ocfs2_meta_cache_item, c_node);
259
260 if (block < tmp->c_block)
261 p = &(*p)->rb_left;
262 else if (block > tmp->c_block)
263 p = &(*p)->rb_right;
264 else {
265 /* This should never happen! */
266 mlog(ML_ERROR, "Duplicate block %llu cached!\n",
267 (unsigned long long) block);
268 BUG();
269 }
270 }
271
272 rb_link_node(&new->c_node, parent, p);
273 rb_insert_color(&new->c_node, &ci->ci_cache.ci_tree);
274 ci->ci_num_cached++;
275}
276
277static inline int ocfs2_insert_can_use_array(struct ocfs2_inode_info *oi,
278 struct ocfs2_caching_info *ci)
279{
280 assert_spin_locked(&oi->ip_lock);
281
282 return (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) &&
283 (ci->ci_num_cached < OCFS2_INODE_MAX_CACHE_ARRAY);
284}
285
286/* tree should be exactly OCFS2_INODE_MAX_CACHE_ARRAY wide. NULL the
287 * pointers in tree after we use them - this allows caller to detect
288 * when to free in case of error. */
289static void ocfs2_expand_cache(struct ocfs2_inode_info *oi,
290 struct ocfs2_meta_cache_item **tree)
291{
292 int i;
293 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
294
295 mlog_bug_on_msg(ci->ci_num_cached != OCFS2_INODE_MAX_CACHE_ARRAY,
296 "Inode %"MLFu64", num cached = %u, should be %u\n",
297 oi->ip_blkno, ci->ci_num_cached,
298 OCFS2_INODE_MAX_CACHE_ARRAY);
299 mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
300 "Inode %"MLFu64" not marked as inline anymore!\n",
301 oi->ip_blkno);
302 assert_spin_locked(&oi->ip_lock);
303
304 /* Be careful to initialize the tree members *first* because
305 * once the ci_tree is used, the array is junk... */
306 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
307 tree[i]->c_block = ci->ci_cache.ci_array[i];
308
309 oi->ip_flags &= ~OCFS2_INODE_CACHE_INLINE;
310 ci->ci_cache.ci_tree = RB_ROOT;
311 /* this will be set again by __ocfs2_insert_cache_tree */
312 ci->ci_num_cached = 0;
313
314 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
315 __ocfs2_insert_cache_tree(ci, tree[i]);
316 tree[i] = NULL;
317 }
318
319 mlog(0, "Expanded %"MLFu64" to a tree cache: flags 0x%x, num = %u\n",
320 oi->ip_blkno, oi->ip_flags, ci->ci_num_cached);
321}
322
323/* Slow path function - memory allocation is necessary. See the
324 * comment above ocfs2_set_buffer_uptodate for more information. */
325static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
326 sector_t block,
327 int expand_tree)
328{
329 int i;
330 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
331 struct ocfs2_meta_cache_item *new = NULL;
332 struct ocfs2_meta_cache_item *tree[OCFS2_INODE_MAX_CACHE_ARRAY] =
333 { NULL, };
334
335 mlog(0, "Inode %"MLFu64", block %llu, expand = %d\n",
336 oi->ip_blkno, (unsigned long long) block, expand_tree);
337
338 new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_KERNEL);
339 if (!new) {
340 mlog_errno(-ENOMEM);
341 return;
342 }
343 new->c_block = block;
344
345 if (expand_tree) {
346 /* Do *not* allocate an array here - the removal code
347 * has no way of tracking that. */
348 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
349 tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep,
350 GFP_KERNEL);
351 if (!tree[i]) {
352 mlog_errno(-ENOMEM);
353 goto out_free;
354 }
355
356 /* These are initialized in ocfs2_expand_cache! */
357 }
358 }
359
360 spin_lock(&oi->ip_lock);
361 if (ocfs2_insert_can_use_array(oi, ci)) {
362 mlog(0, "Someone cleared the tree underneath us\n");
363 /* Ok, items were removed from the cache in between
364 * locks. Detect this and revert back to the fast path */
365 ocfs2_append_cache_array(ci, block);
366 spin_unlock(&oi->ip_lock);
367 goto out_free;
368 }
369
370 if (expand_tree)
371 ocfs2_expand_cache(oi, tree);
372
373 __ocfs2_insert_cache_tree(ci, new);
374 spin_unlock(&oi->ip_lock);
375
376 new = NULL;
377out_free:
378 if (new)
379 kmem_cache_free(ocfs2_uptodate_cachep, new);
380
381 /* If these were used, then ocfs2_expand_cache re-set them to
382 * NULL for us. */
383 if (tree[0]) {
384 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
385 if (tree[i])
386 kmem_cache_free(ocfs2_uptodate_cachep,
387 tree[i]);
388 }
389}
390
391/* Item insertion is guarded by ip_io_sem, so the insertion path takes
392 * advantage of this by not rechecking for a duplicate insert during
393 * the slow case. Additionally, if the cache needs to be bumped up to
394 * a tree, the code will not recheck after acquiring the lock --
395 * multiple paths cannot be expanding to a tree at the same time.
396 *
397 * The slow path takes into account that items can be removed
398 * (including the whole tree wiped and reset) when this process it out
399 * allocating memory. In those cases, it reverts back to the fast
400 * path.
401 *
402 * Note that this function may actually fail to insert the block if
403 * memory cannot be allocated. This is not fatal however (but may
404 * result in a performance penalty) */
405void ocfs2_set_buffer_uptodate(struct inode *inode,
406 struct buffer_head *bh)
407{
408 int expand;
409 struct ocfs2_inode_info *oi = OCFS2_I(inode);
410 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
411
412 /* The block may very well exist in our cache already, so avoid
413 * doing any more work in that case. */
414 if (ocfs2_buffer_cached(oi, bh))
415 return;
416
417 mlog(0, "Inode %"MLFu64", inserting block %llu\n", oi->ip_blkno,
418 (unsigned long long) bh->b_blocknr);
419
420 /* No need to recheck under spinlock - insertion is guarded by
421 * ip_io_sem */
422 spin_lock(&oi->ip_lock);
423 if (ocfs2_insert_can_use_array(oi, ci)) {
424 /* Fast case - it's an array and there's a free
425 * spot. */
426 ocfs2_append_cache_array(ci, bh->b_blocknr);
427 spin_unlock(&oi->ip_lock);
428 return;
429 }
430
431 expand = 0;
432 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
433 /* We need to bump things up to a tree. */
434 expand = 1;
435 }
436 spin_unlock(&oi->ip_lock);
437
438 __ocfs2_set_buffer_uptodate(oi, bh->b_blocknr, expand);
439}
440
441/* Called against a newly allocated buffer. Most likely nobody should
442 * be able to read this sort of metadata while it's still being
443 * allocated, but this is careful to take ip_io_sem anyway. */
444void ocfs2_set_new_buffer_uptodate(struct inode *inode,
445 struct buffer_head *bh)
446{
447 struct ocfs2_inode_info *oi = OCFS2_I(inode);
448
449 /* This should definitely *not* exist in our cache */
450 BUG_ON(ocfs2_buffer_cached(oi, bh));
451
452 set_buffer_uptodate(bh);
453
454 down(&oi->ip_io_sem);
455 ocfs2_set_buffer_uptodate(inode, bh);
456 up(&oi->ip_io_sem);
457}
458
459/* Requires ip_lock. */
460static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
461 int index)
462{
463 sector_t *array = ci->ci_cache.ci_array;
464 int bytes;
465
466 BUG_ON(index < 0 || index >= OCFS2_INODE_MAX_CACHE_ARRAY);
467 BUG_ON(index >= ci->ci_num_cached);
468 BUG_ON(!ci->ci_num_cached);
469
470 mlog(0, "remove index %d (num_cached = %u\n", index,
471 ci->ci_num_cached);
472
473 ci->ci_num_cached--;
474
475 /* don't need to copy if the array is now empty, or if we
476 * removed at the tail */
477 if (ci->ci_num_cached && index < ci->ci_num_cached) {
478 bytes = sizeof(sector_t) * (ci->ci_num_cached - index);
479 memmove(&array[index], &array[index + 1], bytes);
480 }
481}
482
483/* Requires ip_lock. */
484static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
485 struct ocfs2_meta_cache_item *item)
486{
487 mlog(0, "remove block %llu from tree\n",
488 (unsigned long long) item->c_block);
489
490 rb_erase(&item->c_node, &ci->ci_cache.ci_tree);
491 ci->ci_num_cached--;
492}
493
494/* Called when we remove a chunk of metadata from an inode. We don't
495 * bother reverting things to an inlined array in the case of a remove
496 * which moves us back under the limit. */
497void ocfs2_remove_from_cache(struct inode *inode,
498 struct buffer_head *bh)
499{
500 int index;
501 sector_t block = bh->b_blocknr;
502 struct ocfs2_meta_cache_item *item = NULL;
503 struct ocfs2_inode_info *oi = OCFS2_I(inode);
504 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
505
506 spin_lock(&oi->ip_lock);
507 mlog(0, "Inode %"MLFu64", remove %llu, items = %u, array = %u\n",
508 oi->ip_blkno, (unsigned long long) block, ci->ci_num_cached,
509 oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
510
511 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
512 index = ocfs2_search_cache_array(ci, block);
513 if (index != -1)
514 ocfs2_remove_metadata_array(ci, index);
515 } else {
516 item = ocfs2_search_cache_tree(ci, block);
517 if (item)
518 ocfs2_remove_metadata_tree(ci, item);
519 }
520 spin_unlock(&oi->ip_lock);
521
522 if (item)
523 kmem_cache_free(ocfs2_uptodate_cachep, item);
524}
525
526int __init init_ocfs2_uptodate_cache(void)
527{
528 ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
529 sizeof(struct ocfs2_meta_cache_item),
530 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
531 if (!ocfs2_uptodate_cachep)
532 return -ENOMEM;
533
534 mlog(0, "%u inlined cache items per inode.\n",
535 OCFS2_INODE_MAX_CACHE_ARRAY);
536
537 return 0;
538}
539
540void __exit exit_ocfs2_uptodate_cache(void)
541{
542 if (ocfs2_uptodate_cachep)
543 kmem_cache_destroy(ocfs2_uptodate_cachep);
544}
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
new file mode 100644
index 00000000000..e5aacdf4eab
--- /dev/null
+++ b/fs/ocfs2/uptodate.h
@@ -0,0 +1,44 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * uptodate.h
5 *
6 * Cluster uptodate tracking
7 *
8 * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_UPTODATE_H
27#define OCFS2_UPTODATE_H
28
29int __init init_ocfs2_uptodate_cache(void);
30void __exit exit_ocfs2_uptodate_cache(void);
31
32void ocfs2_metadata_cache_init(struct inode *inode);
33void ocfs2_metadata_cache_purge(struct inode *inode);
34
35int ocfs2_buffer_uptodate(struct inode *inode,
36 struct buffer_head *bh);
37void ocfs2_set_buffer_uptodate(struct inode *inode,
38 struct buffer_head *bh);
39void ocfs2_set_new_buffer_uptodate(struct inode *inode,
40 struct buffer_head *bh);
41void ocfs2_remove_from_cache(struct inode *inode,
42 struct buffer_head *bh);
43
44#endif /* OCFS2_UPTODATE_H */
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
new file mode 100644
index 00000000000..5405ce121c9
--- /dev/null
+++ b/fs/ocfs2/ver.c
@@ -0,0 +1,43 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ver.c
5 *
6 * version string
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/string.h>
28#include <linux/kernel.h>
29
30#include "ver.h"
31
32#define OCFS2_BUILD_VERSION "1.3.3"
33
34#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
35
36void ocfs2_print_version(void)
37{
38 printk(KERN_INFO "%s\n", VERSION_STR);
39}
40
41MODULE_DESCRIPTION(VERSION_STR);
42
43MODULE_VERSION(OCFS2_BUILD_VERSION);
diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h
new file mode 100644
index 00000000000..d7395cb91d2
--- /dev/null
+++ b/fs/ocfs2/ver.h
@@ -0,0 +1,31 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ver.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_VER_H
27#define OCFS2_VER_H
28
29void ocfs2_print_version(void);
30
31#endif /* OCFS2_VER_H */
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
new file mode 100644
index 00000000000..021978e0576
--- /dev/null
+++ b/fs/ocfs2/vote.c
@@ -0,0 +1,1202 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * vote.c
5 *
6 * description here
7 *
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/types.h>
27#include <linux/slab.h>
28#include <linux/highmem.h>
29#include <linux/smp_lock.h>
30#include <linux/kthread.h>
31
32#include <cluster/heartbeat.h>
33#include <cluster/nodemanager.h>
34#include <cluster/tcp.h>
35
36#include <dlm/dlmapi.h>
37
38#define MLOG_MASK_PREFIX ML_VOTE
39#include <cluster/masklog.h>
40
41#include "ocfs2.h"
42
43#include "alloc.h"
44#include "dlmglue.h"
45#include "extent_map.h"
46#include "heartbeat.h"
47#include "inode.h"
48#include "journal.h"
49#include "slot_map.h"
50#include "vote.h"
51
52#include "buffer_head_io.h"
53
54#define OCFS2_MESSAGE_TYPE_VOTE (0x1)
55#define OCFS2_MESSAGE_TYPE_RESPONSE (0x2)
56struct ocfs2_msg_hdr
57{
58 __be32 h_response_id; /* used to lookup message handle on sending
59 * node. */
60 __be32 h_request;
61 __be64 h_blkno;
62 __be32 h_generation;
63 __be32 h_node_num; /* node sending this particular message. */
64};
65
66/* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this
67 * for the network. */
68#define OCFS2_VOTE_FILENAME_LEN 256
69struct ocfs2_vote_msg
70{
71 struct ocfs2_msg_hdr v_hdr;
72 union {
73 __be32 v_generic1;
74 __be32 v_orphaned_slot; /* Used during delete votes */
75 __be32 v_nlink; /* Used during unlink votes */
76 } md1; /* Message type dependant 1 */
77 __be32 v_unlink_namelen;
78 __be64 v_unlink_parent;
79 u8 v_unlink_dirent[OCFS2_VOTE_FILENAME_LEN];
80};
81
82/* Responses are given these values to maintain backwards
83 * compatibility with older ocfs2 versions */
84#define OCFS2_RESPONSE_OK (0)
85#define OCFS2_RESPONSE_BUSY (-16)
86#define OCFS2_RESPONSE_BAD_MSG (-22)
87
88struct ocfs2_response_msg
89{
90 struct ocfs2_msg_hdr r_hdr;
91 __be32 r_response;
92 __be32 r_orphaned_slot;
93};
94
95struct ocfs2_vote_work {
96 struct list_head w_list;
97 struct ocfs2_vote_msg w_msg;
98};
99
100enum ocfs2_vote_request {
101 OCFS2_VOTE_REQ_INVALID = 0,
102 OCFS2_VOTE_REQ_DELETE,
103 OCFS2_VOTE_REQ_UNLINK,
104 OCFS2_VOTE_REQ_RENAME,
105 OCFS2_VOTE_REQ_MOUNT,
106 OCFS2_VOTE_REQ_UMOUNT,
107 OCFS2_VOTE_REQ_LAST
108};
109
110static inline int ocfs2_is_valid_vote_request(int request)
111{
112 return OCFS2_VOTE_REQ_INVALID < request &&
113 request < OCFS2_VOTE_REQ_LAST;
114}
115
116typedef void (*ocfs2_net_response_callback)(void *priv,
117 struct ocfs2_response_msg *resp);
118struct ocfs2_net_response_cb {
119 ocfs2_net_response_callback rc_cb;
120 void *rc_priv;
121};
122
123struct ocfs2_net_wait_ctxt {
124 struct list_head n_list;
125 u32 n_response_id;
126 wait_queue_head_t n_event;
127 struct ocfs2_node_map n_node_map;
128 int n_response; /* an agreggate response. 0 if
129 * all nodes are go, < 0 on any
130 * negative response from any
131 * node or network error. */
132 struct ocfs2_net_response_cb *n_callback;
133};
134
135static void ocfs2_process_mount_request(struct ocfs2_super *osb,
136 unsigned int node_num)
137{
138 mlog(0, "MOUNT vote from node %u\n", node_num);
139 /* The other node only sends us this message when he has an EX
140 * on the superblock, so our recovery threads (if having been
141 * launched) are waiting on it.*/
142 ocfs2_recovery_map_clear(osb, node_num);
143 ocfs2_node_map_set_bit(osb, &osb->mounted_map, node_num);
144
145 /* We clear the umount map here because a node may have been
146 * previously mounted, safely unmounted but never stopped
147 * heartbeating - in which case we'd have a stale entry. */
148 ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
149}
150
151static void ocfs2_process_umount_request(struct ocfs2_super *osb,
152 unsigned int node_num)
153{
154 mlog(0, "UMOUNT vote from node %u\n", node_num);
155 ocfs2_node_map_clear_bit(osb, &osb->mounted_map, node_num);
156 ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
157}
158
159void ocfs2_mark_inode_remotely_deleted(struct inode *inode)
160{
161 struct ocfs2_inode_info *oi = OCFS2_I(inode);
162
163 assert_spin_locked(&oi->ip_lock);
164 /* We set the SKIP_DELETE flag on the inode so we don't try to
165 * delete it in delete_inode ourselves, thus avoiding
166 * unecessary lock pinging. If the other node failed to wipe
167 * the inode as a result of a crash, then recovery will pick
168 * up the slack. */
169 oi->ip_flags |= OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE;
170}
171
172static int ocfs2_process_delete_request(struct inode *inode,
173 int *orphaned_slot)
174{
175 int response = OCFS2_RESPONSE_BUSY;
176
177 mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n",
178 inode->i_ino, inode->i_nlink, *orphaned_slot);
179
180 spin_lock(&OCFS2_I(inode)->ip_lock);
181
182 /* Whatever our vote response is, we want to make sure that
183 * the orphaned slot is recorded properly on this node *and*
184 * on the requesting node. Technically, if the requesting node
185 * did not know which slot the inode is orphaned in but we
186 * respond with BUSY he doesn't actually need the orphaned
187 * slot, but it doesn't hurt to do it here anyway. */
188 if ((*orphaned_slot) != OCFS2_INVALID_SLOT) {
189 mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot !=
190 OCFS2_INVALID_SLOT &&
191 OCFS2_I(inode)->ip_orphaned_slot !=
192 (*orphaned_slot),
193 "Inode %"MLFu64": This node thinks it's "
194 "orphaned in slot %d, messaged it's in %d\n",
195 OCFS2_I(inode)->ip_blkno,
196 OCFS2_I(inode)->ip_orphaned_slot,
197 *orphaned_slot);
198
199 mlog(0, "Setting orphaned slot for inode %"MLFu64" to %d\n",
200 OCFS2_I(inode)->ip_blkno, *orphaned_slot);
201
202 OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot;
203 } else {
204 mlog(0, "Sending back orphaned slot %d for inode %"MLFu64"\n",
205 OCFS2_I(inode)->ip_orphaned_slot,
206 OCFS2_I(inode)->ip_blkno);
207
208 *orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
209 }
210
211 /* vote no if the file is still open. */
212 if (OCFS2_I(inode)->ip_open_count) {
213 mlog(0, "open count = %u\n",
214 OCFS2_I(inode)->ip_open_count);
215 spin_unlock(&OCFS2_I(inode)->ip_lock);
216 goto done;
217 }
218 spin_unlock(&OCFS2_I(inode)->ip_lock);
219
220 /* directories are a bit ugly... What if someone is sitting in
221 * it? We want to make sure the inode is removed completely as
222 * a result of the iput in process_vote. */
223 if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) {
224 mlog(0, "i_count = %u\n", atomic_read(&inode->i_count));
225 goto done;
226 }
227
228 if (filemap_fdatawrite(inode->i_mapping)) {
229 mlog(ML_ERROR, "Could not sync inode %"MLFu64" for delete!\n",
230 OCFS2_I(inode)->ip_blkno);
231 goto done;
232 }
233 sync_mapping_buffers(inode->i_mapping);
234 truncate_inode_pages(inode->i_mapping, 0);
235 ocfs2_extent_map_trunc(inode, 0);
236
237 spin_lock(&OCFS2_I(inode)->ip_lock);
238 /* double check open count - someone might have raced this
239 * thread into ocfs2_file_open while we were writing out
240 * data. If we're to allow a wipe of this inode now, we *must*
241 * hold the spinlock until we've marked it. */
242 if (OCFS2_I(inode)->ip_open_count) {
243 mlog(0, "Raced to wipe! open count = %u\n",
244 OCFS2_I(inode)->ip_open_count);
245 spin_unlock(&OCFS2_I(inode)->ip_lock);
246 goto done;
247 }
248
249 /* Mark the inode as being wiped from disk. */
250 ocfs2_mark_inode_remotely_deleted(inode);
251 spin_unlock(&OCFS2_I(inode)->ip_lock);
252
253 /* Not sure this is necessary anymore. */
254 d_prune_aliases(inode);
255
256 /* If we get here, then we're voting 'yes', so commit the
257 * delete on our side. */
258 response = OCFS2_RESPONSE_OK;
259done:
260 return response;
261}
262
263static int ocfs2_match_dentry(struct dentry *dentry,
264 u64 parent_blkno,
265 unsigned int namelen,
266 const char *name)
267{
268 struct inode *parent;
269
270 if (!dentry->d_parent) {
271 mlog(0, "Detached from parent.\n");
272 return 0;
273 }
274
275 parent = dentry->d_parent->d_inode;
276 /* Negative parent dentry? */
277 if (!parent)
278 return 0;
279
280 /* Name is in a different directory. */
281 if (OCFS2_I(parent)->ip_blkno != parent_blkno)
282 return 0;
283
284 if (dentry->d_name.len != namelen)
285 return 0;
286
287 /* comparison above guarantees this is safe. */
288 if (memcmp(dentry->d_name.name, name, namelen))
289 return 0;
290
291 return 1;
292}
293
294static void ocfs2_process_dentry_request(struct inode *inode,
295 int rename,
296 unsigned int new_nlink,
297 u64 parent_blkno,
298 unsigned int namelen,
299 const char *name)
300{
301 struct dentry *dentry = NULL;
302 struct list_head *p;
303 struct ocfs2_inode_info *oi = OCFS2_I(inode);
304
305 mlog(0, "parent %"MLFu64", namelen = %u, name = %.*s\n", parent_blkno,
306 namelen, namelen, name);
307
308 spin_lock(&dcache_lock);
309
310 /* Another node is removing this name from the system. It is
311 * up to us to find the corresponding dentry and if it exists,
312 * unhash it from the dcache. */
313 list_for_each(p, &inode->i_dentry) {
314 dentry = list_entry(p, struct dentry, d_alias);
315
316 if (ocfs2_match_dentry(dentry, parent_blkno, namelen, name)) {
317 mlog(0, "dentry found: %.*s\n",
318 dentry->d_name.len, dentry->d_name.name);
319
320 dget_locked(dentry);
321 break;
322 }
323
324 dentry = NULL;
325 }
326
327 spin_unlock(&dcache_lock);
328
329 if (dentry) {
330 d_delete(dentry);
331 dput(dentry);
332 }
333
334 /* rename votes don't send link counts */
335 if (!rename) {
336 mlog(0, "new_nlink = %u\n", new_nlink);
337
338 /* We don't have the proper locks here to directly
339 * change i_nlink and besides, the vote is sent
340 * *before* the operation so it may have failed on the
341 * other node. This passes a hint to ocfs2_drop_inode
342 * to force ocfs2_delete_inode, who will take the
343 * proper cluster locks to sort things out. */
344 if (new_nlink == 0) {
345 spin_lock(&oi->ip_lock);
346 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
347 spin_unlock(&OCFS2_I(inode)->ip_lock);
348 }
349 }
350}
351
352static void ocfs2_process_vote(struct ocfs2_super *osb,
353 struct ocfs2_vote_msg *msg)
354{
355 int net_status, vote_response;
356 int orphaned_slot = 0;
357 int rename = 0;
358 unsigned int node_num, generation, new_nlink, namelen;
359 u64 blkno, parent_blkno;
360 enum ocfs2_vote_request request;
361 struct inode *inode = NULL;
362 struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
363 struct ocfs2_response_msg response;
364
365 /* decode the network mumbo jumbo into local variables. */
366 request = be32_to_cpu(hdr->h_request);
367 blkno = be64_to_cpu(hdr->h_blkno);
368 generation = be32_to_cpu(hdr->h_generation);
369 node_num = be32_to_cpu(hdr->h_node_num);
370 if (request == OCFS2_VOTE_REQ_DELETE)
371 orphaned_slot = be32_to_cpu(msg->md1.v_orphaned_slot);
372
373 mlog(0, "processing vote: request = %u, blkno = %"MLFu64", "
374 "generation = %u, node_num = %u, priv1 = %u\n", request,
375 blkno, generation, node_num, be32_to_cpu(msg->md1.v_generic1));
376
377 if (!ocfs2_is_valid_vote_request(request)) {
378 mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
379 request, node_num);
380 vote_response = OCFS2_RESPONSE_BAD_MSG;
381 goto respond;
382 }
383
384 vote_response = OCFS2_RESPONSE_OK;
385
386 switch (request) {
387 case OCFS2_VOTE_REQ_UMOUNT:
388 ocfs2_process_umount_request(osb, node_num);
389 goto respond;
390 case OCFS2_VOTE_REQ_MOUNT:
391 ocfs2_process_mount_request(osb, node_num);
392 goto respond;
393 default:
394 /* avoids a gcc warning */
395 break;
396 }
397
398 /* We cannot process the remaining message types before we're
399 * fully mounted. It's perfectly safe however to send a 'yes'
400 * response as we can't possibly have any of the state they're
401 * asking us to modify yet. */
402 if (atomic_read(&osb->vol_state) == VOLUME_INIT)
403 goto respond;
404
405 /* If we get here, then the request is against an inode. */
406 inode = ocfs2_ilookup_for_vote(osb, blkno,
407 request == OCFS2_VOTE_REQ_DELETE);
408
409 /* Not finding the inode is perfectly valid - it means we're
410 * not interested in what the other node is about to do to it
411 * so in those cases we automatically respond with an
412 * affirmative. Cluster locking ensures that we won't race
413 * interest in the inode with this vote request. */
414 if (!inode)
415 goto respond;
416
417 /* Check generation values. It's possible for us to get a
418 * request against a stale inode. If so then we proceed as if
419 * we had not found an inode in the first place. */
420 if (inode->i_generation != generation) {
421 mlog(0, "generation passed %u != inode generation = %u, "
422 "ip_flags = %x, ip_blkno = %"MLFu64", msg %"MLFu64", "
423 "i_count = %u, message type = %u\n",
424 generation, inode->i_generation, OCFS2_I(inode)->ip_flags,
425 OCFS2_I(inode)->ip_blkno, blkno,
426 atomic_read(&inode->i_count), request);
427 iput(inode);
428 inode = NULL;
429 goto respond;
430 }
431
432 switch (request) {
433 case OCFS2_VOTE_REQ_DELETE:
434 vote_response = ocfs2_process_delete_request(inode,
435 &orphaned_slot);
436 break;
437 case OCFS2_VOTE_REQ_RENAME:
438 rename = 1;
439 /* fall through */
440 case OCFS2_VOTE_REQ_UNLINK:
441 parent_blkno = be64_to_cpu(msg->v_unlink_parent);
442 namelen = be32_to_cpu(msg->v_unlink_namelen);
443 /* new_nlink will be ignored in case of a rename vote */
444 new_nlink = be32_to_cpu(msg->md1.v_nlink);
445 ocfs2_process_dentry_request(inode, rename, new_nlink,
446 parent_blkno, namelen,
447 msg->v_unlink_dirent);
448 break;
449 default:
450 mlog(ML_ERROR, "node %u, invalid request: %u\n",
451 node_num, request);
452 vote_response = OCFS2_RESPONSE_BAD_MSG;
453 }
454
455respond:
456 /* Response struture is small so we just put it on the stack
457 * and stuff it inline. */
458 memset(&response, 0, sizeof(struct ocfs2_response_msg));
459 response.r_hdr.h_response_id = hdr->h_response_id;
460 response.r_hdr.h_blkno = hdr->h_blkno;
461 response.r_hdr.h_generation = hdr->h_generation;
462 response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
463 response.r_response = cpu_to_be32(vote_response);
464 response.r_orphaned_slot = cpu_to_be32(orphaned_slot);
465
466 net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
467 osb->net_key,
468 &response,
469 sizeof(struct ocfs2_response_msg),
470 node_num,
471 NULL);
472 /* We still want to error print for ENOPROTOOPT here. The
473 * sending node shouldn't have unregistered his net handler
474 * without sending an unmount vote 1st */
475 if (net_status < 0
476 && net_status != -ETIMEDOUT
477 && net_status != -ENOTCONN)
478 mlog(ML_ERROR, "message to node %u fails with error %d!\n",
479 node_num, net_status);
480
481 if (inode)
482 iput(inode);
483}
484
485static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
486{
487 unsigned long processed;
488 struct ocfs2_lock_res *lockres;
489 struct ocfs2_vote_work *work;
490
491 mlog_entry_void();
492
493 spin_lock(&osb->vote_task_lock);
494 /* grab this early so we know to try again if a state change and
495 * wake happens part-way through our work */
496 osb->vote_work_sequence = osb->vote_wake_sequence;
497
498 processed = osb->blocked_lock_count;
499 while (processed) {
500 BUG_ON(list_empty(&osb->blocked_lock_list));
501
502 lockres = list_entry(osb->blocked_lock_list.next,
503 struct ocfs2_lock_res, l_blocked_list);
504 list_del_init(&lockres->l_blocked_list);
505 osb->blocked_lock_count--;
506 spin_unlock(&osb->vote_task_lock);
507
508 BUG_ON(!processed);
509 processed--;
510
511 ocfs2_process_blocked_lock(osb, lockres);
512
513 spin_lock(&osb->vote_task_lock);
514 }
515
516 while (osb->vote_count) {
517 BUG_ON(list_empty(&osb->vote_list));
518 work = list_entry(osb->vote_list.next,
519 struct ocfs2_vote_work, w_list);
520 list_del(&work->w_list);
521 osb->vote_count--;
522 spin_unlock(&osb->vote_task_lock);
523
524 ocfs2_process_vote(osb, &work->w_msg);
525 kfree(work);
526
527 spin_lock(&osb->vote_task_lock);
528 }
529 spin_unlock(&osb->vote_task_lock);
530
531 mlog_exit_void();
532}
533
534static int ocfs2_vote_thread_lists_empty(struct ocfs2_super *osb)
535{
536 int empty = 0;
537
538 spin_lock(&osb->vote_task_lock);
539 if (list_empty(&osb->blocked_lock_list) &&
540 list_empty(&osb->vote_list))
541 empty = 1;
542
543 spin_unlock(&osb->vote_task_lock);
544 return empty;
545}
546
547static int ocfs2_vote_thread_should_wake(struct ocfs2_super *osb)
548{
549 int should_wake = 0;
550
551 spin_lock(&osb->vote_task_lock);
552 if (osb->vote_work_sequence != osb->vote_wake_sequence)
553 should_wake = 1;
554 spin_unlock(&osb->vote_task_lock);
555
556 return should_wake;
557}
558
559int ocfs2_vote_thread(void *arg)
560{
561 int status = 0;
562 struct ocfs2_super *osb = arg;
563
564 /* only quit once we've been asked to stop and there is no more
565 * work available */
566 while (!(kthread_should_stop() &&
567 ocfs2_vote_thread_lists_empty(osb))) {
568
569 wait_event_interruptible(osb->vote_event,
570 ocfs2_vote_thread_should_wake(osb) ||
571 kthread_should_stop());
572
573 mlog(0, "vote_thread: awoken\n");
574
575 ocfs2_vote_thread_do_work(osb);
576 }
577
578 osb->vote_task = NULL;
579 return status;
580}
581
582static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response_id)
583{
584 struct ocfs2_net_wait_ctxt *w;
585
586 w = kcalloc(1, sizeof(*w), GFP_KERNEL);
587 if (!w) {
588 mlog_errno(-ENOMEM);
589 goto bail;
590 }
591
592 INIT_LIST_HEAD(&w->n_list);
593 init_waitqueue_head(&w->n_event);
594 ocfs2_node_map_init(&w->n_node_map);
595 w->n_response_id = response_id;
596 w->n_callback = NULL;
597bail:
598 return w;
599}
600
601static unsigned int ocfs2_new_response_id(struct ocfs2_super *osb)
602{
603 unsigned int ret;
604
605 spin_lock(&osb->net_response_lock);
606 ret = ++osb->net_response_ids;
607 spin_unlock(&osb->net_response_lock);
608
609 return ret;
610}
611
612static void ocfs2_dequeue_net_wait_ctxt(struct ocfs2_super *osb,
613 struct ocfs2_net_wait_ctxt *w)
614{
615 spin_lock(&osb->net_response_lock);
616 list_del(&w->n_list);
617 spin_unlock(&osb->net_response_lock);
618}
619
620static void ocfs2_queue_net_wait_ctxt(struct ocfs2_super *osb,
621 struct ocfs2_net_wait_ctxt *w)
622{
623 spin_lock(&osb->net_response_lock);
624 list_add_tail(&w->n_list,
625 &osb->net_response_list);
626 spin_unlock(&osb->net_response_lock);
627}
628
629static void __ocfs2_mark_node_responded(struct ocfs2_super *osb,
630 struct ocfs2_net_wait_ctxt *w,
631 int node_num)
632{
633 assert_spin_locked(&osb->net_response_lock);
634
635 ocfs2_node_map_clear_bit(osb, &w->n_node_map, node_num);
636 if (ocfs2_node_map_is_empty(osb, &w->n_node_map))
637 wake_up(&w->n_event);
638}
639
640/* Intended to be called from the node down callback, we fake remove
641 * the node from all our response contexts */
642void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
643 int node_num)
644{
645 struct list_head *p;
646 struct ocfs2_net_wait_ctxt *w = NULL;
647
648 spin_lock(&osb->net_response_lock);
649
650 list_for_each(p, &osb->net_response_list) {
651 w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
652
653 __ocfs2_mark_node_responded(osb, w, node_num);
654 }
655
656 spin_unlock(&osb->net_response_lock);
657}
658
659static int ocfs2_broadcast_vote(struct ocfs2_super *osb,
660 struct ocfs2_vote_msg *request,
661 unsigned int response_id,
662 int *response,
663 struct ocfs2_net_response_cb *callback)
664{
665 int status, i, remote_err;
666 struct ocfs2_net_wait_ctxt *w = NULL;
667 int dequeued = 0;
668
669 mlog_entry_void();
670
671 w = ocfs2_new_net_wait_ctxt(response_id);
672 if (!w) {
673 status = -ENOMEM;
674 mlog_errno(status);
675 goto bail;
676 }
677 w->n_callback = callback;
678
679 /* we're pretty much ready to go at this point, and this fills
680 * in n_response which we need anyway... */
681 ocfs2_queue_net_wait_ctxt(osb, w);
682
683 i = ocfs2_node_map_iterate(osb, &osb->mounted_map, 0);
684
685 while (i != O2NM_INVALID_NODE_NUM) {
686 if (i != osb->node_num) {
687 mlog(0, "trying to send request to node %i\n", i);
688 ocfs2_node_map_set_bit(osb, &w->n_node_map, i);
689
690 remote_err = 0;
691 status = o2net_send_message(OCFS2_MESSAGE_TYPE_VOTE,
692 osb->net_key,
693 request,
694 sizeof(*request),
695 i,
696 &remote_err);
697 if (status == -ETIMEDOUT) {
698 mlog(0, "remote node %d timed out!\n", i);
699 status = -EAGAIN;
700 goto bail;
701 }
702 if (remote_err < 0) {
703 status = remote_err;
704 mlog(0, "remote error %d on node %d!\n",
705 remote_err, i);
706 mlog_errno(status);
707 goto bail;
708 }
709 if (status < 0) {
710 mlog_errno(status);
711 goto bail;
712 }
713 }
714 i++;
715 i = ocfs2_node_map_iterate(osb, &osb->mounted_map, i);
716 mlog(0, "next is %d, i am %d\n", i, osb->node_num);
717 }
718 mlog(0, "done sending, now waiting on responses...\n");
719
720 wait_event(w->n_event, ocfs2_node_map_is_empty(osb, &w->n_node_map));
721
722 ocfs2_dequeue_net_wait_ctxt(osb, w);
723 dequeued = 1;
724
725 *response = w->n_response;
726 status = 0;
727bail:
728 if (w) {
729 if (!dequeued)
730 ocfs2_dequeue_net_wait_ctxt(osb, w);
731 kfree(w);
732 }
733
734 mlog_exit(status);
735 return status;
736}
737
738static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
739 u64 blkno,
740 unsigned int generation,
741 enum ocfs2_vote_request type,
742 u32 priv)
743{
744 struct ocfs2_vote_msg *request;
745 struct ocfs2_msg_hdr *hdr;
746
747 BUG_ON(!ocfs2_is_valid_vote_request(type));
748
749 request = kcalloc(1, sizeof(*request), GFP_KERNEL);
750 if (!request) {
751 mlog_errno(-ENOMEM);
752 } else {
753 hdr = &request->v_hdr;
754 hdr->h_node_num = cpu_to_be32(osb->node_num);
755 hdr->h_request = cpu_to_be32(type);
756 hdr->h_blkno = cpu_to_be64(blkno);
757 hdr->h_generation = cpu_to_be32(generation);
758
759 request->md1.v_generic1 = cpu_to_be32(priv);
760 }
761
762 return request;
763}
764
765/* Complete the buildup of a new vote request and process the
766 * broadcast return value. */
767static int ocfs2_do_request_vote(struct ocfs2_super *osb,
768 struct ocfs2_vote_msg *request,
769 struct ocfs2_net_response_cb *callback)
770{
771 int status, response;
772 unsigned int response_id;
773 struct ocfs2_msg_hdr *hdr;
774
775 response_id = ocfs2_new_response_id(osb);
776
777 hdr = &request->v_hdr;
778 hdr->h_response_id = cpu_to_be32(response_id);
779
780 status = ocfs2_broadcast_vote(osb, request, response_id, &response,
781 callback);
782 if (status < 0) {
783 mlog_errno(status);
784 goto bail;
785 }
786
787 status = response;
788bail:
789
790 return status;
791}
792
793static int ocfs2_request_vote(struct inode *inode,
794 struct ocfs2_vote_msg *request,
795 struct ocfs2_net_response_cb *callback)
796{
797 int status;
798 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
799
800 if (ocfs2_inode_is_new(inode))
801 return 0;
802
803 status = -EAGAIN;
804 while (status == -EAGAIN) {
805 if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
806 signal_pending(current))
807 return -ERESTARTSYS;
808
809 status = ocfs2_super_lock(osb, 0);
810 if (status < 0) {
811 mlog_errno(status);
812 break;
813 }
814
815 status = 0;
816 if (!ocfs2_node_map_is_only(osb, &osb->mounted_map,
817 osb->node_num))
818 status = ocfs2_do_request_vote(osb, request, callback);
819
820 ocfs2_super_unlock(osb, 0);
821 }
822 return status;
823}
824
825static void ocfs2_delete_response_cb(void *priv,
826 struct ocfs2_response_msg *resp)
827{
828 int orphaned_slot, node;
829 struct inode *inode = priv;
830
831 orphaned_slot = be32_to_cpu(resp->r_orphaned_slot);
832 node = be32_to_cpu(resp->r_hdr.h_node_num);
833 mlog(0, "node %d tells us that inode %"MLFu64" is orphaned in slot "
834 "%d\n", node, OCFS2_I(inode)->ip_blkno, orphaned_slot);
835
836 /* The other node may not actually know which slot the inode
837 * is orphaned in. */
838 if (orphaned_slot == OCFS2_INVALID_SLOT)
839 return;
840
841 /* Ok, the responding node knows which slot this inode is
842 * orphaned in. We verify that the information is correct and
843 * then record this in the inode. ocfs2_delete_inode will use
844 * this information to determine which lock to take. */
845 spin_lock(&OCFS2_I(inode)->ip_lock);
846 mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != orphaned_slot &&
847 OCFS2_I(inode)->ip_orphaned_slot
848 != OCFS2_INVALID_SLOT, "Inode %"MLFu64": Node %d "
849 "says it's orphaned in slot %d, we think it's in %d\n",
850 OCFS2_I(inode)->ip_blkno,
851 be32_to_cpu(resp->r_hdr.h_node_num),
852 orphaned_slot, OCFS2_I(inode)->ip_orphaned_slot);
853
854 OCFS2_I(inode)->ip_orphaned_slot = orphaned_slot;
855 spin_unlock(&OCFS2_I(inode)->ip_lock);
856}
857
858int ocfs2_request_delete_vote(struct inode *inode)
859{
860 int orphaned_slot, status;
861 struct ocfs2_net_response_cb delete_cb;
862 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
863 struct ocfs2_vote_msg *request;
864
865 spin_lock(&OCFS2_I(inode)->ip_lock);
866 orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
867 spin_unlock(&OCFS2_I(inode)->ip_lock);
868
869 delete_cb.rc_cb = ocfs2_delete_response_cb;
870 delete_cb.rc_priv = inode;
871
872 mlog(0, "Inode %"MLFu64", we start thinking orphaned slot is %d\n",
873 OCFS2_I(inode)->ip_blkno, orphaned_slot);
874
875 status = -ENOMEM;
876 request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
877 inode->i_generation,
878 OCFS2_VOTE_REQ_DELETE, orphaned_slot);
879 if (request) {
880 status = ocfs2_request_vote(inode, request, &delete_cb);
881
882 kfree(request);
883 }
884
885 return status;
886}
887
888static void ocfs2_setup_unlink_vote(struct ocfs2_vote_msg *request,
889 struct dentry *dentry)
890{
891 struct inode *parent = dentry->d_parent->d_inode;
892
893 /* We need some values which will uniquely identify a dentry
894 * on the other nodes so that they can find it and run
895 * d_delete against it. Parent directory block and full name
896 * should suffice. */
897
898 mlog(0, "unlink/rename request: parent: %"MLFu64" name: %.*s\n",
899 OCFS2_I(parent)->ip_blkno, dentry->d_name.len,
900 dentry->d_name.name);
901
902 request->v_unlink_parent = cpu_to_be64(OCFS2_I(parent)->ip_blkno);
903 request->v_unlink_namelen = cpu_to_be32(dentry->d_name.len);
904 memcpy(request->v_unlink_dirent, dentry->d_name.name,
905 dentry->d_name.len);
906}
907
908int ocfs2_request_unlink_vote(struct inode *inode,
909 struct dentry *dentry,
910 unsigned int nlink)
911{
912 int status;
913 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
914 struct ocfs2_vote_msg *request;
915
916 if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN)
917 return -ENAMETOOLONG;
918
919 status = -ENOMEM;
920 request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
921 inode->i_generation,
922 OCFS2_VOTE_REQ_UNLINK, nlink);
923 if (request) {
924 ocfs2_setup_unlink_vote(request, dentry);
925
926 status = ocfs2_request_vote(inode, request, NULL);
927
928 kfree(request);
929 }
930 return status;
931}
932
933int ocfs2_request_rename_vote(struct inode *inode,
934 struct dentry *dentry)
935{
936 int status;
937 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
938 struct ocfs2_vote_msg *request;
939
940 if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN)
941 return -ENAMETOOLONG;
942
943 status = -ENOMEM;
944 request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
945 inode->i_generation,
946 OCFS2_VOTE_REQ_RENAME, 0);
947 if (request) {
948 ocfs2_setup_unlink_vote(request, dentry);
949
950 status = ocfs2_request_vote(inode, request, NULL);
951
952 kfree(request);
953 }
954 return status;
955}
956
957int ocfs2_request_mount_vote(struct ocfs2_super *osb)
958{
959 int status;
960 struct ocfs2_vote_msg *request = NULL;
961
962 request = ocfs2_new_vote_request(osb, 0ULL, 0,
963 OCFS2_VOTE_REQ_MOUNT, 0);
964 if (!request) {
965 status = -ENOMEM;
966 goto bail;
967 }
968
969 status = -EAGAIN;
970 while (status == -EAGAIN) {
971 if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
972 signal_pending(current)) {
973 status = -ERESTARTSYS;
974 goto bail;
975 }
976
977 if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
978 osb->node_num)) {
979 status = 0;
980 goto bail;
981 }
982
983 status = ocfs2_do_request_vote(osb, request, NULL);
984 }
985
986bail:
987 if (request)
988 kfree(request);
989
990 return status;
991}
992
993int ocfs2_request_umount_vote(struct ocfs2_super *osb)
994{
995 int status;
996 struct ocfs2_vote_msg *request = NULL;
997
998 request = ocfs2_new_vote_request(osb, 0ULL, 0,
999 OCFS2_VOTE_REQ_UMOUNT, 0);
1000 if (!request) {
1001 status = -ENOMEM;
1002 goto bail;
1003 }
1004
1005 status = -EAGAIN;
1006 while (status == -EAGAIN) {
1007 /* Do not check signals on this vote... We really want
1008 * this one to go all the way through. */
1009
1010 if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
1011 osb->node_num)) {
1012 status = 0;
1013 goto bail;
1014 }
1015
1016 status = ocfs2_do_request_vote(osb, request, NULL);
1017 }
1018
1019bail:
1020 if (request)
1021 kfree(request);
1022
1023 return status;
1024}
1025
1026/* TODO: This should eventually be a hash table! */
1027static struct ocfs2_net_wait_ctxt * __ocfs2_find_net_wait_ctxt(struct ocfs2_super *osb,
1028 u32 response_id)
1029{
1030 struct list_head *p;
1031 struct ocfs2_net_wait_ctxt *w = NULL;
1032
1033 list_for_each(p, &osb->net_response_list) {
1034 w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
1035 if (response_id == w->n_response_id)
1036 break;
1037 w = NULL;
1038 }
1039
1040 return w;
1041}
1042
1043/* Translate response codes into local node errno values */
1044static inline int ocfs2_translate_response(int response)
1045{
1046 int ret;
1047
1048 switch (response) {
1049 case OCFS2_RESPONSE_OK:
1050 ret = 0;
1051 break;
1052
1053 case OCFS2_RESPONSE_BUSY:
1054 ret = -EBUSY;
1055 break;
1056
1057 default:
1058 ret = -EINVAL;
1059 }
1060
1061 return ret;
1062}
1063
1064static int ocfs2_handle_response_message(struct o2net_msg *msg,
1065 u32 len,
1066 void *data)
1067{
1068 unsigned int response_id, node_num;
1069 int response_status;
1070 struct ocfs2_super *osb = data;
1071 struct ocfs2_response_msg *resp;
1072 struct ocfs2_net_wait_ctxt * w;
1073 struct ocfs2_net_response_cb *resp_cb;
1074
1075 resp = (struct ocfs2_response_msg *) msg->buf;
1076
1077 response_id = be32_to_cpu(resp->r_hdr.h_response_id);
1078 node_num = be32_to_cpu(resp->r_hdr.h_node_num);
1079 response_status =
1080 ocfs2_translate_response(be32_to_cpu(resp->r_response));
1081
1082 mlog(0, "received response message:\n");
1083 mlog(0, "h_response_id = %u\n", response_id);
1084 mlog(0, "h_request = %u\n", be32_to_cpu(resp->r_hdr.h_request));
1085 mlog(0, "h_blkno = %"MLFu64"\n", be64_to_cpu(resp->r_hdr.h_blkno));
1086 mlog(0, "h_generation = %u\n", be32_to_cpu(resp->r_hdr.h_generation));
1087 mlog(0, "h_node_num = %u\n", node_num);
1088 mlog(0, "r_response = %d\n", response_status);
1089
1090 spin_lock(&osb->net_response_lock);
1091 w = __ocfs2_find_net_wait_ctxt(osb, response_id);
1092 if (!w) {
1093 mlog(0, "request not found!\n");
1094 goto bail;
1095 }
1096 resp_cb = w->n_callback;
1097
1098 if (response_status && (!w->n_response)) {
1099 /* we only really need one negative response so don't
1100 * set it twice. */
1101 w->n_response = response_status;
1102 }
1103
1104 if (resp_cb) {
1105 spin_unlock(&osb->net_response_lock);
1106
1107 resp_cb->rc_cb(resp_cb->rc_priv, resp);
1108
1109 spin_lock(&osb->net_response_lock);
1110 }
1111
1112 __ocfs2_mark_node_responded(osb, w, node_num);
1113bail:
1114 spin_unlock(&osb->net_response_lock);
1115
1116 return 0;
1117}
1118
1119static int ocfs2_handle_vote_message(struct o2net_msg *msg,
1120 u32 len,
1121 void *data)
1122{
1123 int status;
1124 struct ocfs2_super *osb = data;
1125 struct ocfs2_vote_work *work;
1126
1127 work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_KERNEL);
1128 if (!work) {
1129 status = -ENOMEM;
1130 mlog_errno(status);
1131 goto bail;
1132 }
1133
1134 INIT_LIST_HEAD(&work->w_list);
1135 memcpy(&work->w_msg, msg->buf, sizeof(struct ocfs2_vote_msg));
1136
1137 mlog(0, "scheduling vote request:\n");
1138 mlog(0, "h_response_id = %u\n",
1139 be32_to_cpu(work->w_msg.v_hdr.h_response_id));
1140 mlog(0, "h_request = %u\n", be32_to_cpu(work->w_msg.v_hdr.h_request));
1141 mlog(0, "h_blkno = %"MLFu64"\n",
1142 be64_to_cpu(work->w_msg.v_hdr.h_blkno));
1143 mlog(0, "h_generation = %u\n",
1144 be32_to_cpu(work->w_msg.v_hdr.h_generation));
1145 mlog(0, "h_node_num = %u\n",
1146 be32_to_cpu(work->w_msg.v_hdr.h_node_num));
1147 mlog(0, "v_generic1 = %u\n", be32_to_cpu(work->w_msg.md1.v_generic1));
1148
1149 spin_lock(&osb->vote_task_lock);
1150 list_add_tail(&work->w_list, &osb->vote_list);
1151 osb->vote_count++;
1152 spin_unlock(&osb->vote_task_lock);
1153
1154 ocfs2_kick_vote_thread(osb);
1155
1156 status = 0;
1157bail:
1158 return status;
1159}
1160
1161void ocfs2_unregister_net_handlers(struct ocfs2_super *osb)
1162{
1163 if (!osb->net_key)
1164 return;
1165
1166 o2net_unregister_handler_list(&osb->osb_net_handlers);
1167
1168 if (!list_empty(&osb->net_response_list))
1169 mlog(ML_ERROR, "net response list not empty!\n");
1170
1171 osb->net_key = 0;
1172}
1173
1174int ocfs2_register_net_handlers(struct ocfs2_super *osb)
1175{
1176 int status = 0;
1177
1178 status = o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE,
1179 osb->net_key,
1180 sizeof(struct ocfs2_response_msg),
1181 ocfs2_handle_response_message,
1182 osb, &osb->osb_net_handlers);
1183 if (status) {
1184 mlog_errno(status);
1185 goto bail;
1186 }
1187
1188 status = o2net_register_handler(OCFS2_MESSAGE_TYPE_VOTE,
1189 osb->net_key,
1190 sizeof(struct ocfs2_vote_msg),
1191 ocfs2_handle_vote_message,
1192 osb, &osb->osb_net_handlers);
1193 if (status) {
1194 mlog_errno(status);
1195 goto bail;
1196 }
1197bail:
1198 if (status < 0)
1199 ocfs2_unregister_net_handlers(osb);
1200
1201 return status;
1202}
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/vote.h
new file mode 100644
index 00000000000..9cce6070346
--- /dev/null
+++ b/fs/ocfs2/vote.h
@@ -0,0 +1,56 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * vote.h
5 *
6 * description here
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26
27#ifndef VOTE_H
28#define VOTE_H
29
30int ocfs2_vote_thread(void *arg);
31static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
32{
33 spin_lock(&osb->vote_task_lock);
34 /* make sure the voting thread gets a swipe at whatever changes
35 * the caller may have made to the voting state */
36 osb->vote_wake_sequence++;
37 spin_unlock(&osb->vote_task_lock);
38 wake_up(&osb->vote_event);
39}
40
41int ocfs2_request_delete_vote(struct inode *inode);
42int ocfs2_request_unlink_vote(struct inode *inode,
43 struct dentry *dentry,
44 unsigned int nlink);
45int ocfs2_request_rename_vote(struct inode *inode,
46 struct dentry *dentry);
47int ocfs2_request_mount_vote(struct ocfs2_super *osb);
48int ocfs2_request_umount_vote(struct ocfs2_super *osb);
49int ocfs2_register_net_handlers(struct ocfs2_super *osb);
50void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
51
52void ocfs2_mark_inode_remotely_deleted(struct inode *inode);
53
54void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
55 int node_num);
56#endif
diff --git a/fs/open.c b/fs/open.c
index f53a5b9ffb7..70e0230d8e7 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -16,9 +16,11 @@
16#include <linux/tty.h> 16#include <linux/tty.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/backing-dev.h> 18#include <linux/backing-dev.h>
19#include <linux/capability.h>
19#include <linux/security.h> 20#include <linux/security.h>
20#include <linux/mount.h> 21#include <linux/mount.h>
21#include <linux/vfs.h> 22#include <linux/vfs.h>
23#include <linux/fcntl.h>
22#include <asm/uaccess.h> 24#include <asm/uaccess.h>
23#include <linux/fs.h> 25#include <linux/fs.h>
24#include <linux/personality.h> 26#include <linux/personality.h>
@@ -194,7 +196,8 @@ out:
194 return error; 196 return error;
195} 197}
196 198
197int do_truncate(struct dentry *dentry, loff_t length, struct file *filp) 199int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
200 struct file *filp)
198{ 201{
199 int err; 202 int err;
200 struct iattr newattrs; 203 struct iattr newattrs;
@@ -204,19 +207,19 @@ int do_truncate(struct dentry *dentry, loff_t length, struct file *filp)
204 return -EINVAL; 207 return -EINVAL;
205 208
206 newattrs.ia_size = length; 209 newattrs.ia_size = length;
207 newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; 210 newattrs.ia_valid = ATTR_SIZE | time_attrs;
208 if (filp) { 211 if (filp) {
209 newattrs.ia_file = filp; 212 newattrs.ia_file = filp;
210 newattrs.ia_valid |= ATTR_FILE; 213 newattrs.ia_valid |= ATTR_FILE;
211 } 214 }
212 215
213 down(&dentry->d_inode->i_sem); 216 mutex_lock(&dentry->d_inode->i_mutex);
214 err = notify_change(dentry, &newattrs); 217 err = notify_change(dentry, &newattrs);
215 up(&dentry->d_inode->i_sem); 218 mutex_unlock(&dentry->d_inode->i_mutex);
216 return err; 219 return err;
217} 220}
218 221
219static inline long do_sys_truncate(const char __user * path, loff_t length) 222static long do_sys_truncate(const char __user * path, loff_t length)
220{ 223{
221 struct nameidata nd; 224 struct nameidata nd;
222 struct inode * inode; 225 struct inode * inode;
@@ -266,7 +269,7 @@ static inline long do_sys_truncate(const char __user * path, loff_t length)
266 error = locks_verify_truncate(inode, NULL, length); 269 error = locks_verify_truncate(inode, NULL, length);
267 if (!error) { 270 if (!error) {
268 DQUOT_INIT(inode); 271 DQUOT_INIT(inode);
269 error = do_truncate(nd.dentry, length, NULL); 272 error = do_truncate(nd.dentry, length, 0, NULL);
270 } 273 }
271 put_write_access(inode); 274 put_write_access(inode);
272 275
@@ -282,7 +285,7 @@ asmlinkage long sys_truncate(const char __user * path, unsigned long length)
282 return do_sys_truncate(path, (long)length); 285 return do_sys_truncate(path, (long)length);
283} 286}
284 287
285static inline long do_sys_ftruncate(unsigned int fd, loff_t length, int small) 288static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
286{ 289{
287 struct inode * inode; 290 struct inode * inode;
288 struct dentry *dentry; 291 struct dentry *dentry;
@@ -318,7 +321,7 @@ static inline long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
318 321
319 error = locks_verify_truncate(inode, file, length); 322 error = locks_verify_truncate(inode, file, length);
320 if (!error) 323 if (!error)
321 error = do_truncate(dentry, length, file); 324 error = do_truncate(dentry, length, 0, file);
322out_putf: 325out_putf:
323 fput(file); 326 fput(file);
324out: 327out:
@@ -381,7 +384,7 @@ asmlinkage long sys_utime(char __user * filename, struct utimbuf __user * times)
381 384
382 error = get_user(newattrs.ia_atime.tv_sec, &times->actime); 385 error = get_user(newattrs.ia_atime.tv_sec, &times->actime);
383 newattrs.ia_atime.tv_nsec = 0; 386 newattrs.ia_atime.tv_nsec = 0;
384 if (!error) 387 if (!error)
385 error = get_user(newattrs.ia_mtime.tv_sec, &times->modtime); 388 error = get_user(newattrs.ia_mtime.tv_sec, &times->modtime);
386 newattrs.ia_mtime.tv_nsec = 0; 389 newattrs.ia_mtime.tv_nsec = 0;
387 if (error) 390 if (error)
@@ -397,9 +400,9 @@ asmlinkage long sys_utime(char __user * filename, struct utimbuf __user * times)
397 (error = vfs_permission(&nd, MAY_WRITE)) != 0) 400 (error = vfs_permission(&nd, MAY_WRITE)) != 0)
398 goto dput_and_out; 401 goto dput_and_out;
399 } 402 }
400 down(&inode->i_sem); 403 mutex_lock(&inode->i_mutex);
401 error = notify_change(nd.dentry, &newattrs); 404 error = notify_change(nd.dentry, &newattrs);
402 up(&inode->i_sem); 405 mutex_unlock(&inode->i_mutex);
403dput_and_out: 406dput_and_out:
404 path_release(&nd); 407 path_release(&nd);
405out: 408out:
@@ -412,14 +415,14 @@ out:
412 * must be owner or have write permission. 415 * must be owner or have write permission.
413 * Else, update from *times, must be owner or super user. 416 * Else, update from *times, must be owner or super user.
414 */ 417 */
415long do_utimes(char __user * filename, struct timeval * times) 418long do_utimes(int dfd, char __user *filename, struct timeval *times)
416{ 419{
417 int error; 420 int error;
418 struct nameidata nd; 421 struct nameidata nd;
419 struct inode * inode; 422 struct inode * inode;
420 struct iattr newattrs; 423 struct iattr newattrs;
421 424
422 error = user_path_walk(filename, &nd); 425 error = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW, &nd);
423 426
424 if (error) 427 if (error)
425 goto out; 428 goto out;
@@ -450,22 +453,27 @@ long do_utimes(char __user * filename, struct timeval * times)
450 (error = vfs_permission(&nd, MAY_WRITE)) != 0) 453 (error = vfs_permission(&nd, MAY_WRITE)) != 0)
451 goto dput_and_out; 454 goto dput_and_out;
452 } 455 }
453 down(&inode->i_sem); 456 mutex_lock(&inode->i_mutex);
454 error = notify_change(nd.dentry, &newattrs); 457 error = notify_change(nd.dentry, &newattrs);
455 up(&inode->i_sem); 458 mutex_unlock(&inode->i_mutex);
456dput_and_out: 459dput_and_out:
457 path_release(&nd); 460 path_release(&nd);
458out: 461out:
459 return error; 462 return error;
460} 463}
461 464
462asmlinkage long sys_utimes(char __user * filename, struct timeval __user * utimes) 465asmlinkage long sys_futimesat(int dfd, char __user *filename, struct timeval __user *utimes)
463{ 466{
464 struct timeval times[2]; 467 struct timeval times[2];
465 468
466 if (utimes && copy_from_user(&times, utimes, sizeof(times))) 469 if (utimes && copy_from_user(&times, utimes, sizeof(times)))
467 return -EFAULT; 470 return -EFAULT;
468 return do_utimes(filename, utimes ? times : NULL); 471 return do_utimes(dfd, filename, utimes ? times : NULL);
472}
473
474asmlinkage long sys_utimes(char __user *filename, struct timeval __user *utimes)
475{
476 return sys_futimesat(AT_FDCWD, filename, utimes);
469} 477}
470 478
471 479
@@ -474,7 +482,7 @@ asmlinkage long sys_utimes(char __user * filename, struct timeval __user * utime
474 * We do this by temporarily clearing all FS-related capabilities and 482 * We do this by temporarily clearing all FS-related capabilities and
475 * switching the fsuid/fsgid around to the real ones. 483 * switching the fsuid/fsgid around to the real ones.
476 */ 484 */
477asmlinkage long sys_access(const char __user * filename, int mode) 485asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
478{ 486{
479 struct nameidata nd; 487 struct nameidata nd;
480 int old_fsuid, old_fsgid; 488 int old_fsuid, old_fsgid;
@@ -504,7 +512,7 @@ asmlinkage long sys_access(const char __user * filename, int mode)
504 else 512 else
505 current->cap_effective = current->cap_permitted; 513 current->cap_effective = current->cap_permitted;
506 514
507 res = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd); 515 res = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
508 if (!res) { 516 if (!res) {
509 res = vfs_permission(&nd, mode); 517 res = vfs_permission(&nd, mode);
510 /* SuS v2 requires we report a read only fs too */ 518 /* SuS v2 requires we report a read only fs too */
@@ -521,6 +529,11 @@ asmlinkage long sys_access(const char __user * filename, int mode)
521 return res; 529 return res;
522} 530}
523 531
532asmlinkage long sys_access(const char __user *filename, int mode)
533{
534 return sys_faccessat(AT_FDCWD, filename, mode);
535}
536
524asmlinkage long sys_chdir(const char __user * filename) 537asmlinkage long sys_chdir(const char __user * filename)
525{ 538{
526 struct nameidata nd; 539 struct nameidata nd;
@@ -619,13 +632,13 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
619 err = -EPERM; 632 err = -EPERM;
620 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) 633 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
621 goto out_putf; 634 goto out_putf;
622 down(&inode->i_sem); 635 mutex_lock(&inode->i_mutex);
623 if (mode == (mode_t) -1) 636 if (mode == (mode_t) -1)
624 mode = inode->i_mode; 637 mode = inode->i_mode;
625 newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); 638 newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
626 newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; 639 newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
627 err = notify_change(dentry, &newattrs); 640 err = notify_change(dentry, &newattrs);
628 up(&inode->i_sem); 641 mutex_unlock(&inode->i_mutex);
629 642
630out_putf: 643out_putf:
631 fput(file); 644 fput(file);
@@ -633,14 +646,15 @@ out:
633 return err; 646 return err;
634} 647}
635 648
636asmlinkage long sys_chmod(const char __user * filename, mode_t mode) 649asmlinkage long sys_fchmodat(int dfd, const char __user *filename,
650 mode_t mode)
637{ 651{
638 struct nameidata nd; 652 struct nameidata nd;
639 struct inode * inode; 653 struct inode * inode;
640 int error; 654 int error;
641 struct iattr newattrs; 655 struct iattr newattrs;
642 656
643 error = user_path_walk(filename, &nd); 657 error = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW, &nd);
644 if (error) 658 if (error)
645 goto out; 659 goto out;
646 inode = nd.dentry->d_inode; 660 inode = nd.dentry->d_inode;
@@ -653,13 +667,13 @@ asmlinkage long sys_chmod(const char __user * filename, mode_t mode)
653 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) 667 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
654 goto dput_and_out; 668 goto dput_and_out;
655 669
656 down(&inode->i_sem); 670 mutex_lock(&inode->i_mutex);
657 if (mode == (mode_t) -1) 671 if (mode == (mode_t) -1)
658 mode = inode->i_mode; 672 mode = inode->i_mode;
659 newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); 673 newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
660 newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; 674 newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
661 error = notify_change(nd.dentry, &newattrs); 675 error = notify_change(nd.dentry, &newattrs);
662 up(&inode->i_sem); 676 mutex_unlock(&inode->i_mutex);
663 677
664dput_and_out: 678dput_and_out:
665 path_release(&nd); 679 path_release(&nd);
@@ -667,6 +681,11 @@ out:
667 return error; 681 return error;
668} 682}
669 683
684asmlinkage long sys_chmod(const char __user *filename, mode_t mode)
685{
686 return sys_fchmodat(AT_FDCWD, filename, mode);
687}
688
670static int chown_common(struct dentry * dentry, uid_t user, gid_t group) 689static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
671{ 690{
672 struct inode * inode; 691 struct inode * inode;
@@ -695,9 +714,9 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
695 } 714 }
696 if (!S_ISDIR(inode->i_mode)) 715 if (!S_ISDIR(inode->i_mode))
697 newattrs.ia_valid |= ATTR_KILL_SUID|ATTR_KILL_SGID; 716 newattrs.ia_valid |= ATTR_KILL_SUID|ATTR_KILL_SGID;
698 down(&inode->i_sem); 717 mutex_lock(&inode->i_mutex);
699 error = notify_change(dentry, &newattrs); 718 error = notify_change(dentry, &newattrs);
700 up(&inode->i_sem); 719 mutex_unlock(&inode->i_mutex);
701out: 720out:
702 return error; 721 return error;
703} 722}
@@ -715,6 +734,26 @@ asmlinkage long sys_chown(const char __user * filename, uid_t user, gid_t group)
715 return error; 734 return error;
716} 735}
717 736
737asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
738 gid_t group, int flag)
739{
740 struct nameidata nd;
741 int error = -EINVAL;
742 int follow;
743
744 if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
745 goto out;
746
747 follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
748 error = __user_walk_fd(dfd, filename, follow, &nd);
749 if (!error) {
750 error = chown_common(nd.dentry, user, group);
751 path_release(&nd);
752 }
753out:
754 return error;
755}
756
718asmlinkage long sys_lchown(const char __user * filename, uid_t user, gid_t group) 757asmlinkage long sys_lchown(const char __user * filename, uid_t user, gid_t group)
719{ 758{
720 struct nameidata nd; 759 struct nameidata nd;
@@ -818,7 +857,8 @@ cleanup_file:
818 * for the internal routines (ie open_namei()/follow_link() etc). 00 is 857 * for the internal routines (ie open_namei()/follow_link() etc). 00 is
819 * used by symlinks. 858 * used by symlinks.
820 */ 859 */
821struct file *filp_open(const char * filename, int flags, int mode) 860static struct file *do_filp_open(int dfd, const char *filename, int flags,
861 int mode)
822{ 862{
823 int namei_flags, error; 863 int namei_flags, error;
824 struct nameidata nd; 864 struct nameidata nd;
@@ -827,12 +867,17 @@ struct file *filp_open(const char * filename, int flags, int mode)
827 if ((namei_flags+1) & O_ACCMODE) 867 if ((namei_flags+1) & O_ACCMODE)
828 namei_flags++; 868 namei_flags++;
829 869
830 error = open_namei(filename, namei_flags, mode, &nd); 870 error = open_namei(dfd, filename, namei_flags, mode, &nd);
831 if (!error) 871 if (!error)
832 return nameidata_to_filp(&nd, flags); 872 return nameidata_to_filp(&nd, flags);
833 873
834 return ERR_PTR(error); 874 return ERR_PTR(error);
835} 875}
876
877struct file *filp_open(const char *filename, int flags, int mode)
878{
879 return do_filp_open(AT_FDCWD, filename, flags, mode);
880}
836EXPORT_SYMBOL(filp_open); 881EXPORT_SYMBOL(filp_open);
837 882
838/** 883/**
@@ -970,7 +1015,7 @@ out:
970 1015
971EXPORT_SYMBOL(get_unused_fd); 1016EXPORT_SYMBOL(get_unused_fd);
972 1017
973static inline void __put_unused_fd(struct files_struct *files, unsigned int fd) 1018static void __put_unused_fd(struct files_struct *files, unsigned int fd)
974{ 1019{
975 struct fdtable *fdt = files_fdtable(files); 1020 struct fdtable *fdt = files_fdtable(files);
976 __FD_CLR(fd, fdt->open_fds); 1021 __FD_CLR(fd, fdt->open_fds);
@@ -989,7 +1034,7 @@ void fastcall put_unused_fd(unsigned int fd)
989EXPORT_SYMBOL(put_unused_fd); 1034EXPORT_SYMBOL(put_unused_fd);
990 1035
991/* 1036/*
992 * Install a file pointer in the fd array. 1037 * Install a file pointer in the fd array.
993 * 1038 *
994 * The VFS is full of places where we drop the files lock between 1039 * The VFS is full of places where we drop the files lock between
995 * setting the open_fds bitmap and installing the file in the file 1040 * setting the open_fds bitmap and installing the file in the file
@@ -1014,7 +1059,7 @@ void fastcall fd_install(unsigned int fd, struct file * file)
1014 1059
1015EXPORT_SYMBOL(fd_install); 1060EXPORT_SYMBOL(fd_install);
1016 1061
1017long do_sys_open(const char __user *filename, int flags, int mode) 1062long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
1018{ 1063{
1019 char *tmp = getname(filename); 1064 char *tmp = getname(filename);
1020 int fd = PTR_ERR(tmp); 1065 int fd = PTR_ERR(tmp);
@@ -1022,7 +1067,7 @@ long do_sys_open(const char __user *filename, int flags, int mode)
1022 if (!IS_ERR(tmp)) { 1067 if (!IS_ERR(tmp)) {
1023 fd = get_unused_fd(); 1068 fd = get_unused_fd();
1024 if (fd >= 0) { 1069 if (fd >= 0) {
1025 struct file *f = filp_open(tmp, flags, mode); 1070 struct file *f = do_filp_open(dfd, tmp, flags, mode);
1026 if (IS_ERR(f)) { 1071 if (IS_ERR(f)) {
1027 put_unused_fd(fd); 1072 put_unused_fd(fd);
1028 fd = PTR_ERR(f); 1073 fd = PTR_ERR(f);
@@ -1041,10 +1086,20 @@ asmlinkage long sys_open(const char __user *filename, int flags, int mode)
1041 if (force_o_largefile()) 1086 if (force_o_largefile())
1042 flags |= O_LARGEFILE; 1087 flags |= O_LARGEFILE;
1043 1088
1044 return do_sys_open(filename, flags, mode); 1089 return do_sys_open(AT_FDCWD, filename, flags, mode);
1045} 1090}
1046EXPORT_SYMBOL_GPL(sys_open); 1091EXPORT_SYMBOL_GPL(sys_open);
1047 1092
1093asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
1094 int mode)
1095{
1096 if (force_o_largefile())
1097 flags |= O_LARGEFILE;
1098
1099 return do_sys_open(dfd, filename, flags, mode);
1100}
1101EXPORT_SYMBOL_GPL(sys_openat);
1102
1048#ifndef __alpha__ 1103#ifndef __alpha__
1049 1104
1050/* 1105/*
diff --git a/fs/partitions/Kconfig b/fs/partitions/Kconfig
index deb25b661f0..c9a47809928 100644
--- a/fs/partitions/Kconfig
+++ b/fs/partitions/Kconfig
@@ -21,26 +21,30 @@ config ACORN_PARTITION
21 Support hard disks partitioned under Acorn operating systems. 21 Support hard disks partitioned under Acorn operating systems.
22 22
23config ACORN_PARTITION_CUMANA 23config ACORN_PARTITION_CUMANA
24 bool "Cumana partition support" if PARTITION_ADVANCED && ACORN_PARTITION 24 bool "Cumana partition support" if PARTITION_ADVANCED
25 default y if ARCH_ACORN 25 default y if ARCH_ACORN
26 depends on ACORN_PARTITION
26 help 27 help
27 Say Y here if you would like to use hard disks under Linux which 28 Say Y here if you would like to use hard disks under Linux which
28 were partitioned using the Cumana interface on Acorn machines. 29 were partitioned using the Cumana interface on Acorn machines.
29 30
30config ACORN_PARTITION_EESOX 31config ACORN_PARTITION_EESOX
31 bool "EESOX partition support" if PARTITION_ADVANCED && ACORN_PARTITION 32 bool "EESOX partition support" if PARTITION_ADVANCED
32 default y if ARCH_ACORN 33 default y if ARCH_ACORN
34 depends on ACORN_PARTITION
33 35
34config ACORN_PARTITION_ICS 36config ACORN_PARTITION_ICS
35 bool "ICS partition support" if PARTITION_ADVANCED && ACORN_PARTITION 37 bool "ICS partition support" if PARTITION_ADVANCED
36 default y if ARCH_ACORN 38 default y if ARCH_ACORN
39 depends on ACORN_PARTITION
37 help 40 help
38 Say Y here if you would like to use hard disks under Linux which 41 Say Y here if you would like to use hard disks under Linux which
39 were partitioned using the ICS interface on Acorn machines. 42 were partitioned using the ICS interface on Acorn machines.
40 43
41config ACORN_PARTITION_ADFS 44config ACORN_PARTITION_ADFS
42 bool "Native filecore partition support" if PARTITION_ADVANCED && ACORN_PARTITION 45 bool "Native filecore partition support" if PARTITION_ADVANCED
43 default y if ARCH_ACORN 46 default y if ARCH_ACORN
47 depends on ACORN_PARTITION
44 help 48 help
45 The Acorn Disc Filing System is the standard file system of the 49 The Acorn Disc Filing System is the standard file system of the
46 RiscOS operating system which runs on Acorn's ARM-based Risc PC 50 RiscOS operating system which runs on Acorn's ARM-based Risc PC
@@ -48,15 +52,17 @@ config ACORN_PARTITION_ADFS
48 `Y' here, Linux will support disk partitions created under ADFS. 52 `Y' here, Linux will support disk partitions created under ADFS.
49 53
50config ACORN_PARTITION_POWERTEC 54config ACORN_PARTITION_POWERTEC
51 bool "PowerTec partition support" if PARTITION_ADVANCED && ACORN_PARTITION 55 bool "PowerTec partition support" if PARTITION_ADVANCED
52 default y if ARCH_ACORN 56 default y if ARCH_ACORN
57 depends on ACORN_PARTITION
53 help 58 help
54 Support reading partition tables created on Acorn machines using 59 Support reading partition tables created on Acorn machines using
55 the PowerTec SCSI drive. 60 the PowerTec SCSI drive.
56 61
57config ACORN_PARTITION_RISCIX 62config ACORN_PARTITION_RISCIX
58 bool "RISCiX partition support" if PARTITION_ADVANCED && ACORN_PARTITION 63 bool "RISCiX partition support" if PARTITION_ADVANCED
59 default y if ARCH_ACORN 64 default y if ARCH_ACORN
65 depends on ACORN_PARTITION
60 help 66 help
61 Once upon a time, there was a native Unix port for the Acorn series 67 Once upon a time, there was a native Unix port for the Acorn series
62 of machines called RISCiX. If you say 'Y' here, Linux will be able 68 of machines called RISCiX. If you say 'Y' here, Linux will be able
@@ -85,7 +91,7 @@ config ATARI_PARTITION
85 91
86config IBM_PARTITION 92config IBM_PARTITION
87 bool "IBM disk label and partition support" 93 bool "IBM disk label and partition support"
88 depends on PARTITION_ADVANCED && ARCH_S390 94 depends on PARTITION_ADVANCED && S390
89 help 95 help
90 Say Y here if you would like to be able to read the hard disk 96 Say Y here if you would like to be able to read the hard disk
91 partition table format used by IBM DASD disks operating under CMS. 97 partition table format used by IBM DASD disks operating under CMS.
@@ -203,7 +209,7 @@ config ULTRIX_PARTITION
203 209
204config SUN_PARTITION 210config SUN_PARTITION
205 bool "Sun partition tables support" if PARTITION_ADVANCED 211 bool "Sun partition tables support" if PARTITION_ADVANCED
206 default y if (SPARC32 || SPARC64 || SUN3 || SUN3X) 212 default y if (SPARC || SUN3 || SUN3X)
207 ---help--- 213 ---help---
208 Like most systems, SunOS uses its own hard disk partition table 214 Like most systems, SunOS uses its own hard disk partition table
209 format, incompatible with all others. Saying Y here allows you to 215 format, incompatible with all others. Saying Y here allows you to
@@ -216,6 +222,13 @@ config SUN_PARTITION
216 given by the tar program ("man tar" or preferably "info tar"). If 222 given by the tar program ("man tar" or preferably "info tar"). If
217 you don't know what all this is about, say N. 223 you don't know what all this is about, say N.
218 224
225config KARMA_PARTITION
226 bool "Karma Partition support"
227 depends on PARTITION_ADVANCED
228 help
229 Say Y here if you would like to mount the Rio Karma MP3 player, as it
230 uses a proprietary partition table.
231
219config EFI_PARTITION 232config EFI_PARTITION
220 bool "EFI GUID Partition support" 233 bool "EFI GUID Partition support"
221 depends on PARTITION_ADVANCED 234 depends on PARTITION_ADVANCED
@@ -224,5 +237,3 @@ config EFI_PARTITION
224 Say Y here if you would like to use hard disks under Linux which 237 Say Y here if you would like to use hard disks under Linux which
225 were partitioned using EFI GPT. Presently only useful on the 238 were partitioned using EFI GPT. Presently only useful on the
226 IA-64 platform. 239 IA-64 platform.
227
228# define_bool CONFIG_ACORN_PARTITION_CUMANA y
diff --git a/fs/partitions/Makefile b/fs/partitions/Makefile
index 66d5cc26faf..42c7d3878ed 100644
--- a/fs/partitions/Makefile
+++ b/fs/partitions/Makefile
@@ -17,3 +17,4 @@ obj-$(CONFIG_SUN_PARTITION) += sun.o
17obj-$(CONFIG_ULTRIX_PARTITION) += ultrix.o 17obj-$(CONFIG_ULTRIX_PARTITION) += ultrix.o
18obj-$(CONFIG_IBM_PARTITION) += ibm.o 18obj-$(CONFIG_IBM_PARTITION) += ibm.o
19obj-$(CONFIG_EFI_PARTITION) += efi.o 19obj-$(CONFIG_EFI_PARTITION) += efi.o
20obj-$(CONFIG_KARMA_PARTITION) += karma.o
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 8dc1822a702..f924f459bdb 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -35,6 +35,7 @@
35#include "ibm.h" 35#include "ibm.h"
36#include "ultrix.h" 36#include "ultrix.h"
37#include "efi.h" 37#include "efi.h"
38#include "karma.h"
38 39
39#ifdef CONFIG_BLK_DEV_MD 40#ifdef CONFIG_BLK_DEV_MD
40extern void md_autodetect_dev(dev_t dev); 41extern void md_autodetect_dev(dev_t dev);
@@ -103,6 +104,9 @@ static int (*check_part[])(struct parsed_partitions *, struct block_device *) =
103#ifdef CONFIG_IBM_PARTITION 104#ifdef CONFIG_IBM_PARTITION
104 ibm_partition, 105 ibm_partition,
105#endif 106#endif
107#ifdef CONFIG_KARMA_PARTITION
108 karma_partition,
109#endif
106 NULL 110 NULL
107}; 111};
108 112
@@ -226,7 +230,7 @@ static struct sysfs_ops part_sysfs_ops = {
226static ssize_t part_uevent_store(struct hd_struct * p, 230static ssize_t part_uevent_store(struct hd_struct * p,
227 const char *page, size_t count) 231 const char *page, size_t count)
228{ 232{
229 kobject_hotplug(&p->kobj, KOBJ_ADD); 233 kobject_uevent(&p->kobj, KOBJ_ADD);
230 return count; 234 return count;
231} 235}
232static ssize_t part_dev_read(struct hd_struct * p, char *page) 236static ssize_t part_dev_read(struct hd_struct * p, char *page)
@@ -336,12 +340,31 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len)
336 disk->part[part-1] = p; 340 disk->part[part-1] = p;
337} 341}
338 342
343static char *make_block_name(struct gendisk *disk)
344{
345 char *name;
346 static char *block_str = "block:";
347 int size;
348
349 size = strlen(block_str) + strlen(disk->disk_name) + 1;
350 name = kmalloc(size, GFP_KERNEL);
351 if (!name)
352 return NULL;
353 strcpy(name, block_str);
354 strcat(name, disk->disk_name);
355 return name;
356}
357
339static void disk_sysfs_symlinks(struct gendisk *disk) 358static void disk_sysfs_symlinks(struct gendisk *disk)
340{ 359{
341 struct device *target = get_device(disk->driverfs_dev); 360 struct device *target = get_device(disk->driverfs_dev);
342 if (target) { 361 if (target) {
362 char *disk_name = make_block_name(disk);
343 sysfs_create_link(&disk->kobj,&target->kobj,"device"); 363 sysfs_create_link(&disk->kobj,&target->kobj,"device");
344 sysfs_create_link(&target->kobj,&disk->kobj,"block"); 364 if (disk_name) {
365 sysfs_create_link(&target->kobj,&disk->kobj,disk_name);
366 kfree(disk_name);
367 }
345 } 368 }
346} 369}
347 370
@@ -360,7 +383,7 @@ void register_disk(struct gendisk *disk)
360 if ((err = kobject_add(&disk->kobj))) 383 if ((err = kobject_add(&disk->kobj)))
361 return; 384 return;
362 disk_sysfs_symlinks(disk); 385 disk_sysfs_symlinks(disk);
363 kobject_hotplug(&disk->kobj, KOBJ_ADD); 386 kobject_uevent(&disk->kobj, KOBJ_ADD);
364 387
365 /* No minors to use for partitions */ 388 /* No minors to use for partitions */
366 if (disk->minors == 1) { 389 if (disk->minors == 1) {
@@ -461,10 +484,14 @@ void del_gendisk(struct gendisk *disk)
461 devfs_remove_disk(disk); 484 devfs_remove_disk(disk);
462 485
463 if (disk->driverfs_dev) { 486 if (disk->driverfs_dev) {
487 char *disk_name = make_block_name(disk);
464 sysfs_remove_link(&disk->kobj, "device"); 488 sysfs_remove_link(&disk->kobj, "device");
465 sysfs_remove_link(&disk->driverfs_dev->kobj, "block"); 489 if (disk_name) {
490 sysfs_remove_link(&disk->driverfs_dev->kobj, disk_name);
491 kfree(disk_name);
492 }
466 put_device(disk->driverfs_dev); 493 put_device(disk->driverfs_dev);
467 } 494 }
468 kobject_hotplug(&disk->kobj, KOBJ_REMOVE); 495 kobject_uevent(&disk->kobj, KOBJ_REMOVE);
469 kobject_del(&disk->kobj); 496 kobject_del(&disk->kobj);
470} 497}
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 6327bcb2d73..78010ad60e4 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -56,7 +56,10 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
56 struct hd_geometry *geo; 56 struct hd_geometry *geo;
57 char type[5] = {0,}; 57 char type[5] = {0,};
58 char name[7] = {0,}; 58 char name[7] = {0,};
59 struct vtoc_volume_label *vlabel; 59 union label_t {
60 struct vtoc_volume_label vol;
61 struct vtoc_cms_label cms;
62 } *label;
60 unsigned char *data; 63 unsigned char *data;
61 Sector sect; 64 Sector sect;
62 65
@@ -64,9 +67,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
64 goto out_noinfo; 67 goto out_noinfo;
65 if ((geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL)) == NULL) 68 if ((geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL)) == NULL)
66 goto out_nogeo; 69 goto out_nogeo;
67 if ((vlabel = kmalloc(sizeof(struct vtoc_volume_label), 70 if ((label = kmalloc(sizeof(union label_t), GFP_KERNEL)) == NULL)
68 GFP_KERNEL)) == NULL) 71 goto out_nolab;
69 goto out_novlab;
70 72
71 if (ioctl_by_bdev(bdev, BIODASDINFO, (unsigned long)info) != 0 || 73 if (ioctl_by_bdev(bdev, BIODASDINFO, (unsigned long)info) != 0 ||
72 ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0) 74 ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0)
@@ -87,7 +89,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
87 strncpy(name, data + 8, 6); 89 strncpy(name, data + 8, 6);
88 else 90 else
89 strncpy(name, data + 4, 6); 91 strncpy(name, data + 4, 6);
90 memcpy (vlabel, data, sizeof(struct vtoc_volume_label)); 92 memcpy(label, data, sizeof(union label_t));
91 put_dev_sector(sect); 93 put_dev_sector(sect);
92 94
93 EBCASC(type, 4); 95 EBCASC(type, 4);
@@ -100,14 +102,12 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
100 /* 102 /*
101 * VM style CMS1 labeled disk 103 * VM style CMS1 labeled disk
102 */ 104 */
103 int *label = (int *) vlabel; 105 if (label->cms.disk_offset != 0) {
104
105 if (label[13] != 0) {
106 printk("CMS1/%8s(MDSK):", name); 106 printk("CMS1/%8s(MDSK):", name);
107 /* disk is reserved minidisk */ 107 /* disk is reserved minidisk */
108 blocksize = label[3]; 108 blocksize = label->cms.block_size;
109 offset = label[13]; 109 offset = label->cms.disk_offset;
110 size = (label[7] - 1)*(blocksize >> 9); 110 size = (label->cms.block_count - 1) * (blocksize >> 9);
111 } else { 111 } else {
112 printk("CMS1/%8s:", name); 112 printk("CMS1/%8s:", name);
113 offset = (info->label_block + 1); 113 offset = (info->label_block + 1);
@@ -126,7 +126,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
126 printk("VOL1/%8s:", name); 126 printk("VOL1/%8s:", name);
127 127
128 /* get block number and read then go through format1 labels */ 128 /* get block number and read then go through format1 labels */
129 blk = cchhb2blk(&vlabel->vtoc, geo) + 1; 129 blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
130 counter = 0; 130 counter = 0;
131 while ((data = read_dev_sector(bdev, blk*(blocksize/512), 131 while ((data = read_dev_sector(bdev, blk*(blocksize/512),
132 &sect)) != NULL) { 132 &sect)) != NULL) {
@@ -174,7 +174,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
174 } 174 }
175 175
176 printk("\n"); 176 printk("\n");
177 kfree(vlabel); 177 kfree(label);
178 kfree(geo); 178 kfree(geo);
179 kfree(info); 179 kfree(info);
180 return 1; 180 return 1;
@@ -182,8 +182,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
182out_readerr: 182out_readerr:
183out_badsect: 183out_badsect:
184out_noioctl: 184out_noioctl:
185 kfree(vlabel); 185 kfree(label);
186out_novlab: 186out_nolab:
187 kfree(geo); 187 kfree(geo);
188out_nogeo: 188out_nogeo:
189 kfree(info); 189 kfree(info);
diff --git a/fs/partitions/karma.c b/fs/partitions/karma.c
new file mode 100644
index 00000000000..176d89bcf12
--- /dev/null
+++ b/fs/partitions/karma.c
@@ -0,0 +1,57 @@
1/*
2 * fs/partitions/karma.c
3 * Rio Karma partition info.
4 *
5 * Copyright (C) 2006 Bob Copeland (me@bobcopeland.com)
6 * based on osf.c
7 */
8
9#include "check.h"
10#include "karma.h"
11
12int karma_partition(struct parsed_partitions *state, struct block_device *bdev)
13{
14 int i;
15 int slot = 1;
16 Sector sect;
17 unsigned char *data;
18 struct disklabel {
19 u8 d_reserved[270];
20 struct d_partition {
21 __le32 p_res;
22 u8 p_fstype;
23 u8 p_res2[3];
24 __le32 p_offset;
25 __le32 p_size;
26 } d_partitions[2];
27 u8 d_blank[208];
28 __le16 d_magic;
29 } __attribute__((packed)) *label;
30 struct d_partition *p;
31
32 data = read_dev_sector(bdev, 0, &sect);
33 if (!data)
34 return -1;
35
36 label = (struct disklabel *)data;
37 if (le16_to_cpu(label->d_magic) != KARMA_LABEL_MAGIC) {
38 put_dev_sector(sect);
39 return 0;
40 }
41
42 p = label->d_partitions;
43 for (i = 0 ; i < 2; i++, p++) {
44 if (slot == state->limit)
45 break;
46
47 if (p->p_fstype == 0x4d && le32_to_cpu(p->p_size)) {
48 put_partition(state, slot, le32_to_cpu(p->p_offset),
49 le32_to_cpu(p->p_size));
50 }
51 slot++;
52 }
53 printk("\n");
54 put_dev_sector(sect);
55 return 1;
56}
57
diff --git a/fs/partitions/karma.h b/fs/partitions/karma.h
new file mode 100644
index 00000000000..ecf7d3f2a3d
--- /dev/null
+++ b/fs/partitions/karma.h
@@ -0,0 +1,8 @@
1/*
2 * fs/partitions/karma.h
3 */
4
5#define KARMA_LABEL_MAGIC 0xAB56
6
7int karma_partition(struct parsed_partitions *state, struct block_device *bdev);
8
diff --git a/fs/partitions/ultrix.c b/fs/partitions/ultrix.c
index 8a8d4d9db31..ec852c11dce 100644
--- a/fs/partitions/ultrix.c
+++ b/fs/partitions/ultrix.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include "check.h" 9#include "check.h"
10#include "ultrix.h"
10 11
11int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev) 12int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev)
12{ 13{
diff --git a/fs/pipe.c b/fs/pipe.c
index 66aa0b938d6..d722579df79 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -44,13 +44,13 @@ void pipe_wait(struct inode * inode)
44 * is considered a noninteractive wait: 44 * is considered a noninteractive wait:
45 */ 45 */
46 prepare_to_wait(PIPE_WAIT(*inode), &wait, TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE); 46 prepare_to_wait(PIPE_WAIT(*inode), &wait, TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE);
47 up(PIPE_SEM(*inode)); 47 mutex_unlock(PIPE_MUTEX(*inode));
48 schedule(); 48 schedule();
49 finish_wait(PIPE_WAIT(*inode), &wait); 49 finish_wait(PIPE_WAIT(*inode), &wait);
50 down(PIPE_SEM(*inode)); 50 mutex_lock(PIPE_MUTEX(*inode));
51} 51}
52 52
53static inline int 53static int
54pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len) 54pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len)
55{ 55{
56 unsigned long copy; 56 unsigned long copy;
@@ -70,7 +70,7 @@ pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len)
70 return 0; 70 return 0;
71} 71}
72 72
73static inline int 73static int
74pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len) 74pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len)
75{ 75{
76 unsigned long copy; 76 unsigned long copy;
@@ -136,7 +136,7 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
136 136
137 do_wakeup = 0; 137 do_wakeup = 0;
138 ret = 0; 138 ret = 0;
139 down(PIPE_SEM(*inode)); 139 mutex_lock(PIPE_MUTEX(*inode));
140 info = inode->i_pipe; 140 info = inode->i_pipe;
141 for (;;) { 141 for (;;) {
142 int bufs = info->nrbufs; 142 int bufs = info->nrbufs;
@@ -200,7 +200,7 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
200 } 200 }
201 pipe_wait(inode); 201 pipe_wait(inode);
202 } 202 }
203 up(PIPE_SEM(*inode)); 203 mutex_unlock(PIPE_MUTEX(*inode));
204 /* Signal writers asynchronously that there is more room. */ 204 /* Signal writers asynchronously that there is more room. */
205 if (do_wakeup) { 205 if (do_wakeup) {
206 wake_up_interruptible(PIPE_WAIT(*inode)); 206 wake_up_interruptible(PIPE_WAIT(*inode));
@@ -237,7 +237,7 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
237 237
238 do_wakeup = 0; 238 do_wakeup = 0;
239 ret = 0; 239 ret = 0;
240 down(PIPE_SEM(*inode)); 240 mutex_lock(PIPE_MUTEX(*inode));
241 info = inode->i_pipe; 241 info = inode->i_pipe;
242 242
243 if (!PIPE_READERS(*inode)) { 243 if (!PIPE_READERS(*inode)) {
@@ -341,13 +341,13 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
341 PIPE_WAITING_WRITERS(*inode)--; 341 PIPE_WAITING_WRITERS(*inode)--;
342 } 342 }
343out: 343out:
344 up(PIPE_SEM(*inode)); 344 mutex_unlock(PIPE_MUTEX(*inode));
345 if (do_wakeup) { 345 if (do_wakeup) {
346 wake_up_interruptible(PIPE_WAIT(*inode)); 346 wake_up_interruptible(PIPE_WAIT(*inode));
347 kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); 347 kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
348 } 348 }
349 if (ret > 0) 349 if (ret > 0)
350 inode_update_time(inode, 1); /* mtime and ctime */ 350 file_update_time(filp);
351 return ret; 351 return ret;
352} 352}
353 353
@@ -381,7 +381,7 @@ pipe_ioctl(struct inode *pino, struct file *filp,
381 381
382 switch (cmd) { 382 switch (cmd) {
383 case FIONREAD: 383 case FIONREAD:
384 down(PIPE_SEM(*inode)); 384 mutex_lock(PIPE_MUTEX(*inode));
385 info = inode->i_pipe; 385 info = inode->i_pipe;
386 count = 0; 386 count = 0;
387 buf = info->curbuf; 387 buf = info->curbuf;
@@ -390,7 +390,7 @@ pipe_ioctl(struct inode *pino, struct file *filp,
390 count += info->bufs[buf].len; 390 count += info->bufs[buf].len;
391 buf = (buf+1) & (PIPE_BUFFERS-1); 391 buf = (buf+1) & (PIPE_BUFFERS-1);
392 } 392 }
393 up(PIPE_SEM(*inode)); 393 mutex_unlock(PIPE_MUTEX(*inode));
394 return put_user(count, (int __user *)arg); 394 return put_user(count, (int __user *)arg);
395 default: 395 default:
396 return -EINVAL; 396 return -EINVAL;
@@ -433,7 +433,7 @@ pipe_poll(struct file *filp, poll_table *wait)
433static int 433static int
434pipe_release(struct inode *inode, int decr, int decw) 434pipe_release(struct inode *inode, int decr, int decw)
435{ 435{
436 down(PIPE_SEM(*inode)); 436 mutex_lock(PIPE_MUTEX(*inode));
437 PIPE_READERS(*inode) -= decr; 437 PIPE_READERS(*inode) -= decr;
438 PIPE_WRITERS(*inode) -= decw; 438 PIPE_WRITERS(*inode) -= decw;
439 if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) { 439 if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) {
@@ -443,7 +443,7 @@ pipe_release(struct inode *inode, int decr, int decw)
443 kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); 443 kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
444 kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); 444 kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
445 } 445 }
446 up(PIPE_SEM(*inode)); 446 mutex_unlock(PIPE_MUTEX(*inode));
447 447
448 return 0; 448 return 0;
449} 449}
@@ -454,9 +454,9 @@ pipe_read_fasync(int fd, struct file *filp, int on)
454 struct inode *inode = filp->f_dentry->d_inode; 454 struct inode *inode = filp->f_dentry->d_inode;
455 int retval; 455 int retval;
456 456
457 down(PIPE_SEM(*inode)); 457 mutex_lock(PIPE_MUTEX(*inode));
458 retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode)); 458 retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode));
459 up(PIPE_SEM(*inode)); 459 mutex_unlock(PIPE_MUTEX(*inode));
460 460
461 if (retval < 0) 461 if (retval < 0)
462 return retval; 462 return retval;
@@ -471,9 +471,9 @@ pipe_write_fasync(int fd, struct file *filp, int on)
471 struct inode *inode = filp->f_dentry->d_inode; 471 struct inode *inode = filp->f_dentry->d_inode;
472 int retval; 472 int retval;
473 473
474 down(PIPE_SEM(*inode)); 474 mutex_lock(PIPE_MUTEX(*inode));
475 retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode)); 475 retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode));
476 up(PIPE_SEM(*inode)); 476 mutex_unlock(PIPE_MUTEX(*inode));
477 477
478 if (retval < 0) 478 if (retval < 0)
479 return retval; 479 return retval;
@@ -488,14 +488,14 @@ pipe_rdwr_fasync(int fd, struct file *filp, int on)
488 struct inode *inode = filp->f_dentry->d_inode; 488 struct inode *inode = filp->f_dentry->d_inode;
489 int retval; 489 int retval;
490 490
491 down(PIPE_SEM(*inode)); 491 mutex_lock(PIPE_MUTEX(*inode));
492 492
493 retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode)); 493 retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode));
494 494
495 if (retval >= 0) 495 if (retval >= 0)
496 retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode)); 496 retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode));
497 497
498 up(PIPE_SEM(*inode)); 498 mutex_unlock(PIPE_MUTEX(*inode));
499 499
500 if (retval < 0) 500 if (retval < 0)
501 return retval; 501 return retval;
@@ -534,9 +534,9 @@ pipe_read_open(struct inode *inode, struct file *filp)
534{ 534{
535 /* We could have perhaps used atomic_t, but this and friends 535 /* We could have perhaps used atomic_t, but this and friends
536 below are the only places. So it doesn't seem worthwhile. */ 536 below are the only places. So it doesn't seem worthwhile. */
537 down(PIPE_SEM(*inode)); 537 mutex_lock(PIPE_MUTEX(*inode));
538 PIPE_READERS(*inode)++; 538 PIPE_READERS(*inode)++;
539 up(PIPE_SEM(*inode)); 539 mutex_unlock(PIPE_MUTEX(*inode));
540 540
541 return 0; 541 return 0;
542} 542}
@@ -544,9 +544,9 @@ pipe_read_open(struct inode *inode, struct file *filp)
544static int 544static int
545pipe_write_open(struct inode *inode, struct file *filp) 545pipe_write_open(struct inode *inode, struct file *filp)
546{ 546{
547 down(PIPE_SEM(*inode)); 547 mutex_lock(PIPE_MUTEX(*inode));
548 PIPE_WRITERS(*inode)++; 548 PIPE_WRITERS(*inode)++;
549 up(PIPE_SEM(*inode)); 549 mutex_unlock(PIPE_MUTEX(*inode));
550 550
551 return 0; 551 return 0;
552} 552}
@@ -554,12 +554,12 @@ pipe_write_open(struct inode *inode, struct file *filp)
554static int 554static int
555pipe_rdwr_open(struct inode *inode, struct file *filp) 555pipe_rdwr_open(struct inode *inode, struct file *filp)
556{ 556{
557 down(PIPE_SEM(*inode)); 557 mutex_lock(PIPE_MUTEX(*inode));
558 if (filp->f_mode & FMODE_READ) 558 if (filp->f_mode & FMODE_READ)
559 PIPE_READERS(*inode)++; 559 PIPE_READERS(*inode)++;
560 if (filp->f_mode & FMODE_WRITE) 560 if (filp->f_mode & FMODE_WRITE)
561 PIPE_WRITERS(*inode)++; 561 PIPE_WRITERS(*inode)++;
562 up(PIPE_SEM(*inode)); 562 mutex_unlock(PIPE_MUTEX(*inode));
563 563
564 return 0; 564 return 0;
565} 565}
diff --git a/fs/pnode.c b/fs/pnode.c
index aeeec8ba8dd..f1871f773f6 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -103,7 +103,7 @@ static struct vfsmount *propagation_next(struct vfsmount *m,
103 struct vfsmount *next; 103 struct vfsmount *next;
104 struct vfsmount *master = m->mnt_master; 104 struct vfsmount *master = m->mnt_master;
105 105
106 if ( master == origin->mnt_master ) { 106 if (master == origin->mnt_master) {
107 next = next_peer(m); 107 next = next_peer(m);
108 return ((next == origin) ? NULL : next); 108 return ((next == origin) ? NULL : next);
109 } else if (m->mnt_slave.next != &master->mnt_slave_list) 109 } else if (m->mnt_slave.next != &master->mnt_slave_list)
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 3e1239e4b30..7eb1bd7f800 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -308,7 +308,7 @@ int proc_pid_status(struct task_struct *task, char * buffer)
308 buffer = task_sig(task, buffer); 308 buffer = task_sig(task, buffer);
309 buffer = task_cap(task, buffer); 309 buffer = task_cap(task, buffer);
310 buffer = cpuset_task_status_allowed(task, buffer); 310 buffer = cpuset_task_status_allowed(task, buffer);
311#if defined(CONFIG_ARCH_S390) 311#if defined(CONFIG_S390)
312 buffer = task_show_regs(task, buffer); 312 buffer = task_show_regs(task, buffer);
313#endif 313#endif
314 return buffer - orig; 314 return buffer - orig;
@@ -330,7 +330,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
330 unsigned long min_flt = 0, maj_flt = 0; 330 unsigned long min_flt = 0, maj_flt = 0;
331 cputime_t cutime, cstime, utime, stime; 331 cputime_t cutime, cstime, utime, stime;
332 unsigned long rsslim = 0; 332 unsigned long rsslim = 0;
333 unsigned long it_real_value = 0; 333 DEFINE_KTIME(it_real_value);
334 struct task_struct *t; 334 struct task_struct *t;
335 char tcomm[sizeof(task->comm)]; 335 char tcomm[sizeof(task->comm)];
336 336
@@ -386,7 +386,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
386 utime = cputime_add(utime, task->signal->utime); 386 utime = cputime_add(utime, task->signal->utime);
387 stime = cputime_add(stime, task->signal->stime); 387 stime = cputime_add(stime, task->signal->stime);
388 } 388 }
389 it_real_value = task->signal->it_real_value; 389 it_real_value = task->signal->real_timer.expires;
390 } 390 }
391 ppid = pid_alive(task) ? task->group_leader->real_parent->tgid : 0; 391 ppid = pid_alive(task) ? task->group_leader->real_parent->tgid : 0;
392 read_unlock(&tasklist_lock); 392 read_unlock(&tasklist_lock);
@@ -435,7 +435,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
435 priority, 435 priority,
436 nice, 436 nice,
437 num_threads, 437 num_threads,
438 jiffies_to_clock_t(it_real_value), 438 (long) ktime_to_clock_t(it_real_value),
439 start_time, 439 start_time,
440 vsize, 440 vsize,
441 mm ? get_mm_rss(mm) : 0, 441 mm ? get_mm_rss(mm) : 0,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 634355e1698..20feb7568de 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -55,6 +55,7 @@
55#include <linux/proc_fs.h> 55#include <linux/proc_fs.h>
56#include <linux/stat.h> 56#include <linux/stat.h>
57#include <linux/init.h> 57#include <linux/init.h>
58#include <linux/capability.h>
58#include <linux/file.h> 59#include <linux/file.h>
59#include <linux/string.h> 60#include <linux/string.h>
60#include <linux/seq_file.h> 61#include <linux/seq_file.h>
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index b638fb50074..20e5c4509a4 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -21,6 +21,8 @@
21#include <linux/bitops.h> 21#include <linux/bitops.h>
22#include <asm/uaccess.h> 22#include <asm/uaccess.h>
23 23
24#include "internal.h"
25
24static ssize_t proc_file_read(struct file *file, char __user *buf, 26static ssize_t proc_file_read(struct file *file, char __user *buf,
25 size_t nbytes, loff_t *ppos); 27 size_t nbytes, loff_t *ppos);
26static ssize_t proc_file_write(struct file *file, const char __user *buffer, 28static ssize_t proc_file_write(struct file *file, const char __user *buffer,
@@ -54,6 +56,18 @@ proc_file_read(struct file *file, char __user *buf, size_t nbytes,
54 ssize_t n, count; 56 ssize_t n, count;
55 char *start; 57 char *start;
56 struct proc_dir_entry * dp; 58 struct proc_dir_entry * dp;
59 unsigned long long pos;
60
61 /*
62 * Gaah, please just use "seq_file" instead. The legacy /proc
63 * interfaces cut loff_t down to off_t for reads, and ignore
64 * the offset entirely for writes..
65 */
66 pos = *ppos;
67 if (pos > MAX_NON_LFS)
68 return 0;
69 if (nbytes > MAX_NON_LFS - pos)
70 nbytes = MAX_NON_LFS - pos;
57 71
58 dp = PDE(inode); 72 dp = PDE(inode);
59 if (!(page = (char*) __get_free_page(GFP_KERNEL))) 73 if (!(page = (char*) __get_free_page(GFP_KERNEL)))
@@ -202,30 +216,17 @@ proc_file_write(struct file *file, const char __user *buffer,
202static loff_t 216static loff_t
203proc_file_lseek(struct file *file, loff_t offset, int orig) 217proc_file_lseek(struct file *file, loff_t offset, int orig)
204{ 218{
205 lock_kernel(); 219 loff_t retval = -EINVAL;
206 220 switch (orig) {
207 switch (orig) { 221 case 1:
208 case 0: 222 offset += file->f_pos;
209 if (offset < 0) 223 /* fallthrough */
210 goto out; 224 case 0:
211 file->f_pos = offset; 225 if (offset < 0 || offset > MAX_NON_LFS)
212 unlock_kernel(); 226 break;
213 return(file->f_pos); 227 file->f_pos = retval = offset;
214 case 1: 228 }
215 if (offset + file->f_pos < 0) 229 return retval;
216 goto out;
217 file->f_pos += offset;
218 unlock_kernel();
219 return(file->f_pos);
220 case 2:
221 goto out;
222 default:
223 goto out;
224 }
225
226out:
227 unlock_kernel();
228 return -EINVAL;
229} 230}
230 231
231static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) 232static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index e6a818a93f3..6573f31f1fd 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -19,7 +19,7 @@
19#include <asm/system.h> 19#include <asm/system.h>
20#include <asm/uaccess.h> 20#include <asm/uaccess.h>
21 21
22extern void free_proc_entry(struct proc_dir_entry *); 22#include "internal.h"
23 23
24static inline struct proc_dir_entry * de_get(struct proc_dir_entry *de) 24static inline struct proc_dir_entry * de_get(struct proc_dir_entry *de)
25{ 25{
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 3e55198f980..95a1cf32b83 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -37,6 +37,10 @@ extern int proc_tgid_stat(struct task_struct *, char *);
37extern int proc_pid_status(struct task_struct *, char *); 37extern int proc_pid_status(struct task_struct *, char *);
38extern int proc_pid_statm(struct task_struct *, char *); 38extern int proc_pid_statm(struct task_struct *, char *);
39 39
40void free_proc_entry(struct proc_dir_entry *de);
41
42int proc_init_inodecache(void);
43
40static inline struct task_struct *proc_task(struct inode *inode) 44static inline struct task_struct *proc_task(struct inode *inode)
41{ 45{
42 return PROC_I(inode)->task; 46 return PROC_I(inode)->task;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 1c7da988fcc..adc2cd95169 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -14,6 +14,7 @@
14#include <linux/proc_fs.h> 14#include <linux/proc_fs.h>
15#include <linux/user.h> 15#include <linux/user.h>
16#include <linux/a.out.h> 16#include <linux/a.out.h>
17#include <linux/capability.h>
17#include <linux/elf.h> 18#include <linux/elf.h>
18#include <linux/elfcore.h> 19#include <linux/elfcore.h>
19#include <linux/vmalloc.h> 20#include <linux/vmalloc.h>
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index fb117b74809..9bdd077d6f5 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -81,6 +81,30 @@ void proc_device_tree_add_prop(struct proc_dir_entry *pde, struct property *prop
81 __proc_device_tree_add_prop(pde, prop); 81 __proc_device_tree_add_prop(pde, prop);
82} 82}
83 83
84void proc_device_tree_remove_prop(struct proc_dir_entry *pde,
85 struct property *prop)
86{
87 remove_proc_entry(prop->name, pde);
88}
89
90void proc_device_tree_update_prop(struct proc_dir_entry *pde,
91 struct property *newprop,
92 struct property *oldprop)
93{
94 struct proc_dir_entry *ent;
95
96 for (ent = pde->subdir; ent != NULL; ent = ent->next)
97 if (ent->data == oldprop)
98 break;
99 if (ent == NULL) {
100 printk(KERN_WARNING "device-tree: property \"%s\" "
101 " does not exist\n", oldprop->name);
102 } else {
103 ent->data = newprop;
104 ent->size = newprop->length;
105 }
106}
107
84/* 108/*
85 * Process a node, adding entries for its children and its properties. 109 * Process a node, adding entries for its children and its properties.
86 */ 110 */
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 5b6b0b6038a..8f8014285a3 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -20,6 +20,7 @@
20#include <linux/time.h> 20#include <linux/time.h>
21#include <linux/kernel.h> 21#include <linux/kernel.h>
22#include <linux/kernel_stat.h> 22#include <linux/kernel_stat.h>
23#include <linux/fs.h>
23#include <linux/tty.h> 24#include <linux/tty.h>
24#include <linux/string.h> 25#include <linux/string.h>
25#include <linux/mman.h> 26#include <linux/mman.h>
@@ -62,7 +63,6 @@
62 */ 63 */
63extern int get_hardware_list(char *); 64extern int get_hardware_list(char *);
64extern int get_stram_list(char *); 65extern int get_stram_list(char *);
65extern int get_chrdev_list(char *);
66extern int get_filesystem_list(char *); 66extern int get_filesystem_list(char *);
67extern int get_exec_domain_list(char *); 67extern int get_exec_domain_list(char *);
68extern int get_dma_list(char *); 68extern int get_dma_list(char *);
@@ -248,6 +248,154 @@ static int cpuinfo_open(struct inode *inode, struct file *file)
248{ 248{
249 return seq_open(file, &cpuinfo_op); 249 return seq_open(file, &cpuinfo_op);
250} 250}
251
252enum devinfo_states {
253 CHR_HDR,
254 CHR_LIST,
255 BLK_HDR,
256 BLK_LIST,
257 DEVINFO_DONE
258};
259
260struct devinfo_state {
261 void *chrdev;
262 void *blkdev;
263 unsigned int num_records;
264 unsigned int cur_record;
265 enum devinfo_states state;
266};
267
268static void *devinfo_start(struct seq_file *f, loff_t *pos)
269{
270 struct devinfo_state *info = f->private;
271
272 if (*pos) {
273 if ((info) && (*pos <= info->num_records))
274 return info;
275 return NULL;
276 }
277 info = kmalloc(sizeof(*info), GFP_KERNEL);
278 f->private = info;
279 info->chrdev = acquire_chrdev_list();
280 info->blkdev = acquire_blkdev_list();
281 info->state = CHR_HDR;
282 info->num_records = count_chrdev_list();
283 info->num_records += count_blkdev_list();
284 info->num_records += 2; /* Character and Block headers */
285 *pos = 1;
286 info->cur_record = *pos;
287 return info;
288}
289
290static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos)
291{
292 int idummy;
293 char *ndummy;
294 struct devinfo_state *info = f->private;
295
296 switch (info->state) {
297 case CHR_HDR:
298 info->state = CHR_LIST;
299 (*pos)++;
300 /*fallthrough*/
301 case CHR_LIST:
302 if (get_chrdev_info(info->chrdev,&idummy,&ndummy)) {
303 /*
304 * The character dev list is complete
305 */
306 info->state = BLK_HDR;
307 } else {
308 info->chrdev = get_next_chrdev(info->chrdev);
309 }
310 (*pos)++;
311 break;
312 case BLK_HDR:
313 info->state = BLK_LIST;
314 (*pos)++;
315 break;
316 case BLK_LIST:
317 if (get_blkdev_info(info->blkdev,&idummy,&ndummy)) {
318 /*
319 * The block dev list is complete
320 */
321 info->state = DEVINFO_DONE;
322 } else {
323 info->blkdev = get_next_blkdev(info->blkdev);
324 }
325 (*pos)++;
326 break;
327 case DEVINFO_DONE:
328 (*pos)++;
329 info->cur_record = *pos;
330 info = NULL;
331 break;
332 default:
333 break;
334 }
335 if (info)
336 info->cur_record = *pos;
337 return info;
338}
339
340static void devinfo_stop(struct seq_file *f, void *v)
341{
342 struct devinfo_state *info = f->private;
343
344 if (info) {
345 release_chrdev_list(info->chrdev);
346 release_blkdev_list(info->blkdev);
347 f->private = NULL;
348 kfree(info);
349 }
350}
351
352static int devinfo_show(struct seq_file *f, void *arg)
353{
354 int major;
355 char *name;
356 struct devinfo_state *info = f->private;
357
358 switch(info->state) {
359 case CHR_HDR:
360 seq_printf(f,"Character devices:\n");
361 /* fallthrough */
362 case CHR_LIST:
363 if (!get_chrdev_info(info->chrdev,&major,&name))
364 seq_printf(f,"%3d %s\n",major,name);
365 break;
366 case BLK_HDR:
367 seq_printf(f,"\nBlock devices:\n");
368 /* fallthrough */
369 case BLK_LIST:
370 if (!get_blkdev_info(info->blkdev,&major,&name))
371 seq_printf(f,"%3d %s\n",major,name);
372 break;
373 default:
374 break;
375 }
376
377 return 0;
378}
379
380static struct seq_operations devinfo_op = {
381 .start = devinfo_start,
382 .next = devinfo_next,
383 .stop = devinfo_stop,
384 .show = devinfo_show,
385};
386
387static int devinfo_open(struct inode *inode, struct file *file)
388{
389 return seq_open(file, &devinfo_op);
390}
391
392static struct file_operations proc_devinfo_operations = {
393 .open = devinfo_open,
394 .read = seq_read,
395 .llseek = seq_lseek,
396 .release = seq_release,
397};
398
251static struct file_operations proc_cpuinfo_operations = { 399static struct file_operations proc_cpuinfo_operations = {
252 .open = cpuinfo_open, 400 .open = cpuinfo_open,
253 .read = seq_read, 401 .read = seq_read,
@@ -323,6 +471,7 @@ static struct file_operations proc_modules_operations = {
323}; 471};
324#endif 472#endif
325 473
474#ifdef CONFIG_SLAB
326extern struct seq_operations slabinfo_op; 475extern struct seq_operations slabinfo_op;
327extern ssize_t slabinfo_write(struct file *, const char __user *, size_t, loff_t *); 476extern ssize_t slabinfo_write(struct file *, const char __user *, size_t, loff_t *);
328static int slabinfo_open(struct inode *inode, struct file *file) 477static int slabinfo_open(struct inode *inode, struct file *file)
@@ -336,6 +485,7 @@ static struct file_operations proc_slabinfo_operations = {
336 .llseek = seq_lseek, 485 .llseek = seq_lseek,
337 .release = seq_release, 486 .release = seq_release,
338}; 487};
488#endif
339 489
340static int show_stat(struct seq_file *p, void *v) 490static int show_stat(struct seq_file *p, void *v)
341{ 491{
@@ -448,14 +598,6 @@ static struct file_operations proc_stat_operations = {
448 .release = single_release, 598 .release = single_release,
449}; 599};
450 600
451static int devices_read_proc(char *page, char **start, off_t off,
452 int count, int *eof, void *data)
453{
454 int len = get_chrdev_list(page);
455 len += get_blkdev_list(page+len, len);
456 return proc_calc_metrics(page, start, off, count, eof, len);
457}
458
459/* 601/*
460 * /proc/interrupts 602 * /proc/interrupts
461 */ 603 */
@@ -580,7 +722,6 @@ void __init proc_misc_init(void)
580#ifdef CONFIG_STRAM_PROC 722#ifdef CONFIG_STRAM_PROC
581 {"stram", stram_read_proc}, 723 {"stram", stram_read_proc},
582#endif 724#endif
583 {"devices", devices_read_proc},
584 {"filesystems", filesystems_read_proc}, 725 {"filesystems", filesystems_read_proc},
585 {"cmdline", cmdline_read_proc}, 726 {"cmdline", cmdline_read_proc},
586 {"locks", locks_read_proc}, 727 {"locks", locks_read_proc},
@@ -596,11 +737,14 @@ void __init proc_misc_init(void)
596 entry = create_proc_entry("kmsg", S_IRUSR, &proc_root); 737 entry = create_proc_entry("kmsg", S_IRUSR, &proc_root);
597 if (entry) 738 if (entry)
598 entry->proc_fops = &proc_kmsg_operations; 739 entry->proc_fops = &proc_kmsg_operations;
740 create_seq_entry("devices", 0, &proc_devinfo_operations);
599 create_seq_entry("cpuinfo", 0, &proc_cpuinfo_operations); 741 create_seq_entry("cpuinfo", 0, &proc_cpuinfo_operations);
600 create_seq_entry("partitions", 0, &proc_partitions_operations); 742 create_seq_entry("partitions", 0, &proc_partitions_operations);
601 create_seq_entry("stat", 0, &proc_stat_operations); 743 create_seq_entry("stat", 0, &proc_stat_operations);
602 create_seq_entry("interrupts", 0, &proc_interrupts_operations); 744 create_seq_entry("interrupts", 0, &proc_interrupts_operations);
745#ifdef CONFIG_SLAB
603 create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations); 746 create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations);
747#endif
604 create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations); 748 create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations);
605 create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations); 749 create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations);
606 create_seq_entry("zoneinfo",S_IRUGO, &proc_zoneinfo_file_operations); 750 create_seq_entry("zoneinfo",S_IRUGO, &proc_zoneinfo_file_operations);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index aef148f099a..68896283c8a 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -18,6 +18,8 @@
18#include <linux/bitops.h> 18#include <linux/bitops.h>
19#include <linux/smp_lock.h> 19#include <linux/smp_lock.h>
20 20
21#include "internal.h"
22
21struct proc_dir_entry *proc_net, *proc_net_stat, *proc_bus, *proc_root_fs, *proc_root_driver; 23struct proc_dir_entry *proc_net, *proc_net_stat, *proc_bus, *proc_root_fs, *proc_root_driver;
22 24
23#ifdef CONFIG_SYSCTL 25#ifdef CONFIG_SYSCTL
@@ -36,7 +38,6 @@ static struct file_system_type proc_fs_type = {
36 .kill_sb = kill_anon_super, 38 .kill_sb = kill_anon_super,
37}; 39};
38 40
39extern int __init proc_init_inodecache(void);
40void __init proc_root_init(void) 41void __init proc_root_init(void)
41{ 42{
42 int err = proc_init_inodecache(); 43 int err = proc_init_inodecache();
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index d2fa42006d8..0eaad41f465 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -195,7 +195,7 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats
195 195
196static int show_map(struct seq_file *m, void *v) 196static int show_map(struct seq_file *m, void *v)
197{ 197{
198 return show_map_internal(m, v, 0); 198 return show_map_internal(m, v, NULL);
199} 199}
200 200
201static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 201static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
@@ -390,130 +390,12 @@ struct seq_operations proc_pid_smaps_op = {
390}; 390};
391 391
392#ifdef CONFIG_NUMA 392#ifdef CONFIG_NUMA
393 393extern int show_numa_map(struct seq_file *m, void *v);
394struct numa_maps {
395 unsigned long pages;
396 unsigned long anon;
397 unsigned long mapped;
398 unsigned long mapcount_max;
399 unsigned long node[MAX_NUMNODES];
400};
401
402/*
403 * Calculate numa node maps for a vma
404 */
405static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
406{
407 struct page *page;
408 unsigned long vaddr;
409 struct mm_struct *mm = vma->vm_mm;
410 int i;
411 struct numa_maps *md = kmalloc(sizeof(struct numa_maps), GFP_KERNEL);
412
413 if (!md)
414 return NULL;
415 md->pages = 0;
416 md->anon = 0;
417 md->mapped = 0;
418 md->mapcount_max = 0;
419 for_each_node(i)
420 md->node[i] =0;
421
422 for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
423 page = follow_page(mm, vaddr, 0);
424 if (page) {
425 int count = page_mapcount(page);
426
427 if (count)
428 md->mapped++;
429 if (count > md->mapcount_max)
430 md->mapcount_max = count;
431 md->pages++;
432 if (PageAnon(page))
433 md->anon++;
434 md->node[page_to_nid(page)]++;
435 }
436 cond_resched();
437 }
438 return md;
439}
440
441static int show_numa_map(struct seq_file *m, void *v)
442{
443 struct task_struct *task = m->private;
444 struct vm_area_struct *vma = v;
445 struct mempolicy *pol;
446 struct numa_maps *md;
447 struct zone **z;
448 int n;
449 int first;
450
451 if (!vma->vm_mm)
452 return 0;
453
454 md = get_numa_maps(vma);
455 if (!md)
456 return 0;
457
458 seq_printf(m, "%08lx", vma->vm_start);
459 pol = get_vma_policy(task, vma, vma->vm_start);
460 /* Print policy */
461 switch (pol->policy) {
462 case MPOL_PREFERRED:
463 seq_printf(m, " prefer=%d", pol->v.preferred_node);
464 break;
465 case MPOL_BIND:
466 seq_printf(m, " bind={");
467 first = 1;
468 for (z = pol->v.zonelist->zones; *z; z++) {
469
470 if (!first)
471 seq_putc(m, ',');
472 else
473 first = 0;
474 seq_printf(m, "%d/%s", (*z)->zone_pgdat->node_id,
475 (*z)->name);
476 }
477 seq_putc(m, '}');
478 break;
479 case MPOL_INTERLEAVE:
480 seq_printf(m, " interleave={");
481 first = 1;
482 for_each_node(n) {
483 if (node_isset(n, pol->v.nodes)) {
484 if (!first)
485 seq_putc(m,',');
486 else
487 first = 0;
488 seq_printf(m, "%d",n);
489 }
490 }
491 seq_putc(m, '}');
492 break;
493 default:
494 seq_printf(m," default");
495 break;
496 }
497 seq_printf(m, " MaxRef=%lu Pages=%lu Mapped=%lu",
498 md->mapcount_max, md->pages, md->mapped);
499 if (md->anon)
500 seq_printf(m," Anon=%lu",md->anon);
501
502 for_each_online_node(n) {
503 if (md->node[n])
504 seq_printf(m, " N%d=%lu", n, md->node[n]);
505 }
506 seq_putc(m, '\n');
507 kfree(md);
508 if (m->count < m->size) /* vma is copied successfully */
509 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
510 return 0;
511}
512 394
513struct seq_operations proc_pid_numa_maps_op = { 395struct seq_operations proc_pid_numa_maps_op = {
514 .start = m_start, 396 .start = m_start,
515 .next = m_next, 397 .next = m_next,
516 .stop = m_stop, 398 .stop = m_stop,
517 .show = show_numa_map 399 .show = show_numa_map
518}; 400};
519#endif 401#endif
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 3b2e7b69e63..4063fb32f78 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -14,7 +14,6 @@
14#include <linux/a.out.h> 14#include <linux/a.out.h>
15#include <linux/elf.h> 15#include <linux/elf.h>
16#include <linux/elfcore.h> 16#include <linux/elfcore.h>
17#include <linux/proc_fs.h>
18#include <linux/highmem.h> 17#include <linux/highmem.h>
19#include <linux/bootmem.h> 18#include <linux/bootmem.h>
20#include <linux/init.h> 19#include <linux/init.h>
@@ -35,11 +34,14 @@ static size_t elfcorebuf_sz;
35/* Total size of vmcore file. */ 34/* Total size of vmcore file. */
36static u64 vmcore_size; 35static u64 vmcore_size;
37 36
37/* Stores the physical address of elf header of crash image. */
38unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
39
38struct proc_dir_entry *proc_vmcore = NULL; 40struct proc_dir_entry *proc_vmcore = NULL;
39 41
40/* Reads a page from the oldmem device from given offset. */ 42/* Reads a page from the oldmem device from given offset. */
41static ssize_t read_from_oldmem(char *buf, size_t count, 43static ssize_t read_from_oldmem(char *buf, size_t count,
42 loff_t *ppos, int userbuf) 44 u64 *ppos, int userbuf)
43{ 45{
44 unsigned long pfn, offset; 46 unsigned long pfn, offset;
45 size_t nr_bytes; 47 size_t nr_bytes;
diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c
index 99125392765..46efbf52cbe 100644
--- a/fs/qnx4/bitmap.c
+++ b/fs/qnx4/bitmap.c
@@ -23,10 +23,12 @@
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/bitops.h> 24#include <linux/bitops.h>
25 25
26#if 0
26int qnx4_new_block(struct super_block *sb) 27int qnx4_new_block(struct super_block *sb)
27{ 28{
28 return 0; 29 return 0;
29} 30}
31#endif /* 0 */
30 32
31static void count_bits(register const char *bmPart, register int size, 33static void count_bits(register const char *bmPart, register int size,
32 int *const tf) 34 int *const tf)
diff --git a/fs/quota.c b/fs/quota.c
index 612e04db4b9..ba9e0bf32f6 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -15,6 +15,7 @@
15#include <linux/security.h> 15#include <linux/security.h>
16#include <linux/syscalls.h> 16#include <linux/syscalls.h>
17#include <linux/buffer_head.h> 17#include <linux/buffer_head.h>
18#include <linux/capability.h>
18#include <linux/quotaops.h> 19#include <linux/quotaops.h>
19 20
20/* Check validity of generic quotactl commands */ 21/* Check validity of generic quotactl commands */
@@ -168,7 +169,7 @@ static void quota_sync_sb(struct super_block *sb, int type)
168 sync_blockdev(sb->s_bdev); 169 sync_blockdev(sb->s_bdev);
169 170
170 /* Now when everything is written we can discard the pagecache so 171 /* Now when everything is written we can discard the pagecache so
171 * that userspace sees the changes. We need i_sem and so we could 172 * that userspace sees the changes. We need i_mutex and so we could
172 * not do it inside dqonoff_sem. Moreover we need to be carefull 173 * not do it inside dqonoff_sem. Moreover we need to be carefull
173 * about races with quotaoff() (that is the reason why we have own 174 * about races with quotaoff() (that is the reason why we have own
174 * reference to inode). */ 175 * reference to inode). */
@@ -184,9 +185,9 @@ static void quota_sync_sb(struct super_block *sb, int type)
184 up(&sb_dqopt(sb)->dqonoff_sem); 185 up(&sb_dqopt(sb)->dqonoff_sem);
185 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 186 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
186 if (discard[cnt]) { 187 if (discard[cnt]) {
187 down(&discard[cnt]->i_sem); 188 mutex_lock(&discard[cnt]->i_mutex);
188 truncate_inode_pages(&discard[cnt]->i_data, 0); 189 truncate_inode_pages(&discard[cnt]->i_data, 0);
189 up(&discard[cnt]->i_sem); 190 mutex_unlock(&discard[cnt]->i_mutex);
190 iput(discard[cnt]); 191 iput(discard[cnt]);
191 } 192 }
192 } 193 }
diff --git a/fs/quota_v2.c b/fs/quota_v2.c
index 7afcbb1b937..a4ef91bb4f3 100644
--- a/fs/quota_v2.c
+++ b/fs/quota_v2.c
@@ -35,7 +35,8 @@ static int v2_check_quota_file(struct super_block *sb, int type)
35 35
36 size = sb->s_op->quota_read(sb, type, (char *)&dqhead, sizeof(struct v2_disk_dqheader), 0); 36 size = sb->s_op->quota_read(sb, type, (char *)&dqhead, sizeof(struct v2_disk_dqheader), 0);
37 if (size != sizeof(struct v2_disk_dqheader)) { 37 if (size != sizeof(struct v2_disk_dqheader)) {
38 printk("failed read\n"); 38 printk("quota_v2: failed read expected=%d got=%d\n",
39 sizeof(struct v2_disk_dqheader), size);
39 return 0; 40 return 0;
40 } 41 }
41 if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] || 42 if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] ||
diff --git a/fs/ramfs/Makefile b/fs/ramfs/Makefile
index f096f300709..5a0236e02ee 100644
--- a/fs/ramfs/Makefile
+++ b/fs/ramfs/Makefile
@@ -4,4 +4,6 @@
4 4
5obj-$(CONFIG_RAMFS) += ramfs.o 5obj-$(CONFIG_RAMFS) += ramfs.o
6 6
7ramfs-objs := inode.o 7file-mmu-y := file-nommu.o
8file-mmu-$(CONFIG_MMU) := file-mmu.o
9ramfs-objs += inode.o $(file-mmu-y)
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
new file mode 100644
index 00000000000..2115383dcc8
--- /dev/null
+++ b/fs/ramfs/file-mmu.c
@@ -0,0 +1,57 @@
1/* file-mmu.c: ramfs MMU-based file operations
2 *
3 * Resizable simple ram filesystem for Linux.
4 *
5 * Copyright (C) 2000 Linus Torvalds.
6 * 2000 Transmeta Corp.
7 *
8 * Usage limits added by David Gibson, Linuxcare Australia.
9 * This file is released under the GPL.
10 */
11
12/*
13 * NOTE! This filesystem is probably most useful
14 * not as a real filesystem, but as an example of
15 * how virtual filesystems can be written.
16 *
17 * It doesn't get much simpler than this. Consider
18 * that this file implements the full semantics of
19 * a POSIX-compliant read-write filesystem.
20 *
21 * Note in particular how the filesystem does not
22 * need to implement any data structures of its own
23 * to keep track of the virtual data: using the VFS
24 * caches is sufficient.
25 */
26
27#include <linux/module.h>
28#include <linux/fs.h>
29#include <linux/pagemap.h>
30#include <linux/highmem.h>
31#include <linux/init.h>
32#include <linux/string.h>
33#include <linux/smp_lock.h>
34#include <linux/backing-dev.h>
35#include <linux/ramfs.h>
36
37#include <asm/uaccess.h>
38#include "internal.h"
39
40struct address_space_operations ramfs_aops = {
41 .readpage = simple_readpage,
42 .prepare_write = simple_prepare_write,
43 .commit_write = simple_commit_write
44};
45
46struct file_operations ramfs_file_operations = {
47 .read = generic_file_read,
48 .write = generic_file_write,
49 .mmap = generic_file_mmap,
50 .fsync = simple_sync_file,
51 .sendfile = generic_file_sendfile,
52 .llseek = generic_file_llseek,
53};
54
55struct inode_operations ramfs_file_inode_operations = {
56 .getattr = simple_getattr,
57};
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
new file mode 100644
index 00000000000..3f810acd0bf
--- /dev/null
+++ b/fs/ramfs/file-nommu.c
@@ -0,0 +1,292 @@
1/* file-nommu.c: no-MMU version of ramfs
2 *
3 * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/fs.h>
14#include <linux/pagemap.h>
15#include <linux/highmem.h>
16#include <linux/init.h>
17#include <linux/string.h>
18#include <linux/smp_lock.h>
19#include <linux/backing-dev.h>
20#include <linux/ramfs.h>
21#include <linux/quotaops.h>
22#include <linux/pagevec.h>
23#include <linux/mman.h>
24
25#include <asm/uaccess.h>
26#include "internal.h"
27
28static int ramfs_nommu_setattr(struct dentry *, struct iattr *);
29
30struct address_space_operations ramfs_aops = {
31 .readpage = simple_readpage,
32 .prepare_write = simple_prepare_write,
33 .commit_write = simple_commit_write
34};
35
36struct file_operations ramfs_file_operations = {
37 .mmap = ramfs_nommu_mmap,
38 .get_unmapped_area = ramfs_nommu_get_unmapped_area,
39 .read = generic_file_read,
40 .write = generic_file_write,
41 .fsync = simple_sync_file,
42 .sendfile = generic_file_sendfile,
43 .llseek = generic_file_llseek,
44};
45
46struct inode_operations ramfs_file_inode_operations = {
47 .setattr = ramfs_nommu_setattr,
48 .getattr = simple_getattr,
49};
50
51/*****************************************************************************/
52/*
53 * add a contiguous set of pages into a ramfs inode when it's truncated from
54 * size 0 on the assumption that it's going to be used for an mmap of shared
55 * memory
56 */
57static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
58{
59 struct pagevec lru_pvec;
60 unsigned long npages, xpages, loop, limit;
61 struct page *pages;
62 unsigned order;
63 void *data;
64 int ret;
65
66 /* make various checks */
67 order = get_order(newsize);
68 if (unlikely(order >= MAX_ORDER))
69 goto too_big;
70
71 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
72 if (limit != RLIM_INFINITY && newsize > limit)
73 goto fsize_exceeded;
74
75 if (newsize > inode->i_sb->s_maxbytes)
76 goto too_big;
77
78 i_size_write(inode, newsize);
79
80 /* allocate enough contiguous pages to be able to satisfy the
81 * request */
82 pages = alloc_pages(mapping_gfp_mask(inode->i_mapping), order);
83 if (!pages)
84 return -ENOMEM;
85
86 /* split the high-order page into an array of single pages */
87 xpages = 1UL << order;
88 npages = (newsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
89
90 for (loop = 0; loop < npages; loop++)
91 set_page_count(pages + loop, 1);
92
93 /* trim off any pages we don't actually require */
94 for (loop = npages; loop < xpages; loop++)
95 __free_page(pages + loop);
96
97 /* clear the memory we allocated */
98 newsize = PAGE_SIZE * npages;
99 data = page_address(pages);
100 memset(data, 0, newsize);
101
102 /* attach all the pages to the inode's address space */
103 pagevec_init(&lru_pvec, 0);
104 for (loop = 0; loop < npages; loop++) {
105 struct page *page = pages + loop;
106
107 ret = add_to_page_cache(page, inode->i_mapping, loop, GFP_KERNEL);
108 if (ret < 0)
109 goto add_error;
110
111 if (!pagevec_add(&lru_pvec, page))
112 __pagevec_lru_add(&lru_pvec);
113
114 unlock_page(page);
115 }
116
117 pagevec_lru_add(&lru_pvec);
118 return 0;
119
120 fsize_exceeded:
121 send_sig(SIGXFSZ, current, 0);
122 too_big:
123 return -EFBIG;
124
125 add_error:
126 page_cache_release(pages + loop);
127 for (loop++; loop < npages; loop++)
128 __free_page(pages + loop);
129 return ret;
130}
131
132/*****************************************************************************/
133/*
134 * check that file shrinkage doesn't leave any VMAs dangling in midair
135 */
136static int ramfs_nommu_check_mappings(struct inode *inode,
137 size_t newsize, size_t size)
138{
139 struct vm_area_struct *vma;
140 struct prio_tree_iter iter;
141
142 /* search for VMAs that fall within the dead zone */
143 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
144 newsize >> PAGE_SHIFT,
145 (size + PAGE_SIZE - 1) >> PAGE_SHIFT
146 ) {
147 /* found one - only interested if it's shared out of the page
148 * cache */
149 if (vma->vm_flags & VM_SHARED)
150 return -ETXTBSY; /* not quite true, but near enough */
151 }
152
153 return 0;
154}
155
156/*****************************************************************************/
157/*
158 *
159 */
160static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
161{
162 int ret;
163
164 /* assume a truncate from zero size is going to be for the purposes of
165 * shared mmap */
166 if (size == 0) {
167 if (unlikely(newsize >> 32))
168 return -EFBIG;
169
170 return ramfs_nommu_expand_for_mapping(inode, newsize);
171 }
172
173 /* check that a decrease in size doesn't cut off any shared mappings */
174 if (newsize < size) {
175 ret = ramfs_nommu_check_mappings(inode, newsize, size);
176 if (ret < 0)
177 return ret;
178 }
179
180 ret = vmtruncate(inode, size);
181
182 return ret;
183}
184
185/*****************************************************************************/
186/*
187 * handle a change of attributes
188 * - we're specifically interested in a change of size
189 */
190static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
191{
192 struct inode *inode = dentry->d_inode;
193 unsigned int old_ia_valid = ia->ia_valid;
194 int ret = 0;
195
196 /* by providing our own setattr() method, we skip this quotaism */
197 if ((old_ia_valid & ATTR_UID && ia->ia_uid != inode->i_uid) ||
198 (old_ia_valid & ATTR_GID && ia->ia_gid != inode->i_gid))
199 ret = DQUOT_TRANSFER(inode, ia) ? -EDQUOT : 0;
200
201 /* pick out size-changing events */
202 if (ia->ia_valid & ATTR_SIZE) {
203 loff_t size = i_size_read(inode);
204 if (ia->ia_size != size) {
205 ret = ramfs_nommu_resize(inode, ia->ia_size, size);
206 if (ret < 0 || ia->ia_valid == ATTR_SIZE)
207 goto out;
208 } else {
209 /* we skipped the truncate but must still update
210 * timestamps
211 */
212 ia->ia_valid |= ATTR_MTIME|ATTR_CTIME;
213 }
214 }
215
216 ret = inode_setattr(inode, ia);
217 out:
218 ia->ia_valid = old_ia_valid;
219 return ret;
220}
221
222/*****************************************************************************/
223/*
224 * try to determine where a shared mapping can be made
225 * - we require that:
226 * - the pages to be mapped must exist
227 * - the pages be physically contiguous in sequence
228 */
229unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
230 unsigned long addr, unsigned long len,
231 unsigned long pgoff, unsigned long flags)
232{
233 unsigned long maxpages, lpages, nr, loop, ret;
234 struct inode *inode = file->f_dentry->d_inode;
235 struct page **pages = NULL, **ptr, *page;
236 loff_t isize;
237
238 if (!(flags & MAP_SHARED))
239 return addr;
240
241 /* the mapping mustn't extend beyond the EOF */
242 lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
243 isize = i_size_read(inode);
244
245 ret = -EINVAL;
246 maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT;
247 if (pgoff >= maxpages)
248 goto out;
249
250 if (maxpages - pgoff < lpages)
251 goto out;
252
253 /* gang-find the pages */
254 ret = -ENOMEM;
255 pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL);
256 if (!pages)
257 goto out;
258
259 nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages);
260 if (nr != lpages)
261 goto out; /* leave if some pages were missing */
262
263 /* check the pages for physical adjacency */
264 ptr = pages;
265 page = *ptr++;
266 page++;
267 for (loop = lpages; loop > 1; loop--)
268 if (*ptr++ != page++)
269 goto out;
270
271 /* okay - all conditions fulfilled */
272 ret = (unsigned long) page_address(pages[0]);
273
274 out:
275 if (pages) {
276 ptr = pages;
277 for (loop = lpages; loop > 0; loop--)
278 put_page(*ptr++);
279 kfree(pages);
280 }
281
282 return ret;
283}
284
285/*****************************************************************************/
286/*
287 * set up a mapping
288 */
289int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
290{
291 return 0;
292}
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 0a88917605a..c66bd5e4c05 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -34,13 +34,12 @@
34#include <linux/ramfs.h> 34#include <linux/ramfs.h>
35 35
36#include <asm/uaccess.h> 36#include <asm/uaccess.h>
37#include "internal.h"
37 38
38/* some random number */ 39/* some random number */
39#define RAMFS_MAGIC 0x858458f6 40#define RAMFS_MAGIC 0x858458f6
40 41
41static struct super_operations ramfs_ops; 42static struct super_operations ramfs_ops;
42static struct address_space_operations ramfs_aops;
43static struct inode_operations ramfs_file_inode_operations;
44static struct inode_operations ramfs_dir_inode_operations; 43static struct inode_operations ramfs_dir_inode_operations;
45 44
46static struct backing_dev_info ramfs_backing_dev_info = { 45static struct backing_dev_info ramfs_backing_dev_info = {
@@ -142,25 +141,6 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char *
142 return error; 141 return error;
143} 142}
144 143
145static struct address_space_operations ramfs_aops = {
146 .readpage = simple_readpage,
147 .prepare_write = simple_prepare_write,
148 .commit_write = simple_commit_write
149};
150
151struct file_operations ramfs_file_operations = {
152 .read = generic_file_read,
153 .write = generic_file_write,
154 .mmap = generic_file_mmap,
155 .fsync = simple_sync_file,
156 .sendfile = generic_file_sendfile,
157 .llseek = generic_file_llseek,
158};
159
160static struct inode_operations ramfs_file_inode_operations = {
161 .getattr = simple_getattr,
162};
163
164static struct inode_operations ramfs_dir_inode_operations = { 144static struct inode_operations ramfs_dir_inode_operations = {
165 .create = ramfs_create, 145 .create = ramfs_create,
166 .lookup = simple_lookup, 146 .lookup = simple_lookup,
diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h
new file mode 100644
index 00000000000..272c8a7120b
--- /dev/null
+++ b/fs/ramfs/internal.h
@@ -0,0 +1,15 @@
1/* internal.h: ramfs internal definitions
2 *
3 * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12
13extern struct address_space_operations ramfs_aops;
14extern struct file_operations ramfs_file_operations;
15extern struct inode_operations ramfs_file_inode_operations;
diff --git a/fs/read_write.c b/fs/read_write.c
index a091ee4f430..3f7a1a62165 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -14,6 +14,7 @@
14#include <linux/security.h> 14#include <linux/security.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/syscalls.h> 16#include <linux/syscalls.h>
17#include <linux/pagemap.h>
17 18
18#include <asm/uaccess.h> 19#include <asm/uaccess.h>
19#include <asm/unistd.h> 20#include <asm/unistd.h>
@@ -32,7 +33,7 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
32 long long retval; 33 long long retval;
33 struct inode *inode = file->f_mapping->host; 34 struct inode *inode = file->f_mapping->host;
34 35
35 down(&inode->i_sem); 36 mutex_lock(&inode->i_mutex);
36 switch (origin) { 37 switch (origin) {
37 case 2: 38 case 2:
38 offset += inode->i_size; 39 offset += inode->i_size;
@@ -48,7 +49,7 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
48 } 49 }
49 retval = offset; 50 retval = offset;
50 } 51 }
51 up(&inode->i_sem); 52 mutex_unlock(&inode->i_mutex);
52 return retval; 53 return retval;
53} 54}
54 55
@@ -182,22 +183,33 @@ bad:
182} 183}
183#endif 184#endif
184 185
186/*
187 * rw_verify_area doesn't like huge counts. We limit
188 * them to something that fits in "int" so that others
189 * won't have to do range checks all the time.
190 */
191#define MAX_RW_COUNT (INT_MAX & PAGE_CACHE_MASK)
185 192
186int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count) 193int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
187{ 194{
188 struct inode *inode; 195 struct inode *inode;
189 loff_t pos; 196 loff_t pos;
190 197
191 if (unlikely(count > INT_MAX)) 198 if (unlikely((ssize_t) count < 0))
192 goto Einval; 199 goto Einval;
193 pos = *ppos; 200 pos = *ppos;
194 if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) 201 if (unlikely((pos < 0) || (loff_t) (pos + count) < 0))
195 goto Einval; 202 goto Einval;
196 203
197 inode = file->f_dentry->d_inode; 204 inode = file->f_dentry->d_inode;
198 if (inode->i_flock && MANDATORY_LOCK(inode)) 205 if (inode->i_flock && MANDATORY_LOCK(inode)) {
199 return locks_mandatory_area(read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE, inode, file, pos, count); 206 int retval = locks_mandatory_area(
200 return 0; 207 read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
208 inode, file, pos, count);
209 if (retval < 0)
210 return retval;
211 }
212 return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
201 213
202Einval: 214Einval:
203 return -EINVAL; 215 return -EINVAL;
@@ -244,7 +256,8 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
244 return -EFAULT; 256 return -EFAULT;
245 257
246 ret = rw_verify_area(READ, file, pos, count); 258 ret = rw_verify_area(READ, file, pos, count);
247 if (!ret) { 259 if (ret >= 0) {
260 count = ret;
248 ret = security_file_permission (file, MAY_READ); 261 ret = security_file_permission (file, MAY_READ);
249 if (!ret) { 262 if (!ret) {
250 if (file->f_op->read) 263 if (file->f_op->read)
@@ -295,7 +308,8 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
295 return -EFAULT; 308 return -EFAULT;
296 309
297 ret = rw_verify_area(WRITE, file, pos, count); 310 ret = rw_verify_area(WRITE, file, pos, count);
298 if (!ret) { 311 if (ret >= 0) {
312 count = ret;
299 ret = security_file_permission (file, MAY_WRITE); 313 ret = security_file_permission (file, MAY_WRITE);
300 if (!ret) { 314 if (!ret) {
301 if (file->f_op->write) 315 if (file->f_op->write)
@@ -497,7 +511,7 @@ static ssize_t do_readv_writev(int type, struct file *file,
497 } 511 }
498 512
499 ret = rw_verify_area(type, file, pos, tot_len); 513 ret = rw_verify_area(type, file, pos, tot_len);
500 if (ret) 514 if (ret < 0)
501 goto out; 515 goto out;
502 ret = security_file_permission(file, type == READ ? MAY_READ : MAY_WRITE); 516 ret = security_file_permission(file, type == READ ? MAY_READ : MAY_WRITE);
503 if (ret) 517 if (ret)
@@ -653,8 +667,9 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
653 if (!(in_file->f_mode & FMODE_PREAD)) 667 if (!(in_file->f_mode & FMODE_PREAD))
654 goto fput_in; 668 goto fput_in;
655 retval = rw_verify_area(READ, in_file, ppos, count); 669 retval = rw_verify_area(READ, in_file, ppos, count);
656 if (retval) 670 if (retval < 0)
657 goto fput_in; 671 goto fput_in;
672 count = retval;
658 673
659 retval = security_file_permission (in_file, MAY_READ); 674 retval = security_file_permission (in_file, MAY_READ);
660 if (retval) 675 if (retval)
@@ -674,8 +689,9 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
674 goto fput_out; 689 goto fput_out;
675 out_inode = out_file->f_dentry->d_inode; 690 out_inode = out_file->f_dentry->d_inode;
676 retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count); 691 retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
677 if (retval) 692 if (retval < 0)
678 goto fput_out; 693 goto fput_out;
694 count = retval;
679 695
680 retval = security_file_permission (out_file, MAY_WRITE); 696 retval = security_file_permission (out_file, MAY_WRITE);
681 if (retval) 697 if (retval)
diff --git a/fs/readdir.c b/fs/readdir.c
index b03579bc021..b6109329b60 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -30,13 +30,13 @@ int vfs_readdir(struct file *file, filldir_t filler, void *buf)
30 if (res) 30 if (res)
31 goto out; 31 goto out;
32 32
33 down(&inode->i_sem); 33 mutex_lock(&inode->i_mutex);
34 res = -ENOENT; 34 res = -ENOENT;
35 if (!IS_DEADDIR(inode)) { 35 if (!IS_DEADDIR(inode)) {
36 res = file->f_op->readdir(file, buf, filler); 36 res = file->f_op->readdir(file, buf, filler);
37 file_accessed(file); 37 file_accessed(file);
38 } 38 }
39 up(&inode->i_sem); 39 mutex_unlock(&inode->i_mutex);
40out: 40out:
41 return res; 41 return res;
42} 42}
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 7892a865b58..ad6fa964b0e 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -49,7 +49,7 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
49 } 49 }
50 50
51 reiserfs_write_lock(inode->i_sb); 51 reiserfs_write_lock(inode->i_sb);
52 down(&inode->i_sem); 52 mutex_lock(&inode->i_mutex);
53 /* freeing preallocation only involves relogging blocks that 53 /* freeing preallocation only involves relogging blocks that
54 * are already in the current transaction. preallocation gets 54 * are already in the current transaction. preallocation gets
55 * freed at the end of each transaction, so it is impossible for 55 * freed at the end of each transaction, so it is impossible for
@@ -100,7 +100,7 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
100 err = reiserfs_truncate_file(inode, 0); 100 err = reiserfs_truncate_file(inode, 0);
101 } 101 }
102 out: 102 out:
103 up(&inode->i_sem); 103 mutex_unlock(&inode->i_mutex);
104 reiserfs_write_unlock(inode->i_sb); 104 reiserfs_write_unlock(inode->i_sb);
105 return err; 105 return err;
106} 106}
@@ -1342,7 +1342,7 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t
1342 if (unlikely(!access_ok(VERIFY_READ, buf, count))) 1342 if (unlikely(!access_ok(VERIFY_READ, buf, count)))
1343 return -EFAULT; 1343 return -EFAULT;
1344 1344
1345 down(&inode->i_sem); // locks the entire file for just us 1345 mutex_lock(&inode->i_mutex); // locks the entire file for just us
1346 1346
1347 pos = *ppos; 1347 pos = *ppos;
1348 1348
@@ -1360,7 +1360,7 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t
1360 if (res) 1360 if (res)
1361 goto out; 1361 goto out;
1362 1362
1363 inode_update_time(inode, 1); /* Both mtime and ctime */ 1363 file_update_time(file);
1364 1364
1365 // Ok, we are done with all the checks. 1365 // Ok, we are done with all the checks.
1366 1366
@@ -1532,12 +1532,12 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t
1532 generic_osync_inode(inode, file->f_mapping, 1532 generic_osync_inode(inode, file->f_mapping,
1533 OSYNC_METADATA | OSYNC_DATA); 1533 OSYNC_METADATA | OSYNC_DATA);
1534 1534
1535 up(&inode->i_sem); 1535 mutex_unlock(&inode->i_mutex);
1536 reiserfs_async_progress_wait(inode->i_sb); 1536 reiserfs_async_progress_wait(inode->i_sb);
1537 return (already_written != 0) ? already_written : res; 1537 return (already_written != 0) ? already_written : res;
1538 1538
1539 out: 1539 out:
1540 up(&inode->i_sem); // unlock the file on exit. 1540 mutex_unlock(&inode->i_mutex); // unlock the file on exit.
1541 return res; 1541 return res;
1542} 1542}
1543 1543
diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c
index 37c1306eb9b..a3ec238fd9e 100644
--- a/fs/reiserfs/hashes.c
+++ b/fs/reiserfs/hashes.c
@@ -19,6 +19,7 @@
19// 19//
20 20
21#include <linux/kernel.h> 21#include <linux/kernel.h>
22#include <linux/reiserfs_fs.h>
22#include <asm/types.h> 23#include <asm/types.h>
23#include <asm/bug.h> 24#include <asm/bug.h>
24 25
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 5f82352b97e..ffa34b861bd 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -32,6 +32,7 @@ void reiserfs_delete_inode(struct inode *inode)
32 JOURNAL_PER_BALANCE_CNT * 2 + 32 JOURNAL_PER_BALANCE_CNT * 2 +
33 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb); 33 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
34 struct reiserfs_transaction_handle th; 34 struct reiserfs_transaction_handle th;
35 int err;
35 36
36 truncate_inode_pages(&inode->i_data, 0); 37 truncate_inode_pages(&inode->i_data, 0);
37 38
@@ -39,32 +40,36 @@ void reiserfs_delete_inode(struct inode *inode)
39 40
40 /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */ 41 /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
41 if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */ 42 if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */
42 down(&inode->i_sem); 43 mutex_lock(&inode->i_mutex);
43 44
44 reiserfs_delete_xattrs(inode); 45 reiserfs_delete_xattrs(inode);
45 46
46 if (journal_begin(&th, inode->i_sb, jbegin_count)) { 47 if (journal_begin(&th, inode->i_sb, jbegin_count)) {
47 up(&inode->i_sem); 48 mutex_unlock(&inode->i_mutex);
48 goto out; 49 goto out;
49 } 50 }
50 reiserfs_update_inode_transaction(inode); 51 reiserfs_update_inode_transaction(inode);
51 52
52 if (reiserfs_delete_object(&th, inode)) { 53 err = reiserfs_delete_object(&th, inode);
53 up(&inode->i_sem);
54 goto out;
55 }
56 54
57 /* Do quota update inside a transaction for journaled quotas. We must do that 55 /* Do quota update inside a transaction for journaled quotas. We must do that
58 * after delete_object so that quota updates go into the same transaction as 56 * after delete_object so that quota updates go into the same transaction as
59 * stat data deletion */ 57 * stat data deletion */
60 DQUOT_FREE_INODE(inode); 58 if (!err)
59 DQUOT_FREE_INODE(inode);
61 60
62 if (journal_end(&th, inode->i_sb, jbegin_count)) { 61 if (journal_end(&th, inode->i_sb, jbegin_count)) {
63 up(&inode->i_sem); 62 mutex_unlock(&inode->i_mutex);
64 goto out; 63 goto out;
65 } 64 }
66 65
67 up(&inode->i_sem); 66 mutex_unlock(&inode->i_mutex);
67
68 /* check return value from reiserfs_delete_object after
69 * ending the transaction
70 */
71 if (err)
72 goto out;
68 73
69 /* all items of file are deleted, so we can remove "save" link */ 74 /* all items of file are deleted, so we can remove "save" link */
70 remove_save_link(inode, 0 /* not truncate */ ); /* we can't do anything 75 remove_save_link(inode, 0 /* not truncate */ ); /* we can't do anything
@@ -546,7 +551,7 @@ static int convert_tail_for_hole(struct inode *inode,
546 551
547 /* we don't have to make sure the conversion did not happen while 552 /* we don't have to make sure the conversion did not happen while
548 ** we were locking the page because anyone that could convert 553 ** we were locking the page because anyone that could convert
549 ** must first take i_sem. 554 ** must first take i_mutex.
550 ** 555 **
551 ** We must fix the tail page for writing because it might have buffers 556 ** We must fix the tail page for writing because it might have buffers
552 ** that are mapped, but have a block number of 0. This indicates tail 557 ** that are mapped, but have a block number of 0. This indicates tail
@@ -581,7 +586,7 @@ static inline int _allocate_block(struct reiserfs_transaction_handle *th,
581 BUG_ON(!th->t_trans_id); 586 BUG_ON(!th->t_trans_id);
582 587
583#ifdef REISERFS_PREALLOCATE 588#ifdef REISERFS_PREALLOCATE
584 if (!(flags & GET_BLOCK_NO_ISEM)) { 589 if (!(flags & GET_BLOCK_NO_IMUX)) {
585 return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr, 590 return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr,
586 path, block); 591 path, block);
587 } 592 }
@@ -2099,6 +2104,7 @@ int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps)
2099 struct page *page = NULL; 2104 struct page *page = NULL;
2100 int error; 2105 int error;
2101 struct buffer_head *bh = NULL; 2106 struct buffer_head *bh = NULL;
2107 int err2;
2102 2108
2103 reiserfs_write_lock(p_s_inode->i_sb); 2109 reiserfs_write_lock(p_s_inode->i_sb);
2104 2110
@@ -2136,14 +2142,18 @@ int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps)
2136 transaction of truncating gets committed - on reboot the file 2142 transaction of truncating gets committed - on reboot the file
2137 either appears truncated properly or not truncated at all */ 2143 either appears truncated properly or not truncated at all */
2138 add_save_link(&th, p_s_inode, 1); 2144 add_save_link(&th, p_s_inode, 1);
2139 error = reiserfs_do_truncate(&th, p_s_inode, page, update_timestamps); 2145 err2 = reiserfs_do_truncate(&th, p_s_inode, page, update_timestamps);
2140 if (error)
2141 goto out;
2142 error = 2146 error =
2143 journal_end(&th, p_s_inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1); 2147 journal_end(&th, p_s_inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1);
2144 if (error) 2148 if (error)
2145 goto out; 2149 goto out;
2146 2150
2151 /* check reiserfs_do_truncate after ending the transaction */
2152 if (err2) {
2153 error = err2;
2154 goto out;
2155 }
2156
2147 if (update_timestamps) { 2157 if (update_timestamps) {
2148 error = remove_save_link(p_s_inode, 1 /* truncate */ ); 2158 error = remove_save_link(p_s_inode, 1 /* truncate */ );
2149 if (error) 2159 if (error)
@@ -2194,7 +2204,7 @@ static int map_block_for_writepage(struct inode *inode,
2194 INITIALIZE_PATH(path); 2204 INITIALIZE_PATH(path);
2195 int pos_in_item; 2205 int pos_in_item;
2196 int jbegin_count = JOURNAL_PER_BALANCE_CNT; 2206 int jbegin_count = JOURNAL_PER_BALANCE_CNT;
2197 loff_t byte_offset = (block << inode->i_sb->s_blocksize_bits) + 1; 2207 loff_t byte_offset = ((loff_t)block << inode->i_sb->s_blocksize_bits)+1;
2198 int retval; 2208 int retval;
2199 int use_get_block = 0; 2209 int use_get_block = 0;
2200 int bytes_copied = 0; 2210 int bytes_copied = 0;
@@ -2308,7 +2318,7 @@ static int map_block_for_writepage(struct inode *inode,
2308 /* this is where we fill in holes in the file. */ 2318 /* this is where we fill in holes in the file. */
2309 if (use_get_block) { 2319 if (use_get_block) {
2310 retval = reiserfs_get_block(inode, block, bh_result, 2320 retval = reiserfs_get_block(inode, block, bh_result,
2311 GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM 2321 GET_BLOCK_CREATE | GET_BLOCK_NO_IMUX
2312 | GET_BLOCK_NO_DANGLE); 2322 | GET_BLOCK_NO_DANGLE);
2313 if (!retval) { 2323 if (!retval) {
2314 if (!buffer_mapped(bh_result) 2324 if (!buffer_mapped(bh_result)
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 81fc00285f6..745c8810089 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -2,6 +2,7 @@
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README 2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */ 3 */
4 4
5#include <linux/capability.h>
5#include <linux/fs.h> 6#include <linux/fs.h>
6#include <linux/reiserfs_fs.h> 7#include <linux/reiserfs_fs.h>
7#include <linux/time.h> 8#include <linux/time.h>
@@ -120,7 +121,7 @@ static int reiserfs_unpack(struct inode *inode, struct file *filp)
120 /* we need to make sure nobody is changing the file size beneath 121 /* we need to make sure nobody is changing the file size beneath
121 ** us 122 ** us
122 */ 123 */
123 down(&inode->i_sem); 124 mutex_lock(&inode->i_mutex);
124 125
125 write_from = inode->i_size & (blocksize - 1); 126 write_from = inode->i_size & (blocksize - 1);
126 /* if we are on a block boundary, we are already unpacked. */ 127 /* if we are on a block boundary, we are already unpacked. */
@@ -156,7 +157,7 @@ static int reiserfs_unpack(struct inode *inode, struct file *filp)
156 page_cache_release(page); 157 page_cache_release(page);
157 158
158 out: 159 out:
159 up(&inode->i_sem); 160 mutex_unlock(&inode->i_mutex);
160 reiserfs_write_unlock(inode->i_sb); 161 reiserfs_write_unlock(inode->i_sb);
161 return retval; 162 return retval;
162} 163}
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 4b15761434b..4491fcf2a0e 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1039,6 +1039,10 @@ static int flush_commit_list(struct super_block *s,
1039 } 1039 }
1040 atomic_dec(&journal->j_async_throttle); 1040 atomic_dec(&journal->j_async_throttle);
1041 1041
1042 /* We're skipping the commit if there's an error */
1043 if (retval || reiserfs_is_journal_aborted(journal))
1044 barrier = 0;
1045
1042 /* wait on everything written so far before writing the commit 1046 /* wait on everything written so far before writing the commit
1043 * if we are in barrier mode, send the commit down now 1047 * if we are in barrier mode, send the commit down now
1044 */ 1048 */
@@ -1077,10 +1081,16 @@ static int flush_commit_list(struct super_block *s,
1077 BUG_ON(atomic_read(&(jl->j_commit_left)) != 1); 1081 BUG_ON(atomic_read(&(jl->j_commit_left)) != 1);
1078 1082
1079 if (!barrier) { 1083 if (!barrier) {
1080 if (buffer_dirty(jl->j_commit_bh)) 1084 /* If there was a write error in the journal - we can't commit
1081 BUG(); 1085 * this transaction - it will be invalid and, if successful,
1082 mark_buffer_dirty(jl->j_commit_bh); 1086 * will just end up propogating the write error out to
1083 sync_dirty_buffer(jl->j_commit_bh); 1087 * the file system. */
1088 if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
1089 if (buffer_dirty(jl->j_commit_bh))
1090 BUG();
1091 mark_buffer_dirty(jl->j_commit_bh) ;
1092 sync_dirty_buffer(jl->j_commit_bh) ;
1093 }
1084 } else 1094 } else
1085 wait_on_buffer(jl->j_commit_bh); 1095 wait_on_buffer(jl->j_commit_bh);
1086 1096
@@ -2757,6 +2767,15 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
2757 journal->j_cnode_used = 0; 2767 journal->j_cnode_used = 0;
2758 journal->j_must_wait = 0; 2768 journal->j_must_wait = 0;
2759 2769
2770 if (journal->j_cnode_free == 0) {
2771 reiserfs_warning(p_s_sb, "journal-2004: Journal cnode memory "
2772 "allocation failed (%ld bytes). Journal is "
2773 "too large for available memory. Usually "
2774 "this is due to a journal that is too large.",
2775 sizeof (struct reiserfs_journal_cnode) * num_cnodes);
2776 goto free_and_return;
2777 }
2778
2760 init_journal_hash(p_s_sb); 2779 init_journal_hash(p_s_sb);
2761 jl = journal->j_current_jl; 2780 jl = journal->j_current_jl;
2762 jl->j_list_bitmap = get_list_bitmap(p_s_sb, jl); 2781 jl->j_list_bitmap = get_list_bitmap(p_s_sb, jl);
@@ -3906,10 +3925,13 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
3906 flush = 1; 3925 flush = 1;
3907 } 3926 }
3908#ifdef REISERFS_PREALLOCATE 3927#ifdef REISERFS_PREALLOCATE
3909 /* quota ops might need to nest, setup the journal_info pointer for them */ 3928 /* quota ops might need to nest, setup the journal_info pointer for them
3929 * and raise the refcount so that it is > 0. */
3910 current->journal_info = th; 3930 current->journal_info = th;
3931 th->t_refcount++;
3911 reiserfs_discard_all_prealloc(th); /* it should not involve new blocks into 3932 reiserfs_discard_all_prealloc(th); /* it should not involve new blocks into
3912 * the transaction */ 3933 * the transaction */
3934 th->t_refcount--;
3913 current->journal_info = th->t_handle_save; 3935 current->journal_info = th->t_handle_save;
3914#endif 3936#endif
3915 3937
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 3549067c42d..8f8d8d01107 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -375,11 +375,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
375 return ERR_PTR(-EIO); 375 return ERR_PTR(-EIO);
376 } 376 }
377 377
378 if (inode) 378 return d_splice_alias(inode, dentry);
379 return d_splice_alias(inode, dentry);
380
381 d_add(dentry, inode);
382 return NULL;
383} 379}
384 380
385/* 381/*
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 42afb5bef11..397d9590c8f 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2211,7 +2211,7 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
2211 size_t towrite = len; 2211 size_t towrite = len;
2212 struct buffer_head tmp_bh, *bh; 2212 struct buffer_head tmp_bh, *bh;
2213 2213
2214 down(&inode->i_sem); 2214 mutex_lock(&inode->i_mutex);
2215 while (towrite > 0) { 2215 while (towrite > 0) {
2216 tocopy = sb->s_blocksize - offset < towrite ? 2216 tocopy = sb->s_blocksize - offset < towrite ?
2217 sb->s_blocksize - offset : towrite; 2217 sb->s_blocksize - offset : towrite;
@@ -2250,7 +2250,7 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
2250 inode->i_version++; 2250 inode->i_version++;
2251 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 2251 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2252 mark_inode_dirty(inode); 2252 mark_inode_dirty(inode);
2253 up(&inode->i_sem); 2253 mutex_unlock(&inode->i_mutex);
2254 return len - towrite; 2254 return len - towrite;
2255} 2255}
2256 2256
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
index c92e124f628..196e971c03c 100644
--- a/fs/reiserfs/tail_conversion.c
+++ b/fs/reiserfs/tail_conversion.c
@@ -205,7 +205,7 @@ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_in
205 1) * p_s_sb->s_blocksize; 205 1) * p_s_sb->s_blocksize;
206 pos1 = pos; 206 pos1 = pos;
207 207
208 // we are protected by i_sem. The tail can not disapper, not 208 // we are protected by i_mutex. The tail can not disapper, not
209 // append can be done either 209 // append can be done either
210 // we are in truncate or packing tail in file_release 210 // we are in truncate or packing tail in file_release
211 211
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 72e12079867..cc061bfd437 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -30,6 +30,7 @@
30 */ 30 */
31 31
32#include <linux/reiserfs_fs.h> 32#include <linux/reiserfs_fs.h>
33#include <linux/capability.h>
33#include <linux/dcache.h> 34#include <linux/dcache.h>
34#include <linux/namei.h> 35#include <linux/namei.h>
35#include <linux/errno.h> 36#include <linux/errno.h>
@@ -67,11 +68,11 @@ static struct dentry *create_xa_root(struct super_block *sb)
67 goto out; 68 goto out;
68 } else if (!xaroot->d_inode) { 69 } else if (!xaroot->d_inode) {
69 int err; 70 int err;
70 down(&privroot->d_inode->i_sem); 71 mutex_lock(&privroot->d_inode->i_mutex);
71 err = 72 err =
72 privroot->d_inode->i_op->mkdir(privroot->d_inode, xaroot, 73 privroot->d_inode->i_op->mkdir(privroot->d_inode, xaroot,
73 0700); 74 0700);
74 up(&privroot->d_inode->i_sem); 75 mutex_unlock(&privroot->d_inode->i_mutex);
75 76
76 if (err) { 77 if (err) {
77 dput(xaroot); 78 dput(xaroot);
@@ -115,8 +116,8 @@ static struct dentry *__get_xa_root(struct super_block *s)
115} 116}
116 117
117/* Returns the dentry (or NULL) referring to the root of the extended 118/* Returns the dentry (or NULL) referring to the root of the extended
118 * attribute directory tree. If it has already been retreived, it is used. 119 * attribute directory tree. If it has already been retrieved, it is used.
119 * Otherwise, we attempt to retreive it from disk. It may also return 120 * Otherwise, we attempt to retrieve it from disk. It may also return
120 * a pointer-encoded error. 121 * a pointer-encoded error.
121 */ 122 */
122static inline struct dentry *get_xa_root(struct super_block *s) 123static inline struct dentry *get_xa_root(struct super_block *s)
@@ -219,7 +220,7 @@ static struct dentry *get_xa_file_dentry(const struct inode *inode,
219 } else if (flags & XATTR_REPLACE || flags & FL_READONLY) { 220 } else if (flags & XATTR_REPLACE || flags & FL_READONLY) {
220 goto out; 221 goto out;
221 } else { 222 } else {
222 /* inode->i_sem is down, so nothing else can try to create 223 /* inode->i_mutex is down, so nothing else can try to create
223 * the same xattr */ 224 * the same xattr */
224 err = xadir->d_inode->i_op->create(xadir->d_inode, xafile, 225 err = xadir->d_inode->i_op->create(xadir->d_inode, xafile,
225 0700 | S_IFREG, NULL); 226 0700 | S_IFREG, NULL);
@@ -268,7 +269,7 @@ static struct file *open_xa_file(const struct inode *inode, const char *name,
268 * and don't mess with f->f_pos, but the idea is the same. Do some 269 * and don't mess with f->f_pos, but the idea is the same. Do some
269 * action on each and every entry in the directory. 270 * action on each and every entry in the directory.
270 * 271 *
271 * we're called with i_sem held, so there are no worries about the directory 272 * we're called with i_mutex held, so there are no worries about the directory
272 * changing underneath us. 273 * changing underneath us.
273 */ 274 */
274static int __xattr_readdir(struct file *filp, void *dirent, filldir_t filldir) 275static int __xattr_readdir(struct file *filp, void *dirent, filldir_t filldir)
@@ -426,7 +427,7 @@ int xattr_readdir(struct file *file, filldir_t filler, void *buf)
426 int res = -ENOTDIR; 427 int res = -ENOTDIR;
427 if (!file->f_op || !file->f_op->readdir) 428 if (!file->f_op || !file->f_op->readdir)
428 goto out; 429 goto out;
429 down(&inode->i_sem); 430 mutex_lock(&inode->i_mutex);
430// down(&inode->i_zombie); 431// down(&inode->i_zombie);
431 res = -ENOENT; 432 res = -ENOENT;
432 if (!IS_DEADDIR(inode)) { 433 if (!IS_DEADDIR(inode)) {
@@ -435,7 +436,7 @@ int xattr_readdir(struct file *file, filldir_t filler, void *buf)
435 unlock_kernel(); 436 unlock_kernel();
436 } 437 }
437// up(&inode->i_zombie); 438// up(&inode->i_zombie);
438 up(&inode->i_sem); 439 mutex_unlock(&inode->i_mutex);
439 out: 440 out:
440 return res; 441 return res;
441} 442}
@@ -480,7 +481,7 @@ static inline __u32 xattr_hash(const char *msg, int len)
480/* Generic extended attribute operations that can be used by xa plugins */ 481/* Generic extended attribute operations that can be used by xa plugins */
481 482
482/* 483/*
483 * inode->i_sem: down 484 * inode->i_mutex: down
484 */ 485 */
485int 486int
486reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer, 487reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
@@ -497,12 +498,6 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
497 struct iattr newattrs; 498 struct iattr newattrs;
498 __u32 xahash = 0; 499 __u32 xahash = 0;
499 500
500 if (IS_RDONLY(inode))
501 return -EROFS;
502
503 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
504 return -EPERM;
505
506 if (get_inode_sd_version(inode) == STAT_DATA_V1) 501 if (get_inode_sd_version(inode) == STAT_DATA_V1)
507 return -EOPNOTSUPP; 502 return -EOPNOTSUPP;
508 503
@@ -535,7 +530,7 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
535 /* Resize it so we're ok to write there */ 530 /* Resize it so we're ok to write there */
536 newattrs.ia_size = buffer_size; 531 newattrs.ia_size = buffer_size;
537 newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; 532 newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
538 down(&xinode->i_sem); 533 mutex_lock(&xinode->i_mutex);
539 err = notify_change(fp->f_dentry, &newattrs); 534 err = notify_change(fp->f_dentry, &newattrs);
540 if (err) 535 if (err)
541 goto out_filp; 536 goto out_filp;
@@ -598,7 +593,7 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
598 } 593 }
599 594
600 out_filp: 595 out_filp:
601 up(&xinode->i_sem); 596 mutex_unlock(&xinode->i_mutex);
602 fput(fp); 597 fput(fp);
603 598
604 out: 599 out:
@@ -606,7 +601,7 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
606} 601}
607 602
608/* 603/*
609 * inode->i_sem: down 604 * inode->i_mutex: down
610 */ 605 */
611int 606int
612reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer, 607reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
@@ -758,9 +753,6 @@ int reiserfs_xattr_del(struct inode *inode, const char *name)
758 struct dentry *dir; 753 struct dentry *dir;
759 int err; 754 int err;
760 755
761 if (IS_RDONLY(inode))
762 return -EROFS;
763
764 dir = open_xa_dir(inode, FL_READONLY); 756 dir = open_xa_dir(inode, FL_READONLY);
765 if (IS_ERR(dir)) { 757 if (IS_ERR(dir)) {
766 err = PTR_ERR(dir); 758 err = PTR_ERR(dir);
@@ -793,7 +785,7 @@ reiserfs_delete_xattrs_filler(void *buf, const char *name, int namelen,
793 785
794} 786}
795 787
796/* This is called w/ inode->i_sem downed */ 788/* This is called w/ inode->i_mutex downed */
797int reiserfs_delete_xattrs(struct inode *inode) 789int reiserfs_delete_xattrs(struct inode *inode)
798{ 790{
799 struct file *fp; 791 struct file *fp;
@@ -946,7 +938,7 @@ int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
946 938
947/* 939/*
948 * Inode operation getxattr() 940 * Inode operation getxattr()
949 * Preliminary locking: we down dentry->d_inode->i_sem 941 * Preliminary locking: we down dentry->d_inode->i_mutex
950 */ 942 */
951ssize_t 943ssize_t
952reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, 944reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
@@ -970,7 +962,7 @@ reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
970/* 962/*
971 * Inode operation setxattr() 963 * Inode operation setxattr()
972 * 964 *
973 * dentry->d_inode->i_sem down 965 * dentry->d_inode->i_mutex down
974 */ 966 */
975int 967int
976reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, 968reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
@@ -984,12 +976,6 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
984 get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) 976 get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
985 return -EOPNOTSUPP; 977 return -EOPNOTSUPP;
986 978
987 if (IS_RDONLY(dentry->d_inode))
988 return -EROFS;
989
990 if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode))
991 return -EROFS;
992
993 reiserfs_write_lock_xattr_i(dentry->d_inode); 979 reiserfs_write_lock_xattr_i(dentry->d_inode);
994 lock = !has_xattr_dir(dentry->d_inode); 980 lock = !has_xattr_dir(dentry->d_inode);
995 if (lock) 981 if (lock)
@@ -1008,7 +994,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
1008/* 994/*
1009 * Inode operation removexattr() 995 * Inode operation removexattr()
1010 * 996 *
1011 * dentry->d_inode->i_sem down 997 * dentry->d_inode->i_mutex down
1012 */ 998 */
1013int reiserfs_removexattr(struct dentry *dentry, const char *name) 999int reiserfs_removexattr(struct dentry *dentry, const char *name)
1014{ 1000{
@@ -1019,12 +1005,6 @@ int reiserfs_removexattr(struct dentry *dentry, const char *name)
1019 get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) 1005 get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
1020 return -EOPNOTSUPP; 1006 return -EOPNOTSUPP;
1021 1007
1022 if (IS_RDONLY(dentry->d_inode))
1023 return -EROFS;
1024
1025 if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode))
1026 return -EPERM;
1027
1028 reiserfs_write_lock_xattr_i(dentry->d_inode); 1008 reiserfs_write_lock_xattr_i(dentry->d_inode);
1029 reiserfs_read_lock_xattrs(dentry->d_sb); 1009 reiserfs_read_lock_xattrs(dentry->d_sb);
1030 1010
@@ -1091,7 +1071,7 @@ reiserfs_listxattr_filler(void *buf, const char *name, int namelen,
1091/* 1071/*
1092 * Inode operation listxattr() 1072 * Inode operation listxattr()
1093 * 1073 *
1094 * Preliminary locking: we down dentry->d_inode->i_sem 1074 * Preliminary locking: we down dentry->d_inode->i_mutex
1095 */ 1075 */
1096ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) 1076ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
1097{ 1077{
@@ -1289,9 +1269,9 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
1289 if (!IS_ERR(dentry)) { 1269 if (!IS_ERR(dentry)) {
1290 if (!(mount_flags & MS_RDONLY) && !dentry->d_inode) { 1270 if (!(mount_flags & MS_RDONLY) && !dentry->d_inode) {
1291 struct inode *inode = dentry->d_parent->d_inode; 1271 struct inode *inode = dentry->d_parent->d_inode;
1292 down(&inode->i_sem); 1272 mutex_lock(&inode->i_mutex);
1293 err = inode->i_op->mkdir(inode, dentry, 0700); 1273 err = inode->i_op->mkdir(inode, dentry, 0700);
1294 up(&inode->i_sem); 1274 mutex_unlock(&inode->i_mutex);
1295 if (err) { 1275 if (err) {
1296 dput(dentry); 1276 dput(dentry);
1297 dentry = NULL; 1277 dentry = NULL;
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index a47ac9aac8b..43de3ba8333 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -1,3 +1,4 @@
1#include <linux/capability.h>
1#include <linux/fs.h> 2#include <linux/fs.h>
2#include <linux/posix_acl.h> 3#include <linux/posix_acl.h>
3#include <linux/reiserfs_fs.h> 4#include <linux/reiserfs_fs.h>
@@ -174,7 +175,7 @@ static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
174/* 175/*
175 * Inode operation get_posix_acl(). 176 * Inode operation get_posix_acl().
176 * 177 *
177 * inode->i_sem: down 178 * inode->i_mutex: down
178 * BKL held [before 2.5.x] 179 * BKL held [before 2.5.x]
179 */ 180 */
180struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) 181struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
@@ -237,7 +238,7 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
237/* 238/*
238 * Inode operation set_posix_acl(). 239 * Inode operation set_posix_acl().
239 * 240 *
240 * inode->i_sem: down 241 * inode->i_mutex: down
241 * BKL held [before 2.5.x] 242 * BKL held [before 2.5.x]
242 */ 243 */
243static int 244static int
@@ -312,7 +313,7 @@ reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
312 return error; 313 return error;
313} 314}
314 315
315/* dir->i_sem: down, 316/* dir->i_mutex: locked,
316 * inode is new and not released into the wild yet */ 317 * inode is new and not released into the wild yet */
317int 318int
318reiserfs_inherit_default_acl(struct inode *dir, struct dentry *dentry, 319reiserfs_inherit_default_acl(struct inode *dir, struct dentry *dentry,
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 2501f7e66ab..024a938ca60 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -1,4 +1,5 @@
1#include <linux/reiserfs_fs.h> 1#include <linux/reiserfs_fs.h>
2#include <linux/capability.h>
2#include <linux/errno.h> 3#include <linux/errno.h>
3#include <linux/fs.h> 4#include <linux/fs.h>
4#include <linux/pagemap.h> 5#include <linux/pagemap.h>
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index 51458048ca6..073f39364b1 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -16,18 +16,10 @@ static int
16user_get(struct inode *inode, const char *name, void *buffer, size_t size) 16user_get(struct inode *inode, const char *name, void *buffer, size_t size)
17{ 17{
18 18
19 int error;
20
21 if (strlen(name) < sizeof(XATTR_USER_PREFIX)) 19 if (strlen(name) < sizeof(XATTR_USER_PREFIX))
22 return -EINVAL; 20 return -EINVAL;
23
24 if (!reiserfs_xattrs_user(inode->i_sb)) 21 if (!reiserfs_xattrs_user(inode->i_sb))
25 return -EOPNOTSUPP; 22 return -EOPNOTSUPP;
26
27 error = reiserfs_permission_locked(inode, MAY_READ, NULL);
28 if (error)
29 return error;
30
31 return reiserfs_xattr_get(inode, name, buffer, size); 23 return reiserfs_xattr_get(inode, name, buffer, size);
32} 24}
33 25
@@ -36,43 +28,21 @@ user_set(struct inode *inode, const char *name, const void *buffer,
36 size_t size, int flags) 28 size_t size, int flags)
37{ 29{
38 30
39 int error;
40
41 if (strlen(name) < sizeof(XATTR_USER_PREFIX)) 31 if (strlen(name) < sizeof(XATTR_USER_PREFIX))
42 return -EINVAL; 32 return -EINVAL;
43 33
44 if (!reiserfs_xattrs_user(inode->i_sb)) 34 if (!reiserfs_xattrs_user(inode->i_sb))
45 return -EOPNOTSUPP; 35 return -EOPNOTSUPP;
46
47 if (!S_ISREG(inode->i_mode) &&
48 (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
49 return -EPERM;
50
51 error = reiserfs_permission_locked(inode, MAY_WRITE, NULL);
52 if (error)
53 return error;
54
55 return reiserfs_xattr_set(inode, name, buffer, size, flags); 36 return reiserfs_xattr_set(inode, name, buffer, size, flags);
56} 37}
57 38
58static int user_del(struct inode *inode, const char *name) 39static int user_del(struct inode *inode, const char *name)
59{ 40{
60 int error;
61
62 if (strlen(name) < sizeof(XATTR_USER_PREFIX)) 41 if (strlen(name) < sizeof(XATTR_USER_PREFIX))
63 return -EINVAL; 42 return -EINVAL;
64 43
65 if (!reiserfs_xattrs_user(inode->i_sb)) 44 if (!reiserfs_xattrs_user(inode->i_sb))
66 return -EOPNOTSUPP; 45 return -EOPNOTSUPP;
67
68 if (!S_ISREG(inode->i_mode) &&
69 (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
70 return -EPERM;
71
72 error = reiserfs_permission_locked(inode, MAY_WRITE, NULL);
73 if (error)
74 return error;
75
76 return 0; 46 return 0;
77} 47}
78 48
diff --git a/fs/relayfs/buffers.c b/fs/relayfs/buffers.c
index 84e21ffa5ca..10187812771 100644
--- a/fs/relayfs/buffers.c
+++ b/fs/relayfs/buffers.c
@@ -185,5 +185,6 @@ void relay_destroy_buf(struct rchan_buf *buf)
185void relay_remove_buf(struct kref *kref) 185void relay_remove_buf(struct kref *kref)
186{ 186{
187 struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref); 187 struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
188 relayfs_remove(buf->dentry); 188 buf->chan->cb->remove_buf_file(buf->dentry);
189 relay_destroy_buf(buf);
189} 190}
diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
index 0f7f88d067a..383523011aa 100644
--- a/fs/relayfs/inode.c
+++ b/fs/relayfs/inode.c
@@ -26,31 +26,22 @@
26 26
27static struct vfsmount * relayfs_mount; 27static struct vfsmount * relayfs_mount;
28static int relayfs_mount_count; 28static int relayfs_mount_count;
29static kmem_cache_t * relayfs_inode_cachep;
30 29
31static struct backing_dev_info relayfs_backing_dev_info = { 30static struct backing_dev_info relayfs_backing_dev_info = {
32 .ra_pages = 0, /* No readahead */ 31 .ra_pages = 0, /* No readahead */
33 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, 32 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
34}; 33};
35 34
36static struct inode *relayfs_get_inode(struct super_block *sb, int mode, 35static struct inode *relayfs_get_inode(struct super_block *sb,
37 struct rchan *chan) 36 int mode,
37 struct file_operations *fops,
38 void *data)
38{ 39{
39 struct rchan_buf *buf = NULL;
40 struct inode *inode; 40 struct inode *inode;
41 41
42 if (S_ISREG(mode)) {
43 BUG_ON(!chan);
44 buf = relay_create_buf(chan);
45 if (!buf)
46 return NULL;
47 }
48
49 inode = new_inode(sb); 42 inode = new_inode(sb);
50 if (!inode) { 43 if (!inode)
51 relay_destroy_buf(buf);
52 return NULL; 44 return NULL;
53 }
54 45
55 inode->i_mode = mode; 46 inode->i_mode = mode;
56 inode->i_uid = 0; 47 inode->i_uid = 0;
@@ -61,8 +52,9 @@ static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
61 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 52 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
62 switch (mode & S_IFMT) { 53 switch (mode & S_IFMT) {
63 case S_IFREG: 54 case S_IFREG:
64 inode->i_fop = &relayfs_file_operations; 55 inode->i_fop = fops;
65 RELAYFS_I(inode)->buf = buf; 56 if (data)
57 inode->u.generic_ip = data;
66 break; 58 break;
67 case S_IFDIR: 59 case S_IFDIR:
68 inode->i_op = &simple_dir_inode_operations; 60 inode->i_op = &simple_dir_inode_operations;
@@ -83,7 +75,8 @@ static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
83 * @name: the name of the file to create 75 * @name: the name of the file to create
84 * @parent: parent directory 76 * @parent: parent directory
85 * @mode: mode 77 * @mode: mode
86 * @chan: relay channel associated with the file 78 * @fops: file operations to use for the file
79 * @data: user-associated data for this file
87 * 80 *
88 * Returns the new dentry, NULL on failure 81 * Returns the new dentry, NULL on failure
89 * 82 *
@@ -92,7 +85,8 @@ static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
92static struct dentry *relayfs_create_entry(const char *name, 85static struct dentry *relayfs_create_entry(const char *name,
93 struct dentry *parent, 86 struct dentry *parent,
94 int mode, 87 int mode,
95 struct rchan *chan) 88 struct file_operations *fops,
89 void *data)
96{ 90{
97 struct dentry *d; 91 struct dentry *d;
98 struct inode *inode; 92 struct inode *inode;
@@ -115,7 +109,7 @@ static struct dentry *relayfs_create_entry(const char *name,
115 } 109 }
116 110
117 parent = dget(parent); 111 parent = dget(parent);
118 down(&parent->d_inode->i_sem); 112 mutex_lock(&parent->d_inode->i_mutex);
119 d = lookup_one_len(name, parent, strlen(name)); 113 d = lookup_one_len(name, parent, strlen(name));
120 if (IS_ERR(d)) { 114 if (IS_ERR(d)) {
121 d = NULL; 115 d = NULL;
@@ -127,7 +121,7 @@ static struct dentry *relayfs_create_entry(const char *name,
127 goto release_mount; 121 goto release_mount;
128 } 122 }
129 123
130 inode = relayfs_get_inode(parent->d_inode->i_sb, mode, chan); 124 inode = relayfs_get_inode(parent->d_inode->i_sb, mode, fops, data);
131 if (!inode) { 125 if (!inode) {
132 d = NULL; 126 d = NULL;
133 goto release_mount; 127 goto release_mount;
@@ -145,7 +139,7 @@ release_mount:
145 simple_release_fs(&relayfs_mount, &relayfs_mount_count); 139 simple_release_fs(&relayfs_mount, &relayfs_mount_count);
146 140
147exit: 141exit:
148 up(&parent->d_inode->i_sem); 142 mutex_unlock(&parent->d_inode->i_mutex);
149 dput(parent); 143 dput(parent);
150 return d; 144 return d;
151} 145}
@@ -155,20 +149,26 @@ exit:
155 * @name: the name of the file to create 149 * @name: the name of the file to create
156 * @parent: parent directory 150 * @parent: parent directory
157 * @mode: mode, if not specied the default perms are used 151 * @mode: mode, if not specied the default perms are used
158 * @chan: channel associated with the file 152 * @fops: file operations to use for the file
153 * @data: user-associated data for this file
159 * 154 *
160 * Returns file dentry if successful, NULL otherwise. 155 * Returns file dentry if successful, NULL otherwise.
161 * 156 *
162 * The file will be created user r on behalf of current user. 157 * The file will be created user r on behalf of current user.
163 */ 158 */
164struct dentry *relayfs_create_file(const char *name, struct dentry *parent, 159struct dentry *relayfs_create_file(const char *name,
165 int mode, struct rchan *chan) 160 struct dentry *parent,
161 int mode,
162 struct file_operations *fops,
163 void *data)
166{ 164{
165 BUG_ON(!fops);
166
167 if (!mode) 167 if (!mode)
168 mode = S_IRUSR; 168 mode = S_IRUSR;
169 mode = (mode & S_IALLUGO) | S_IFREG; 169 mode = (mode & S_IALLUGO) | S_IFREG;
170 170
171 return relayfs_create_entry(name, parent, mode, chan); 171 return relayfs_create_entry(name, parent, mode, fops, data);
172} 172}
173 173
174/** 174/**
@@ -183,7 +183,7 @@ struct dentry *relayfs_create_file(const char *name, struct dentry *parent,
183struct dentry *relayfs_create_dir(const char *name, struct dentry *parent) 183struct dentry *relayfs_create_dir(const char *name, struct dentry *parent)
184{ 184{
185 int mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; 185 int mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
186 return relayfs_create_entry(name, parent, mode, NULL); 186 return relayfs_create_entry(name, parent, mode, NULL, NULL);
187} 187}
188 188
189/** 189/**
@@ -204,7 +204,7 @@ int relayfs_remove(struct dentry *dentry)
204 return -EINVAL; 204 return -EINVAL;
205 205
206 parent = dget(parent); 206 parent = dget(parent);
207 down(&parent->d_inode->i_sem); 207 mutex_lock(&parent->d_inode->i_mutex);
208 if (dentry->d_inode) { 208 if (dentry->d_inode) {
209 if (S_ISDIR(dentry->d_inode->i_mode)) 209 if (S_ISDIR(dentry->d_inode->i_mode))
210 error = simple_rmdir(parent->d_inode, dentry); 210 error = simple_rmdir(parent->d_inode, dentry);
@@ -215,7 +215,7 @@ int relayfs_remove(struct dentry *dentry)
215 } 215 }
216 if (!error) 216 if (!error)
217 dput(dentry); 217 dput(dentry);
218 up(&parent->d_inode->i_sem); 218 mutex_unlock(&parent->d_inode->i_mutex);
219 dput(parent); 219 dput(parent);
220 220
221 if (!error) 221 if (!error)
@@ -225,6 +225,17 @@ int relayfs_remove(struct dentry *dentry)
225} 225}
226 226
227/** 227/**
228 * relayfs_remove_file - remove a file from relay filesystem
229 * @dentry: directory dentry
230 *
231 * Returns 0 if successful, negative otherwise.
232 */
233int relayfs_remove_file(struct dentry *dentry)
234{
235 return relayfs_remove(dentry);
236}
237
238/**
228 * relayfs_remove_dir - remove a directory in the relay filesystem 239 * relayfs_remove_dir - remove a directory in the relay filesystem
229 * @dentry: directory dentry 240 * @dentry: directory dentry
230 * 241 *
@@ -236,45 +247,45 @@ int relayfs_remove_dir(struct dentry *dentry)
236} 247}
237 248
238/** 249/**
239 * relayfs_open - open file op for relayfs files 250 * relay_file_open - open file op for relay files
240 * @inode: the inode 251 * @inode: the inode
241 * @filp: the file 252 * @filp: the file
242 * 253 *
243 * Increments the channel buffer refcount. 254 * Increments the channel buffer refcount.
244 */ 255 */
245static int relayfs_open(struct inode *inode, struct file *filp) 256static int relay_file_open(struct inode *inode, struct file *filp)
246{ 257{
247 struct rchan_buf *buf = RELAYFS_I(inode)->buf; 258 struct rchan_buf *buf = inode->u.generic_ip;
248 kref_get(&buf->kref); 259 kref_get(&buf->kref);
260 filp->private_data = buf;
249 261
250 return 0; 262 return 0;
251} 263}
252 264
253/** 265/**
254 * relayfs_mmap - mmap file op for relayfs files 266 * relay_file_mmap - mmap file op for relay files
255 * @filp: the file 267 * @filp: the file
256 * @vma: the vma describing what to map 268 * @vma: the vma describing what to map
257 * 269 *
258 * Calls upon relay_mmap_buf to map the file into user space. 270 * Calls upon relay_mmap_buf to map the file into user space.
259 */ 271 */
260static int relayfs_mmap(struct file *filp, struct vm_area_struct *vma) 272static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
261{ 273{
262 struct inode *inode = filp->f_dentry->d_inode; 274 struct rchan_buf *buf = filp->private_data;
263 return relay_mmap_buf(RELAYFS_I(inode)->buf, vma); 275 return relay_mmap_buf(buf, vma);
264} 276}
265 277
266/** 278/**
267 * relayfs_poll - poll file op for relayfs files 279 * relay_file_poll - poll file op for relay files
268 * @filp: the file 280 * @filp: the file
269 * @wait: poll table 281 * @wait: poll table
270 * 282 *
271 * Poll implemention. 283 * Poll implemention.
272 */ 284 */
273static unsigned int relayfs_poll(struct file *filp, poll_table *wait) 285static unsigned int relay_file_poll(struct file *filp, poll_table *wait)
274{ 286{
275 unsigned int mask = 0; 287 unsigned int mask = 0;
276 struct inode *inode = filp->f_dentry->d_inode; 288 struct rchan_buf *buf = filp->private_data;
277 struct rchan_buf *buf = RELAYFS_I(inode)->buf;
278 289
279 if (buf->finalized) 290 if (buf->finalized)
280 return POLLERR; 291 return POLLERR;
@@ -289,27 +300,27 @@ static unsigned int relayfs_poll(struct file *filp, poll_table *wait)
289} 300}
290 301
291/** 302/**
292 * relayfs_release - release file op for relayfs files 303 * relay_file_release - release file op for relay files
293 * @inode: the inode 304 * @inode: the inode
294 * @filp: the file 305 * @filp: the file
295 * 306 *
296 * Decrements the channel refcount, as the filesystem is 307 * Decrements the channel refcount, as the filesystem is
297 * no longer using it. 308 * no longer using it.
298 */ 309 */
299static int relayfs_release(struct inode *inode, struct file *filp) 310static int relay_file_release(struct inode *inode, struct file *filp)
300{ 311{
301 struct rchan_buf *buf = RELAYFS_I(inode)->buf; 312 struct rchan_buf *buf = filp->private_data;
302 kref_put(&buf->kref, relay_remove_buf); 313 kref_put(&buf->kref, relay_remove_buf);
303 314
304 return 0; 315 return 0;
305} 316}
306 317
307/** 318/**
308 * relayfs_read_consume - update the consumed count for the buffer 319 * relay_file_read_consume - update the consumed count for the buffer
309 */ 320 */
310static void relayfs_read_consume(struct rchan_buf *buf, 321static void relay_file_read_consume(struct rchan_buf *buf,
311 size_t read_pos, 322 size_t read_pos,
312 size_t bytes_consumed) 323 size_t bytes_consumed)
313{ 324{
314 size_t subbuf_size = buf->chan->subbuf_size; 325 size_t subbuf_size = buf->chan->subbuf_size;
315 size_t n_subbufs = buf->chan->n_subbufs; 326 size_t n_subbufs = buf->chan->n_subbufs;
@@ -332,9 +343,9 @@ static void relayfs_read_consume(struct rchan_buf *buf,
332} 343}
333 344
334/** 345/**
335 * relayfs_read_avail - boolean, are there unconsumed bytes available? 346 * relay_file_read_avail - boolean, are there unconsumed bytes available?
336 */ 347 */
337static int relayfs_read_avail(struct rchan_buf *buf, size_t read_pos) 348static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
338{ 349{
339 size_t bytes_produced, bytes_consumed, write_offset; 350 size_t bytes_produced, bytes_consumed, write_offset;
340 size_t subbuf_size = buf->chan->subbuf_size; 351 size_t subbuf_size = buf->chan->subbuf_size;
@@ -365,16 +376,16 @@ static int relayfs_read_avail(struct rchan_buf *buf, size_t read_pos)
365 if (bytes_produced == bytes_consumed) 376 if (bytes_produced == bytes_consumed)
366 return 0; 377 return 0;
367 378
368 relayfs_read_consume(buf, read_pos, 0); 379 relay_file_read_consume(buf, read_pos, 0);
369 380
370 return 1; 381 return 1;
371} 382}
372 383
373/** 384/**
374 * relayfs_read_subbuf_avail - return bytes available in sub-buffer 385 * relay_file_read_subbuf_avail - return bytes available in sub-buffer
375 */ 386 */
376static size_t relayfs_read_subbuf_avail(size_t read_pos, 387static size_t relay_file_read_subbuf_avail(size_t read_pos,
377 struct rchan_buf *buf) 388 struct rchan_buf *buf)
378{ 389{
379 size_t padding, avail = 0; 390 size_t padding, avail = 0;
380 size_t read_subbuf, read_offset, write_subbuf, write_offset; 391 size_t read_subbuf, read_offset, write_subbuf, write_offset;
@@ -396,14 +407,14 @@ static size_t relayfs_read_subbuf_avail(size_t read_pos,
396} 407}
397 408
398/** 409/**
399 * relayfs_read_start_pos - find the first available byte to read 410 * relay_file_read_start_pos - find the first available byte to read
400 * 411 *
401 * If the read_pos is in the middle of padding, return the 412 * If the read_pos is in the middle of padding, return the
402 * position of the first actually available byte, otherwise 413 * position of the first actually available byte, otherwise
403 * return the original value. 414 * return the original value.
404 */ 415 */
405static size_t relayfs_read_start_pos(size_t read_pos, 416static size_t relay_file_read_start_pos(size_t read_pos,
406 struct rchan_buf *buf) 417 struct rchan_buf *buf)
407{ 418{
408 size_t read_subbuf, padding, padding_start, padding_end; 419 size_t read_subbuf, padding, padding_start, padding_end;
409 size_t subbuf_size = buf->chan->subbuf_size; 420 size_t subbuf_size = buf->chan->subbuf_size;
@@ -422,11 +433,11 @@ static size_t relayfs_read_start_pos(size_t read_pos,
422} 433}
423 434
424/** 435/**
425 * relayfs_read_end_pos - return the new read position 436 * relay_file_read_end_pos - return the new read position
426 */ 437 */
427static size_t relayfs_read_end_pos(struct rchan_buf *buf, 438static size_t relay_file_read_end_pos(struct rchan_buf *buf,
428 size_t read_pos, 439 size_t read_pos,
429 size_t count) 440 size_t count)
430{ 441{
431 size_t read_subbuf, padding, end_pos; 442 size_t read_subbuf, padding, end_pos;
432 size_t subbuf_size = buf->chan->subbuf_size; 443 size_t subbuf_size = buf->chan->subbuf_size;
@@ -445,7 +456,7 @@ static size_t relayfs_read_end_pos(struct rchan_buf *buf,
445} 456}
446 457
447/** 458/**
448 * relayfs_read - read file op for relayfs files 459 * relay_file_read - read file op for relay files
449 * @filp: the file 460 * @filp: the file
450 * @buffer: the userspace buffer 461 * @buffer: the userspace buffer
451 * @count: number of bytes to read 462 * @count: number of bytes to read
@@ -454,23 +465,23 @@ static size_t relayfs_read_end_pos(struct rchan_buf *buf,
454 * Reads count bytes or the number of bytes available in the 465 * Reads count bytes or the number of bytes available in the
455 * current sub-buffer being read, whichever is smaller. 466 * current sub-buffer being read, whichever is smaller.
456 */ 467 */
457static ssize_t relayfs_read(struct file *filp, 468static ssize_t relay_file_read(struct file *filp,
458 char __user *buffer, 469 char __user *buffer,
459 size_t count, 470 size_t count,
460 loff_t *ppos) 471 loff_t *ppos)
461{ 472{
473 struct rchan_buf *buf = filp->private_data;
462 struct inode *inode = filp->f_dentry->d_inode; 474 struct inode *inode = filp->f_dentry->d_inode;
463 struct rchan_buf *buf = RELAYFS_I(inode)->buf;
464 size_t read_start, avail; 475 size_t read_start, avail;
465 ssize_t ret = 0; 476 ssize_t ret = 0;
466 void *from; 477 void *from;
467 478
468 down(&inode->i_sem); 479 mutex_lock(&inode->i_mutex);
469 if(!relayfs_read_avail(buf, *ppos)) 480 if(!relay_file_read_avail(buf, *ppos))
470 goto out; 481 goto out;
471 482
472 read_start = relayfs_read_start_pos(*ppos, buf); 483 read_start = relay_file_read_start_pos(*ppos, buf);
473 avail = relayfs_read_subbuf_avail(read_start, buf); 484 avail = relay_file_read_subbuf_avail(read_start, buf);
474 if (!avail) 485 if (!avail)
475 goto out; 486 goto out;
476 487
@@ -480,58 +491,25 @@ static ssize_t relayfs_read(struct file *filp,
480 ret = -EFAULT; 491 ret = -EFAULT;
481 goto out; 492 goto out;
482 } 493 }
483 relayfs_read_consume(buf, read_start, count); 494 relay_file_read_consume(buf, read_start, count);
484 *ppos = relayfs_read_end_pos(buf, read_start, count); 495 *ppos = relay_file_read_end_pos(buf, read_start, count);
485out: 496out:
486 up(&inode->i_sem); 497 mutex_unlock(&inode->i_mutex);
487 return ret; 498 return ret;
488} 499}
489 500
490/** 501struct file_operations relay_file_operations = {
491 * relayfs alloc_inode() implementation 502 .open = relay_file_open,
492 */ 503 .poll = relay_file_poll,
493static struct inode *relayfs_alloc_inode(struct super_block *sb) 504 .mmap = relay_file_mmap,
494{ 505 .read = relay_file_read,
495 struct relayfs_inode_info *p = kmem_cache_alloc(relayfs_inode_cachep, SLAB_KERNEL);
496 if (!p)
497 return NULL;
498 p->buf = NULL;
499
500 return &p->vfs_inode;
501}
502
503/**
504 * relayfs destroy_inode() implementation
505 */
506static void relayfs_destroy_inode(struct inode *inode)
507{
508 if (RELAYFS_I(inode)->buf)
509 relay_destroy_buf(RELAYFS_I(inode)->buf);
510
511 kmem_cache_free(relayfs_inode_cachep, RELAYFS_I(inode));
512}
513
514static void init_once(void *p, kmem_cache_t *cachep, unsigned long flags)
515{
516 struct relayfs_inode_info *i = p;
517 if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR)
518 inode_init_once(&i->vfs_inode);
519}
520
521struct file_operations relayfs_file_operations = {
522 .open = relayfs_open,
523 .poll = relayfs_poll,
524 .mmap = relayfs_mmap,
525 .read = relayfs_read,
526 .llseek = no_llseek, 506 .llseek = no_llseek,
527 .release = relayfs_release, 507 .release = relay_file_release,
528}; 508};
529 509
530static struct super_operations relayfs_ops = { 510static struct super_operations relayfs_ops = {
531 .statfs = simple_statfs, 511 .statfs = simple_statfs,
532 .drop_inode = generic_delete_inode, 512 .drop_inode = generic_delete_inode,
533 .alloc_inode = relayfs_alloc_inode,
534 .destroy_inode = relayfs_destroy_inode,
535}; 513};
536 514
537static int relayfs_fill_super(struct super_block * sb, void * data, int silent) 515static int relayfs_fill_super(struct super_block * sb, void * data, int silent)
@@ -544,7 +522,7 @@ static int relayfs_fill_super(struct super_block * sb, void * data, int silent)
544 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 522 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
545 sb->s_magic = RELAYFS_MAGIC; 523 sb->s_magic = RELAYFS_MAGIC;
546 sb->s_op = &relayfs_ops; 524 sb->s_op = &relayfs_ops;
547 inode = relayfs_get_inode(sb, mode, NULL); 525 inode = relayfs_get_inode(sb, mode, NULL, NULL);
548 526
549 if (!inode) 527 if (!inode)
550 return -ENOMEM; 528 return -ENOMEM;
@@ -575,33 +553,27 @@ static struct file_system_type relayfs_fs_type = {
575 553
576static int __init init_relayfs_fs(void) 554static int __init init_relayfs_fs(void)
577{ 555{
578 int err; 556 return register_filesystem(&relayfs_fs_type);
579
580 relayfs_inode_cachep = kmem_cache_create("relayfs_inode_cache",
581 sizeof(struct relayfs_inode_info), 0,
582 0, init_once, NULL);
583 if (!relayfs_inode_cachep)
584 return -ENOMEM;
585
586 err = register_filesystem(&relayfs_fs_type);
587 if (err)
588 kmem_cache_destroy(relayfs_inode_cachep);
589
590 return err;
591} 557}
592 558
593static void __exit exit_relayfs_fs(void) 559static void __exit exit_relayfs_fs(void)
594{ 560{
561
562
563
564
565
595 unregister_filesystem(&relayfs_fs_type); 566 unregister_filesystem(&relayfs_fs_type);
596 kmem_cache_destroy(relayfs_inode_cachep);
597} 567}
598 568
599module_init(init_relayfs_fs) 569module_init(init_relayfs_fs)
600module_exit(exit_relayfs_fs) 570module_exit(exit_relayfs_fs)
601 571
602EXPORT_SYMBOL_GPL(relayfs_file_operations); 572EXPORT_SYMBOL_GPL(relay_file_operations);
603EXPORT_SYMBOL_GPL(relayfs_create_dir); 573EXPORT_SYMBOL_GPL(relayfs_create_dir);
604EXPORT_SYMBOL_GPL(relayfs_remove_dir); 574EXPORT_SYMBOL_GPL(relayfs_remove_dir);
575EXPORT_SYMBOL_GPL(relayfs_create_file);
576EXPORT_SYMBOL_GPL(relayfs_remove_file);
605 577
606MODULE_AUTHOR("Tom Zanussi <zanussi@us.ibm.com> and Karim Yaghmour <karim@opersys.com>"); 578MODULE_AUTHOR("Tom Zanussi <zanussi@us.ibm.com> and Karim Yaghmour <karim@opersys.com>");
607MODULE_DESCRIPTION("Relay Filesystem"); 579MODULE_DESCRIPTION("Relay Filesystem");
diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c
index 16446a15c96..abf3ceaace4 100644
--- a/fs/relayfs/relay.c
+++ b/fs/relayfs/relay.c
@@ -80,11 +80,34 @@ static void buf_unmapped_default_callback(struct rchan_buf *buf,
80{ 80{
81} 81}
82 82
83/*
84 * create_buf_file_create() default callback. Creates file to represent buf.
85 */
86static struct dentry *create_buf_file_default_callback(const char *filename,
87 struct dentry *parent,
88 int mode,
89 struct rchan_buf *buf,
90 int *is_global)
91{
92 return relayfs_create_file(filename, parent, mode,
93 &relay_file_operations, buf);
94}
95
96/*
97 * remove_buf_file() default callback. Removes file representing relay buffer.
98 */
99static int remove_buf_file_default_callback(struct dentry *dentry)
100{
101 return relayfs_remove(dentry);
102}
103
83/* relay channel default callbacks */ 104/* relay channel default callbacks */
84static struct rchan_callbacks default_channel_callbacks = { 105static struct rchan_callbacks default_channel_callbacks = {
85 .subbuf_start = subbuf_start_default_callback, 106 .subbuf_start = subbuf_start_default_callback,
86 .buf_mapped = buf_mapped_default_callback, 107 .buf_mapped = buf_mapped_default_callback,
87 .buf_unmapped = buf_unmapped_default_callback, 108 .buf_unmapped = buf_unmapped_default_callback,
109 .create_buf_file = create_buf_file_default_callback,
110 .remove_buf_file = remove_buf_file_default_callback,
88}; 111};
89 112
90/** 113/**
@@ -148,14 +171,16 @@ static inline void __relay_reset(struct rchan_buf *buf, unsigned int init)
148void relay_reset(struct rchan *chan) 171void relay_reset(struct rchan *chan)
149{ 172{
150 unsigned int i; 173 unsigned int i;
174 struct rchan_buf *prev = NULL;
151 175
152 if (!chan) 176 if (!chan)
153 return; 177 return;
154 178
155 for (i = 0; i < NR_CPUS; i++) { 179 for (i = 0; i < NR_CPUS; i++) {
156 if (!chan->buf[i]) 180 if (!chan->buf[i] || chan->buf[i] == prev)
157 continue; 181 break;
158 __relay_reset(chan->buf[i], 0); 182 __relay_reset(chan->buf[i], 0);
183 prev = chan->buf[i];
159 } 184 }
160} 185}
161 186
@@ -166,17 +191,27 @@ void relay_reset(struct rchan *chan)
166 */ 191 */
167static struct rchan_buf *relay_open_buf(struct rchan *chan, 192static struct rchan_buf *relay_open_buf(struct rchan *chan,
168 const char *filename, 193 const char *filename,
169 struct dentry *parent) 194 struct dentry *parent,
195 int *is_global)
170{ 196{
171 struct rchan_buf *buf; 197 struct rchan_buf *buf;
172 struct dentry *dentry; 198 struct dentry *dentry;
173 199
200 if (*is_global)
201 return chan->buf[0];
202
203 buf = relay_create_buf(chan);
204 if (!buf)
205 return NULL;
206
174 /* Create file in fs */ 207 /* Create file in fs */
175 dentry = relayfs_create_file(filename, parent, S_IRUSR, chan); 208 dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR,
176 if (!dentry) 209 buf, is_global);
210 if (!dentry) {
211 relay_destroy_buf(buf);
177 return NULL; 212 return NULL;
213 }
178 214
179 buf = RELAYFS_I(dentry->d_inode)->buf;
180 buf->dentry = dentry; 215 buf->dentry = dentry;
181 __relay_reset(buf, 1); 216 __relay_reset(buf, 1);
182 217
@@ -214,6 +249,10 @@ static inline void setup_callbacks(struct rchan *chan,
214 cb->buf_mapped = buf_mapped_default_callback; 249 cb->buf_mapped = buf_mapped_default_callback;
215 if (!cb->buf_unmapped) 250 if (!cb->buf_unmapped)
216 cb->buf_unmapped = buf_unmapped_default_callback; 251 cb->buf_unmapped = buf_unmapped_default_callback;
252 if (!cb->create_buf_file)
253 cb->create_buf_file = create_buf_file_default_callback;
254 if (!cb->remove_buf_file)
255 cb->remove_buf_file = remove_buf_file_default_callback;
217 chan->cb = cb; 256 chan->cb = cb;
218} 257}
219 258
@@ -241,6 +280,7 @@ struct rchan *relay_open(const char *base_filename,
241 unsigned int i; 280 unsigned int i;
242 struct rchan *chan; 281 struct rchan *chan;
243 char *tmpname; 282 char *tmpname;
283 int is_global = 0;
244 284
245 if (!base_filename) 285 if (!base_filename)
246 return NULL; 286 return NULL;
@@ -265,7 +305,8 @@ struct rchan *relay_open(const char *base_filename,
265 305
266 for_each_online_cpu(i) { 306 for_each_online_cpu(i) {
267 sprintf(tmpname, "%s%d", base_filename, i); 307 sprintf(tmpname, "%s%d", base_filename, i);
268 chan->buf[i] = relay_open_buf(chan, tmpname, parent); 308 chan->buf[i] = relay_open_buf(chan, tmpname, parent,
309 &is_global);
269 chan->buf[i]->cpu = i; 310 chan->buf[i]->cpu = i;
270 if (!chan->buf[i]) 311 if (!chan->buf[i])
271 goto free_bufs; 312 goto free_bufs;
@@ -279,6 +320,8 @@ free_bufs:
279 if (!chan->buf[i]) 320 if (!chan->buf[i])
280 break; 321 break;
281 relay_close_buf(chan->buf[i]); 322 relay_close_buf(chan->buf[i]);
323 if (is_global)
324 break;
282 } 325 }
283 kfree(tmpname); 326 kfree(tmpname);
284 327
@@ -333,8 +376,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
333 return length; 376 return length;
334 377
335toobig: 378toobig:
336 printk(KERN_WARNING "relayfs: event too large (%Zd)\n", length); 379 buf->chan->last_toobig = length;
337 WARN_ON(1);
338 return 0; 380 return 0;
339} 381}
340 382
@@ -389,16 +431,23 @@ void relay_destroy_channel(struct kref *kref)
389void relay_close(struct rchan *chan) 431void relay_close(struct rchan *chan)
390{ 432{
391 unsigned int i; 433 unsigned int i;
434 struct rchan_buf *prev = NULL;
392 435
393 if (!chan) 436 if (!chan)
394 return; 437 return;
395 438
396 for (i = 0; i < NR_CPUS; i++) { 439 for (i = 0; i < NR_CPUS; i++) {
397 if (!chan->buf[i]) 440 if (!chan->buf[i] || chan->buf[i] == prev)
398 continue; 441 break;
399 relay_close_buf(chan->buf[i]); 442 relay_close_buf(chan->buf[i]);
443 prev = chan->buf[i];
400 } 444 }
401 445
446 if (chan->last_toobig)
447 printk(KERN_WARNING "relayfs: one or more items not logged "
448 "[item size (%Zd) > sub-buffer size (%Zd)]\n",
449 chan->last_toobig, chan->subbuf_size);
450
402 kref_put(&chan->kref, relay_destroy_channel); 451 kref_put(&chan->kref, relay_destroy_channel);
403} 452}
404 453
@@ -411,14 +460,16 @@ void relay_close(struct rchan *chan)
411void relay_flush(struct rchan *chan) 460void relay_flush(struct rchan *chan)
412{ 461{
413 unsigned int i; 462 unsigned int i;
463 struct rchan_buf *prev = NULL;
414 464
415 if (!chan) 465 if (!chan)
416 return; 466 return;
417 467
418 for (i = 0; i < NR_CPUS; i++) { 468 for (i = 0; i < NR_CPUS; i++) {
419 if (!chan->buf[i]) 469 if (!chan->buf[i] || chan->buf[i] == prev)
420 continue; 470 break;
421 relay_switch_subbuf(chan->buf[i], 0); 471 relay_switch_subbuf(chan->buf[i], 0);
472 prev = chan->buf[i];
422 } 473 }
423} 474}
424 475
diff --git a/fs/relayfs/relay.h b/fs/relayfs/relay.h
index 703503fa22b..0993d3e5753 100644
--- a/fs/relayfs/relay.h
+++ b/fs/relayfs/relay.h
@@ -1,10 +1,6 @@
1#ifndef _RELAY_H 1#ifndef _RELAY_H
2#define _RELAY_H 2#define _RELAY_H
3 3
4struct dentry *relayfs_create_file(const char *name,
5 struct dentry *parent,
6 int mode,
7 struct rchan *chan);
8extern int relayfs_remove(struct dentry *dentry); 4extern int relayfs_remove(struct dentry *dentry);
9extern int relay_buf_empty(struct rchan_buf *buf); 5extern int relay_buf_empty(struct rchan_buf *buf);
10extern void relay_destroy_channel(struct kref *kref); 6extern void relay_destroy_channel(struct kref *kref);
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index c74f382dabb..0a13859fd57 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -418,7 +418,7 @@ static int
418romfs_readpage(struct file *file, struct page * page) 418romfs_readpage(struct file *file, struct page * page)
419{ 419{
420 struct inode *inode = page->mapping->host; 420 struct inode *inode = page->mapping->host;
421 unsigned long offset, avail, readlen; 421 loff_t offset, avail, readlen;
422 void *buf; 422 void *buf;
423 int result = -EIO; 423 int result = -EIO;
424 424
@@ -429,8 +429,8 @@ romfs_readpage(struct file *file, struct page * page)
429 goto err_out; 429 goto err_out;
430 430
431 /* 32 bit warning -- but not for us :) */ 431 /* 32 bit warning -- but not for us :) */
432 offset = page->index << PAGE_CACHE_SHIFT; 432 offset = page_offset(page);
433 if (offset < inode->i_size) { 433 if (offset < i_size_read(inode)) {
434 avail = inode->i_size-offset; 434 avail = inode->i_size-offset;
435 readlen = min_t(unsigned long, avail, PAGE_SIZE); 435 readlen = min_t(unsigned long, avail, PAGE_SIZE);
436 if (romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen) == readlen) { 436 if (romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen) == readlen) {
diff --git a/fs/select.c b/fs/select.c
index f10a10317d5..c0f02d36c60 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -179,12 +179,11 @@ get_max:
179#define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR) 179#define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
180#define POLLEX_SET (POLLPRI) 180#define POLLEX_SET (POLLPRI)
181 181
182int do_select(int n, fd_set_bits *fds, long *timeout) 182int do_select(int n, fd_set_bits *fds, s64 *timeout)
183{ 183{
184 struct poll_wqueues table; 184 struct poll_wqueues table;
185 poll_table *wait; 185 poll_table *wait;
186 int retval, i; 186 int retval, i;
187 long __timeout = *timeout;
188 187
189 rcu_read_lock(); 188 rcu_read_lock();
190 retval = max_select_fd(n, fds); 189 retval = max_select_fd(n, fds);
@@ -196,11 +195,12 @@ int do_select(int n, fd_set_bits *fds, long *timeout)
196 195
197 poll_initwait(&table); 196 poll_initwait(&table);
198 wait = &table.pt; 197 wait = &table.pt;
199 if (!__timeout) 198 if (!*timeout)
200 wait = NULL; 199 wait = NULL;
201 retval = 0; 200 retval = 0;
202 for (;;) { 201 for (;;) {
203 unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; 202 unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
203 long __timeout;
204 204
205 set_current_state(TASK_INTERRUPTIBLE); 205 set_current_state(TASK_INTERRUPTIBLE);
206 206
@@ -255,22 +255,32 @@ int do_select(int n, fd_set_bits *fds, long *timeout)
255 *rexp = res_ex; 255 *rexp = res_ex;
256 } 256 }
257 wait = NULL; 257 wait = NULL;
258 if (retval || !__timeout || signal_pending(current)) 258 if (retval || !*timeout || signal_pending(current))
259 break; 259 break;
260 if(table.error) { 260 if(table.error) {
261 retval = table.error; 261 retval = table.error;
262 break; 262 break;
263 } 263 }
264
265 if (*timeout < 0) {
266 /* Wait indefinitely */
267 __timeout = MAX_SCHEDULE_TIMEOUT;
268 } else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT - 1)) {
269 /* Wait for longer than MAX_SCHEDULE_TIMEOUT. Do it in a loop */
270 __timeout = MAX_SCHEDULE_TIMEOUT - 1;
271 *timeout -= __timeout;
272 } else {
273 __timeout = *timeout;
274 *timeout = 0;
275 }
264 __timeout = schedule_timeout(__timeout); 276 __timeout = schedule_timeout(__timeout);
277 if (*timeout >= 0)
278 *timeout += __timeout;
265 } 279 }
266 __set_current_state(TASK_RUNNING); 280 __set_current_state(TASK_RUNNING);
267 281
268 poll_freewait(&table); 282 poll_freewait(&table);
269 283
270 /*
271 * Up-to-date the caller timeout.
272 */
273 *timeout = __timeout;
274 return retval; 284 return retval;
275} 285}
276 286
@@ -295,36 +305,14 @@ static void select_bits_free(void *bits, int size)
295#define MAX_SELECT_SECONDS \ 305#define MAX_SELECT_SECONDS \
296 ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1) 306 ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
297 307
298asmlinkage long 308static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
299sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp) 309 fd_set __user *exp, s64 *timeout)
300{ 310{
301 fd_set_bits fds; 311 fd_set_bits fds;
302 char *bits; 312 char *bits;
303 long timeout;
304 int ret, size, max_fdset; 313 int ret, size, max_fdset;
305 struct fdtable *fdt; 314 struct fdtable *fdt;
306 315
307 timeout = MAX_SCHEDULE_TIMEOUT;
308 if (tvp) {
309 time_t sec, usec;
310
311 if (!access_ok(VERIFY_READ, tvp, sizeof(*tvp))
312 || __get_user(sec, &tvp->tv_sec)
313 || __get_user(usec, &tvp->tv_usec)) {
314 ret = -EFAULT;
315 goto out_nofds;
316 }
317
318 ret = -EINVAL;
319 if (sec < 0 || usec < 0)
320 goto out_nofds;
321
322 if ((unsigned long) sec < MAX_SELECT_SECONDS) {
323 timeout = ROUND_UP(usec, 1000000/HZ);
324 timeout += sec * (unsigned long) HZ;
325 }
326 }
327
328 ret = -EINVAL; 316 ret = -EINVAL;
329 if (n < 0) 317 if (n < 0)
330 goto out_nofds; 318 goto out_nofds;
@@ -362,18 +350,7 @@ sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s
362 zero_fd_set(n, fds.res_out); 350 zero_fd_set(n, fds.res_out);
363 zero_fd_set(n, fds.res_ex); 351 zero_fd_set(n, fds.res_ex);
364 352
365 ret = do_select(n, &fds, &timeout); 353 ret = do_select(n, &fds, timeout);
366
367 if (tvp && !(current->personality & STICKY_TIMEOUTS)) {
368 time_t sec = 0, usec = 0;
369 if (timeout) {
370 sec = timeout / HZ;
371 usec = timeout % HZ;
372 usec *= (1000000/HZ);
373 }
374 put_user(sec, &tvp->tv_sec);
375 put_user(usec, &tvp->tv_usec);
376 }
377 354
378 if (ret < 0) 355 if (ret < 0)
379 goto out; 356 goto out;
@@ -395,6 +372,154 @@ out_nofds:
395 return ret; 372 return ret;
396} 373}
397 374
375asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
376 fd_set __user *exp, struct timeval __user *tvp)
377{
378 s64 timeout = -1;
379 struct timeval tv;
380 int ret;
381
382 if (tvp) {
383 if (copy_from_user(&tv, tvp, sizeof(tv)))
384 return -EFAULT;
385
386 if (tv.tv_sec < 0 || tv.tv_usec < 0)
387 return -EINVAL;
388
389 /* Cast to u64 to make GCC stop complaining */
390 if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)
391 timeout = -1; /* infinite */
392 else {
393 timeout = ROUND_UP(tv.tv_usec, USEC_PER_SEC/HZ);
394 timeout += tv.tv_sec * HZ;
395 }
396 }
397
398 ret = core_sys_select(n, inp, outp, exp, &timeout);
399
400 if (tvp) {
401 if (current->personality & STICKY_TIMEOUTS)
402 goto sticky;
403 tv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
404 tv.tv_sec = timeout;
405 if (copy_to_user(tvp, &tv, sizeof(tv))) {
406sticky:
407 /*
408 * If an application puts its timeval in read-only
409 * memory, we don't want the Linux-specific update to
410 * the timeval to cause a fault after the select has
411 * completed successfully. However, because we're not
412 * updating the timeval, we can't restart the system
413 * call.
414 */
415 if (ret == -ERESTARTNOHAND)
416 ret = -EINTR;
417 }
418 }
419
420 return ret;
421}
422
423#ifdef TIF_RESTORE_SIGMASK
424asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
425 fd_set __user *exp, struct timespec __user *tsp,
426 const sigset_t __user *sigmask, size_t sigsetsize)
427{
428 s64 timeout = MAX_SCHEDULE_TIMEOUT;
429 sigset_t ksigmask, sigsaved;
430 struct timespec ts;
431 int ret;
432
433 if (tsp) {
434 if (copy_from_user(&ts, tsp, sizeof(ts)))
435 return -EFAULT;
436
437 if (ts.tv_sec < 0 || ts.tv_nsec < 0)
438 return -EINVAL;
439
440 /* Cast to u64 to make GCC stop complaining */
441 if ((u64)ts.tv_sec >= (u64)MAX_INT64_SECONDS)
442 timeout = -1; /* infinite */
443 else {
444 timeout = ROUND_UP(ts.tv_nsec, NSEC_PER_SEC/HZ);
445 timeout += ts.tv_sec * HZ;
446 }
447 }
448
449 if (sigmask) {
450 /* XXX: Don't preclude handling different sized sigset_t's. */
451 if (sigsetsize != sizeof(sigset_t))
452 return -EINVAL;
453 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
454 return -EFAULT;
455
456 sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
457 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
458 }
459
460 ret = core_sys_select(n, inp, outp, exp, &timeout);
461
462 if (tsp) {
463 if (current->personality & STICKY_TIMEOUTS)
464 goto sticky;
465 ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000;
466 ts.tv_sec = timeout;
467 if (copy_to_user(tsp, &ts, sizeof(ts))) {
468sticky:
469 /*
470 * If an application puts its timeval in read-only
471 * memory, we don't want the Linux-specific update to
472 * the timeval to cause a fault after the select has
473 * completed successfully. However, because we're not
474 * updating the timeval, we can't restart the system
475 * call.
476 */
477 if (ret == -ERESTARTNOHAND)
478 ret = -EINTR;
479 }
480 }
481
482 if (ret == -ERESTARTNOHAND) {
483 /*
484 * Don't restore the signal mask yet. Let do_signal() deliver
485 * the signal on the way back to userspace, before the signal
486 * mask is restored.
487 */
488 if (sigmask) {
489 memcpy(&current->saved_sigmask, &sigsaved,
490 sizeof(sigsaved));
491 set_thread_flag(TIF_RESTORE_SIGMASK);
492 }
493 } else if (sigmask)
494 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
495
496 return ret;
497}
498
499/*
500 * Most architectures can't handle 7-argument syscalls. So we provide a
501 * 6-argument version where the sixth argument is a pointer to a structure
502 * which has a pointer to the sigset_t itself followed by a size_t containing
503 * the sigset size.
504 */
505asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp,
506 fd_set __user *exp, struct timespec __user *tsp, void __user *sig)
507{
508 size_t sigsetsize = 0;
509 sigset_t __user *up = NULL;
510
511 if (sig) {
512 if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t))
513 || __get_user(up, (sigset_t * __user *)sig)
514 || __get_user(sigsetsize,
515 (size_t * __user)(sig+sizeof(void *))))
516 return -EFAULT;
517 }
518
519 return sys_pselect7(n, inp, outp, exp, tsp, up, sigsetsize);
520}
521#endif /* TIF_RESTORE_SIGMASK */
522
398struct poll_list { 523struct poll_list {
399 struct poll_list *next; 524 struct poll_list *next;
400 int len; 525 int len;
@@ -436,16 +561,19 @@ static void do_pollfd(unsigned int num, struct pollfd * fdpage,
436} 561}
437 562
438static int do_poll(unsigned int nfds, struct poll_list *list, 563static int do_poll(unsigned int nfds, struct poll_list *list,
439 struct poll_wqueues *wait, long timeout) 564 struct poll_wqueues *wait, s64 *timeout)
440{ 565{
441 int count = 0; 566 int count = 0;
442 poll_table* pt = &wait->pt; 567 poll_table* pt = &wait->pt;
443 568
444 if (!timeout) 569 /* Optimise the no-wait case */
570 if (!(*timeout))
445 pt = NULL; 571 pt = NULL;
446 572
447 for (;;) { 573 for (;;) {
448 struct poll_list *walk; 574 struct poll_list *walk;
575 long __timeout;
576
449 set_current_state(TASK_INTERRUPTIBLE); 577 set_current_state(TASK_INTERRUPTIBLE);
450 walk = list; 578 walk = list;
451 while(walk != NULL) { 579 while(walk != NULL) {
@@ -453,18 +581,36 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
453 walk = walk->next; 581 walk = walk->next;
454 } 582 }
455 pt = NULL; 583 pt = NULL;
456 if (count || !timeout || signal_pending(current)) 584 if (count || !*timeout || signal_pending(current))
457 break; 585 break;
458 count = wait->error; 586 count = wait->error;
459 if (count) 587 if (count)
460 break; 588 break;
461 timeout = schedule_timeout(timeout); 589
590 if (*timeout < 0) {
591 /* Wait indefinitely */
592 __timeout = MAX_SCHEDULE_TIMEOUT;
593 } else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT-1)) {
594 /*
595 * Wait for longer than MAX_SCHEDULE_TIMEOUT. Do it in
596 * a loop
597 */
598 __timeout = MAX_SCHEDULE_TIMEOUT - 1;
599 *timeout -= __timeout;
600 } else {
601 __timeout = *timeout;
602 *timeout = 0;
603 }
604
605 __timeout = schedule_timeout(__timeout);
606 if (*timeout >= 0)
607 *timeout += __timeout;
462 } 608 }
463 __set_current_state(TASK_RUNNING); 609 __set_current_state(TASK_RUNNING);
464 return count; 610 return count;
465} 611}
466 612
467asmlinkage long sys_poll(struct pollfd __user * ufds, unsigned int nfds, long timeout) 613int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout)
468{ 614{
469 struct poll_wqueues table; 615 struct poll_wqueues table;
470 int fdcount, err; 616 int fdcount, err;
@@ -482,14 +628,6 @@ asmlinkage long sys_poll(struct pollfd __user * ufds, unsigned int nfds, long ti
482 if (nfds > max_fdset && nfds > OPEN_MAX) 628 if (nfds > max_fdset && nfds > OPEN_MAX)
483 return -EINVAL; 629 return -EINVAL;
484 630
485 if (timeout) {
486 /* Careful about overflow in the intermediate values */
487 if ((unsigned long) timeout < MAX_SCHEDULE_TIMEOUT / HZ)
488 timeout = (unsigned long)(timeout*HZ+999)/1000+1;
489 else /* Negative or overflow */
490 timeout = MAX_SCHEDULE_TIMEOUT;
491 }
492
493 poll_initwait(&table); 631 poll_initwait(&table);
494 632
495 head = NULL; 633 head = NULL;
@@ -519,6 +657,7 @@ asmlinkage long sys_poll(struct pollfd __user * ufds, unsigned int nfds, long ti
519 } 657 }
520 i -= pp->len; 658 i -= pp->len;
521 } 659 }
660
522 fdcount = do_poll(nfds, head, &table, timeout); 661 fdcount = do_poll(nfds, head, &table, timeout);
523 662
524 /* OK, now copy the revents fields back to user space. */ 663 /* OK, now copy the revents fields back to user space. */
@@ -547,3 +686,98 @@ out_fds:
547 poll_freewait(&table); 686 poll_freewait(&table);
548 return err; 687 return err;
549} 688}
689
690asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
691 long timeout_msecs)
692{
693 s64 timeout_jiffies = 0;
694
695 if (timeout_msecs) {
696#if HZ > 1000
697 /* We can only overflow if HZ > 1000 */
698 if (timeout_msecs / 1000 > (s64)0x7fffffffffffffffULL / (s64)HZ)
699 timeout_jiffies = -1;
700 else
701#endif
702 timeout_jiffies = msecs_to_jiffies(timeout_msecs);
703 }
704
705 return do_sys_poll(ufds, nfds, &timeout_jiffies);
706}
707
708#ifdef TIF_RESTORE_SIGMASK
709asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
710 struct timespec __user *tsp, const sigset_t __user *sigmask,
711 size_t sigsetsize)
712{
713 sigset_t ksigmask, sigsaved;
714 struct timespec ts;
715 s64 timeout = -1;
716 int ret;
717
718 if (tsp) {
719 if (copy_from_user(&ts, tsp, sizeof(ts)))
720 return -EFAULT;
721
722 /* Cast to u64 to make GCC stop complaining */
723 if ((u64)ts.tv_sec >= (u64)MAX_INT64_SECONDS)
724 timeout = -1; /* infinite */
725 else {
726 timeout = ROUND_UP(ts.tv_nsec, NSEC_PER_SEC/HZ);
727 timeout += ts.tv_sec * HZ;
728 }
729 }
730
731 if (sigmask) {
732 /* XXX: Don't preclude handling different sized sigset_t's. */
733 if (sigsetsize != sizeof(sigset_t))
734 return -EINVAL;
735 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
736 return -EFAULT;
737
738 sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
739 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
740 }
741
742 ret = do_sys_poll(ufds, nfds, &timeout);
743
744 /* We can restart this syscall, usually */
745 if (ret == -EINTR) {
746 /*
747 * Don't restore the signal mask yet. Let do_signal() deliver
748 * the signal on the way back to userspace, before the signal
749 * mask is restored.
750 */
751 if (sigmask) {
752 memcpy(&current->saved_sigmask, &sigsaved,
753 sizeof(sigsaved));
754 set_thread_flag(TIF_RESTORE_SIGMASK);
755 }
756 ret = -ERESTARTNOHAND;
757 } else if (sigmask)
758 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
759
760 if (tsp && timeout >= 0) {
761 if (current->personality & STICKY_TIMEOUTS)
762 goto sticky;
763 /* Yes, we know it's actually an s64, but it's also positive. */
764 ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000;
765 ts.tv_sec = timeout;
766 if (copy_to_user(tsp, &ts, sizeof(ts))) {
767 sticky:
768 /*
769 * If an application puts its timeval in read-only
770 * memory, we don't want the Linux-specific update to
771 * the timeval to cause a fault after the select has
772 * completed successfully. However, because we're not
773 * updating the timeval, we can't restart the system
774 * call.
775 */
776 if (ret == -ERESTARTNOHAND && timeout >= 0)
777 ret = -EINTR;
778 }
779 }
780
781 return ret;
782}
783#endif /* TIF_RESTORE_SIGMASK */
diff --git a/fs/smbfs/Makefile b/fs/smbfs/Makefile
index 93246b7dd6f..6673ee82cb4 100644
--- a/fs/smbfs/Makefile
+++ b/fs/smbfs/Makefile
@@ -13,7 +13,6 @@ smbfs-objs := proc.o dir.o cache.o sock.o inode.o file.o ioctl.o getopt.o \
13EXTRA_CFLAGS += -DSMBFS_PARANOIA 13EXTRA_CFLAGS += -DSMBFS_PARANOIA
14#EXTRA_CFLAGS += -DSMBFS_DEBUG 14#EXTRA_CFLAGS += -DSMBFS_DEBUG
15#EXTRA_CFLAGS += -DSMBFS_DEBUG_VERBOSE 15#EXTRA_CFLAGS += -DSMBFS_DEBUG_VERBOSE
16#EXTRA_CFLAGS += -DDEBUG_SMB_MALLOC
17#EXTRA_CFLAGS += -DDEBUG_SMB_TIMESTAMP 16#EXTRA_CFLAGS += -DDEBUG_SMB_TIMESTAMP
18#EXTRA_CFLAGS += -Werror 17#EXTRA_CFLAGS += -Werror
19 18
diff --git a/fs/smbfs/cache.c b/fs/smbfs/cache.c
index f3e6b81288a..74b86d9725a 100644
--- a/fs/smbfs/cache.c
+++ b/fs/smbfs/cache.c
@@ -66,7 +66,7 @@ smb_invalidate_dircache_entries(struct dentry *parent)
66 spin_lock(&dcache_lock); 66 spin_lock(&dcache_lock);
67 next = parent->d_subdirs.next; 67 next = parent->d_subdirs.next;
68 while (next != &parent->d_subdirs) { 68 while (next != &parent->d_subdirs) {
69 dentry = list_entry(next, struct dentry, d_child); 69 dentry = list_entry(next, struct dentry, d_u.d_child);
70 dentry->d_fsdata = NULL; 70 dentry->d_fsdata = NULL;
71 smb_age_dentry(server, dentry); 71 smb_age_dentry(server, dentry);
72 next = next->next; 72 next = next->next;
@@ -100,7 +100,7 @@ smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
100 spin_lock(&dcache_lock); 100 spin_lock(&dcache_lock);
101 next = parent->d_subdirs.next; 101 next = parent->d_subdirs.next;
102 while (next != &parent->d_subdirs) { 102 while (next != &parent->d_subdirs) {
103 dent = list_entry(next, struct dentry, d_child); 103 dent = list_entry(next, struct dentry, d_u.d_child);
104 if ((unsigned long)dent->d_fsdata == fpos) { 104 if ((unsigned long)dent->d_fsdata == fpos) {
105 if (dent->d_inode) 105 if (dent->d_inode)
106 dget_locked(dent); 106 dget_locked(dent);
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index b4fcfa8b55a..7042e62726a 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -209,8 +209,8 @@ smb_updatepage(struct file *file, struct page *page, unsigned long offset,
209{ 209{
210 struct dentry *dentry = file->f_dentry; 210 struct dentry *dentry = file->f_dentry;
211 211
212 DEBUG1("(%s/%s %d@%ld)\n", DENTRY_PATH(dentry), 212 DEBUG1("(%s/%s %d@%lld)\n", DENTRY_PATH(dentry), count,
213 count, (page->index << PAGE_CACHE_SHIFT)+offset); 213 ((unsigned long long)page->index << PAGE_CACHE_SHIFT) + offset);
214 214
215 return smb_writepage_sync(dentry->d_inode, page, offset, count); 215 return smb_writepage_sync(dentry->d_inode, page, offset, count);
216} 216}
@@ -374,8 +374,7 @@ smb_file_release(struct inode *inode, struct file * file)
374 /* We must flush any dirty pages now as we won't be able to 374 /* We must flush any dirty pages now as we won't be able to
375 write anything after close. mmap can trigger this. 375 write anything after close. mmap can trigger this.
376 "openers" should perhaps include mmap'ers ... */ 376 "openers" should perhaps include mmap'ers ... */
377 filemap_fdatawrite(inode->i_mapping); 377 filemap_write_and_wait(inode->i_mapping);
378 filemap_fdatawait(inode->i_mapping);
379 smb_close(inode); 378 smb_close(inode);
380 } 379 }
381 unlock_kernel(); 380 unlock_kernel();
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 10b994428fe..02e3e82d465 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -487,11 +487,11 @@ smb_put_super(struct super_block *sb)
487 if (server->conn_pid) 487 if (server->conn_pid)
488 kill_proc(server->conn_pid, SIGTERM, 1); 488 kill_proc(server->conn_pid, SIGTERM, 1);
489 489
490 smb_kfree(server->ops); 490 kfree(server->ops);
491 smb_unload_nls(server); 491 smb_unload_nls(server);
492 sb->s_fs_info = NULL; 492 sb->s_fs_info = NULL;
493 smb_unlock_server(server); 493 smb_unlock_server(server);
494 smb_kfree(server); 494 kfree(server);
495} 495}
496 496
497static int smb_fill_super(struct super_block *sb, void *raw_data, int silent) 497static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
@@ -519,11 +519,10 @@ static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
519 sb->s_op = &smb_sops; 519 sb->s_op = &smb_sops;
520 sb->s_time_gran = 100; 520 sb->s_time_gran = 100;
521 521
522 server = smb_kmalloc(sizeof(struct smb_sb_info), GFP_KERNEL); 522 server = kzalloc(sizeof(struct smb_sb_info), GFP_KERNEL);
523 if (!server) 523 if (!server)
524 goto out_no_server; 524 goto out_no_server;
525 sb->s_fs_info = server; 525 sb->s_fs_info = server;
526 memset(server, 0, sizeof(struct smb_sb_info));
527 526
528 server->super_block = sb; 527 server->super_block = sb;
529 server->mnt = NULL; 528 server->mnt = NULL;
@@ -542,8 +541,8 @@ static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
542 /* FIXME: move these to the smb_sb_info struct */ 541 /* FIXME: move these to the smb_sb_info struct */
543 VERBOSE("alloc chunk = %d\n", sizeof(struct smb_ops) + 542 VERBOSE("alloc chunk = %d\n", sizeof(struct smb_ops) +
544 sizeof(struct smb_mount_data_kernel)); 543 sizeof(struct smb_mount_data_kernel));
545 mem = smb_kmalloc(sizeof(struct smb_ops) + 544 mem = kmalloc(sizeof(struct smb_ops) +
546 sizeof(struct smb_mount_data_kernel), GFP_KERNEL); 545 sizeof(struct smb_mount_data_kernel), GFP_KERNEL);
547 if (!mem) 546 if (!mem)
548 goto out_no_mem; 547 goto out_no_mem;
549 548
@@ -621,12 +620,12 @@ out_no_root:
621out_no_smbiod: 620out_no_smbiod:
622 smb_unload_nls(server); 621 smb_unload_nls(server);
623out_bad_option: 622out_bad_option:
624 smb_kfree(mem); 623 kfree(mem);
625out_no_mem: 624out_no_mem:
626 if (!server->mnt) 625 if (!server->mnt)
627 printk(KERN_ERR "smb_fill_super: allocation failure\n"); 626 printk(KERN_ERR "smb_fill_super: allocation failure\n");
628 sb->s_fs_info = NULL; 627 sb->s_fs_info = NULL;
629 smb_kfree(server); 628 kfree(server);
630 goto out_fail; 629 goto out_fail;
631out_wrong_data: 630out_wrong_data:
632 printk(KERN_ERR "smbfs: mount_data version %d is not supported\n", ver); 631 printk(KERN_ERR "smbfs: mount_data version %d is not supported\n", ver);
@@ -697,8 +696,7 @@ smb_notify_change(struct dentry *dentry, struct iattr *attr)
697 DENTRY_PATH(dentry), 696 DENTRY_PATH(dentry),
698 (long) inode->i_size, (long) attr->ia_size); 697 (long) inode->i_size, (long) attr->ia_size);
699 698
700 filemap_fdatawrite(inode->i_mapping); 699 filemap_write_and_wait(inode->i_mapping);
701 filemap_fdatawait(inode->i_mapping);
702 700
703 error = smb_open(dentry, O_WRONLY); 701 error = smb_open(dentry, O_WRONLY);
704 if (error) 702 if (error)
@@ -783,12 +781,6 @@ out:
783 return error; 781 return error;
784} 782}
785 783
786#ifdef DEBUG_SMB_MALLOC
787int smb_malloced;
788int smb_current_kmalloced;
789int smb_current_vmalloced;
790#endif
791
792static struct super_block *smb_get_sb(struct file_system_type *fs_type, 784static struct super_block *smb_get_sb(struct file_system_type *fs_type,
793 int flags, const char *dev_name, void *data) 785 int flags, const char *dev_name, void *data)
794{ 786{
@@ -808,12 +800,6 @@ static int __init init_smb_fs(void)
808 int err; 800 int err;
809 DEBUG1("registering ...\n"); 801 DEBUG1("registering ...\n");
810 802
811#ifdef DEBUG_SMB_MALLOC
812 smb_malloced = 0;
813 smb_current_kmalloced = 0;
814 smb_current_vmalloced = 0;
815#endif
816
817 err = init_inodecache(); 803 err = init_inodecache();
818 if (err) 804 if (err)
819 goto out_inode; 805 goto out_inode;
@@ -838,11 +824,6 @@ static void __exit exit_smb_fs(void)
838 unregister_filesystem(&smb_fs_type); 824 unregister_filesystem(&smb_fs_type);
839 smb_destroy_request_cache(); 825 smb_destroy_request_cache();
840 destroy_inodecache(); 826 destroy_inodecache();
841#ifdef DEBUG_SMB_MALLOC
842 printk(KERN_DEBUG "smb_malloced: %d\n", smb_malloced);
843 printk(KERN_DEBUG "smb_current_kmalloced: %d\n",smb_current_kmalloced);
844 printk(KERN_DEBUG "smb_current_vmalloced: %d\n",smb_current_vmalloced);
845#endif
846} 827}
847 828
848module_init(init_smb_fs) 829module_init(init_smb_fs)
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index 38ab558835c..b1b878b8173 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -8,6 +8,7 @@
8 */ 8 */
9 9
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/capability.h>
11#include <linux/errno.h> 12#include <linux/errno.h>
12#include <linux/slab.h> 13#include <linux/slab.h>
13#include <linux/fs.h> 14#include <linux/fs.h>
@@ -3113,7 +3114,7 @@ smb_proc_setattr_unix(struct dentry *d, struct iattr *attr,
3113 LSET(data, 32, SMB_TIME_NO_CHANGE); 3114 LSET(data, 32, SMB_TIME_NO_CHANGE);
3114 LSET(data, 40, SMB_UID_NO_CHANGE); 3115 LSET(data, 40, SMB_UID_NO_CHANGE);
3115 LSET(data, 48, SMB_GID_NO_CHANGE); 3116 LSET(data, 48, SMB_GID_NO_CHANGE);
3116 LSET(data, 56, smb_filetype_from_mode(attr->ia_mode)); 3117 DSET(data, 56, smb_filetype_from_mode(attr->ia_mode));
3117 LSET(data, 60, major); 3118 LSET(data, 60, major);
3118 LSET(data, 68, minor); 3119 LSET(data, 68, minor);
3119 LSET(data, 76, 0); 3120 LSET(data, 76, 0);
diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
index a0f296d9928..c71c375863c 100644
--- a/fs/smbfs/request.c
+++ b/fs/smbfs/request.c
@@ -68,7 +68,7 @@ static struct smb_request *smb_do_alloc_request(struct smb_sb_info *server,
68 goto out; 68 goto out;
69 69
70 if (bufsize > 0) { 70 if (bufsize > 0) {
71 buf = smb_kmalloc(bufsize, GFP_NOFS); 71 buf = kmalloc(bufsize, GFP_NOFS);
72 if (!buf) { 72 if (!buf) {
73 kmem_cache_free(req_cachep, req); 73 kmem_cache_free(req_cachep, req);
74 return NULL; 74 return NULL;
@@ -124,9 +124,8 @@ static void smb_free_request(struct smb_request *req)
124{ 124{
125 atomic_dec(&req->rq_server->nr_requests); 125 atomic_dec(&req->rq_server->nr_requests);
126 if (req->rq_buffer && !(req->rq_flags & SMB_REQ_STATIC)) 126 if (req->rq_buffer && !(req->rq_flags & SMB_REQ_STATIC))
127 smb_kfree(req->rq_buffer); 127 kfree(req->rq_buffer);
128 if (req->rq_trans2buffer) 128 kfree(req->rq_trans2buffer);
129 smb_kfree(req->rq_trans2buffer);
130 kmem_cache_free(req_cachep, req); 129 kmem_cache_free(req_cachep, req);
131} 130}
132 131
@@ -183,8 +182,7 @@ static int smb_setup_request(struct smb_request *req)
183 req->rq_err = 0; 182 req->rq_err = 0;
184 req->rq_errno = 0; 183 req->rq_errno = 0;
185 req->rq_fragment = 0; 184 req->rq_fragment = 0;
186 if (req->rq_trans2buffer) 185 kfree(req->rq_trans2buffer);
187 smb_kfree(req->rq_trans2buffer);
188 186
189 return 0; 187 return 0;
190} 188}
@@ -647,10 +645,9 @@ static int smb_recv_trans2(struct smb_sb_info *server, struct smb_request *req)
647 goto out_too_long; 645 goto out_too_long;
648 646
649 req->rq_trans2bufsize = buf_len; 647 req->rq_trans2bufsize = buf_len;
650 req->rq_trans2buffer = smb_kmalloc(buf_len, GFP_NOFS); 648 req->rq_trans2buffer = kzalloc(buf_len, GFP_NOFS);
651 if (!req->rq_trans2buffer) 649 if (!req->rq_trans2buffer)
652 goto out_no_mem; 650 goto out_no_mem;
653 memset(req->rq_trans2buffer, 0, buf_len);
654 651
655 req->rq_parm = req->rq_trans2buffer; 652 req->rq_parm = req->rq_trans2buffer;
656 req->rq_data = req->rq_trans2buffer + parm_tot; 653 req->rq_data = req->rq_trans2buffer + parm_tot;
diff --git a/fs/stat.c b/fs/stat.c
index b8a0e5110ab..24211b030f3 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -63,12 +63,12 @@ int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
63 63
64EXPORT_SYMBOL(vfs_getattr); 64EXPORT_SYMBOL(vfs_getattr);
65 65
66int vfs_stat(char __user *name, struct kstat *stat) 66int vfs_stat_fd(int dfd, char __user *name, struct kstat *stat)
67{ 67{
68 struct nameidata nd; 68 struct nameidata nd;
69 int error; 69 int error;
70 70
71 error = user_path_walk(name, &nd); 71 error = __user_walk_fd(dfd, name, LOOKUP_FOLLOW, &nd);
72 if (!error) { 72 if (!error) {
73 error = vfs_getattr(nd.mnt, nd.dentry, stat); 73 error = vfs_getattr(nd.mnt, nd.dentry, stat);
74 path_release(&nd); 74 path_release(&nd);
@@ -76,14 +76,19 @@ int vfs_stat(char __user *name, struct kstat *stat)
76 return error; 76 return error;
77} 77}
78 78
79int vfs_stat(char __user *name, struct kstat *stat)
80{
81 return vfs_stat_fd(AT_FDCWD, name, stat);
82}
83
79EXPORT_SYMBOL(vfs_stat); 84EXPORT_SYMBOL(vfs_stat);
80 85
81int vfs_lstat(char __user *name, struct kstat *stat) 86int vfs_lstat_fd(int dfd, char __user *name, struct kstat *stat)
82{ 87{
83 struct nameidata nd; 88 struct nameidata nd;
84 int error; 89 int error;
85 90
86 error = user_path_walk_link(name, &nd); 91 error = __user_walk_fd(dfd, name, 0, &nd);
87 if (!error) { 92 if (!error) {
88 error = vfs_getattr(nd.mnt, nd.dentry, stat); 93 error = vfs_getattr(nd.mnt, nd.dentry, stat);
89 path_release(&nd); 94 path_release(&nd);
@@ -91,6 +96,11 @@ int vfs_lstat(char __user *name, struct kstat *stat)
91 return error; 96 return error;
92} 97}
93 98
99int vfs_lstat(char __user *name, struct kstat *stat)
100{
101 return vfs_lstat_fd(AT_FDCWD, name, stat);
102}
103
94EXPORT_SYMBOL(vfs_lstat); 104EXPORT_SYMBOL(vfs_lstat);
95 105
96int vfs_fstat(unsigned int fd, struct kstat *stat) 106int vfs_fstat(unsigned int fd, struct kstat *stat)
@@ -151,7 +161,7 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
151asmlinkage long sys_stat(char __user * filename, struct __old_kernel_stat __user * statbuf) 161asmlinkage long sys_stat(char __user * filename, struct __old_kernel_stat __user * statbuf)
152{ 162{
153 struct kstat stat; 163 struct kstat stat;
154 int error = vfs_stat(filename, &stat); 164 int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
155 165
156 if (!error) 166 if (!error)
157 error = cp_old_stat(&stat, statbuf); 167 error = cp_old_stat(&stat, statbuf);
@@ -161,7 +171,7 @@ asmlinkage long sys_stat(char __user * filename, struct __old_kernel_stat __user
161asmlinkage long sys_lstat(char __user * filename, struct __old_kernel_stat __user * statbuf) 171asmlinkage long sys_lstat(char __user * filename, struct __old_kernel_stat __user * statbuf)
162{ 172{
163 struct kstat stat; 173 struct kstat stat;
164 int error = vfs_lstat(filename, &stat); 174 int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
165 175
166 if (!error) 176 if (!error)
167 error = cp_old_stat(&stat, statbuf); 177 error = cp_old_stat(&stat, statbuf);
@@ -229,27 +239,50 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
229 return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; 239 return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
230} 240}
231 241
232asmlinkage long sys_newstat(char __user * filename, struct stat __user * statbuf) 242asmlinkage long sys_newstat(char __user *filename, struct stat __user *statbuf)
233{ 243{
234 struct kstat stat; 244 struct kstat stat;
235 int error = vfs_stat(filename, &stat); 245 int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
236 246
237 if (!error) 247 if (!error)
238 error = cp_new_stat(&stat, statbuf); 248 error = cp_new_stat(&stat, statbuf);
239 249
240 return error; 250 return error;
241} 251}
242asmlinkage long sys_newlstat(char __user * filename, struct stat __user * statbuf) 252
253asmlinkage long sys_newlstat(char __user *filename, struct stat __user *statbuf)
243{ 254{
244 struct kstat stat; 255 struct kstat stat;
245 int error = vfs_lstat(filename, &stat); 256 int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
246 257
247 if (!error) 258 if (!error)
248 error = cp_new_stat(&stat, statbuf); 259 error = cp_new_stat(&stat, statbuf);
249 260
250 return error; 261 return error;
251} 262}
252asmlinkage long sys_newfstat(unsigned int fd, struct stat __user * statbuf) 263
264asmlinkage long sys_newfstatat(int dfd, char __user *filename,
265 struct stat __user *statbuf, int flag)
266{
267 struct kstat stat;
268 int error = -EINVAL;
269
270 if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
271 goto out;
272
273 if (flag & AT_SYMLINK_NOFOLLOW)
274 error = vfs_lstat_fd(dfd, filename, &stat);
275 else
276 error = vfs_stat_fd(dfd, filename, &stat);
277
278 if (!error)
279 error = cp_new_stat(&stat, statbuf);
280
281out:
282 return error;
283}
284
285asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf)
253{ 286{
254 struct kstat stat; 287 struct kstat stat;
255 int error = vfs_fstat(fd, &stat); 288 int error = vfs_fstat(fd, &stat);
@@ -260,7 +293,8 @@ asmlinkage long sys_newfstat(unsigned int fd, struct stat __user * statbuf)
260 return error; 293 return error;
261} 294}
262 295
263asmlinkage long sys_readlink(const char __user * path, char __user * buf, int bufsiz) 296asmlinkage long sys_readlinkat(int dfd, const char __user *path,
297 char __user *buf, int bufsiz)
264{ 298{
265 struct nameidata nd; 299 struct nameidata nd;
266 int error; 300 int error;
@@ -268,7 +302,7 @@ asmlinkage long sys_readlink(const char __user * path, char __user * buf, int bu
268 if (bufsiz <= 0) 302 if (bufsiz <= 0)
269 return -EINVAL; 303 return -EINVAL;
270 304
271 error = user_path_walk_link(path, &nd); 305 error = __user_walk_fd(dfd, path, 0, &nd);
272 if (!error) { 306 if (!error) {
273 struct inode * inode = nd.dentry->d_inode; 307 struct inode * inode = nd.dentry->d_inode;
274 308
@@ -285,6 +319,12 @@ asmlinkage long sys_readlink(const char __user * path, char __user * buf, int bu
285 return error; 319 return error;
286} 320}
287 321
322asmlinkage long sys_readlink(const char __user *path, char __user *buf,
323 int bufsiz)
324{
325 return sys_readlinkat(AT_FDCWD, path, buf, bufsiz);
326}
327
288 328
289/* ---------- LFS-64 ----------- */ 329/* ---------- LFS-64 ----------- */
290#ifdef __ARCH_WANT_STAT64 330#ifdef __ARCH_WANT_STAT64
diff --git a/fs/super.c b/fs/super.c
index 6689dded3c8..c177b92419c 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -72,7 +72,7 @@ static struct super_block *alloc_super(void)
72 INIT_HLIST_HEAD(&s->s_anon); 72 INIT_HLIST_HEAD(&s->s_anon);
73 INIT_LIST_HEAD(&s->s_inodes); 73 INIT_LIST_HEAD(&s->s_inodes);
74 init_rwsem(&s->s_umount); 74 init_rwsem(&s->s_umount);
75 sema_init(&s->s_lock, 1); 75 mutex_init(&s->s_lock);
76 down_write(&s->s_umount); 76 down_write(&s->s_umount);
77 s->s_count = S_BIAS; 77 s->s_count = S_BIAS;
78 atomic_set(&s->s_active, 1); 78 atomic_set(&s->s_active, 1);
@@ -665,16 +665,6 @@ static int test_bdev_super(struct super_block *s, void *data)
665 return (void *)s->s_bdev == data; 665 return (void *)s->s_bdev == data;
666} 666}
667 667
668static void bdev_uevent(struct block_device *bdev, enum kobject_action action)
669{
670 if (bdev->bd_disk) {
671 if (bdev->bd_part)
672 kobject_uevent(&bdev->bd_part->kobj, action, NULL);
673 else
674 kobject_uevent(&bdev->bd_disk->kobj, action, NULL);
675 }
676}
677
678struct super_block *get_sb_bdev(struct file_system_type *fs_type, 668struct super_block *get_sb_bdev(struct file_system_type *fs_type,
679 int flags, const char *dev_name, void *data, 669 int flags, const char *dev_name, void *data,
680 int (*fill_super)(struct super_block *, void *, int)) 670 int (*fill_super)(struct super_block *, void *, int))
@@ -710,17 +700,14 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
710 700
711 s->s_flags = flags; 701 s->s_flags = flags;
712 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); 702 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
713 s->s_old_blocksize = block_size(bdev); 703 sb_set_blocksize(s, block_size(bdev));
714 sb_set_blocksize(s, s->s_old_blocksize);
715 error = fill_super(s, data, flags & MS_VERBOSE ? 1 : 0); 704 error = fill_super(s, data, flags & MS_VERBOSE ? 1 : 0);
716 if (error) { 705 if (error) {
717 up_write(&s->s_umount); 706 up_write(&s->s_umount);
718 deactivate_super(s); 707 deactivate_super(s);
719 s = ERR_PTR(error); 708 s = ERR_PTR(error);
720 } else { 709 } else
721 s->s_flags |= MS_ACTIVE; 710 s->s_flags |= MS_ACTIVE;
722 bdev_uevent(bdev, KOBJ_MOUNT);
723 }
724 } 711 }
725 712
726 return s; 713 return s;
@@ -736,7 +723,6 @@ void kill_block_super(struct super_block *sb)
736{ 723{
737 struct block_device *bdev = sb->s_bdev; 724 struct block_device *bdev = sb->s_bdev;
738 725
739 bdev_uevent(bdev, KOBJ_UMOUNT);
740 generic_shutdown_super(sb); 726 generic_shutdown_super(sb);
741 sync_blockdev(bdev); 727 sync_blockdev(bdev);
742 close_bdev_excl(bdev); 728 close_bdev_excl(bdev);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 59734ba1ee6..49bd219275d 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -99,7 +99,7 @@ static int create_dir(struct kobject * k, struct dentry * p,
99 int error; 99 int error;
100 umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; 100 umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
101 101
102 down(&p->d_inode->i_sem); 102 mutex_lock(&p->d_inode->i_mutex);
103 *d = lookup_one_len(n, p, strlen(n)); 103 *d = lookup_one_len(n, p, strlen(n));
104 if (!IS_ERR(*d)) { 104 if (!IS_ERR(*d)) {
105 error = sysfs_make_dirent(p->d_fsdata, *d, k, mode, SYSFS_DIR); 105 error = sysfs_make_dirent(p->d_fsdata, *d, k, mode, SYSFS_DIR);
@@ -112,13 +112,17 @@ static int create_dir(struct kobject * k, struct dentry * p,
112 } 112 }
113 } 113 }
114 if (error && (error != -EEXIST)) { 114 if (error && (error != -EEXIST)) {
115 sysfs_put((*d)->d_fsdata); 115 struct sysfs_dirent *sd = (*d)->d_fsdata;
116 if (sd) {
117 list_del_init(&sd->s_sibling);
118 sysfs_put(sd);
119 }
116 d_drop(*d); 120 d_drop(*d);
117 } 121 }
118 dput(*d); 122 dput(*d);
119 } else 123 } else
120 error = PTR_ERR(*d); 124 error = PTR_ERR(*d);
121 up(&p->d_inode->i_sem); 125 mutex_unlock(&p->d_inode->i_mutex);
122 return error; 126 return error;
123} 127}
124 128
@@ -242,7 +246,7 @@ static void remove_dir(struct dentry * d)
242 struct dentry * parent = dget(d->d_parent); 246 struct dentry * parent = dget(d->d_parent);
243 struct sysfs_dirent * sd; 247 struct sysfs_dirent * sd;
244 248
245 down(&parent->d_inode->i_sem); 249 mutex_lock(&parent->d_inode->i_mutex);
246 d_delete(d); 250 d_delete(d);
247 sd = d->d_fsdata; 251 sd = d->d_fsdata;
248 list_del_init(&sd->s_sibling); 252 list_del_init(&sd->s_sibling);
@@ -253,7 +257,7 @@ static void remove_dir(struct dentry * d)
253 pr_debug(" o %s removing done (%d)\n",d->d_name.name, 257 pr_debug(" o %s removing done (%d)\n",d->d_name.name,
254 atomic_read(&d->d_count)); 258 atomic_read(&d->d_count));
255 259
256 up(&parent->d_inode->i_sem); 260 mutex_unlock(&parent->d_inode->i_mutex);
257 dput(parent); 261 dput(parent);
258} 262}
259 263
@@ -282,7 +286,7 @@ void sysfs_remove_dir(struct kobject * kobj)
282 return; 286 return;
283 287
284 pr_debug("sysfs %s: removing dir\n",dentry->d_name.name); 288 pr_debug("sysfs %s: removing dir\n",dentry->d_name.name);
285 down(&dentry->d_inode->i_sem); 289 mutex_lock(&dentry->d_inode->i_mutex);
286 parent_sd = dentry->d_fsdata; 290 parent_sd = dentry->d_fsdata;
287 list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) { 291 list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
288 if (!sd->s_element || !(sd->s_type & SYSFS_NOT_PINNED)) 292 if (!sd->s_element || !(sd->s_type & SYSFS_NOT_PINNED))
@@ -291,7 +295,7 @@ void sysfs_remove_dir(struct kobject * kobj)
291 sysfs_drop_dentry(sd, dentry); 295 sysfs_drop_dentry(sd, dentry);
292 sysfs_put(sd); 296 sysfs_put(sd);
293 } 297 }
294 up(&dentry->d_inode->i_sem); 298 mutex_unlock(&dentry->d_inode->i_mutex);
295 299
296 remove_dir(dentry); 300 remove_dir(dentry);
297 /** 301 /**
@@ -314,7 +318,7 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
314 down_write(&sysfs_rename_sem); 318 down_write(&sysfs_rename_sem);
315 parent = kobj->parent->dentry; 319 parent = kobj->parent->dentry;
316 320
317 down(&parent->d_inode->i_sem); 321 mutex_lock(&parent->d_inode->i_mutex);
318 322
319 new_dentry = lookup_one_len(new_name, parent, strlen(new_name)); 323 new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
320 if (!IS_ERR(new_dentry)) { 324 if (!IS_ERR(new_dentry)) {
@@ -330,7 +334,7 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
330 error = -EEXIST; 334 error = -EEXIST;
331 dput(new_dentry); 335 dput(new_dentry);
332 } 336 }
333 up(&parent->d_inode->i_sem); 337 mutex_unlock(&parent->d_inode->i_mutex);
334 up_write(&sysfs_rename_sem); 338 up_write(&sysfs_rename_sem);
335 339
336 return error; 340 return error;
@@ -341,9 +345,9 @@ static int sysfs_dir_open(struct inode *inode, struct file *file)
341 struct dentry * dentry = file->f_dentry; 345 struct dentry * dentry = file->f_dentry;
342 struct sysfs_dirent * parent_sd = dentry->d_fsdata; 346 struct sysfs_dirent * parent_sd = dentry->d_fsdata;
343 347
344 down(&dentry->d_inode->i_sem); 348 mutex_lock(&dentry->d_inode->i_mutex);
345 file->private_data = sysfs_new_dirent(parent_sd, NULL); 349 file->private_data = sysfs_new_dirent(parent_sd, NULL);
346 up(&dentry->d_inode->i_sem); 350 mutex_unlock(&dentry->d_inode->i_mutex);
347 351
348 return file->private_data ? 0 : -ENOMEM; 352 return file->private_data ? 0 : -ENOMEM;
349 353
@@ -354,9 +358,9 @@ static int sysfs_dir_close(struct inode *inode, struct file *file)
354 struct dentry * dentry = file->f_dentry; 358 struct dentry * dentry = file->f_dentry;
355 struct sysfs_dirent * cursor = file->private_data; 359 struct sysfs_dirent * cursor = file->private_data;
356 360
357 down(&dentry->d_inode->i_sem); 361 mutex_lock(&dentry->d_inode->i_mutex);
358 list_del_init(&cursor->s_sibling); 362 list_del_init(&cursor->s_sibling);
359 up(&dentry->d_inode->i_sem); 363 mutex_unlock(&dentry->d_inode->i_mutex);
360 364
361 release_sysfs_dirent(cursor); 365 release_sysfs_dirent(cursor);
362 366
@@ -432,7 +436,7 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin)
432{ 436{
433 struct dentry * dentry = file->f_dentry; 437 struct dentry * dentry = file->f_dentry;
434 438
435 down(&dentry->d_inode->i_sem); 439 mutex_lock(&dentry->d_inode->i_mutex);
436 switch (origin) { 440 switch (origin) {
437 case 1: 441 case 1:
438 offset += file->f_pos; 442 offset += file->f_pos;
@@ -440,7 +444,7 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin)
440 if (offset >= 0) 444 if (offset >= 0)
441 break; 445 break;
442 default: 446 default:
443 up(&file->f_dentry->d_inode->i_sem); 447 mutex_unlock(&file->f_dentry->d_inode->i_mutex);
444 return -EINVAL; 448 return -EINVAL;
445 } 449 }
446 if (offset != file->f_pos) { 450 if (offset != file->f_pos) {
@@ -464,7 +468,7 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin)
464 list_add_tail(&cursor->s_sibling, p); 468 list_add_tail(&cursor->s_sibling, p);
465 } 469 }
466 } 470 }
467 up(&dentry->d_inode->i_sem); 471 mutex_unlock(&dentry->d_inode->i_mutex);
468 return offset; 472 return offset;
469} 473}
470 474
@@ -479,4 +483,3 @@ struct file_operations sysfs_dir_operations = {
479EXPORT_SYMBOL_GPL(sysfs_create_dir); 483EXPORT_SYMBOL_GPL(sysfs_create_dir);
480EXPORT_SYMBOL_GPL(sysfs_remove_dir); 484EXPORT_SYMBOL_GPL(sysfs_remove_dir);
481EXPORT_SYMBOL_GPL(sysfs_rename_dir); 485EXPORT_SYMBOL_GPL(sysfs_rename_dir);
482
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 4013d7905e8..d0e3d849516 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -364,9 +364,9 @@ int sysfs_add_file(struct dentry * dir, const struct attribute * attr, int type)
364 umode_t mode = (attr->mode & S_IALLUGO) | S_IFREG; 364 umode_t mode = (attr->mode & S_IALLUGO) | S_IFREG;
365 int error = 0; 365 int error = 0;
366 366
367 down(&dir->d_inode->i_sem); 367 mutex_lock(&dir->d_inode->i_mutex);
368 error = sysfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type); 368 error = sysfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type);
369 up(&dir->d_inode->i_sem); 369 mutex_unlock(&dir->d_inode->i_mutex);
370 370
371 return error; 371 return error;
372} 372}
@@ -398,7 +398,7 @@ int sysfs_update_file(struct kobject * kobj, const struct attribute * attr)
398 struct dentry * victim; 398 struct dentry * victim;
399 int res = -ENOENT; 399 int res = -ENOENT;
400 400
401 down(&dir->d_inode->i_sem); 401 mutex_lock(&dir->d_inode->i_mutex);
402 victim = lookup_one_len(attr->name, dir, strlen(attr->name)); 402 victim = lookup_one_len(attr->name, dir, strlen(attr->name));
403 if (!IS_ERR(victim)) { 403 if (!IS_ERR(victim)) {
404 /* make sure dentry is really there */ 404 /* make sure dentry is really there */
@@ -420,7 +420,7 @@ int sysfs_update_file(struct kobject * kobj, const struct attribute * attr)
420 */ 420 */
421 dput(victim); 421 dput(victim);
422 } 422 }
423 up(&dir->d_inode->i_sem); 423 mutex_unlock(&dir->d_inode->i_mutex);
424 424
425 return res; 425 return res;
426} 426}
@@ -441,22 +441,22 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
441 struct iattr newattrs; 441 struct iattr newattrs;
442 int res = -ENOENT; 442 int res = -ENOENT;
443 443
444 down(&dir->d_inode->i_sem); 444 mutex_lock(&dir->d_inode->i_mutex);
445 victim = lookup_one_len(attr->name, dir, strlen(attr->name)); 445 victim = lookup_one_len(attr->name, dir, strlen(attr->name));
446 if (!IS_ERR(victim)) { 446 if (!IS_ERR(victim)) {
447 if (victim->d_inode && 447 if (victim->d_inode &&
448 (victim->d_parent->d_inode == dir->d_inode)) { 448 (victim->d_parent->d_inode == dir->d_inode)) {
449 inode = victim->d_inode; 449 inode = victim->d_inode;
450 down(&inode->i_sem); 450 mutex_lock(&inode->i_mutex);
451 newattrs.ia_mode = (mode & S_IALLUGO) | 451 newattrs.ia_mode = (mode & S_IALLUGO) |
452 (inode->i_mode & ~S_IALLUGO); 452 (inode->i_mode & ~S_IALLUGO);
453 newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; 453 newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
454 res = notify_change(victim, &newattrs); 454 res = notify_change(victim, &newattrs);
455 up(&inode->i_sem); 455 mutex_unlock(&inode->i_mutex);
456 } 456 }
457 dput(victim); 457 dput(victim);
458 } 458 }
459 up(&dir->d_inode->i_sem); 459 mutex_unlock(&dir->d_inode->i_mutex);
460 460
461 return res; 461 return res;
462} 462}
@@ -480,4 +480,3 @@ void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
480EXPORT_SYMBOL_GPL(sysfs_create_file); 480EXPORT_SYMBOL_GPL(sysfs_create_file);
481EXPORT_SYMBOL_GPL(sysfs_remove_file); 481EXPORT_SYMBOL_GPL(sysfs_remove_file);
482EXPORT_SYMBOL_GPL(sysfs_update_file); 482EXPORT_SYMBOL_GPL(sysfs_update_file);
483
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 970a33f0329..689f7bcfaf3 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -11,6 +11,7 @@
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/namei.h> 12#include <linux/namei.h>
13#include <linux/backing-dev.h> 13#include <linux/backing-dev.h>
14#include <linux/capability.h>
14#include "sysfs.h" 15#include "sysfs.h"
15 16
16extern struct super_block * sysfs_sb; 17extern struct super_block * sysfs_sb;
@@ -201,7 +202,7 @@ const unsigned char * sysfs_get_name(struct sysfs_dirent *sd)
201 202
202/* 203/*
203 * Unhashes the dentry corresponding to given sysfs_dirent 204 * Unhashes the dentry corresponding to given sysfs_dirent
204 * Called with parent inode's i_sem held. 205 * Called with parent inode's i_mutex held.
205 */ 206 */
206void sysfs_drop_dentry(struct sysfs_dirent * sd, struct dentry * parent) 207void sysfs_drop_dentry(struct sysfs_dirent * sd, struct dentry * parent)
207{ 208{
@@ -232,7 +233,7 @@ void sysfs_hash_and_remove(struct dentry * dir, const char * name)
232 /* no inode means this hasn't been made visible yet */ 233 /* no inode means this hasn't been made visible yet */
233 return; 234 return;
234 235
235 down(&dir->d_inode->i_sem); 236 mutex_lock(&dir->d_inode->i_mutex);
236 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { 237 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
237 if (!sd->s_element) 238 if (!sd->s_element)
238 continue; 239 continue;
@@ -243,7 +244,5 @@ void sysfs_hash_and_remove(struct dentry * dir, const char * name)
243 break; 244 break;
244 } 245 }
245 } 246 }
246 up(&dir->d_inode->i_sem); 247 mutex_unlock(&dir->d_inode->i_mutex);
247} 248}
248
249
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index de402fa915f..e38d6338a20 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -86,9 +86,9 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
86 86
87 BUG_ON(!kobj || !kobj->dentry || !name); 87 BUG_ON(!kobj || !kobj->dentry || !name);
88 88
89 down(&dentry->d_inode->i_sem); 89 mutex_lock(&dentry->d_inode->i_mutex);
90 error = sysfs_add_link(dentry, name, target); 90 error = sysfs_add_link(dentry, name, target);
91 up(&dentry->d_inode->i_sem); 91 mutex_unlock(&dentry->d_inode->i_mutex);
92 return error; 92 return error;
93} 93}
94 94
@@ -177,4 +177,3 @@ struct inode_operations sysfs_symlink_inode_operations = {
177 177
178EXPORT_SYMBOL_GPL(sysfs_create_link); 178EXPORT_SYMBOL_GPL(sysfs_create_link);
179EXPORT_SYMBOL_GPL(sysfs_remove_link); 179EXPORT_SYMBOL_GPL(sysfs_remove_link);
180
diff --git a/fs/sysv/ChangeLog b/fs/sysv/ChangeLog
index 18e3487debd..f403f8b91b8 100644
--- a/fs/sysv/ChangeLog
+++ b/fs/sysv/ChangeLog
@@ -54,7 +54,7 @@ Fri Jan 4 2002 Alexander Viro <viro@parcelfarce.linux.theplanet.co.uk>
54 (sysv_read_super): Likewise. 54 (sysv_read_super): Likewise.
55 (v7_read_super): Likewise. 55 (v7_read_super): Likewise.
56 56
57Sun Dec 30 2001 Manfred Spraul <manfreds@colorfullife.com> 57Sun Dec 30 2001 Manfred Spraul <manfred@colorfullife.com>
58 58
59 * dir.c (dir_commit_chunk): Do not set dir->i_version. 59 * dir.c (dir_commit_chunk): Do not set dir->i_version.
60 (sysv_readdir): Likewise. 60 (sysv_readdir): Likewise.
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 69a085abad6..cce8b05cba5 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -103,7 +103,7 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
103 offset = (char *)de - kaddr; 103 offset = (char *)de - kaddr;
104 104
105 over = filldir(dirent, name, strnlen(name,SYSV_NAMELEN), 105 over = filldir(dirent, name, strnlen(name,SYSV_NAMELEN),
106 (n<<PAGE_CACHE_SHIFT) | offset, 106 ((loff_t)n<<PAGE_CACHE_SHIFT) | offset,
107 fs16_to_cpu(SYSV_SB(sb), de->inode), 107 fs16_to_cpu(SYSV_SB(sb), de->inode),
108 DT_UNKNOWN); 108 DT_UNKNOWN);
109 if (over) { 109 if (over) {
@@ -115,7 +115,7 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
115 } 115 }
116 116
117done: 117done:
118 filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset; 118 filp->f_pos = ((loff_t)n << PAGE_CACHE_SHIFT) | offset;
119 unlock_kernel(); 119 unlock_kernel();
120 return 0; 120 return 0;
121} 121}
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index b9ded26b10a..4fae57d9d11 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -4,11 +4,6 @@
4 * PURPOSE 4 * PURPOSE
5 * Block allocation handling routines for the OSTA-UDF(tm) filesystem. 5 * Block allocation handling routines for the OSTA-UDF(tm) filesystem.
6 * 6 *
7 * CONTACTS
8 * E-mail regarding any portion of the Linux UDF file system should be
9 * directed to the development team mailing list (run by majordomo):
10 * linux_udf@hpesjro.fc.hp.com
11 *
12 * COPYRIGHT 7 * COPYRIGHT
13 * This file is distributed under the terms of the GNU General Public 8 * This file is distributed under the terms of the GNU General Public
14 * License (GPL). Copies of the GPL can be obtained from: 9 * License (GPL). Copies of the GPL can be obtained from:
@@ -46,7 +41,7 @@
46#define uint(x) xuint(x) 41#define uint(x) xuint(x)
47#define xuint(x) __le ## x 42#define xuint(x) __le ## x
48 43
49extern inline int find_next_one_bit (void * addr, int size, int offset) 44static inline int find_next_one_bit (void * addr, int size, int offset)
50{ 45{
51 uintBPL_t * p = ((uintBPL_t *) addr) + (offset / BITS_PER_LONG); 46 uintBPL_t * p = ((uintBPL_t *) addr) + (offset / BITS_PER_LONG);
52 int result = offset & ~(BITS_PER_LONG-1); 47 int result = offset & ~(BITS_PER_LONG-1);
diff --git a/fs/udf/crc.c b/fs/udf/crc.c
index d95c6e38a45..1b82a4adc2f 100644
--- a/fs/udf/crc.c
+++ b/fs/udf/crc.c
@@ -14,11 +14,6 @@
14 * 14 *
15 * AT&T gives permission for the free use of the CRC source code. 15 * AT&T gives permission for the free use of the CRC source code.
16 * 16 *
17 * CONTACTS
18 * E-mail regarding any portion of the Linux UDF file system should be
19 * directed to the development team mailing list (run by majordomo):
20 * linux_udf@hpesjro.fc.hp.com
21 *
22 * COPYRIGHT 17 * COPYRIGHT
23 * This file is distributed under the terms of the GNU General Public 18 * This file is distributed under the terms of the GNU General Public
24 * License (GPL). Copies of the GPL can be obtained from: 19 * License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 82440b73114..f5222527fe3 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -4,11 +4,6 @@
4 * PURPOSE 4 * PURPOSE
5 * Directory handling routines for the OSTA-UDF(tm) filesystem. 5 * Directory handling routines for the OSTA-UDF(tm) filesystem.
6 * 6 *
7 * CONTACTS
8 * E-mail regarding any portion of the Linux UDF file system should be
9 * directed to the development team mailing list (run by majordomo):
10 * linux_udf@hpesjro.fc.hp.com
11 *
12 * COPYRIGHT 7 * COPYRIGHT
13 * This file is distributed under the terms of the GNU General Public 8 * This file is distributed under the terms of the GNU General Public
14 * License (GPL). Copies of the GPL can be obtained from: 9 * License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 9a61ecc5451..fe751a2a0e4 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -4,11 +4,6 @@
4 * PURPOSE 4 * PURPOSE
5 * Directory related functions 5 * Directory related functions
6 * 6 *
7 * CONTACTS
8 * E-mail regarding any portion of the Linux UDF file system should be
9 * directed to the development team mailing list (run by majordomo):
10 * linux_udf@hpesjro.fc.hp.com
11 *
12 * COPYRIGHT 7 * COPYRIGHT
13 * This file is distributed under the terms of the GNU General Public 8 * This file is distributed under the terms of the GNU General Public
14 * License (GPL). Copies of the GPL can be obtained from: 9 * License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 01f520c71dc..a6f2acc1f15 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -4,11 +4,6 @@
4 * PURPOSE 4 * PURPOSE
5 * File handling routines for the OSTA-UDF(tm) filesystem. 5 * File handling routines for the OSTA-UDF(tm) filesystem.
6 * 6 *
7 * CONTACTS
8 * E-mail regarding any portion of the Linux UDF file system should be
9 * directed to the development team mailing list (run by majordomo):
10 * linux_udf@hpesjro.fc.hp.com
11 *
12 * COPYRIGHT 7 * COPYRIGHT
13 * This file is distributed under the terms of the GNU General Public 8 * This file is distributed under the terms of the GNU General Public
14 * License (GPL). Copies of the GPL can be obtained from: 9 * License (GPL). Copies of the GPL can be obtained from:
@@ -36,6 +31,7 @@
36#include <asm/uaccess.h> 31#include <asm/uaccess.h>
37#include <linux/kernel.h> 32#include <linux/kernel.h>
38#include <linux/string.h> /* memset */ 33#include <linux/string.h> /* memset */
34#include <linux/capability.h>
39#include <linux/errno.h> 35#include <linux/errno.h>
40#include <linux/smp_lock.h> 36#include <linux/smp_lock.h>
41#include <linux/pagemap.h> 37#include <linux/pagemap.h>
diff --git a/fs/udf/fsync.c b/fs/udf/fsync.c
index 2dde6b888c2..5887d78cde4 100644
--- a/fs/udf/fsync.c
+++ b/fs/udf/fsync.c
@@ -4,11 +4,6 @@
4 * PURPOSE 4 * PURPOSE
5 * Fsync handling routines for the OSTA-UDF(tm) filesystem. 5 * Fsync handling routines for the OSTA-UDF(tm) filesystem.
6 * 6 *
7 * CONTACTS
8 * E-mail regarding any portion of the Linux UDF file system should be
9 * directed to the development team mailing list (run by majordomo):
10 * linux_udf@hpesjro.fc.hp.com
11 *
12 * COPYRIGHT 7 * COPYRIGHT
13 * This file is distributed under the terms of the GNU General Public 8 * This file is distributed under the terms of the GNU General Public
14 * License (GPL). Copies of the GPL can be obtained from: 9 * License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index a7e5d40f1eb..c9b707b470c 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -4,11 +4,6 @@
4 * PURPOSE 4 * PURPOSE
5 * Inode allocation handling routines for the OSTA-UDF(tm) filesystem. 5 * Inode allocation handling routines for the OSTA-UDF(tm) filesystem.
6 * 6 *
7 * CONTACTS
8 * E-mail regarding any portion of the Linux UDF file system should be
9 * directed to the development team mailing list (run by majordomo):
10 * linux_udf@hpesjro.fc.hp.com
11 *
12 * COPYRIGHT 7 * COPYRIGHT
13 * This file is distributed under the terms of the GNU General Public 8 * This file is distributed under the terms of the GNU General Public
14 * License (GPL). Copies of the GPL can be obtained from: 9 * License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index b83890beaaa..395e582ee54 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -4,11 +4,6 @@
4 * PURPOSE 4 * PURPOSE
5 * Inode handling routines for the OSTA-UDF(tm) filesystem. 5 * Inode handling routines for the OSTA-UDF(tm) filesystem.
6 * 6 *
7 * CONTACTS
8 * E-mail regarding any portion of the Linux UDF file system should be
9 * directed to the development team mailing list (run by majordomo):
10 * linux_udf@hpesjro.fc.hp.com
11 *
12 * COPYRIGHT 7 * COPYRIGHT
13 * This file is distributed under the terms of the GNU General Public 8 * This file is distributed under the terms of the GNU General Public
14 * License (GPL). Copies of the GPL can be obtained from: 9 * License (GPL). Copies of the GPL can be obtained from:
@@ -1962,11 +1957,6 @@ int8_t inode_bmap(struct inode *inode, int block, kernel_lb_addr *bloc, uint32_t
1962 printk(KERN_ERR "udf: inode_bmap: block < 0\n"); 1957 printk(KERN_ERR "udf: inode_bmap: block < 0\n");
1963 return -1; 1958 return -1;
1964 } 1959 }
1965 if (!inode)
1966 {
1967 printk(KERN_ERR "udf: inode_bmap: NULL inode\n");
1968 return -1;
1969 }
1970 1960
1971 *extoffset = 0; 1961 *extoffset = 0;
1972 *elen = 0; 1962 *elen = 0;
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 2da5087dfe0..08421610766 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -4,11 +4,6 @@
4 * PURPOSE 4 * PURPOSE
5 * Low Level Device Routines for the UDF filesystem 5 * Low Level Device Routines for the UDF filesystem
6 * 6 *
7 * CONTACTS
8 * E-mail regarding any portion of the Linux UDF file system should be
9 * directed to the development team mailing list (run by majordomo):
10 * linux_udf@hpesjro.fc.hp.com
11 *
12 * COPYRIGHT 7 * COPYRIGHT
13 * This file is distributed under the terms of the GNU General Public 8 * This file is distributed under the terms of the GNU General Public
14 * License (GPL). Copies of the GPL can be obtained from: 9 * License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index fd321f9ace8..cc8ca3254db 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -4,11 +4,6 @@
4 * PURPOSE 4 * PURPOSE
5 * Miscellaneous routines for the OSTA-UDF(tm) filesystem. 5 * Miscellaneous routines for the OSTA-UDF(tm) filesystem.
6 * 6 *
7 * CONTACTS
8 * E-mail regarding any portion of the Linux UDF file system should be
9 * directed to the development team mailing list (run by majordomo):
10 * linux_udf@hpesjro.fc.hp.com
11 *
12 * COPYRIGHT 7 * COPYRIGHT
13 * This file is distributed under the terms of the GNU General Public 8 * This file is distributed under the terms of the GNU General Public
14 * License (GPL). Copies of the GPL can be obtained from: 9 * License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index ac191ed7df0..ca732e79c48 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -4,11 +4,6 @@
4 * PURPOSE 4 * PURPOSE
5 * Inode name handling routines for the OSTA-UDF(tm) filesystem. 5 * Inode name handling routines for the OSTA-UDF(tm) filesystem.
6 * 6 *
7 * CONTACTS
8 * E-mail regarding any portion of the Linux UDF file system should be
9 * directed to the development team mailing list (run by majordomo):
10 * linux_udf@hpesjro.fc.hp.com
11 *
12 * COPYRIGHT 7 * COPYRIGHT
13 * This file is distributed under the terms of the GNU General Public 8 * This file is distributed under the terms of the GNU General Public
14 * License (GPL). Copies of the GPL can be obtained from: 9 * License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 4d36f264be0..dabf2b841db 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -4,11 +4,6 @@
4 * PURPOSE 4 * PURPOSE
5 * Partition handling routines for the OSTA-UDF(tm) filesystem. 5 * Partition handling routines for the OSTA-UDF(tm) filesystem.
6 * 6 *
7 * CONTACTS
8 * E-mail regarding any portion of the Linux UDF file system should be
9 * directed to the development team mailing list (run by majordomo):
10 * linux_udf@hpesjro.fc.hp.com
11 *
12 * COPYRIGHT 7 * COPYRIGHT
13 * This file is distributed under the terms of the GNU General Public 8 * This file is distributed under the terms of the GNU General Public
14 * License (GPL). Copies of the GPL can be obtained from: 9 * License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 15bd4f24c5b..4a6f49adc60 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -14,11 +14,6 @@
14 * http://www.ecma.ch/ 14 * http://www.ecma.ch/
15 * http://www.iso.org/ 15 * http://www.iso.org/
16 * 16 *
17 * CONTACTS
18 * E-mail regarding any portion of the Linux UDF file system should be
19 * directed to the development team mailing list (run by majordomo):
20 * linux_udf@hpesjro.fc.hp.com
21 *
22 * COPYRIGHT 17 * COPYRIGHT
23 * This file is distributed under the terms of the GNU General Public 18 * This file is distributed under the terms of the GNU General Public
24 * License (GPL). Copies of the GPL can be obtained from: 19 * License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 43f3051ef75..674bb40edc8 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -4,11 +4,6 @@
4 * PURPOSE 4 * PURPOSE
5 * Symlink handling routines for the OSTA-UDF(tm) filesystem. 5 * Symlink handling routines for the OSTA-UDF(tm) filesystem.
6 * 6 *
7 * CONTACTS
8 * E-mail regarding any portion of the Linux UDF file system should be
9 * directed to the development team mailing list (run by majordomo):
10 * linux_udf@hpesjro.fc.hp.com
11 *
12 * COPYRIGHT 7 * COPYRIGHT
13 * This file is distributed under the terms of the GNU General Public 8 * This file is distributed under the terms of the GNU General Public
14 * License (GPL). Copies of the GPL can be obtained from: 9 * License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 7dc8a5572ca..e1b0e8cfecb 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -4,11 +4,6 @@
4 * PURPOSE 4 * PURPOSE
5 * Truncate handling routines for the OSTA-UDF(tm) filesystem. 5 * Truncate handling routines for the OSTA-UDF(tm) filesystem.
6 * 6 *
7 * CONTACTS
8 * E-mail regarding any portion of the Linux UDF file system should be
9 * directed to the development team mailing list (run by majordomo):
10 * linux_udf@hpesjro.fc.hp.com
11 *
12 * COPYRIGHT 7 * COPYRIGHT
13 * This file is distributed under the terms of the GNU General Public 8 * This file is distributed under the terms of the GNU General Public
14 * License (GPL). Copies of the GPL can be obtained from: 9 * License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 5a80efd8deb..706c92e1dcc 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -11,11 +11,6 @@
11 * UTF-8 is explained in the IETF RFC XXXX. 11 * UTF-8 is explained in the IETF RFC XXXX.
12 * ftp://ftp.internic.net/rfc/rfcxxxx.txt 12 * ftp://ftp.internic.net/rfc/rfcxxxx.txt
13 * 13 *
14 * CONTACTS
15 * E-mail regarding any portion of the Linux UDF file system should be
16 * directed to the development team's mailing list (run by majordomo):
17 * linux_udf@hpesjro.fc.hp.com
18 *
19 * COPYRIGHT 14 * COPYRIGHT
20 * This file is distributed under the terms of the GNU General Public 15 * This file is distributed under the terms of the GNU General Public
21 * License (GPL). Copies of the GPL can be obtained from: 16 * License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index faf1512173e..3ada9dcf55b 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -13,6 +13,7 @@
13#include <linux/string.h> 13#include <linux/string.h>
14#include <linux/quotaops.h> 14#include <linux/quotaops.h>
15#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
16#include <linux/capability.h>
16#include <linux/sched.h> 17#include <linux/sched.h>
17#include <linux/bitops.h> 18#include <linux/bitops.h>
18#include <asm/byteorder.h> 19#include <asm/byteorder.h>
@@ -48,7 +49,7 @@ void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count
48 49
49 sb = inode->i_sb; 50 sb = inode->i_sb;
50 uspi = UFS_SB(sb)->s_uspi; 51 uspi = UFS_SB(sb)->s_uspi;
51 usb1 = ubh_get_usb_first(USPI_UBH); 52 usb1 = ubh_get_usb_first(uspi);
52 53
53 UFSD(("ENTER, fragment %u, count %u\n", fragment, count)) 54 UFSD(("ENTER, fragment %u, count %u\n", fragment, count))
54 55
@@ -80,8 +81,9 @@ void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count
80 for (i = bit; i < end_bit; i++) { 81 for (i = bit; i < end_bit; i++) {
81 if (ubh_isclr (UCPI_UBH, ucpi->c_freeoff, i)) 82 if (ubh_isclr (UCPI_UBH, ucpi->c_freeoff, i))
82 ubh_setbit (UCPI_UBH, ucpi->c_freeoff, i); 83 ubh_setbit (UCPI_UBH, ucpi->c_freeoff, i);
83 else ufs_error (sb, "ufs_free_fragments", 84 else
84 "bit already cleared for fragment %u", i); 85 ufs_error (sb, "ufs_free_fragments",
86 "bit already cleared for fragment %u", i);
85 } 87 }
86 88
87 DQUOT_FREE_BLOCK (inode, count); 89 DQUOT_FREE_BLOCK (inode, count);
@@ -142,7 +144,7 @@ void ufs_free_blocks (struct inode * inode, unsigned fragment, unsigned count) {
142 144
143 sb = inode->i_sb; 145 sb = inode->i_sb;
144 uspi = UFS_SB(sb)->s_uspi; 146 uspi = UFS_SB(sb)->s_uspi;
145 usb1 = ubh_get_usb_first(USPI_UBH); 147 usb1 = ubh_get_usb_first(uspi);
146 148
147 UFSD(("ENTER, fragment %u, count %u\n", fragment, count)) 149 UFSD(("ENTER, fragment %u, count %u\n", fragment, count))
148 150
@@ -246,7 +248,7 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
246 248
247 sb = inode->i_sb; 249 sb = inode->i_sb;
248 uspi = UFS_SB(sb)->s_uspi; 250 uspi = UFS_SB(sb)->s_uspi;
249 usb1 = ubh_get_usb_first(USPI_UBH); 251 usb1 = ubh_get_usb_first(uspi);
250 *err = -ENOSPC; 252 *err = -ENOSPC;
251 253
252 lock_super (sb); 254 lock_super (sb);
@@ -406,7 +408,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
406 408
407 sb = inode->i_sb; 409 sb = inode->i_sb;
408 uspi = UFS_SB(sb)->s_uspi; 410 uspi = UFS_SB(sb)->s_uspi;
409 usb1 = ubh_get_usb_first (USPI_UBH); 411 usb1 = ubh_get_usb_first (uspi);
410 count = newcount - oldcount; 412 count = newcount - oldcount;
411 413
412 cgno = ufs_dtog(fragment); 414 cgno = ufs_dtog(fragment);
@@ -489,7 +491,7 @@ static unsigned ufs_alloc_fragments (struct inode * inode, unsigned cgno,
489 491
490 sb = inode->i_sb; 492 sb = inode->i_sb;
491 uspi = UFS_SB(sb)->s_uspi; 493 uspi = UFS_SB(sb)->s_uspi;
492 usb1 = ubh_get_usb_first(USPI_UBH); 494 usb1 = ubh_get_usb_first(uspi);
493 oldcg = cgno; 495 oldcg = cgno;
494 496
495 /* 497 /*
@@ -605,7 +607,7 @@ static unsigned ufs_alloccg_block (struct inode * inode,
605 607
606 sb = inode->i_sb; 608 sb = inode->i_sb;
607 uspi = UFS_SB(sb)->s_uspi; 609 uspi = UFS_SB(sb)->s_uspi;
608 usb1 = ubh_get_usb_first(USPI_UBH); 610 usb1 = ubh_get_usb_first(uspi);
609 ucg = ubh_get_ucg(UCPI_UBH); 611 ucg = ubh_get_ucg(UCPI_UBH);
610 612
611 if (goal == 0) { 613 if (goal == 0) {
@@ -662,7 +664,7 @@ static unsigned ufs_bitmap_search (struct super_block * sb,
662 UFSD(("ENTER, cg %u, goal %u, count %u\n", ucpi->c_cgx, goal, count)) 664 UFSD(("ENTER, cg %u, goal %u, count %u\n", ucpi->c_cgx, goal, count))
663 665
664 uspi = UFS_SB(sb)->s_uspi; 666 uspi = UFS_SB(sb)->s_uspi;
665 usb1 = ubh_get_usb_first (USPI_UBH); 667 usb1 = ubh_get_usb_first (uspi);
666 ucg = ubh_get_ucg(UCPI_UBH); 668 ucg = ubh_get_ucg(UCPI_UBH);
667 669
668 if (goal) 670 if (goal)
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index d0915fba155..7c10c68902a 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -491,7 +491,7 @@ int ufs_delete_entry (struct inode * inode, struct ufs_dir_entry * dir,
491 491
492 UFSD(("ino %u, reclen %u, namlen %u, name %s\n", 492 UFSD(("ino %u, reclen %u, namlen %u, name %s\n",
493 fs32_to_cpu(sb, de->d_ino), 493 fs32_to_cpu(sb, de->d_ino),
494 fs16to_cpu(sb, de->d_reclen), 494 fs16_to_cpu(sb, de->d_reclen),
495 ufs_get_de_namlen(sb, de), de->d_name)) 495 ufs_get_de_namlen(sb, de), de->d_name))
496 496
497 while (i < bh->b_size) { 497 while (i < bh->b_size) {
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 0938945b9cb..c7a47ed4f43 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -72,7 +72,7 @@ void ufs_free_inode (struct inode * inode)
72 72
73 sb = inode->i_sb; 73 sb = inode->i_sb;
74 uspi = UFS_SB(sb)->s_uspi; 74 uspi = UFS_SB(sb)->s_uspi;
75 usb1 = ubh_get_usb_first(USPI_UBH); 75 usb1 = ubh_get_usb_first(uspi);
76 76
77 ino = inode->i_ino; 77 ino = inode->i_ino;
78 78
@@ -167,7 +167,7 @@ struct inode * ufs_new_inode(struct inode * dir, int mode)
167 ufsi = UFS_I(inode); 167 ufsi = UFS_I(inode);
168 sbi = UFS_SB(sb); 168 sbi = UFS_SB(sb);
169 uspi = sbi->s_uspi; 169 uspi = sbi->s_uspi;
170 usb1 = ubh_get_usb_first(USPI_UBH); 170 usb1 = ubh_get_usb_first(uspi);
171 171
172 lock_super (sb); 172 lock_super (sb);
173 173
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 55f4aa16e3f..e0c04e36a05 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -61,7 +61,7 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off
61 int n = 0; 61 int n = 0;
62 62
63 63
64 UFSD(("ptrs=uspi->s_apb = %d,double_blocks=%d \n",ptrs,double_blocks)); 64 UFSD(("ptrs=uspi->s_apb = %d,double_blocks=%ld \n",ptrs,double_blocks));
65 if (i_block < 0) { 65 if (i_block < 0) {
66 ufs_warning(inode->i_sb, "ufs_block_to_path", "block < 0"); 66 ufs_warning(inode->i_sb, "ufs_block_to_path", "block < 0");
67 } else if (i_block < direct_blocks) { 67 } else if (i_block < direct_blocks) {
@@ -104,7 +104,7 @@ u64 ufs_frag_map(struct inode *inode, sector_t frag)
104 unsigned flags = UFS_SB(sb)->s_flags; 104 unsigned flags = UFS_SB(sb)->s_flags;
105 u64 temp = 0L; 105 u64 temp = 0L;
106 106
107 UFSD((": frag = %lu depth = %d\n",frag,depth)); 107 UFSD((": frag = %llu depth = %d\n", (unsigned long long)frag, depth));
108 UFSD((": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n",uspi->s_fpbshift,uspi->s_apbmask,mask)); 108 UFSD((": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n",uspi->s_fpbshift,uspi->s_apbmask,mask));
109 109
110 if (depth == 0) 110 if (depth == 0)
@@ -365,9 +365,10 @@ repeat:
365 sync_dirty_buffer(bh); 365 sync_dirty_buffer(bh);
366 inode->i_ctime = CURRENT_TIME_SEC; 366 inode->i_ctime = CURRENT_TIME_SEC;
367 mark_inode_dirty(inode); 367 mark_inode_dirty(inode);
368 UFSD(("result %u\n", tmp + blockoff));
368out: 369out:
369 brelse (bh); 370 brelse (bh);
370 UFSD(("EXIT, result %u\n", tmp + blockoff)) 371 UFSD(("EXIT\n"));
371 return result; 372 return result;
372} 373}
373 374
@@ -386,7 +387,7 @@ static int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buf
386 387
387 if (!create) { 388 if (!create) {
388 phys64 = ufs_frag_map(inode, fragment); 389 phys64 = ufs_frag_map(inode, fragment);
389 UFSD(("phys64 = %lu \n",phys64)); 390 UFSD(("phys64 = %llu \n",phys64));
390 if (phys64) 391 if (phys64)
391 map_bh(bh_result, sb, phys64); 392 map_bh(bh_result, sb, phys64);
392 return 0; 393 return 0;
@@ -401,7 +402,7 @@ static int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buf
401 402
402 lock_kernel(); 403 lock_kernel();
403 404
404 UFSD(("ENTER, ino %lu, fragment %u\n", inode->i_ino, fragment)) 405 UFSD(("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment))
405 if (fragment < 0) 406 if (fragment < 0)
406 goto abort_negative; 407 goto abort_negative;
407 if (fragment > 408 if (fragment >
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 54828ebcf1b..d4aacee593f 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -221,7 +221,7 @@ void ufs_error (struct super_block * sb, const char * function,
221 va_list args; 221 va_list args;
222 222
223 uspi = UFS_SB(sb)->s_uspi; 223 uspi = UFS_SB(sb)->s_uspi;
224 usb1 = ubh_get_usb_first(USPI_UBH); 224 usb1 = ubh_get_usb_first(uspi);
225 225
226 if (!(sb->s_flags & MS_RDONLY)) { 226 if (!(sb->s_flags & MS_RDONLY)) {
227 usb1->fs_clean = UFS_FSBAD; 227 usb1->fs_clean = UFS_FSBAD;
@@ -253,7 +253,7 @@ void ufs_panic (struct super_block * sb, const char * function,
253 va_list args; 253 va_list args;
254 254
255 uspi = UFS_SB(sb)->s_uspi; 255 uspi = UFS_SB(sb)->s_uspi;
256 usb1 = ubh_get_usb_first(USPI_UBH); 256 usb1 = ubh_get_usb_first(uspi);
257 257
258 if (!(sb->s_flags & MS_RDONLY)) { 258 if (!(sb->s_flags & MS_RDONLY)) {
259 usb1->fs_clean = UFS_FSBAD; 259 usb1->fs_clean = UFS_FSBAD;
@@ -420,21 +420,18 @@ static int ufs_read_cylinder_structures (struct super_block *sb) {
420 if (i + uspi->s_fpb > blks) 420 if (i + uspi->s_fpb > blks)
421 size = (blks - i) * uspi->s_fsize; 421 size = (blks - i) * uspi->s_fsize;
422 422
423 if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { 423 if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
424 ubh = ubh_bread(sb, 424 ubh = ubh_bread(sb,
425 fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_csaddr) + i, size); 425 fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_csaddr) + i, size);
426 if (!ubh) 426 else
427 goto failed;
428 ubh_ubhcpymem (space, ubh, size);
429 sbi->s_csp[ufs_fragstoblks(i)]=(struct ufs_csum *)space;
430 }
431 else {
432 ubh = ubh_bread(sb, uspi->s_csaddr + i, size); 427 ubh = ubh_bread(sb, uspi->s_csaddr + i, size);
433 if (!ubh) 428
434 goto failed; 429 if (!ubh)
435 ubh_ubhcpymem(space, ubh, size); 430 goto failed;
436 sbi->s_csp[ufs_fragstoblks(i)]=(struct ufs_csum *)space; 431
437 } 432 ubh_ubhcpymem (space, ubh, size);
433 sbi->s_csp[ufs_fragstoblks(i)]=(struct ufs_csum *)space;
434
438 space += size; 435 space += size;
439 ubh_brelse (ubh); 436 ubh_brelse (ubh);
440 ubh = NULL; 437 ubh = NULL;
@@ -539,6 +536,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
539 struct inode *inode; 536 struct inode *inode;
540 unsigned block_size, super_block_size; 537 unsigned block_size, super_block_size;
541 unsigned flags; 538 unsigned flags;
539 unsigned super_block_offset;
542 540
543 uspi = NULL; 541 uspi = NULL;
544 ubh = NULL; 542 ubh = NULL;
@@ -586,10 +584,11 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
586 if (!uspi) 584 if (!uspi)
587 goto failed; 585 goto failed;
588 586
587 super_block_offset=UFS_SBLOCK;
588
589 /* Keep 2Gig file limit. Some UFS variants need to override 589 /* Keep 2Gig file limit. Some UFS variants need to override
590 this but as I don't know which I'll let those in the know loosen 590 this but as I don't know which I'll let those in the know loosen
591 the rules */ 591 the rules */
592
593 switch (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) { 592 switch (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) {
594 case UFS_MOUNT_UFSTYPE_44BSD: 593 case UFS_MOUNT_UFSTYPE_44BSD:
595 UFSD(("ufstype=44bsd\n")) 594 UFSD(("ufstype=44bsd\n"))
@@ -601,7 +600,8 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
601 flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD; 600 flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD;
602 break; 601 break;
603 case UFS_MOUNT_UFSTYPE_UFS2: 602 case UFS_MOUNT_UFSTYPE_UFS2:
604 UFSD(("ufstype=ufs2\n")) 603 UFSD(("ufstype=ufs2\n"));
604 super_block_offset=SBLOCK_UFS2;
605 uspi->s_fsize = block_size = 512; 605 uspi->s_fsize = block_size = 512;
606 uspi->s_fmask = ~(512 - 1); 606 uspi->s_fmask = ~(512 - 1);
607 uspi->s_fshift = 9; 607 uspi->s_fshift = 9;
@@ -725,19 +725,16 @@ again:
725 /* 725 /*
726 * read ufs super block from device 726 * read ufs super block from device
727 */ 727 */
728 if ( (flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { 728
729 ubh = ubh_bread_uspi(uspi, sb, uspi->s_sbbase + SBLOCK_UFS2/block_size, super_block_size); 729 ubh = ubh_bread_uspi(uspi, sb, uspi->s_sbbase + super_block_offset/block_size, super_block_size);
730 } 730
731 else {
732 ubh = ubh_bread_uspi(uspi, sb, uspi->s_sbbase + UFS_SBLOCK/block_size, super_block_size);
733 }
734 if (!ubh) 731 if (!ubh)
735 goto failed; 732 goto failed;
736 733
737 734
738 usb1 = ubh_get_usb_first(USPI_UBH); 735 usb1 = ubh_get_usb_first(uspi);
739 usb2 = ubh_get_usb_second(USPI_UBH); 736 usb2 = ubh_get_usb_second(uspi);
740 usb3 = ubh_get_usb_third(USPI_UBH); 737 usb3 = ubh_get_usb_third(uspi);
741 usb = (struct ufs_super_block *) 738 usb = (struct ufs_super_block *)
742 ((struct ufs_buffer_head *)uspi)->bh[0]->b_data ; 739 ((struct ufs_buffer_head *)uspi)->bh[0]->b_data ;
743 740
@@ -1006,8 +1003,8 @@ static void ufs_write_super (struct super_block *sb) {
1006 UFSD(("ENTER\n")) 1003 UFSD(("ENTER\n"))
1007 flags = UFS_SB(sb)->s_flags; 1004 flags = UFS_SB(sb)->s_flags;
1008 uspi = UFS_SB(sb)->s_uspi; 1005 uspi = UFS_SB(sb)->s_uspi;
1009 usb1 = ubh_get_usb_first(USPI_UBH); 1006 usb1 = ubh_get_usb_first(uspi);
1010 usb3 = ubh_get_usb_third(USPI_UBH); 1007 usb3 = ubh_get_usb_third(uspi);
1011 1008
1012 if (!(sb->s_flags & MS_RDONLY)) { 1009 if (!(sb->s_flags & MS_RDONLY)) {
1013 usb1->fs_time = cpu_to_fs32(sb, get_seconds()); 1010 usb1->fs_time = cpu_to_fs32(sb, get_seconds());
@@ -1049,8 +1046,8 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1049 1046
1050 uspi = UFS_SB(sb)->s_uspi; 1047 uspi = UFS_SB(sb)->s_uspi;
1051 flags = UFS_SB(sb)->s_flags; 1048 flags = UFS_SB(sb)->s_flags;
1052 usb1 = ubh_get_usb_first(USPI_UBH); 1049 usb1 = ubh_get_usb_first(uspi);
1053 usb3 = ubh_get_usb_third(USPI_UBH); 1050 usb3 = ubh_get_usb_third(uspi);
1054 1051
1055 /* 1052 /*
1056 * Allow the "check" option to be passed as a remount option. 1053 * Allow the "check" option to be passed as a remount option.
@@ -1124,7 +1121,7 @@ static int ufs_statfs (struct super_block *sb, struct kstatfs *buf)
1124 lock_kernel(); 1121 lock_kernel();
1125 1122
1126 uspi = UFS_SB(sb)->s_uspi; 1123 uspi = UFS_SB(sb)->s_uspi;
1127 usb1 = ubh_get_usb_first (USPI_UBH); 1124 usb1 = ubh_get_usb_first (uspi);
1128 usb = (struct ufs_super_block *) 1125 usb = (struct ufs_super_block *)
1129 ((struct ufs_buffer_head *)uspi)->bh[0]->b_data ; 1126 ((struct ufs_buffer_head *)uspi)->bh[0]->b_data ;
1130 1127
@@ -1275,7 +1272,7 @@ static ssize_t ufs_quota_write(struct super_block *sb, int type,
1275 size_t towrite = len; 1272 size_t towrite = len;
1276 struct buffer_head *bh; 1273 struct buffer_head *bh;
1277 1274
1278 down(&inode->i_sem); 1275 mutex_lock(&inode->i_mutex);
1279 while (towrite > 0) { 1276 while (towrite > 0) {
1280 tocopy = sb->s_blocksize - offset < towrite ? 1277 tocopy = sb->s_blocksize - offset < towrite ?
1281 sb->s_blocksize - offset : towrite; 1278 sb->s_blocksize - offset : towrite;
@@ -1296,14 +1293,16 @@ static ssize_t ufs_quota_write(struct super_block *sb, int type,
1296 blk++; 1293 blk++;
1297 } 1294 }
1298out: 1295out:
1299 if (len == towrite) 1296 if (len == towrite) {
1297 mutex_unlock(&inode->i_mutex);
1300 return err; 1298 return err;
1299 }
1301 if (inode->i_size < off+len-towrite) 1300 if (inode->i_size < off+len-towrite)
1302 i_size_write(inode, off+len-towrite); 1301 i_size_write(inode, off+len-towrite);
1303 inode->i_version++; 1302 inode->i_version++;
1304 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; 1303 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
1305 mark_inode_dirty(inode); 1304 mark_inode_dirty(inode);
1306 up(&inode->i_sem); 1305 mutex_unlock(&inode->i_mutex);
1307 return len - towrite; 1306 return len - towrite;
1308} 1307}
1309 1308
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index b2640076679..48d6d9bcc15 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -249,18 +249,28 @@ extern void _ubh_memcpyubh_(struct ufs_sb_private_info *, struct ufs_buffer_head
249 249
250 250
251/* 251/*
252 * macros to get important structures from ufs_buffer_head 252 * macros and inline function to get important structures from ufs_sb_private_info
253 */ 253 */
254#define ubh_get_usb_first(ubh) \
255 ((struct ufs_super_block_first *)((ubh)->bh[0]->b_data))
256 254
257#define ubh_get_usb_second(ubh) \ 255static inline void *get_usb_offset(struct ufs_sb_private_info *uspi,
258 ((struct ufs_super_block_second *)(ubh)-> \ 256 unsigned int offset)
259 bh[UFS_SECTOR_SIZE >> uspi->s_fshift]->b_data + (UFS_SECTOR_SIZE & ~uspi->s_fmask)) 257{
258 unsigned int index;
259
260 index = offset >> uspi->s_fshift;
261 offset &= ~uspi->s_fmask;
262 return uspi->s_ubh.bh[index]->b_data + offset;
263}
264
265#define ubh_get_usb_first(uspi) \
266 ((struct ufs_super_block_first *)get_usb_offset((uspi), 0))
267
268#define ubh_get_usb_second(uspi) \
269 ((struct ufs_super_block_second *)get_usb_offset((uspi), UFS_SECTOR_SIZE))
270
271#define ubh_get_usb_third(uspi) \
272 ((struct ufs_super_block_third *)get_usb_offset((uspi), 2*UFS_SECTOR_SIZE))
260 273
261#define ubh_get_usb_third(ubh) \
262 ((struct ufs_super_block_third *)((ubh)-> \
263 bh[UFS_SECTOR_SIZE*2 >> uspi->s_fshift]->b_data + (UFS_SECTOR_SIZE*2 & ~uspi->s_fmask)))
264 274
265#define ubh_get_ucg(ubh) \ 275#define ubh_get_ucg(ubh) \
266 ((struct ufs_cylinder_group *)((ubh)->bh[0]->b_data)) 276 ((struct ufs_cylinder_group *)((ubh)->bh[0]->b_data))
diff --git a/fs/xattr.c b/fs/xattr.c
index a9db2255799..80eca7d3d69 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -19,6 +19,149 @@
19#include <linux/fsnotify.h> 19#include <linux/fsnotify.h>
20#include <asm/uaccess.h> 20#include <asm/uaccess.h>
21 21
22
23/*
24 * Check permissions for extended attribute access. This is a bit complicated
25 * because different namespaces have very different rules.
26 */
27static int
28xattr_permission(struct inode *inode, const char *name, int mask)
29{
30 /*
31 * We can never set or remove an extended attribute on a read-only
32 * filesystem or on an immutable / append-only inode.
33 */
34 if (mask & MAY_WRITE) {
35 if (IS_RDONLY(inode))
36 return -EROFS;
37 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
38 return -EPERM;
39 }
40
41 /*
42 * No restriction for security.* and system.* from the VFS. Decision
43 * on these is left to the underlying filesystem / security module.
44 */
45 if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) ||
46 !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
47 return 0;
48
49 /*
50 * The trusted.* namespace can only accessed by a privilegued user.
51 */
52 if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
53 return (capable(CAP_SYS_ADMIN) ? 0 : -EPERM);
54
55 if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) {
56 if (!S_ISREG(inode->i_mode) &&
57 (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
58 return -EPERM;
59 }
60
61 return permission(inode, mask, NULL);
62}
63
64int
65vfs_setxattr(struct dentry *dentry, char *name, void *value,
66 size_t size, int flags)
67{
68 struct inode *inode = dentry->d_inode;
69 int error;
70
71 error = xattr_permission(inode, name, MAY_WRITE);
72 if (error)
73 return error;
74
75 mutex_lock(&inode->i_mutex);
76 error = security_inode_setxattr(dentry, name, value, size, flags);
77 if (error)
78 goto out;
79 error = -EOPNOTSUPP;
80 if (inode->i_op->setxattr) {
81 error = inode->i_op->setxattr(dentry, name, value, size, flags);
82 if (!error) {
83 fsnotify_xattr(dentry);
84 security_inode_post_setxattr(dentry, name, value,
85 size, flags);
86 }
87 } else if (!strncmp(name, XATTR_SECURITY_PREFIX,
88 XATTR_SECURITY_PREFIX_LEN)) {
89 const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
90 error = security_inode_setsecurity(inode, suffix, value,
91 size, flags);
92 if (!error)
93 fsnotify_xattr(dentry);
94 }
95out:
96 mutex_unlock(&inode->i_mutex);
97 return error;
98}
99EXPORT_SYMBOL_GPL(vfs_setxattr);
100
101ssize_t
102vfs_getxattr(struct dentry *dentry, char *name, void *value, size_t size)
103{
104 struct inode *inode = dentry->d_inode;
105 int error;
106
107 error = xattr_permission(inode, name, MAY_READ);
108 if (error)
109 return error;
110
111 error = security_inode_getxattr(dentry, name);
112 if (error)
113 return error;
114
115 if (inode->i_op->getxattr)
116 error = inode->i_op->getxattr(dentry, name, value, size);
117 else
118 error = -EOPNOTSUPP;
119
120 if (!strncmp(name, XATTR_SECURITY_PREFIX,
121 XATTR_SECURITY_PREFIX_LEN)) {
122 const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
123 int ret = security_inode_getsecurity(inode, suffix, value,
124 size, error);
125 /*
126 * Only overwrite the return value if a security module
127 * is actually active.
128 */
129 if (ret != -EOPNOTSUPP)
130 error = ret;
131 }
132
133 return error;
134}
135EXPORT_SYMBOL_GPL(vfs_getxattr);
136
137int
138vfs_removexattr(struct dentry *dentry, char *name)
139{
140 struct inode *inode = dentry->d_inode;
141 int error;
142
143 if (!inode->i_op->removexattr)
144 return -EOPNOTSUPP;
145
146 error = xattr_permission(inode, name, MAY_WRITE);
147 if (error)
148 return error;
149
150 error = security_inode_removexattr(dentry, name);
151 if (error)
152 return error;
153
154 mutex_lock(&inode->i_mutex);
155 error = inode->i_op->removexattr(dentry, name);
156 mutex_unlock(&inode->i_mutex);
157
158 if (!error)
159 fsnotify_xattr(dentry);
160 return error;
161}
162EXPORT_SYMBOL_GPL(vfs_removexattr);
163
164
22/* 165/*
23 * Extended attribute SET operations 166 * Extended attribute SET operations
24 */ 167 */
@@ -51,29 +194,7 @@ setxattr(struct dentry *d, char __user *name, void __user *value,
51 } 194 }
52 } 195 }
53 196
54 down(&d->d_inode->i_sem); 197 error = vfs_setxattr(d, kname, kvalue, size, flags);
55 error = security_inode_setxattr(d, kname, kvalue, size, flags);
56 if (error)
57 goto out;
58 error = -EOPNOTSUPP;
59 if (d->d_inode->i_op && d->d_inode->i_op->setxattr) {
60 error = d->d_inode->i_op->setxattr(d, kname, kvalue,
61 size, flags);
62 if (!error) {
63 fsnotify_xattr(d);
64 security_inode_post_setxattr(d, kname, kvalue,
65 size, flags);
66 }
67 } else if (!strncmp(kname, XATTR_SECURITY_PREFIX,
68 sizeof XATTR_SECURITY_PREFIX - 1)) {
69 const char *suffix = kname + sizeof XATTR_SECURITY_PREFIX - 1;
70 error = security_inode_setsecurity(d->d_inode, suffix, kvalue,
71 size, flags);
72 if (!error)
73 fsnotify_xattr(d);
74 }
75out:
76 up(&d->d_inode->i_sem);
77 kfree(kvalue); 198 kfree(kvalue);
78 return error; 199 return error;
79} 200}
@@ -147,22 +268,7 @@ getxattr(struct dentry *d, char __user *name, void __user *value, size_t size)
147 return -ENOMEM; 268 return -ENOMEM;
148 } 269 }
149 270
150 error = security_inode_getxattr(d, kname); 271 error = vfs_getxattr(d, kname, kvalue, size);
151 if (error)
152 goto out;
153 error = -EOPNOTSUPP;
154 if (d->d_inode->i_op && d->d_inode->i_op->getxattr)
155 error = d->d_inode->i_op->getxattr(d, kname, kvalue, size);
156
157 if (!strncmp(kname, XATTR_SECURITY_PREFIX,
158 sizeof XATTR_SECURITY_PREFIX - 1)) {
159 const char *suffix = kname + sizeof XATTR_SECURITY_PREFIX - 1;
160 int rv = security_inode_getsecurity(d->d_inode, suffix, kvalue,
161 size, error);
162 /* Security module active: overwrite error value */
163 if (rv != -EOPNOTSUPP)
164 error = rv;
165 }
166 if (error > 0) { 272 if (error > 0) {
167 if (size && copy_to_user(value, kvalue, error)) 273 if (size && copy_to_user(value, kvalue, error))
168 error = -EFAULT; 274 error = -EFAULT;
@@ -171,7 +277,6 @@ getxattr(struct dentry *d, char __user *name, void __user *value, size_t size)
171 than XATTR_SIZE_MAX bytes. Not possible. */ 277 than XATTR_SIZE_MAX bytes. Not possible. */
172 error = -E2BIG; 278 error = -E2BIG;
173 } 279 }
174out:
175 kfree(kvalue); 280 kfree(kvalue);
176 return error; 281 return error;
177} 282}
@@ -245,7 +350,7 @@ listxattr(struct dentry *d, char __user *list, size_t size)
245 error = d->d_inode->i_op->listxattr(d, klist, size); 350 error = d->d_inode->i_op->listxattr(d, klist, size);
246 } else { 351 } else {
247 error = security_inode_listsecurity(d->d_inode, klist, size); 352 error = security_inode_listsecurity(d->d_inode, klist, size);
248 if (size && error >= size) 353 if (size && error > size)
249 error = -ERANGE; 354 error = -ERANGE;
250 } 355 }
251 if (error > 0) { 356 if (error > 0) {
@@ -318,19 +423,7 @@ removexattr(struct dentry *d, char __user *name)
318 if (error < 0) 423 if (error < 0)
319 return error; 424 return error;
320 425
321 error = -EOPNOTSUPP; 426 return vfs_removexattr(d, kname);
322 if (d->d_inode->i_op && d->d_inode->i_op->removexattr) {
323 error = security_inode_removexattr(d, kname);
324 if (error)
325 goto out;
326 down(&d->d_inode->i_sem);
327 error = d->d_inode->i_op->removexattr(d, kname);
328 up(&d->d_inode->i_sem);
329 if (!error)
330 fsnotify_xattr(d);
331 }
332out:
333 return error;
334} 427}
335 428
336asmlinkage long 429asmlinkage long
diff --git a/fs/xfs/Kbuild b/fs/xfs/Kbuild
new file mode 100644
index 00000000000..2566e96706f
--- /dev/null
+++ b/fs/xfs/Kbuild
@@ -0,0 +1,6 @@
1#
2# The xfs people like to share Makefile with 2.6 and 2.4.
3# Utilise file named Kbuild file which has precedence over Makefile.
4#
5
6include $(srctree)/$(obj)/Makefile-linux-2.6
diff --git a/fs/xfs/linux-2.6/mutex.h b/fs/xfs/linux-2.6/mutex.h
index ce773d89a92..2a88d56c4dc 100644
--- a/fs/xfs/linux-2.6/mutex.h
+++ b/fs/xfs/linux-2.6/mutex.h
@@ -18,22 +18,8 @@
18#ifndef __XFS_SUPPORT_MUTEX_H__ 18#ifndef __XFS_SUPPORT_MUTEX_H__
19#define __XFS_SUPPORT_MUTEX_H__ 19#define __XFS_SUPPORT_MUTEX_H__
20 20
21#include <linux/spinlock.h> 21#include <linux/mutex.h>
22#include <asm/semaphore.h>
23 22
24/* 23typedef struct mutex mutex_t;
25 * Map the mutex'es from IRIX to Linux semaphores.
26 *
27 * Destroy just simply initializes to -99 which should block all other
28 * callers.
29 */
30#define MUTEX_DEFAULT 0x0
31typedef struct semaphore mutex_t;
32
33#define mutex_init(lock, type, name) sema_init(lock, 1)
34#define mutex_destroy(lock) sema_init(lock, -99)
35#define mutex_lock(lock, num) down(lock)
36#define mutex_trylock(lock) (down_trylock(lock) ? 0 : 1)
37#define mutex_unlock(lock) up(lock)
38 24
39#endif /* __XFS_SUPPORT_MUTEX_H__ */ 25#endif /* __XFS_SUPPORT_MUTEX_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c6108971b4e..12062678940 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -40,11 +40,10 @@
40#include "xfs_rw.h" 40#include "xfs_rw.h"
41#include "xfs_iomap.h" 41#include "xfs_iomap.h"
42#include <linux/mpage.h> 42#include <linux/mpage.h>
43#include <linux/pagevec.h>
43#include <linux/writeback.h> 44#include <linux/writeback.h>
44 45
45STATIC void xfs_count_page_state(struct page *, int *, int *, int *); 46STATIC void xfs_count_page_state(struct page *, int *, int *, int *);
46STATIC void xfs_convert_page(struct inode *, struct page *, xfs_iomap_t *,
47 struct writeback_control *wbc, void *, int, int);
48 47
49#if defined(XFS_RW_TRACE) 48#if defined(XFS_RW_TRACE)
50void 49void
@@ -55,17 +54,15 @@ xfs_page_trace(
55 int mask) 54 int mask)
56{ 55{
57 xfs_inode_t *ip; 56 xfs_inode_t *ip;
58 bhv_desc_t *bdp;
59 vnode_t *vp = LINVFS_GET_VP(inode); 57 vnode_t *vp = LINVFS_GET_VP(inode);
60 loff_t isize = i_size_read(inode); 58 loff_t isize = i_size_read(inode);
61 loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT; 59 loff_t offset = page_offset(page);
62 int delalloc = -1, unmapped = -1, unwritten = -1; 60 int delalloc = -1, unmapped = -1, unwritten = -1;
63 61
64 if (page_has_buffers(page)) 62 if (page_has_buffers(page))
65 xfs_count_page_state(page, &delalloc, &unmapped, &unwritten); 63 xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
66 64
67 bdp = vn_bhv_lookup(VN_BHV_HEAD(vp), &xfs_vnodeops); 65 ip = xfs_vtoi(vp);
68 ip = XFS_BHVTOI(bdp);
69 if (!ip->i_rwtrace) 66 if (!ip->i_rwtrace)
70 return; 67 return;
71 68
@@ -103,15 +100,56 @@ xfs_finish_ioend(
103 queue_work(xfsdatad_workqueue, &ioend->io_work); 100 queue_work(xfsdatad_workqueue, &ioend->io_work);
104} 101}
105 102
103/*
104 * We're now finished for good with this ioend structure.
105 * Update the page state via the associated buffer_heads,
106 * release holds on the inode and bio, and finally free
107 * up memory. Do not use the ioend after this.
108 */
106STATIC void 109STATIC void
107xfs_destroy_ioend( 110xfs_destroy_ioend(
108 xfs_ioend_t *ioend) 111 xfs_ioend_t *ioend)
109{ 112{
113 struct buffer_head *bh, *next;
114
115 for (bh = ioend->io_buffer_head; bh; bh = next) {
116 next = bh->b_private;
117 bh->b_end_io(bh, ioend->io_uptodate);
118 }
119
110 vn_iowake(ioend->io_vnode); 120 vn_iowake(ioend->io_vnode);
111 mempool_free(ioend, xfs_ioend_pool); 121 mempool_free(ioend, xfs_ioend_pool);
112} 122}
113 123
114/* 124/*
125 * Buffered IO write completion for delayed allocate extents.
126 * TODO: Update ondisk isize now that we know the file data
127 * has been flushed (i.e. the notorious "NULL file" problem).
128 */
129STATIC void
130xfs_end_bio_delalloc(
131 void *data)
132{
133 xfs_ioend_t *ioend = data;
134
135 xfs_destroy_ioend(ioend);
136}
137
138/*
139 * Buffered IO write completion for regular, written extents.
140 */
141STATIC void
142xfs_end_bio_written(
143 void *data)
144{
145 xfs_ioend_t *ioend = data;
146
147 xfs_destroy_ioend(ioend);
148}
149
150/*
151 * IO write completion for unwritten extents.
152 *
115 * Issue transactions to convert a buffer range from unwritten 153 * Issue transactions to convert a buffer range from unwritten
116 * to written extents. 154 * to written extents.
117 */ 155 */
@@ -123,21 +161,10 @@ xfs_end_bio_unwritten(
123 vnode_t *vp = ioend->io_vnode; 161 vnode_t *vp = ioend->io_vnode;
124 xfs_off_t offset = ioend->io_offset; 162 xfs_off_t offset = ioend->io_offset;
125 size_t size = ioend->io_size; 163 size_t size = ioend->io_size;
126 struct buffer_head *bh, *next;
127 int error; 164 int error;
128 165
129 if (ioend->io_uptodate) 166 if (ioend->io_uptodate)
130 VOP_BMAP(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL, error); 167 VOP_BMAP(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL, error);
131
132 /* ioend->io_buffer_head is only non-NULL for buffered I/O */
133 for (bh = ioend->io_buffer_head; bh; bh = next) {
134 next = bh->b_private;
135
136 bh->b_end_io = NULL;
137 clear_buffer_unwritten(bh);
138 end_buffer_async_write(bh, ioend->io_uptodate);
139 }
140
141 xfs_destroy_ioend(ioend); 168 xfs_destroy_ioend(ioend);
142} 169}
143 170
@@ -149,7 +176,8 @@ xfs_end_bio_unwritten(
149 */ 176 */
150STATIC xfs_ioend_t * 177STATIC xfs_ioend_t *
151xfs_alloc_ioend( 178xfs_alloc_ioend(
152 struct inode *inode) 179 struct inode *inode,
180 unsigned int type)
153{ 181{
154 xfs_ioend_t *ioend; 182 xfs_ioend_t *ioend;
155 183
@@ -162,45 +190,25 @@ xfs_alloc_ioend(
162 */ 190 */
163 atomic_set(&ioend->io_remaining, 1); 191 atomic_set(&ioend->io_remaining, 1);
164 ioend->io_uptodate = 1; /* cleared if any I/O fails */ 192 ioend->io_uptodate = 1; /* cleared if any I/O fails */
193 ioend->io_list = NULL;
194 ioend->io_type = type;
165 ioend->io_vnode = LINVFS_GET_VP(inode); 195 ioend->io_vnode = LINVFS_GET_VP(inode);
166 ioend->io_buffer_head = NULL; 196 ioend->io_buffer_head = NULL;
197 ioend->io_buffer_tail = NULL;
167 atomic_inc(&ioend->io_vnode->v_iocount); 198 atomic_inc(&ioend->io_vnode->v_iocount);
168 ioend->io_offset = 0; 199 ioend->io_offset = 0;
169 ioend->io_size = 0; 200 ioend->io_size = 0;
170 201
171 INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten, ioend); 202 if (type == IOMAP_UNWRITTEN)
203 INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten, ioend);
204 else if (type == IOMAP_DELAY)
205 INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc, ioend);
206 else
207 INIT_WORK(&ioend->io_work, xfs_end_bio_written, ioend);
172 208
173 return ioend; 209 return ioend;
174} 210}
175 211
176void
177linvfs_unwritten_done(
178 struct buffer_head *bh,
179 int uptodate)
180{
181 xfs_ioend_t *ioend = bh->b_private;
182 static spinlock_t unwritten_done_lock = SPIN_LOCK_UNLOCKED;
183 unsigned long flags;
184
185 ASSERT(buffer_unwritten(bh));
186 bh->b_end_io = NULL;
187
188 if (!uptodate)
189 ioend->io_uptodate = 0;
190
191 /*
192 * Deep magic here. We reuse b_private in the buffer_heads to build
193 * a chain for completing the I/O from user context after we've issued
194 * a transaction to convert the unwritten extent.
195 */
196 spin_lock_irqsave(&unwritten_done_lock, flags);
197 bh->b_private = ioend->io_buffer_head;
198 ioend->io_buffer_head = bh;
199 spin_unlock_irqrestore(&unwritten_done_lock, flags);
200
201 xfs_finish_ioend(ioend);
202}
203
204STATIC int 212STATIC int
205xfs_map_blocks( 213xfs_map_blocks(
206 struct inode *inode, 214 struct inode *inode,
@@ -218,138 +226,283 @@ xfs_map_blocks(
218 return -error; 226 return -error;
219} 227}
220 228
229STATIC inline int
230xfs_iomap_valid(
231 xfs_iomap_t *iomapp,
232 loff_t offset)
233{
234 return offset >= iomapp->iomap_offset &&
235 offset < iomapp->iomap_offset + iomapp->iomap_bsize;
236}
237
221/* 238/*
222 * Finds the corresponding mapping in block @map array of the 239 * BIO completion handler for buffered IO.
223 * given @offset within a @page.
224 */ 240 */
225STATIC xfs_iomap_t * 241STATIC int
226xfs_offset_to_map( 242xfs_end_bio(
243 struct bio *bio,
244 unsigned int bytes_done,
245 int error)
246{
247 xfs_ioend_t *ioend = bio->bi_private;
248
249 if (bio->bi_size)
250 return 1;
251
252 ASSERT(ioend);
253 ASSERT(atomic_read(&bio->bi_cnt) >= 1);
254
255 /* Toss bio and pass work off to an xfsdatad thread */
256 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
257 ioend->io_uptodate = 0;
258 bio->bi_private = NULL;
259 bio->bi_end_io = NULL;
260
261 bio_put(bio);
262 xfs_finish_ioend(ioend);
263 return 0;
264}
265
266STATIC void
267xfs_submit_ioend_bio(
268 xfs_ioend_t *ioend,
269 struct bio *bio)
270{
271 atomic_inc(&ioend->io_remaining);
272
273 bio->bi_private = ioend;
274 bio->bi_end_io = xfs_end_bio;
275
276 submit_bio(WRITE, bio);
277 ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
278 bio_put(bio);
279}
280
281STATIC struct bio *
282xfs_alloc_ioend_bio(
283 struct buffer_head *bh)
284{
285 struct bio *bio;
286 int nvecs = bio_get_nr_vecs(bh->b_bdev);
287
288 do {
289 bio = bio_alloc(GFP_NOIO, nvecs);
290 nvecs >>= 1;
291 } while (!bio);
292
293 ASSERT(bio->bi_private == NULL);
294 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
295 bio->bi_bdev = bh->b_bdev;
296 bio_get(bio);
297 return bio;
298}
299
300STATIC void
301xfs_start_buffer_writeback(
302 struct buffer_head *bh)
303{
304 ASSERT(buffer_mapped(bh));
305 ASSERT(buffer_locked(bh));
306 ASSERT(!buffer_delay(bh));
307 ASSERT(!buffer_unwritten(bh));
308
309 mark_buffer_async_write(bh);
310 set_buffer_uptodate(bh);
311 clear_buffer_dirty(bh);
312}
313
314STATIC void
315xfs_start_page_writeback(
227 struct page *page, 316 struct page *page,
228 xfs_iomap_t *iomapp, 317 struct writeback_control *wbc,
229 unsigned long offset) 318 int clear_dirty,
319 int buffers)
320{
321 ASSERT(PageLocked(page));
322 ASSERT(!PageWriteback(page));
323 set_page_writeback(page);
324 if (clear_dirty)
325 clear_page_dirty(page);
326 unlock_page(page);
327 if (!buffers) {
328 end_page_writeback(page);
329 wbc->pages_skipped++; /* We didn't write this page */
330 }
331}
332
333static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
334{
335 return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
336}
337
338/*
339 * Submit all of the bios for all of the ioends we have saved up, covering the
340 * initial writepage page and also any probed pages.
341 *
342 * Because we may have multiple ioends spanning a page, we need to start
343 * writeback on all the buffers before we submit them for I/O. If we mark the
344 * buffers as we got, then we can end up with a page that only has buffers
345 * marked async write and I/O complete on can occur before we mark the other
346 * buffers async write.
347 *
348 * The end result of this is that we trip a bug in end_page_writeback() because
349 * we call it twice for the one page as the code in end_buffer_async_write()
350 * assumes that all buffers on the page are started at the same time.
351 *
352 * The fix is two passes across the ioend list - one to start writeback on the
353 * bufferheads, and then the second one submit them for I/O.
354 */
355STATIC void
356xfs_submit_ioend(
357 xfs_ioend_t *ioend)
230{ 358{
231 loff_t full_offset; /* offset from start of file */ 359 xfs_ioend_t *head = ioend;
360 xfs_ioend_t *next;
361 struct buffer_head *bh;
362 struct bio *bio;
363 sector_t lastblock = 0;
364
365 /* Pass 1 - start writeback */
366 do {
367 next = ioend->io_list;
368 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
369 xfs_start_buffer_writeback(bh);
370 }
371 } while ((ioend = next) != NULL);
372
373 /* Pass 2 - submit I/O */
374 ioend = head;
375 do {
376 next = ioend->io_list;
377 bio = NULL;
378
379 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
380
381 if (!bio) {
382 retry:
383 bio = xfs_alloc_ioend_bio(bh);
384 } else if (bh->b_blocknr != lastblock + 1) {
385 xfs_submit_ioend_bio(ioend, bio);
386 goto retry;
387 }
388
389 if (bio_add_buffer(bio, bh) != bh->b_size) {
390 xfs_submit_ioend_bio(ioend, bio);
391 goto retry;
392 }
232 393
233 ASSERT(offset < PAGE_CACHE_SIZE); 394 lastblock = bh->b_blocknr;
395 }
396 if (bio)
397 xfs_submit_ioend_bio(ioend, bio);
398 xfs_finish_ioend(ioend);
399 } while ((ioend = next) != NULL);
400}
401
402/*
403 * Cancel submission of all buffer_heads so far in this endio.
404 * Toss the endio too. Only ever called for the initial page
405 * in a writepage request, so only ever one page.
406 */
407STATIC void
408xfs_cancel_ioend(
409 xfs_ioend_t *ioend)
410{
411 xfs_ioend_t *next;
412 struct buffer_head *bh, *next_bh;
234 413
235 full_offset = page->index; /* NB: using 64bit number */ 414 do {
236 full_offset <<= PAGE_CACHE_SHIFT; /* offset from file start */ 415 next = ioend->io_list;
237 full_offset += offset; /* offset from page start */ 416 bh = ioend->io_buffer_head;
417 do {
418 next_bh = bh->b_private;
419 clear_buffer_async_write(bh);
420 unlock_buffer(bh);
421 } while ((bh = next_bh) != NULL);
238 422
239 if (full_offset < iomapp->iomap_offset) 423 vn_iowake(ioend->io_vnode);
240 return NULL; 424 mempool_free(ioend, xfs_ioend_pool);
241 if (iomapp->iomap_offset + (iomapp->iomap_bsize -1) >= full_offset) 425 } while ((ioend = next) != NULL);
242 return iomapp; 426}
243 return NULL; 427
428/*
429 * Test to see if we've been building up a completion structure for
430 * earlier buffers -- if so, we try to append to this ioend if we
431 * can, otherwise we finish off any current ioend and start another.
432 * Return true if we've finished the given ioend.
433 */
434STATIC void
435xfs_add_to_ioend(
436 struct inode *inode,
437 struct buffer_head *bh,
438 xfs_off_t offset,
439 unsigned int type,
440 xfs_ioend_t **result,
441 int need_ioend)
442{
443 xfs_ioend_t *ioend = *result;
444
445 if (!ioend || need_ioend || type != ioend->io_type) {
446 xfs_ioend_t *previous = *result;
447
448 ioend = xfs_alloc_ioend(inode, type);
449 ioend->io_offset = offset;
450 ioend->io_buffer_head = bh;
451 ioend->io_buffer_tail = bh;
452 if (previous)
453 previous->io_list = ioend;
454 *result = ioend;
455 } else {
456 ioend->io_buffer_tail->b_private = bh;
457 ioend->io_buffer_tail = bh;
458 }
459
460 bh->b_private = NULL;
461 ioend->io_size += bh->b_size;
244} 462}
245 463
246STATIC void 464STATIC void
247xfs_map_at_offset( 465xfs_map_at_offset(
248 struct page *page,
249 struct buffer_head *bh, 466 struct buffer_head *bh,
250 unsigned long offset, 467 loff_t offset,
251 int block_bits, 468 int block_bits,
252 xfs_iomap_t *iomapp) 469 xfs_iomap_t *iomapp)
253{ 470{
254 xfs_daddr_t bn; 471 xfs_daddr_t bn;
255 loff_t delta;
256 int sector_shift; 472 int sector_shift;
257 473
258 ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE)); 474 ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE));
259 ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY)); 475 ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY));
260 ASSERT(iomapp->iomap_bn != IOMAP_DADDR_NULL); 476 ASSERT(iomapp->iomap_bn != IOMAP_DADDR_NULL);
261 477
262 delta = page->index;
263 delta <<= PAGE_CACHE_SHIFT;
264 delta += offset;
265 delta -= iomapp->iomap_offset;
266 delta >>= block_bits;
267
268 sector_shift = block_bits - BBSHIFT; 478 sector_shift = block_bits - BBSHIFT;
269 bn = iomapp->iomap_bn >> sector_shift; 479 bn = (iomapp->iomap_bn >> sector_shift) +
270 bn += delta; 480 ((offset - iomapp->iomap_offset) >> block_bits);
271 BUG_ON(!bn && !(iomapp->iomap_flags & IOMAP_REALTIME)); 481
482 ASSERT(bn || (iomapp->iomap_flags & IOMAP_REALTIME));
272 ASSERT((bn << sector_shift) >= iomapp->iomap_bn); 483 ASSERT((bn << sector_shift) >= iomapp->iomap_bn);
273 484
274 lock_buffer(bh); 485 lock_buffer(bh);
275 bh->b_blocknr = bn; 486 bh->b_blocknr = bn;
276 bh->b_bdev = iomapp->iomap_target->pbr_bdev; 487 bh->b_bdev = iomapp->iomap_target->bt_bdev;
277 set_buffer_mapped(bh); 488 set_buffer_mapped(bh);
278 clear_buffer_delay(bh); 489 clear_buffer_delay(bh);
490 clear_buffer_unwritten(bh);
279} 491}
280 492
281/* 493/*
282 * Look for a page at index which is unlocked and contains our 494 * Look for a page at index that is suitable for clustering.
283 * unwritten extent flagged buffers at its head. Returns page
284 * locked and with an extra reference count, and length of the
285 * unwritten extent component on this page that we can write,
286 * in units of filesystem blocks.
287 */
288STATIC struct page *
289xfs_probe_unwritten_page(
290 struct address_space *mapping,
291 pgoff_t index,
292 xfs_iomap_t *iomapp,
293 xfs_ioend_t *ioend,
294 unsigned long max_offset,
295 unsigned long *fsbs,
296 unsigned int bbits)
297{
298 struct page *page;
299
300 page = find_trylock_page(mapping, index);
301 if (!page)
302 return NULL;
303 if (PageWriteback(page))
304 goto out;
305
306 if (page->mapping && page_has_buffers(page)) {
307 struct buffer_head *bh, *head;
308 unsigned long p_offset = 0;
309
310 *fsbs = 0;
311 bh = head = page_buffers(page);
312 do {
313 if (!buffer_unwritten(bh) || !buffer_uptodate(bh))
314 break;
315 if (!xfs_offset_to_map(page, iomapp, p_offset))
316 break;
317 if (p_offset >= max_offset)
318 break;
319 xfs_map_at_offset(page, bh, p_offset, bbits, iomapp);
320 set_buffer_unwritten_io(bh);
321 bh->b_private = ioend;
322 p_offset += bh->b_size;
323 (*fsbs)++;
324 } while ((bh = bh->b_this_page) != head);
325
326 if (p_offset)
327 return page;
328 }
329
330out:
331 unlock_page(page);
332 return NULL;
333}
334
335/*
336 * Look for a page at index which is unlocked and not mapped
337 * yet - clustering for mmap write case.
338 */ 495 */
339STATIC unsigned int 496STATIC unsigned int
340xfs_probe_unmapped_page( 497xfs_probe_page(
341 struct address_space *mapping, 498 struct page *page,
342 pgoff_t index, 499 unsigned int pg_offset,
343 unsigned int pg_offset) 500 int mapped)
344{ 501{
345 struct page *page;
346 int ret = 0; 502 int ret = 0;
347 503
348 page = find_trylock_page(mapping, index);
349 if (!page)
350 return 0;
351 if (PageWriteback(page)) 504 if (PageWriteback(page))
352 goto out; 505 return 0;
353 506
354 if (page->mapping && PageDirty(page)) { 507 if (page->mapping && PageDirty(page)) {
355 if (page_has_buffers(page)) { 508 if (page_has_buffers(page)) {
@@ -357,79 +510,101 @@ xfs_probe_unmapped_page(
357 510
358 bh = head = page_buffers(page); 511 bh = head = page_buffers(page);
359 do { 512 do {
360 if (buffer_mapped(bh) || !buffer_uptodate(bh)) 513 if (!buffer_uptodate(bh))
514 break;
515 if (mapped != buffer_mapped(bh))
361 break; 516 break;
362 ret += bh->b_size; 517 ret += bh->b_size;
363 if (ret >= pg_offset) 518 if (ret >= pg_offset)
364 break; 519 break;
365 } while ((bh = bh->b_this_page) != head); 520 } while ((bh = bh->b_this_page) != head);
366 } else 521 } else
367 ret = PAGE_CACHE_SIZE; 522 ret = mapped ? 0 : PAGE_CACHE_SIZE;
368 } 523 }
369 524
370out:
371 unlock_page(page);
372 return ret; 525 return ret;
373} 526}
374 527
375STATIC unsigned int 528STATIC size_t
376xfs_probe_unmapped_cluster( 529xfs_probe_cluster(
377 struct inode *inode, 530 struct inode *inode,
378 struct page *startpage, 531 struct page *startpage,
379 struct buffer_head *bh, 532 struct buffer_head *bh,
380 struct buffer_head *head) 533 struct buffer_head *head,
534 int mapped)
381{ 535{
536 struct pagevec pvec;
382 pgoff_t tindex, tlast, tloff; 537 pgoff_t tindex, tlast, tloff;
383 unsigned int pg_offset, len, total = 0; 538 size_t total = 0;
384 struct address_space *mapping = inode->i_mapping; 539 int done = 0, i;
385 540
386 /* First sum forwards in this page */ 541 /* First sum forwards in this page */
387 do { 542 do {
388 if (buffer_mapped(bh)) 543 if (mapped != buffer_mapped(bh))
389 break; 544 return total;
390 total += bh->b_size; 545 total += bh->b_size;
391 } while ((bh = bh->b_this_page) != head); 546 } while ((bh = bh->b_this_page) != head);
392 547
393 /* If we reached the end of the page, sum forwards in 548 /* if we reached the end of the page, sum forwards in following pages */
394 * following pages. 549 tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
395 */ 550 tindex = startpage->index + 1;
396 if (bh == head) { 551
397 tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT; 552 /* Prune this back to avoid pathological behavior */
398 /* Prune this back to avoid pathological behavior */ 553 tloff = min(tlast, startpage->index + 64);
399 tloff = min(tlast, startpage->index + 64); 554
400 for (tindex = startpage->index + 1; tindex < tloff; tindex++) { 555 pagevec_init(&pvec, 0);
401 len = xfs_probe_unmapped_page(mapping, tindex, 556 while (!done && tindex <= tloff) {
402 PAGE_CACHE_SIZE); 557 unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
403 if (!len) 558
404 return total; 559 if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
560 break;
561
562 for (i = 0; i < pagevec_count(&pvec); i++) {
563 struct page *page = pvec.pages[i];
564 size_t pg_offset, len = 0;
565
566 if (tindex == tlast) {
567 pg_offset =
568 i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
569 if (!pg_offset) {
570 done = 1;
571 break;
572 }
573 } else
574 pg_offset = PAGE_CACHE_SIZE;
575
576 if (page->index == tindex && !TestSetPageLocked(page)) {
577 len = xfs_probe_page(page, pg_offset, mapped);
578 unlock_page(page);
579 }
580
581 if (!len) {
582 done = 1;
583 break;
584 }
585
405 total += len; 586 total += len;
587 tindex++;
406 } 588 }
407 if (tindex == tlast && 589
408 (pg_offset = i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) { 590 pagevec_release(&pvec);
409 total += xfs_probe_unmapped_page(mapping, 591 cond_resched();
410 tindex, pg_offset);
411 }
412 } 592 }
593
413 return total; 594 return total;
414} 595}
415 596
416/* 597/*
417 * Probe for a given page (index) in the inode and test if it is delayed 598 * Test if a given page is suitable for writing as part of an unwritten
418 * and without unwritten buffers. Returns page locked and with an extra 599 * or delayed allocate extent.
419 * reference count.
420 */ 600 */
421STATIC struct page * 601STATIC int
422xfs_probe_delalloc_page( 602xfs_is_delayed_page(
423 struct inode *inode, 603 struct page *page,
424 pgoff_t index) 604 unsigned int type)
425{ 605{
426 struct page *page;
427
428 page = find_trylock_page(inode->i_mapping, index);
429 if (!page)
430 return NULL;
431 if (PageWriteback(page)) 606 if (PageWriteback(page))
432 goto out; 607 return 0;
433 608
434 if (page->mapping && page_has_buffers(page)) { 609 if (page->mapping && page_has_buffers(page)) {
435 struct buffer_head *bh, *head; 610 struct buffer_head *bh, *head;
@@ -437,243 +612,156 @@ xfs_probe_delalloc_page(
437 612
438 bh = head = page_buffers(page); 613 bh = head = page_buffers(page);
439 do { 614 do {
440 if (buffer_unwritten(bh)) { 615 if (buffer_unwritten(bh))
441 acceptable = 0; 616 acceptable = (type == IOMAP_UNWRITTEN);
617 else if (buffer_delay(bh))
618 acceptable = (type == IOMAP_DELAY);
619 else if (buffer_mapped(bh))
620 acceptable = (type == 0);
621 else
442 break; 622 break;
443 } else if (buffer_delay(bh)) {
444 acceptable = 1;
445 }
446 } while ((bh = bh->b_this_page) != head); 623 } while ((bh = bh->b_this_page) != head);
447 624
448 if (acceptable) 625 if (acceptable)
449 return page; 626 return 1;
450 } 627 }
451 628
452out:
453 unlock_page(page);
454 return NULL;
455}
456
457STATIC int
458xfs_map_unwritten(
459 struct inode *inode,
460 struct page *start_page,
461 struct buffer_head *head,
462 struct buffer_head *curr,
463 unsigned long p_offset,
464 int block_bits,
465 xfs_iomap_t *iomapp,
466 struct writeback_control *wbc,
467 int startio,
468 int all_bh)
469{
470 struct buffer_head *bh = curr;
471 xfs_iomap_t *tmp;
472 xfs_ioend_t *ioend;
473 loff_t offset;
474 unsigned long nblocks = 0;
475
476 offset = start_page->index;
477 offset <<= PAGE_CACHE_SHIFT;
478 offset += p_offset;
479
480 ioend = xfs_alloc_ioend(inode);
481
482 /* First map forwards in the page consecutive buffers
483 * covering this unwritten extent
484 */
485 do {
486 if (!buffer_unwritten(bh))
487 break;
488 tmp = xfs_offset_to_map(start_page, iomapp, p_offset);
489 if (!tmp)
490 break;
491 xfs_map_at_offset(start_page, bh, p_offset, block_bits, iomapp);
492 set_buffer_unwritten_io(bh);
493 bh->b_private = ioend;
494 p_offset += bh->b_size;
495 nblocks++;
496 } while ((bh = bh->b_this_page) != head);
497
498 atomic_add(nblocks, &ioend->io_remaining);
499
500 /* If we reached the end of the page, map forwards in any
501 * following pages which are also covered by this extent.
502 */
503 if (bh == head) {
504 struct address_space *mapping = inode->i_mapping;
505 pgoff_t tindex, tloff, tlast;
506 unsigned long bs;
507 unsigned int pg_offset, bbits = inode->i_blkbits;
508 struct page *page;
509
510 tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
511 tloff = (iomapp->iomap_offset + iomapp->iomap_bsize) >> PAGE_CACHE_SHIFT;
512 tloff = min(tlast, tloff);
513 for (tindex = start_page->index + 1; tindex < tloff; tindex++) {
514 page = xfs_probe_unwritten_page(mapping,
515 tindex, iomapp, ioend,
516 PAGE_CACHE_SIZE, &bs, bbits);
517 if (!page)
518 break;
519 nblocks += bs;
520 atomic_add(bs, &ioend->io_remaining);
521 xfs_convert_page(inode, page, iomapp, wbc, ioend,
522 startio, all_bh);
523 /* stop if converting the next page might add
524 * enough blocks that the corresponding byte
525 * count won't fit in our ulong page buf length */
526 if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits))
527 goto enough;
528 }
529
530 if (tindex == tlast &&
531 (pg_offset = (i_size_read(inode) & (PAGE_CACHE_SIZE - 1)))) {
532 page = xfs_probe_unwritten_page(mapping,
533 tindex, iomapp, ioend,
534 pg_offset, &bs, bbits);
535 if (page) {
536 nblocks += bs;
537 atomic_add(bs, &ioend->io_remaining);
538 xfs_convert_page(inode, page, iomapp, wbc, ioend,
539 startio, all_bh);
540 if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits))
541 goto enough;
542 }
543 }
544 }
545
546enough:
547 ioend->io_size = (xfs_off_t)nblocks << block_bits;
548 ioend->io_offset = offset;
549 xfs_finish_ioend(ioend);
550 return 0; 629 return 0;
551} 630}
552 631
553STATIC void
554xfs_submit_page(
555 struct page *page,
556 struct writeback_control *wbc,
557 struct buffer_head *bh_arr[],
558 int bh_count,
559 int probed_page,
560 int clear_dirty)
561{
562 struct buffer_head *bh;
563 int i;
564
565 BUG_ON(PageWriteback(page));
566 if (bh_count)
567 set_page_writeback(page);
568 if (clear_dirty)
569 clear_page_dirty(page);
570 unlock_page(page);
571
572 if (bh_count) {
573 for (i = 0; i < bh_count; i++) {
574 bh = bh_arr[i];
575 mark_buffer_async_write(bh);
576 if (buffer_unwritten(bh))
577 set_buffer_unwritten_io(bh);
578 set_buffer_uptodate(bh);
579 clear_buffer_dirty(bh);
580 }
581
582 for (i = 0; i < bh_count; i++)
583 submit_bh(WRITE, bh_arr[i]);
584
585 if (probed_page && clear_dirty)
586 wbc->nr_to_write--; /* Wrote an "extra" page */
587 }
588}
589
590/* 632/*
591 * Allocate & map buffers for page given the extent map. Write it out. 633 * Allocate & map buffers for page given the extent map. Write it out.
592 * except for the original page of a writepage, this is called on 634 * except for the original page of a writepage, this is called on
593 * delalloc/unwritten pages only, for the original page it is possible 635 * delalloc/unwritten pages only, for the original page it is possible
594 * that the page has no mapping at all. 636 * that the page has no mapping at all.
595 */ 637 */
596STATIC void 638STATIC int
597xfs_convert_page( 639xfs_convert_page(
598 struct inode *inode, 640 struct inode *inode,
599 struct page *page, 641 struct page *page,
600 xfs_iomap_t *iomapp, 642 loff_t tindex,
643 xfs_iomap_t *mp,
644 xfs_ioend_t **ioendp,
601 struct writeback_control *wbc, 645 struct writeback_control *wbc,
602 void *private,
603 int startio, 646 int startio,
604 int all_bh) 647 int all_bh)
605{ 648{
606 struct buffer_head *bh_arr[MAX_BUF_PER_PAGE], *bh, *head; 649 struct buffer_head *bh, *head;
607 xfs_iomap_t *mp = iomapp, *tmp; 650 xfs_off_t end_offset;
608 unsigned long offset, end_offset; 651 unsigned long p_offset;
609 int index = 0; 652 unsigned int type;
610 int bbits = inode->i_blkbits; 653 int bbits = inode->i_blkbits;
611 int len, page_dirty; 654 int len, page_dirty;
655 int count = 0, done = 0, uptodate = 1;
656 xfs_off_t offset = page_offset(page);
612 657
613 end_offset = (i_size_read(inode) & (PAGE_CACHE_SIZE - 1)); 658 if (page->index != tindex)
659 goto fail;
660 if (TestSetPageLocked(page))
661 goto fail;
662 if (PageWriteback(page))
663 goto fail_unlock_page;
664 if (page->mapping != inode->i_mapping)
665 goto fail_unlock_page;
666 if (!xfs_is_delayed_page(page, (*ioendp)->io_type))
667 goto fail_unlock_page;
614 668
615 /* 669 /*
616 * page_dirty is initially a count of buffers on the page before 670 * page_dirty is initially a count of buffers on the page before
617 * EOF and is decrememted as we move each into a cleanable state. 671 * EOF and is decrememted as we move each into a cleanable state.
672 *
673 * Derivation:
674 *
675 * End offset is the highest offset that this page should represent.
676 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
677 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
678 * hence give us the correct page_dirty count. On any other page,
679 * it will be zero and in that case we need page_dirty to be the
680 * count of buffers on the page.
618 */ 681 */
682 end_offset = min_t(unsigned long long,
683 (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
684 i_size_read(inode));
685
619 len = 1 << inode->i_blkbits; 686 len = 1 << inode->i_blkbits;
620 end_offset = max(end_offset, PAGE_CACHE_SIZE); 687 p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
621 end_offset = roundup(end_offset, len); 688 PAGE_CACHE_SIZE);
622 page_dirty = end_offset / len; 689 p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
690 page_dirty = p_offset / len;
623 691
624 offset = 0;
625 bh = head = page_buffers(page); 692 bh = head = page_buffers(page);
626 do { 693 do {
627 if (offset >= end_offset) 694 if (offset >= end_offset)
628 break; 695 break;
629 if (!(PageUptodate(page) || buffer_uptodate(bh))) 696 if (!buffer_uptodate(bh))
697 uptodate = 0;
698 if (!(PageUptodate(page) || buffer_uptodate(bh))) {
699 done = 1;
630 continue; 700 continue;
631 if (buffer_mapped(bh) && all_bh && 701 }
632 !(buffer_unwritten(bh) || buffer_delay(bh))) { 702
703 if (buffer_unwritten(bh) || buffer_delay(bh)) {
704 if (buffer_unwritten(bh))
705 type = IOMAP_UNWRITTEN;
706 else
707 type = IOMAP_DELAY;
708
709 if (!xfs_iomap_valid(mp, offset)) {
710 done = 1;
711 continue;
712 }
713
714 ASSERT(!(mp->iomap_flags & IOMAP_HOLE));
715 ASSERT(!(mp->iomap_flags & IOMAP_DELAY));
716
717 xfs_map_at_offset(bh, offset, bbits, mp);
633 if (startio) { 718 if (startio) {
719 xfs_add_to_ioend(inode, bh, offset,
720 type, ioendp, done);
721 } else {
722 set_buffer_dirty(bh);
723 unlock_buffer(bh);
724 mark_buffer_dirty(bh);
725 }
726 page_dirty--;
727 count++;
728 } else {
729 type = 0;
730 if (buffer_mapped(bh) && all_bh && startio) {
634 lock_buffer(bh); 731 lock_buffer(bh);
635 bh_arr[index++] = bh; 732 xfs_add_to_ioend(inode, bh, offset,
733 type, ioendp, done);
734 count++;
636 page_dirty--; 735 page_dirty--;
736 } else {
737 done = 1;
637 } 738 }
638 continue;
639 } 739 }
640 tmp = xfs_offset_to_map(page, mp, offset); 740 } while (offset += len, (bh = bh->b_this_page) != head);
641 if (!tmp)
642 continue;
643 ASSERT(!(tmp->iomap_flags & IOMAP_HOLE));
644 ASSERT(!(tmp->iomap_flags & IOMAP_DELAY));
645 741
646 /* If this is a new unwritten extent buffer (i.e. one 742 if (uptodate && bh == head)
647 * that we haven't passed in private data for, we must 743 SetPageUptodate(page);
648 * now map this buffer too. 744
649 */ 745 if (startio) {
650 if (buffer_unwritten(bh) && !bh->b_end_io) { 746 if (count) {
651 ASSERT(tmp->iomap_flags & IOMAP_UNWRITTEN); 747 struct backing_dev_info *bdi;
652 xfs_map_unwritten(inode, page, head, bh, offset, 748
653 bbits, tmp, wbc, startio, all_bh); 749 bdi = inode->i_mapping->backing_dev_info;
654 } else if (! (buffer_unwritten(bh) && buffer_locked(bh))) { 750 if (bdi_write_congested(bdi)) {
655 xfs_map_at_offset(page, bh, offset, bbits, tmp); 751 wbc->encountered_congestion = 1;
656 if (buffer_unwritten(bh)) { 752 done = 1;
657 set_buffer_unwritten_io(bh); 753 } else if (--wbc->nr_to_write <= 0) {
658 bh->b_private = private; 754 done = 1;
659 ASSERT(private);
660 } 755 }
661 } 756 }
662 if (startio) { 757 xfs_start_page_writeback(page, wbc, !page_dirty, count);
663 bh_arr[index++] = bh;
664 } else {
665 set_buffer_dirty(bh);
666 unlock_buffer(bh);
667 mark_buffer_dirty(bh);
668 }
669 page_dirty--;
670 } while (offset += len, (bh = bh->b_this_page) != head);
671
672 if (startio && index) {
673 xfs_submit_page(page, wbc, bh_arr, index, 1, !page_dirty);
674 } else {
675 unlock_page(page);
676 } 758 }
759
760 return done;
761 fail_unlock_page:
762 unlock_page(page);
763 fail:
764 return 1;
677} 765}
678 766
679/* 767/*
@@ -685,19 +773,31 @@ xfs_cluster_write(
685 struct inode *inode, 773 struct inode *inode,
686 pgoff_t tindex, 774 pgoff_t tindex,
687 xfs_iomap_t *iomapp, 775 xfs_iomap_t *iomapp,
776 xfs_ioend_t **ioendp,
688 struct writeback_control *wbc, 777 struct writeback_control *wbc,
689 int startio, 778 int startio,
690 int all_bh, 779 int all_bh,
691 pgoff_t tlast) 780 pgoff_t tlast)
692{ 781{
693 struct page *page; 782 struct pagevec pvec;
783 int done = 0, i;
784
785 pagevec_init(&pvec, 0);
786 while (!done && tindex <= tlast) {
787 unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
694 788
695 for (; tindex <= tlast; tindex++) { 789 if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
696 page = xfs_probe_delalloc_page(inode, tindex);
697 if (!page)
698 break; 790 break;
699 xfs_convert_page(inode, page, iomapp, wbc, NULL, 791
700 startio, all_bh); 792 for (i = 0; i < pagevec_count(&pvec); i++) {
793 done = xfs_convert_page(inode, pvec.pages[i], tindex++,
794 iomapp, ioendp, wbc, startio, all_bh);
795 if (done)
796 break;
797 }
798
799 pagevec_release(&pvec);
800 cond_resched();
701 } 801 }
702} 802}
703 803
@@ -728,18 +828,22 @@ xfs_page_state_convert(
728 int startio, 828 int startio,
729 int unmapped) /* also implies page uptodate */ 829 int unmapped) /* also implies page uptodate */
730{ 830{
731 struct buffer_head *bh_arr[MAX_BUF_PER_PAGE], *bh, *head; 831 struct buffer_head *bh, *head;
732 xfs_iomap_t *iomp, iomap; 832 xfs_iomap_t iomap;
833 xfs_ioend_t *ioend = NULL, *iohead = NULL;
733 loff_t offset; 834 loff_t offset;
734 unsigned long p_offset = 0; 835 unsigned long p_offset = 0;
836 unsigned int type;
735 __uint64_t end_offset; 837 __uint64_t end_offset;
736 pgoff_t end_index, last_index, tlast; 838 pgoff_t end_index, last_index, tlast;
737 int len, err, i, cnt = 0, uptodate = 1; 839 ssize_t size, len;
738 int flags; 840 int flags, err, iomap_valid = 0, uptodate = 1;
739 int page_dirty; 841 int page_dirty, count = 0, trylock_flag = 0;
842 int all_bh = unmapped;
740 843
741 /* wait for other IO threads? */ 844 /* wait for other IO threads? */
742 flags = (startio && wbc->sync_mode != WB_SYNC_NONE) ? 0 : BMAPI_TRYLOCK; 845 if (startio && (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking))
846 trylock_flag |= BMAPI_TRYLOCK;
743 847
744 /* Is this page beyond the end of the file? */ 848 /* Is this page beyond the end of the file? */
745 offset = i_size_read(inode); 849 offset = i_size_read(inode);
@@ -754,161 +858,173 @@ xfs_page_state_convert(
754 } 858 }
755 } 859 }
756 860
757 end_offset = min_t(unsigned long long,
758 (loff_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset);
759 offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
760
761 /* 861 /*
762 * page_dirty is initially a count of buffers on the page before 862 * page_dirty is initially a count of buffers on the page before
763 * EOF and is decrememted as we move each into a cleanable state. 863 * EOF and is decrememted as we move each into a cleanable state.
764 */ 864 *
865 * Derivation:
866 *
867 * End offset is the highest offset that this page should represent.
868 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
869 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
870 * hence give us the correct page_dirty count. On any other page,
871 * it will be zero and in that case we need page_dirty to be the
872 * count of buffers on the page.
873 */
874 end_offset = min_t(unsigned long long,
875 (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset);
765 len = 1 << inode->i_blkbits; 876 len = 1 << inode->i_blkbits;
766 p_offset = max(p_offset, PAGE_CACHE_SIZE); 877 p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
767 p_offset = roundup(p_offset, len); 878 PAGE_CACHE_SIZE);
879 p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
768 page_dirty = p_offset / len; 880 page_dirty = p_offset / len;
769 881
770 iomp = NULL;
771 p_offset = 0;
772 bh = head = page_buffers(page); 882 bh = head = page_buffers(page);
883 offset = page_offset(page);
884 flags = -1;
885 type = 0;
886
887 /* TODO: cleanup count and page_dirty */
773 888
774 do { 889 do {
775 if (offset >= end_offset) 890 if (offset >= end_offset)
776 break; 891 break;
777 if (!buffer_uptodate(bh)) 892 if (!buffer_uptodate(bh))
778 uptodate = 0; 893 uptodate = 0;
779 if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) 894 if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) {
895 /*
896 * the iomap is actually still valid, but the ioend
897 * isn't. shouldn't happen too often.
898 */
899 iomap_valid = 0;
780 continue; 900 continue;
781
782 if (iomp) {
783 iomp = xfs_offset_to_map(page, &iomap, p_offset);
784 } 901 }
785 902
903 if (iomap_valid)
904 iomap_valid = xfs_iomap_valid(&iomap, offset);
905
786 /* 906 /*
787 * First case, map an unwritten extent and prepare for 907 * First case, map an unwritten extent and prepare for
788 * extent state conversion transaction on completion. 908 * extent state conversion transaction on completion.
789 */ 909 *
790 if (buffer_unwritten(bh)) { 910 * Second case, allocate space for a delalloc buffer.
791 if (!startio) 911 * We can return EAGAIN here in the release page case.
792 continue; 912 *
793 if (!iomp) { 913 * Third case, an unmapped buffer was found, and we are
794 err = xfs_map_blocks(inode, offset, len, &iomap, 914 * in a path where we need to write the whole page out.
795 BMAPI_WRITE|BMAPI_IGNSTATE); 915 */
796 if (err) { 916 if (buffer_unwritten(bh) || buffer_delay(bh) ||
797 goto error; 917 ((buffer_uptodate(bh) || PageUptodate(page)) &&
798 } 918 !buffer_mapped(bh) && (unmapped || startio))) {
799 iomp = xfs_offset_to_map(page, &iomap, 919 /*
800 p_offset); 920 * Make sure we don't use a read-only iomap
921 */
922 if (flags == BMAPI_READ)
923 iomap_valid = 0;
924
925 if (buffer_unwritten(bh)) {
926 type = IOMAP_UNWRITTEN;
927 flags = BMAPI_WRITE|BMAPI_IGNSTATE;
928 } else if (buffer_delay(bh)) {
929 type = IOMAP_DELAY;
930 flags = BMAPI_ALLOCATE;
931 if (!startio)
932 flags |= trylock_flag;
933 } else {
934 type = IOMAP_NEW;
935 flags = BMAPI_WRITE|BMAPI_MMAP;
801 } 936 }
802 if (iomp) { 937
803 if (!bh->b_end_io) { 938 if (!iomap_valid) {
804 err = xfs_map_unwritten(inode, page, 939 if (type == IOMAP_NEW) {
805 head, bh, p_offset, 940 size = xfs_probe_cluster(inode,
806 inode->i_blkbits, iomp, 941 page, bh, head, 0);
807 wbc, startio, unmapped);
808 if (err) {
809 goto error;
810 }
811 } else { 942 } else {
812 set_bit(BH_Lock, &bh->b_state); 943 size = len;
813 } 944 }
814 BUG_ON(!buffer_locked(bh)); 945
815 bh_arr[cnt++] = bh; 946 err = xfs_map_blocks(inode, offset, size,
816 page_dirty--; 947 &iomap, flags);
817 } 948 if (err)
818 /*
819 * Second case, allocate space for a delalloc buffer.
820 * We can return EAGAIN here in the release page case.
821 */
822 } else if (buffer_delay(bh)) {
823 if (!iomp) {
824 err = xfs_map_blocks(inode, offset, len, &iomap,
825 BMAPI_ALLOCATE | flags);
826 if (err) {
827 goto error; 949 goto error;
828 } 950 iomap_valid = xfs_iomap_valid(&iomap, offset);
829 iomp = xfs_offset_to_map(page, &iomap,
830 p_offset);
831 } 951 }
832 if (iomp) { 952 if (iomap_valid) {
833 xfs_map_at_offset(page, bh, p_offset, 953 xfs_map_at_offset(bh, offset,
834 inode->i_blkbits, iomp); 954 inode->i_blkbits, &iomap);
835 if (startio) { 955 if (startio) {
836 bh_arr[cnt++] = bh; 956 xfs_add_to_ioend(inode, bh, offset,
957 type, &ioend,
958 !iomap_valid);
837 } else { 959 } else {
838 set_buffer_dirty(bh); 960 set_buffer_dirty(bh);
839 unlock_buffer(bh); 961 unlock_buffer(bh);
840 mark_buffer_dirty(bh); 962 mark_buffer_dirty(bh);
841 } 963 }
842 page_dirty--; 964 page_dirty--;
965 count++;
966 }
967 } else if (buffer_uptodate(bh) && startio) {
968 /*
969 * we got here because the buffer is already mapped.
970 * That means it must already have extents allocated
971 * underneath it. Map the extent by reading it.
972 */
973 if (!iomap_valid || type != 0) {
974 flags = BMAPI_READ;
975 size = xfs_probe_cluster(inode, page, bh,
976 head, 1);
977 err = xfs_map_blocks(inode, offset, size,
978 &iomap, flags);
979 if (err)
980 goto error;
981 iomap_valid = xfs_iomap_valid(&iomap, offset);
843 } 982 }
844 } else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
845 (unmapped || startio)) {
846 983
847 if (!buffer_mapped(bh)) { 984 type = 0;
848 int size; 985 if (!test_and_set_bit(BH_Lock, &bh->b_state)) {
849 986 ASSERT(buffer_mapped(bh));
850 /* 987 if (iomap_valid)
851 * Getting here implies an unmapped buffer 988 all_bh = 1;
852 * was found, and we are in a path where we 989 xfs_add_to_ioend(inode, bh, offset, type,
853 * need to write the whole page out. 990 &ioend, !iomap_valid);
854 */ 991 page_dirty--;
855 if (!iomp) { 992 count++;
856 size = xfs_probe_unmapped_cluster( 993 } else {
857 inode, page, bh, head); 994 iomap_valid = 0;
858 err = xfs_map_blocks(inode, offset,
859 size, &iomap,
860 BMAPI_WRITE|BMAPI_MMAP);
861 if (err) {
862 goto error;
863 }
864 iomp = xfs_offset_to_map(page, &iomap,
865 p_offset);
866 }
867 if (iomp) {
868 xfs_map_at_offset(page,
869 bh, p_offset,
870 inode->i_blkbits, iomp);
871 if (startio) {
872 bh_arr[cnt++] = bh;
873 } else {
874 set_buffer_dirty(bh);
875 unlock_buffer(bh);
876 mark_buffer_dirty(bh);
877 }
878 page_dirty--;
879 }
880 } else if (startio) {
881 if (buffer_uptodate(bh) &&
882 !test_and_set_bit(BH_Lock, &bh->b_state)) {
883 bh_arr[cnt++] = bh;
884 page_dirty--;
885 }
886 } 995 }
996 } else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
997 (unmapped || startio)) {
998 iomap_valid = 0;
887 } 999 }
888 } while (offset += len, p_offset += len, 1000
889 ((bh = bh->b_this_page) != head)); 1001 if (!iohead)
1002 iohead = ioend;
1003
1004 } while (offset += len, ((bh = bh->b_this_page) != head));
890 1005
891 if (uptodate && bh == head) 1006 if (uptodate && bh == head)
892 SetPageUptodate(page); 1007 SetPageUptodate(page);
893 1008
894 if (startio) { 1009 if (startio)
895 xfs_submit_page(page, wbc, bh_arr, cnt, 0, !page_dirty); 1010 xfs_start_page_writeback(page, wbc, 1, count);
896 }
897 1011
898 if (iomp) { 1012 if (ioend && iomap_valid) {
899 offset = (iomp->iomap_offset + iomp->iomap_bsize - 1) >> 1013 offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >>
900 PAGE_CACHE_SHIFT; 1014 PAGE_CACHE_SHIFT;
901 tlast = min_t(pgoff_t, offset, last_index); 1015 tlast = min_t(pgoff_t, offset, last_index);
902 xfs_cluster_write(inode, page->index + 1, iomp, wbc, 1016 xfs_cluster_write(inode, page->index + 1, &iomap, &ioend,
903 startio, unmapped, tlast); 1017 wbc, startio, all_bh, tlast);
904 } 1018 }
905 1019
1020 if (iohead)
1021 xfs_submit_ioend(iohead);
1022
906 return page_dirty; 1023 return page_dirty;
907 1024
908error: 1025error:
909 for (i = 0; i < cnt; i++) { 1026 if (iohead)
910 unlock_buffer(bh_arr[i]); 1027 xfs_cancel_ioend(iohead);
911 }
912 1028
913 /* 1029 /*
914 * If it's delalloc and we have nowhere to put it, 1030 * If it's delalloc and we have nowhere to put it,
@@ -916,9 +1032,8 @@ error:
916 * us to try again. 1032 * us to try again.
917 */ 1033 */
918 if (err != -EAGAIN) { 1034 if (err != -EAGAIN) {
919 if (!unmapped) { 1035 if (!unmapped)
920 block_invalidatepage(page, 0); 1036 block_invalidatepage(page, 0);
921 }
922 ClearPageUptodate(page); 1037 ClearPageUptodate(page);
923 } 1038 }
924 return err; 1039 return err;
@@ -941,13 +1056,12 @@ __linvfs_get_block(
941 int retpbbm = 1; 1056 int retpbbm = 1;
942 int error; 1057 int error;
943 1058
944 if (blocks) {
945 offset = blocks << inode->i_blkbits; /* 64 bit goodness */
946 size = (ssize_t) min_t(xfs_off_t, offset, LONG_MAX);
947 } else {
948 size = 1 << inode->i_blkbits;
949 }
950 offset = (xfs_off_t)iblock << inode->i_blkbits; 1059 offset = (xfs_off_t)iblock << inode->i_blkbits;
1060 if (blocks)
1061 size = (ssize_t) min_t(xfs_off_t, LONG_MAX,
1062 (xfs_off_t)blocks << inode->i_blkbits);
1063 else
1064 size = 1 << inode->i_blkbits;
951 1065
952 VOP_BMAP(vp, offset, size, 1066 VOP_BMAP(vp, offset, size,
953 create ? flags : BMAPI_READ, &iomap, &retpbbm, error); 1067 create ? flags : BMAPI_READ, &iomap, &retpbbm, error);
@@ -983,7 +1097,7 @@ __linvfs_get_block(
983 } 1097 }
984 1098
985 /* If this is a realtime file, data might be on a new device */ 1099 /* If this is a realtime file, data might be on a new device */
986 bh_result->b_bdev = iomap.iomap_target->pbr_bdev; 1100 bh_result->b_bdev = iomap.iomap_target->bt_bdev;
987 1101
988 /* If we previously allocated a block out beyond eof and 1102 /* If we previously allocated a block out beyond eof and
989 * we are now coming back to use it then we will need to 1103 * we are now coming back to use it then we will need to
@@ -1007,7 +1121,7 @@ __linvfs_get_block(
1007 ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0); 1121 ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0);
1008 offset = min_t(xfs_off_t, 1122 offset = min_t(xfs_off_t,
1009 iomap.iomap_bsize - iomap.iomap_delta, 1123 iomap.iomap_bsize - iomap.iomap_delta,
1010 blocks << inode->i_blkbits); 1124 (xfs_off_t)blocks << inode->i_blkbits);
1011 bh_result->b_size = (u32) min_t(xfs_off_t, UINT_MAX, offset); 1125 bh_result->b_size = (u32) min_t(xfs_off_t, UINT_MAX, offset);
1012 } 1126 }
1013 1127
@@ -1095,10 +1209,10 @@ linvfs_direct_IO(
1095 if (error) 1209 if (error)
1096 return -error; 1210 return -error;
1097 1211
1098 iocb->private = xfs_alloc_ioend(inode); 1212 iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
1099 1213
1100 ret = blockdev_direct_IO_own_locking(rw, iocb, inode, 1214 ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
1101 iomap.iomap_target->pbr_bdev, 1215 iomap.iomap_target->bt_bdev,
1102 iov, offset, nr_segs, 1216 iov, offset, nr_segs,
1103 linvfs_get_blocks_direct, 1217 linvfs_get_blocks_direct,
1104 linvfs_end_io_direct); 1218 linvfs_end_io_direct);
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 4720758a9ad..55339dd5a30 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -23,14 +23,24 @@ extern mempool_t *xfs_ioend_pool;
23 23
24typedef void (*xfs_ioend_func_t)(void *); 24typedef void (*xfs_ioend_func_t)(void *);
25 25
26/*
27 * xfs_ioend struct manages large extent writes for XFS.
28 * It can manage several multi-page bio's at once.
29 */
26typedef struct xfs_ioend { 30typedef struct xfs_ioend {
31 struct xfs_ioend *io_list; /* next ioend in chain */
32 unsigned int io_type; /* delalloc / unwritten */
27 unsigned int io_uptodate; /* I/O status register */ 33 unsigned int io_uptodate; /* I/O status register */
28 atomic_t io_remaining; /* hold count */ 34 atomic_t io_remaining; /* hold count */
29 struct vnode *io_vnode; /* file being written to */ 35 struct vnode *io_vnode; /* file being written to */
30 struct buffer_head *io_buffer_head;/* buffer linked list head */ 36 struct buffer_head *io_buffer_head;/* buffer linked list head */
37 struct buffer_head *io_buffer_tail;/* buffer linked list tail */
31 size_t io_size; /* size of the extent */ 38 size_t io_size; /* size of the extent */
32 xfs_off_t io_offset; /* offset in the file */ 39 xfs_off_t io_offset; /* offset in the file */
33 struct work_struct io_work; /* xfsdatad work queue */ 40 struct work_struct io_work; /* xfsdatad work queue */
34} xfs_ioend_t; 41} xfs_ioend_t;
35 42
43extern struct address_space_operations linvfs_aops;
44extern int linvfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
45
36#endif /* __XFS_IOPS_H__ */ 46#endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 6fe21d2b884..e44b7c1a3a3 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -31,76 +31,77 @@
31#include <linux/kthread.h> 31#include <linux/kthread.h>
32#include "xfs_linux.h" 32#include "xfs_linux.h"
33 33
34STATIC kmem_cache_t *pagebuf_zone; 34STATIC kmem_zone_t *xfs_buf_zone;
35STATIC kmem_shaker_t pagebuf_shake; 35STATIC kmem_shaker_t xfs_buf_shake;
36STATIC int xfsbufd(void *);
36STATIC int xfsbufd_wakeup(int, gfp_t); 37STATIC int xfsbufd_wakeup(int, gfp_t);
37STATIC void pagebuf_delwri_queue(xfs_buf_t *, int); 38STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
38 39
39STATIC struct workqueue_struct *xfslogd_workqueue; 40STATIC struct workqueue_struct *xfslogd_workqueue;
40struct workqueue_struct *xfsdatad_workqueue; 41struct workqueue_struct *xfsdatad_workqueue;
41 42
42#ifdef PAGEBUF_TRACE 43#ifdef XFS_BUF_TRACE
43void 44void
44pagebuf_trace( 45xfs_buf_trace(
45 xfs_buf_t *pb, 46 xfs_buf_t *bp,
46 char *id, 47 char *id,
47 void *data, 48 void *data,
48 void *ra) 49 void *ra)
49{ 50{
50 ktrace_enter(pagebuf_trace_buf, 51 ktrace_enter(xfs_buf_trace_buf,
51 pb, id, 52 bp, id,
52 (void *)(unsigned long)pb->pb_flags, 53 (void *)(unsigned long)bp->b_flags,
53 (void *)(unsigned long)pb->pb_hold.counter, 54 (void *)(unsigned long)bp->b_hold.counter,
54 (void *)(unsigned long)pb->pb_sema.count.counter, 55 (void *)(unsigned long)bp->b_sema.count.counter,
55 (void *)current, 56 (void *)current,
56 data, ra, 57 data, ra,
57 (void *)(unsigned long)((pb->pb_file_offset>>32) & 0xffffffff), 58 (void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff),
58 (void *)(unsigned long)(pb->pb_file_offset & 0xffffffff), 59 (void *)(unsigned long)(bp->b_file_offset & 0xffffffff),
59 (void *)(unsigned long)pb->pb_buffer_length, 60 (void *)(unsigned long)bp->b_buffer_length,
60 NULL, NULL, NULL, NULL, NULL); 61 NULL, NULL, NULL, NULL, NULL);
61} 62}
62ktrace_t *pagebuf_trace_buf; 63ktrace_t *xfs_buf_trace_buf;
63#define PAGEBUF_TRACE_SIZE 4096 64#define XFS_BUF_TRACE_SIZE 4096
64#define PB_TRACE(pb, id, data) \ 65#define XB_TRACE(bp, id, data) \
65 pagebuf_trace(pb, id, (void *)data, (void *)__builtin_return_address(0)) 66 xfs_buf_trace(bp, id, (void *)data, (void *)__builtin_return_address(0))
66#else 67#else
67#define PB_TRACE(pb, id, data) do { } while (0) 68#define XB_TRACE(bp, id, data) do { } while (0)
68#endif 69#endif
69 70
70#ifdef PAGEBUF_LOCK_TRACKING 71#ifdef XFS_BUF_LOCK_TRACKING
71# define PB_SET_OWNER(pb) ((pb)->pb_last_holder = current->pid) 72# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid)
72# define PB_CLEAR_OWNER(pb) ((pb)->pb_last_holder = -1) 73# define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1)
73# define PB_GET_OWNER(pb) ((pb)->pb_last_holder) 74# define XB_GET_OWNER(bp) ((bp)->b_last_holder)
74#else 75#else
75# define PB_SET_OWNER(pb) do { } while (0) 76# define XB_SET_OWNER(bp) do { } while (0)
76# define PB_CLEAR_OWNER(pb) do { } while (0) 77# define XB_CLEAR_OWNER(bp) do { } while (0)
77# define PB_GET_OWNER(pb) do { } while (0) 78# define XB_GET_OWNER(bp) do { } while (0)
78#endif 79#endif
79 80
80#define pb_to_gfp(flags) \ 81#define xb_to_gfp(flags) \
81 ((((flags) & PBF_READ_AHEAD) ? __GFP_NORETRY : \ 82 ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \
82 ((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN) 83 ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN)
83 84
84#define pb_to_km(flags) \ 85#define xb_to_km(flags) \
85 (((flags) & PBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP) 86 (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
86 87
87#define pagebuf_allocate(flags) \ 88#define xfs_buf_allocate(flags) \
88 kmem_zone_alloc(pagebuf_zone, pb_to_km(flags)) 89 kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags))
89#define pagebuf_deallocate(pb) \ 90#define xfs_buf_deallocate(bp) \
90 kmem_zone_free(pagebuf_zone, (pb)); 91 kmem_zone_free(xfs_buf_zone, (bp));
91 92
92/* 93/*
93 * Page Region interfaces. 94 * Page Region interfaces.
94 * 95 *
95 * For pages in filesystems where the blocksize is smaller than the 96 * For pages in filesystems where the blocksize is smaller than the
96 * pagesize, we use the page->private field (long) to hold a bitmap 97 * pagesize, we use the page->private field (long) to hold a bitmap
97 * of uptodate regions within the page. 98 * of uptodate regions within the page.
98 * 99 *
99 * Each such region is "bytes per page / bits per long" bytes long. 100 * Each such region is "bytes per page / bits per long" bytes long.
100 * 101 *
101 * NBPPR == number-of-bytes-per-page-region 102 * NBPPR == number-of-bytes-per-page-region
102 * BTOPR == bytes-to-page-region (rounded up) 103 * BTOPR == bytes-to-page-region (rounded up)
103 * BTOPRT == bytes-to-page-region-truncated (rounded down) 104 * BTOPRT == bytes-to-page-region-truncated (rounded down)
104 */ 105 */
105#if (BITS_PER_LONG == 32) 106#if (BITS_PER_LONG == 32)
106#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */ 107#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */
@@ -159,7 +160,7 @@ test_page_region(
159} 160}
160 161
161/* 162/*
162 * Mapping of multi-page buffers into contiguous virtual space 163 * Mapping of multi-page buffers into contiguous virtual space
163 */ 164 */
164 165
165typedef struct a_list { 166typedef struct a_list {
@@ -172,7 +173,7 @@ STATIC int as_list_len;
172STATIC DEFINE_SPINLOCK(as_lock); 173STATIC DEFINE_SPINLOCK(as_lock);
173 174
174/* 175/*
175 * Try to batch vunmaps because they are costly. 176 * Try to batch vunmaps because they are costly.
176 */ 177 */
177STATIC void 178STATIC void
178free_address( 179free_address(
@@ -215,83 +216,83 @@ purge_addresses(void)
215} 216}
216 217
217/* 218/*
218 * Internal pagebuf object manipulation 219 * Internal xfs_buf_t object manipulation
219 */ 220 */
220 221
221STATIC void 222STATIC void
222_pagebuf_initialize( 223_xfs_buf_initialize(
223 xfs_buf_t *pb, 224 xfs_buf_t *bp,
224 xfs_buftarg_t *target, 225 xfs_buftarg_t *target,
225 loff_t range_base, 226 xfs_off_t range_base,
226 size_t range_length, 227 size_t range_length,
227 page_buf_flags_t flags) 228 xfs_buf_flags_t flags)
228{ 229{
229 /* 230 /*
230 * We don't want certain flags to appear in pb->pb_flags. 231 * We don't want certain flags to appear in b_flags.
231 */ 232 */
232 flags &= ~(PBF_LOCK|PBF_MAPPED|PBF_DONT_BLOCK|PBF_READ_AHEAD); 233 flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD);
233 234
234 memset(pb, 0, sizeof(xfs_buf_t)); 235 memset(bp, 0, sizeof(xfs_buf_t));
235 atomic_set(&pb->pb_hold, 1); 236 atomic_set(&bp->b_hold, 1);
236 init_MUTEX_LOCKED(&pb->pb_iodonesema); 237 init_MUTEX_LOCKED(&bp->b_iodonesema);
237 INIT_LIST_HEAD(&pb->pb_list); 238 INIT_LIST_HEAD(&bp->b_list);
238 INIT_LIST_HEAD(&pb->pb_hash_list); 239 INIT_LIST_HEAD(&bp->b_hash_list);
239 init_MUTEX_LOCKED(&pb->pb_sema); /* held, no waiters */ 240 init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
240 PB_SET_OWNER(pb); 241 XB_SET_OWNER(bp);
241 pb->pb_target = target; 242 bp->b_target = target;
242 pb->pb_file_offset = range_base; 243 bp->b_file_offset = range_base;
243 /* 244 /*
244 * Set buffer_length and count_desired to the same value initially. 245 * Set buffer_length and count_desired to the same value initially.
245 * I/O routines should use count_desired, which will be the same in 246 * I/O routines should use count_desired, which will be the same in
246 * most cases but may be reset (e.g. XFS recovery). 247 * most cases but may be reset (e.g. XFS recovery).
247 */ 248 */
248 pb->pb_buffer_length = pb->pb_count_desired = range_length; 249 bp->b_buffer_length = bp->b_count_desired = range_length;
249 pb->pb_flags = flags; 250 bp->b_flags = flags;
250 pb->pb_bn = XFS_BUF_DADDR_NULL; 251 bp->b_bn = XFS_BUF_DADDR_NULL;
251 atomic_set(&pb->pb_pin_count, 0); 252 atomic_set(&bp->b_pin_count, 0);
252 init_waitqueue_head(&pb->pb_waiters); 253 init_waitqueue_head(&bp->b_waiters);
253 254
254 XFS_STATS_INC(pb_create); 255 XFS_STATS_INC(xb_create);
255 PB_TRACE(pb, "initialize", target); 256 XB_TRACE(bp, "initialize", target);
256} 257}
257 258
258/* 259/*
259 * Allocate a page array capable of holding a specified number 260 * Allocate a page array capable of holding a specified number
260 * of pages, and point the page buf at it. 261 * of pages, and point the page buf at it.
261 */ 262 */
262STATIC int 263STATIC int
263_pagebuf_get_pages( 264_xfs_buf_get_pages(
264 xfs_buf_t *pb, 265 xfs_buf_t *bp,
265 int page_count, 266 int page_count,
266 page_buf_flags_t flags) 267 xfs_buf_flags_t flags)
267{ 268{
268 /* Make sure that we have a page list */ 269 /* Make sure that we have a page list */
269 if (pb->pb_pages == NULL) { 270 if (bp->b_pages == NULL) {
270 pb->pb_offset = page_buf_poff(pb->pb_file_offset); 271 bp->b_offset = xfs_buf_poff(bp->b_file_offset);
271 pb->pb_page_count = page_count; 272 bp->b_page_count = page_count;
272 if (page_count <= PB_PAGES) { 273 if (page_count <= XB_PAGES) {
273 pb->pb_pages = pb->pb_page_array; 274 bp->b_pages = bp->b_page_array;
274 } else { 275 } else {
275 pb->pb_pages = kmem_alloc(sizeof(struct page *) * 276 bp->b_pages = kmem_alloc(sizeof(struct page *) *
276 page_count, pb_to_km(flags)); 277 page_count, xb_to_km(flags));
277 if (pb->pb_pages == NULL) 278 if (bp->b_pages == NULL)
278 return -ENOMEM; 279 return -ENOMEM;
279 } 280 }
280 memset(pb->pb_pages, 0, sizeof(struct page *) * page_count); 281 memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
281 } 282 }
282 return 0; 283 return 0;
283} 284}
284 285
285/* 286/*
286 * Frees pb_pages if it was malloced. 287 * Frees b_pages if it was allocated.
287 */ 288 */
288STATIC void 289STATIC void
289_pagebuf_free_pages( 290_xfs_buf_free_pages(
290 xfs_buf_t *bp) 291 xfs_buf_t *bp)
291{ 292{
292 if (bp->pb_pages != bp->pb_page_array) { 293 if (bp->b_pages != bp->b_page_array) {
293 kmem_free(bp->pb_pages, 294 kmem_free(bp->b_pages,
294 bp->pb_page_count * sizeof(struct page *)); 295 bp->b_page_count * sizeof(struct page *));
295 } 296 }
296} 297}
297 298
@@ -299,79 +300,79 @@ _pagebuf_free_pages(
299 * Releases the specified buffer. 300 * Releases the specified buffer.
300 * 301 *
301 * The modification state of any associated pages is left unchanged. 302 * The modification state of any associated pages is left unchanged.
302 * The buffer most not be on any hash - use pagebuf_rele instead for 303 * The buffer most not be on any hash - use xfs_buf_rele instead for
303 * hashed and refcounted buffers 304 * hashed and refcounted buffers
304 */ 305 */
305void 306void
306pagebuf_free( 307xfs_buf_free(
307 xfs_buf_t *bp) 308 xfs_buf_t *bp)
308{ 309{
309 PB_TRACE(bp, "free", 0); 310 XB_TRACE(bp, "free", 0);
310 311
311 ASSERT(list_empty(&bp->pb_hash_list)); 312 ASSERT(list_empty(&bp->b_hash_list));
312 313
313 if (bp->pb_flags & _PBF_PAGE_CACHE) { 314 if (bp->b_flags & _XBF_PAGE_CACHE) {
314 uint i; 315 uint i;
315 316
316 if ((bp->pb_flags & PBF_MAPPED) && (bp->pb_page_count > 1)) 317 if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
317 free_address(bp->pb_addr - bp->pb_offset); 318 free_address(bp->b_addr - bp->b_offset);
318 319
319 for (i = 0; i < bp->pb_page_count; i++) 320 for (i = 0; i < bp->b_page_count; i++)
320 page_cache_release(bp->pb_pages[i]); 321 page_cache_release(bp->b_pages[i]);
321 _pagebuf_free_pages(bp); 322 _xfs_buf_free_pages(bp);
322 } else if (bp->pb_flags & _PBF_KMEM_ALLOC) { 323 } else if (bp->b_flags & _XBF_KMEM_ALLOC) {
323 /* 324 /*
324 * XXX(hch): bp->pb_count_desired might be incorrect (see 325 * XXX(hch): bp->b_count_desired might be incorrect (see
325 * pagebuf_associate_memory for details), but fortunately 326 * xfs_buf_associate_memory for details), but fortunately
326 * the Linux version of kmem_free ignores the len argument.. 327 * the Linux version of kmem_free ignores the len argument..
327 */ 328 */
328 kmem_free(bp->pb_addr, bp->pb_count_desired); 329 kmem_free(bp->b_addr, bp->b_count_desired);
329 _pagebuf_free_pages(bp); 330 _xfs_buf_free_pages(bp);
330 } 331 }
331 332
332 pagebuf_deallocate(bp); 333 xfs_buf_deallocate(bp);
333} 334}
334 335
335/* 336/*
336 * Finds all pages for buffer in question and builds it's page list. 337 * Finds all pages for buffer in question and builds it's page list.
337 */ 338 */
338STATIC int 339STATIC int
339_pagebuf_lookup_pages( 340_xfs_buf_lookup_pages(
340 xfs_buf_t *bp, 341 xfs_buf_t *bp,
341 uint flags) 342 uint flags)
342{ 343{
343 struct address_space *mapping = bp->pb_target->pbr_mapping; 344 struct address_space *mapping = bp->b_target->bt_mapping;
344 size_t blocksize = bp->pb_target->pbr_bsize; 345 size_t blocksize = bp->b_target->bt_bsize;
345 size_t size = bp->pb_count_desired; 346 size_t size = bp->b_count_desired;
346 size_t nbytes, offset; 347 size_t nbytes, offset;
347 gfp_t gfp_mask = pb_to_gfp(flags); 348 gfp_t gfp_mask = xb_to_gfp(flags);
348 unsigned short page_count, i; 349 unsigned short page_count, i;
349 pgoff_t first; 350 pgoff_t first;
350 loff_t end; 351 xfs_off_t end;
351 int error; 352 int error;
352 353
353 end = bp->pb_file_offset + bp->pb_buffer_length; 354 end = bp->b_file_offset + bp->b_buffer_length;
354 page_count = page_buf_btoc(end) - page_buf_btoct(bp->pb_file_offset); 355 page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
355 356
356 error = _pagebuf_get_pages(bp, page_count, flags); 357 error = _xfs_buf_get_pages(bp, page_count, flags);
357 if (unlikely(error)) 358 if (unlikely(error))
358 return error; 359 return error;
359 bp->pb_flags |= _PBF_PAGE_CACHE; 360 bp->b_flags |= _XBF_PAGE_CACHE;
360 361
361 offset = bp->pb_offset; 362 offset = bp->b_offset;
362 first = bp->pb_file_offset >> PAGE_CACHE_SHIFT; 363 first = bp->b_file_offset >> PAGE_CACHE_SHIFT;
363 364
364 for (i = 0; i < bp->pb_page_count; i++) { 365 for (i = 0; i < bp->b_page_count; i++) {
365 struct page *page; 366 struct page *page;
366 uint retries = 0; 367 uint retries = 0;
367 368
368 retry: 369 retry:
369 page = find_or_create_page(mapping, first + i, gfp_mask); 370 page = find_or_create_page(mapping, first + i, gfp_mask);
370 if (unlikely(page == NULL)) { 371 if (unlikely(page == NULL)) {
371 if (flags & PBF_READ_AHEAD) { 372 if (flags & XBF_READ_AHEAD) {
372 bp->pb_page_count = i; 373 bp->b_page_count = i;
373 for (i = 0; i < bp->pb_page_count; i++) 374 for (i = 0; i < bp->b_page_count; i++)
374 unlock_page(bp->pb_pages[i]); 375 unlock_page(bp->b_pages[i]);
375 return -ENOMEM; 376 return -ENOMEM;
376 } 377 }
377 378
@@ -387,13 +388,13 @@ _pagebuf_lookup_pages(
387 "deadlock in %s (mode:0x%x)\n", 388 "deadlock in %s (mode:0x%x)\n",
388 __FUNCTION__, gfp_mask); 389 __FUNCTION__, gfp_mask);
389 390
390 XFS_STATS_INC(pb_page_retries); 391 XFS_STATS_INC(xb_page_retries);
391 xfsbufd_wakeup(0, gfp_mask); 392 xfsbufd_wakeup(0, gfp_mask);
392 blk_congestion_wait(WRITE, HZ/50); 393 blk_congestion_wait(WRITE, HZ/50);
393 goto retry; 394 goto retry;
394 } 395 }
395 396
396 XFS_STATS_INC(pb_page_found); 397 XFS_STATS_INC(xb_page_found);
397 398
398 nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset); 399 nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset);
399 size -= nbytes; 400 size -= nbytes;
@@ -401,27 +402,27 @@ _pagebuf_lookup_pages(
401 if (!PageUptodate(page)) { 402 if (!PageUptodate(page)) {
402 page_count--; 403 page_count--;
403 if (blocksize >= PAGE_CACHE_SIZE) { 404 if (blocksize >= PAGE_CACHE_SIZE) {
404 if (flags & PBF_READ) 405 if (flags & XBF_READ)
405 bp->pb_locked = 1; 406 bp->b_locked = 1;
406 } else if (!PagePrivate(page)) { 407 } else if (!PagePrivate(page)) {
407 if (test_page_region(page, offset, nbytes)) 408 if (test_page_region(page, offset, nbytes))
408 page_count++; 409 page_count++;
409 } 410 }
410 } 411 }
411 412
412 bp->pb_pages[i] = page; 413 bp->b_pages[i] = page;
413 offset = 0; 414 offset = 0;
414 } 415 }
415 416
416 if (!bp->pb_locked) { 417 if (!bp->b_locked) {
417 for (i = 0; i < bp->pb_page_count; i++) 418 for (i = 0; i < bp->b_page_count; i++)
418 unlock_page(bp->pb_pages[i]); 419 unlock_page(bp->b_pages[i]);
419 } 420 }
420 421
421 if (page_count == bp->pb_page_count) 422 if (page_count == bp->b_page_count)
422 bp->pb_flags |= PBF_DONE; 423 bp->b_flags |= XBF_DONE;
423 424
424 PB_TRACE(bp, "lookup_pages", (long)page_count); 425 XB_TRACE(bp, "lookup_pages", (long)page_count);
425 return error; 426 return error;
426} 427}
427 428
@@ -429,23 +430,23 @@ _pagebuf_lookup_pages(
429 * Map buffer into kernel address-space if nessecary. 430 * Map buffer into kernel address-space if nessecary.
430 */ 431 */
431STATIC int 432STATIC int
432_pagebuf_map_pages( 433_xfs_buf_map_pages(
433 xfs_buf_t *bp, 434 xfs_buf_t *bp,
434 uint flags) 435 uint flags)
435{ 436{
436 /* A single page buffer is always mappable */ 437 /* A single page buffer is always mappable */
437 if (bp->pb_page_count == 1) { 438 if (bp->b_page_count == 1) {
438 bp->pb_addr = page_address(bp->pb_pages[0]) + bp->pb_offset; 439 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
439 bp->pb_flags |= PBF_MAPPED; 440 bp->b_flags |= XBF_MAPPED;
440 } else if (flags & PBF_MAPPED) { 441 } else if (flags & XBF_MAPPED) {
441 if (as_list_len > 64) 442 if (as_list_len > 64)
442 purge_addresses(); 443 purge_addresses();
443 bp->pb_addr = vmap(bp->pb_pages, bp->pb_page_count, 444 bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
444 VM_MAP, PAGE_KERNEL); 445 VM_MAP, PAGE_KERNEL);
445 if (unlikely(bp->pb_addr == NULL)) 446 if (unlikely(bp->b_addr == NULL))
446 return -ENOMEM; 447 return -ENOMEM;
447 bp->pb_addr += bp->pb_offset; 448 bp->b_addr += bp->b_offset;
448 bp->pb_flags |= PBF_MAPPED; 449 bp->b_flags |= XBF_MAPPED;
449 } 450 }
450 451
451 return 0; 452 return 0;
@@ -456,9 +457,7 @@ _pagebuf_map_pages(
456 */ 457 */
457 458
458/* 459/*
459 * _pagebuf_find 460 * Look up, and creates if absent, a lockable buffer for
460 *
461 * Looks up, and creates if absent, a lockable buffer for
462 * a given range of an inode. The buffer is returned 461 * a given range of an inode. The buffer is returned
463 * locked. If other overlapping buffers exist, they are 462 * locked. If other overlapping buffers exist, they are
464 * released before the new buffer is created and locked, 463 * released before the new buffer is created and locked,
@@ -466,55 +465,55 @@ _pagebuf_map_pages(
466 * are unlocked. No I/O is implied by this call. 465 * are unlocked. No I/O is implied by this call.
467 */ 466 */
468xfs_buf_t * 467xfs_buf_t *
469_pagebuf_find( 468_xfs_buf_find(
470 xfs_buftarg_t *btp, /* block device target */ 469 xfs_buftarg_t *btp, /* block device target */
471 loff_t ioff, /* starting offset of range */ 470 xfs_off_t ioff, /* starting offset of range */
472 size_t isize, /* length of range */ 471 size_t isize, /* length of range */
473 page_buf_flags_t flags, /* PBF_TRYLOCK */ 472 xfs_buf_flags_t flags,
474 xfs_buf_t *new_pb)/* newly allocated buffer */ 473 xfs_buf_t *new_bp)
475{ 474{
476 loff_t range_base; 475 xfs_off_t range_base;
477 size_t range_length; 476 size_t range_length;
478 xfs_bufhash_t *hash; 477 xfs_bufhash_t *hash;
479 xfs_buf_t *pb, *n; 478 xfs_buf_t *bp, *n;
480 479
481 range_base = (ioff << BBSHIFT); 480 range_base = (ioff << BBSHIFT);
482 range_length = (isize << BBSHIFT); 481 range_length = (isize << BBSHIFT);
483 482
484 /* Check for IOs smaller than the sector size / not sector aligned */ 483 /* Check for IOs smaller than the sector size / not sector aligned */
485 ASSERT(!(range_length < (1 << btp->pbr_sshift))); 484 ASSERT(!(range_length < (1 << btp->bt_sshift)));
486 ASSERT(!(range_base & (loff_t)btp->pbr_smask)); 485 ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
487 486
488 hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)]; 487 hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
489 488
490 spin_lock(&hash->bh_lock); 489 spin_lock(&hash->bh_lock);
491 490
492 list_for_each_entry_safe(pb, n, &hash->bh_list, pb_hash_list) { 491 list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
493 ASSERT(btp == pb->pb_target); 492 ASSERT(btp == bp->b_target);
494 if (pb->pb_file_offset == range_base && 493 if (bp->b_file_offset == range_base &&
495 pb->pb_buffer_length == range_length) { 494 bp->b_buffer_length == range_length) {
496 /* 495 /*
497 * If we look at something bring it to the 496 * If we look at something, bring it to the
498 * front of the list for next time. 497 * front of the list for next time.
499 */ 498 */
500 atomic_inc(&pb->pb_hold); 499 atomic_inc(&bp->b_hold);
501 list_move(&pb->pb_hash_list, &hash->bh_list); 500 list_move(&bp->b_hash_list, &hash->bh_list);
502 goto found; 501 goto found;
503 } 502 }
504 } 503 }
505 504
506 /* No match found */ 505 /* No match found */
507 if (new_pb) { 506 if (new_bp) {
508 _pagebuf_initialize(new_pb, btp, range_base, 507 _xfs_buf_initialize(new_bp, btp, range_base,
509 range_length, flags); 508 range_length, flags);
510 new_pb->pb_hash = hash; 509 new_bp->b_hash = hash;
511 list_add(&new_pb->pb_hash_list, &hash->bh_list); 510 list_add(&new_bp->b_hash_list, &hash->bh_list);
512 } else { 511 } else {
513 XFS_STATS_INC(pb_miss_locked); 512 XFS_STATS_INC(xb_miss_locked);
514 } 513 }
515 514
516 spin_unlock(&hash->bh_lock); 515 spin_unlock(&hash->bh_lock);
517 return new_pb; 516 return new_bp;
518 517
519found: 518found:
520 spin_unlock(&hash->bh_lock); 519 spin_unlock(&hash->bh_lock);
@@ -523,74 +522,72 @@ found:
523 * if this does not work then we need to drop the 522 * if this does not work then we need to drop the
524 * spinlock and do a hard attempt on the semaphore. 523 * spinlock and do a hard attempt on the semaphore.
525 */ 524 */
526 if (down_trylock(&pb->pb_sema)) { 525 if (down_trylock(&bp->b_sema)) {
527 if (!(flags & PBF_TRYLOCK)) { 526 if (!(flags & XBF_TRYLOCK)) {
528 /* wait for buffer ownership */ 527 /* wait for buffer ownership */
529 PB_TRACE(pb, "get_lock", 0); 528 XB_TRACE(bp, "get_lock", 0);
530 pagebuf_lock(pb); 529 xfs_buf_lock(bp);
531 XFS_STATS_INC(pb_get_locked_waited); 530 XFS_STATS_INC(xb_get_locked_waited);
532 } else { 531 } else {
533 /* We asked for a trylock and failed, no need 532 /* We asked for a trylock and failed, no need
534 * to look at file offset and length here, we 533 * to look at file offset and length here, we
535 * know that this pagebuf at least overlaps our 534 * know that this buffer at least overlaps our
536 * pagebuf and is locked, therefore our buffer 535 * buffer and is locked, therefore our buffer
537 * either does not exist, or is this buffer 536 * either does not exist, or is this buffer.
538 */ 537 */
539 538 xfs_buf_rele(bp);
540 pagebuf_rele(pb); 539 XFS_STATS_INC(xb_busy_locked);
541 XFS_STATS_INC(pb_busy_locked); 540 return NULL;
542 return (NULL);
543 } 541 }
544 } else { 542 } else {
545 /* trylock worked */ 543 /* trylock worked */
546 PB_SET_OWNER(pb); 544 XB_SET_OWNER(bp);
547 } 545 }
548 546
549 if (pb->pb_flags & PBF_STALE) { 547 if (bp->b_flags & XBF_STALE) {
550 ASSERT((pb->pb_flags & _PBF_DELWRI_Q) == 0); 548 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
551 pb->pb_flags &= PBF_MAPPED; 549 bp->b_flags &= XBF_MAPPED;
552 } 550 }
553 PB_TRACE(pb, "got_lock", 0); 551 XB_TRACE(bp, "got_lock", 0);
554 XFS_STATS_INC(pb_get_locked); 552 XFS_STATS_INC(xb_get_locked);
555 return (pb); 553 return bp;
556} 554}
557 555
558/* 556/*
559 * xfs_buf_get_flags assembles a buffer covering the specified range. 557 * Assembles a buffer covering the specified range.
560 *
561 * Storage in memory for all portions of the buffer will be allocated, 558 * Storage in memory for all portions of the buffer will be allocated,
562 * although backing storage may not be. 559 * although backing storage may not be.
563 */ 560 */
564xfs_buf_t * 561xfs_buf_t *
565xfs_buf_get_flags( /* allocate a buffer */ 562xfs_buf_get_flags(
566 xfs_buftarg_t *target,/* target for buffer */ 563 xfs_buftarg_t *target,/* target for buffer */
567 loff_t ioff, /* starting offset of range */ 564 xfs_off_t ioff, /* starting offset of range */
568 size_t isize, /* length of range */ 565 size_t isize, /* length of range */
569 page_buf_flags_t flags) /* PBF_TRYLOCK */ 566 xfs_buf_flags_t flags)
570{ 567{
571 xfs_buf_t *pb, *new_pb; 568 xfs_buf_t *bp, *new_bp;
572 int error = 0, i; 569 int error = 0, i;
573 570
574 new_pb = pagebuf_allocate(flags); 571 new_bp = xfs_buf_allocate(flags);
575 if (unlikely(!new_pb)) 572 if (unlikely(!new_bp))
576 return NULL; 573 return NULL;
577 574
578 pb = _pagebuf_find(target, ioff, isize, flags, new_pb); 575 bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
579 if (pb == new_pb) { 576 if (bp == new_bp) {
580 error = _pagebuf_lookup_pages(pb, flags); 577 error = _xfs_buf_lookup_pages(bp, flags);
581 if (error) 578 if (error)
582 goto no_buffer; 579 goto no_buffer;
583 } else { 580 } else {
584 pagebuf_deallocate(new_pb); 581 xfs_buf_deallocate(new_bp);
585 if (unlikely(pb == NULL)) 582 if (unlikely(bp == NULL))
586 return NULL; 583 return NULL;
587 } 584 }
588 585
589 for (i = 0; i < pb->pb_page_count; i++) 586 for (i = 0; i < bp->b_page_count; i++)
590 mark_page_accessed(pb->pb_pages[i]); 587 mark_page_accessed(bp->b_pages[i]);
591 588
592 if (!(pb->pb_flags & PBF_MAPPED)) { 589 if (!(bp->b_flags & XBF_MAPPED)) {
593 error = _pagebuf_map_pages(pb, flags); 590 error = _xfs_buf_map_pages(bp, flags);
594 if (unlikely(error)) { 591 if (unlikely(error)) {
595 printk(KERN_WARNING "%s: failed to map pages\n", 592 printk(KERN_WARNING "%s: failed to map pages\n",
596 __FUNCTION__); 593 __FUNCTION__);
@@ -598,97 +595,97 @@ xfs_buf_get_flags( /* allocate a buffer */
598 } 595 }
599 } 596 }
600 597
601 XFS_STATS_INC(pb_get); 598 XFS_STATS_INC(xb_get);
602 599
603 /* 600 /*
604 * Always fill in the block number now, the mapped cases can do 601 * Always fill in the block number now, the mapped cases can do
605 * their own overlay of this later. 602 * their own overlay of this later.
606 */ 603 */
607 pb->pb_bn = ioff; 604 bp->b_bn = ioff;
608 pb->pb_count_desired = pb->pb_buffer_length; 605 bp->b_count_desired = bp->b_buffer_length;
609 606
610 PB_TRACE(pb, "get", (unsigned long)flags); 607 XB_TRACE(bp, "get", (unsigned long)flags);
611 return pb; 608 return bp;
612 609
613 no_buffer: 610 no_buffer:
614 if (flags & (PBF_LOCK | PBF_TRYLOCK)) 611 if (flags & (XBF_LOCK | XBF_TRYLOCK))
615 pagebuf_unlock(pb); 612 xfs_buf_unlock(bp);
616 pagebuf_rele(pb); 613 xfs_buf_rele(bp);
617 return NULL; 614 return NULL;
618} 615}
619 616
620xfs_buf_t * 617xfs_buf_t *
621xfs_buf_read_flags( 618xfs_buf_read_flags(
622 xfs_buftarg_t *target, 619 xfs_buftarg_t *target,
623 loff_t ioff, 620 xfs_off_t ioff,
624 size_t isize, 621 size_t isize,
625 page_buf_flags_t flags) 622 xfs_buf_flags_t flags)
626{ 623{
627 xfs_buf_t *pb; 624 xfs_buf_t *bp;
628 625
629 flags |= PBF_READ; 626 flags |= XBF_READ;
630 627
631 pb = xfs_buf_get_flags(target, ioff, isize, flags); 628 bp = xfs_buf_get_flags(target, ioff, isize, flags);
632 if (pb) { 629 if (bp) {
633 if (!XFS_BUF_ISDONE(pb)) { 630 if (!XFS_BUF_ISDONE(bp)) {
634 PB_TRACE(pb, "read", (unsigned long)flags); 631 XB_TRACE(bp, "read", (unsigned long)flags);
635 XFS_STATS_INC(pb_get_read); 632 XFS_STATS_INC(xb_get_read);
636 pagebuf_iostart(pb, flags); 633 xfs_buf_iostart(bp, flags);
637 } else if (flags & PBF_ASYNC) { 634 } else if (flags & XBF_ASYNC) {
638 PB_TRACE(pb, "read_async", (unsigned long)flags); 635 XB_TRACE(bp, "read_async", (unsigned long)flags);
639 /* 636 /*
640 * Read ahead call which is already satisfied, 637 * Read ahead call which is already satisfied,
641 * drop the buffer 638 * drop the buffer
642 */ 639 */
643 goto no_buffer; 640 goto no_buffer;
644 } else { 641 } else {
645 PB_TRACE(pb, "read_done", (unsigned long)flags); 642 XB_TRACE(bp, "read_done", (unsigned long)flags);
646 /* We do not want read in the flags */ 643 /* We do not want read in the flags */
647 pb->pb_flags &= ~PBF_READ; 644 bp->b_flags &= ~XBF_READ;
648 } 645 }
649 } 646 }
650 647
651 return pb; 648 return bp;
652 649
653 no_buffer: 650 no_buffer:
654 if (flags & (PBF_LOCK | PBF_TRYLOCK)) 651 if (flags & (XBF_LOCK | XBF_TRYLOCK))
655 pagebuf_unlock(pb); 652 xfs_buf_unlock(bp);
656 pagebuf_rele(pb); 653 xfs_buf_rele(bp);
657 return NULL; 654 return NULL;
658} 655}
659 656
660/* 657/*
661 * If we are not low on memory then do the readahead in a deadlock 658 * If we are not low on memory then do the readahead in a deadlock
662 * safe manner. 659 * safe manner.
663 */ 660 */
664void 661void
665pagebuf_readahead( 662xfs_buf_readahead(
666 xfs_buftarg_t *target, 663 xfs_buftarg_t *target,
667 loff_t ioff, 664 xfs_off_t ioff,
668 size_t isize, 665 size_t isize,
669 page_buf_flags_t flags) 666 xfs_buf_flags_t flags)
670{ 667{
671 struct backing_dev_info *bdi; 668 struct backing_dev_info *bdi;
672 669
673 bdi = target->pbr_mapping->backing_dev_info; 670 bdi = target->bt_mapping->backing_dev_info;
674 if (bdi_read_congested(bdi)) 671 if (bdi_read_congested(bdi))
675 return; 672 return;
676 673
677 flags |= (PBF_TRYLOCK|PBF_ASYNC|PBF_READ_AHEAD); 674 flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
678 xfs_buf_read_flags(target, ioff, isize, flags); 675 xfs_buf_read_flags(target, ioff, isize, flags);
679} 676}
680 677
681xfs_buf_t * 678xfs_buf_t *
682pagebuf_get_empty( 679xfs_buf_get_empty(
683 size_t len, 680 size_t len,
684 xfs_buftarg_t *target) 681 xfs_buftarg_t *target)
685{ 682{
686 xfs_buf_t *pb; 683 xfs_buf_t *bp;
687 684
688 pb = pagebuf_allocate(0); 685 bp = xfs_buf_allocate(0);
689 if (pb) 686 if (bp)
690 _pagebuf_initialize(pb, target, 0, len, 0); 687 _xfs_buf_initialize(bp, target, 0, len, 0);
691 return pb; 688 return bp;
692} 689}
693 690
694static inline struct page * 691static inline struct page *
@@ -704,8 +701,8 @@ mem_to_page(
704} 701}
705 702
706int 703int
707pagebuf_associate_memory( 704xfs_buf_associate_memory(
708 xfs_buf_t *pb, 705 xfs_buf_t *bp,
709 void *mem, 706 void *mem,
710 size_t len) 707 size_t len)
711{ 708{
@@ -722,40 +719,40 @@ pagebuf_associate_memory(
722 page_count++; 719 page_count++;
723 720
724 /* Free any previous set of page pointers */ 721 /* Free any previous set of page pointers */
725 if (pb->pb_pages) 722 if (bp->b_pages)
726 _pagebuf_free_pages(pb); 723 _xfs_buf_free_pages(bp);
727 724
728 pb->pb_pages = NULL; 725 bp->b_pages = NULL;
729 pb->pb_addr = mem; 726 bp->b_addr = mem;
730 727
731 rval = _pagebuf_get_pages(pb, page_count, 0); 728 rval = _xfs_buf_get_pages(bp, page_count, 0);
732 if (rval) 729 if (rval)
733 return rval; 730 return rval;
734 731
735 pb->pb_offset = offset; 732 bp->b_offset = offset;
736 ptr = (size_t) mem & PAGE_CACHE_MASK; 733 ptr = (size_t) mem & PAGE_CACHE_MASK;
737 end = PAGE_CACHE_ALIGN((size_t) mem + len); 734 end = PAGE_CACHE_ALIGN((size_t) mem + len);
738 end_cur = end; 735 end_cur = end;
739 /* set up first page */ 736 /* set up first page */
740 pb->pb_pages[0] = mem_to_page(mem); 737 bp->b_pages[0] = mem_to_page(mem);
741 738
742 ptr += PAGE_CACHE_SIZE; 739 ptr += PAGE_CACHE_SIZE;
743 pb->pb_page_count = ++i; 740 bp->b_page_count = ++i;
744 while (ptr < end) { 741 while (ptr < end) {
745 pb->pb_pages[i] = mem_to_page((void *)ptr); 742 bp->b_pages[i] = mem_to_page((void *)ptr);
746 pb->pb_page_count = ++i; 743 bp->b_page_count = ++i;
747 ptr += PAGE_CACHE_SIZE; 744 ptr += PAGE_CACHE_SIZE;
748 } 745 }
749 pb->pb_locked = 0; 746 bp->b_locked = 0;
750 747
751 pb->pb_count_desired = pb->pb_buffer_length = len; 748 bp->b_count_desired = bp->b_buffer_length = len;
752 pb->pb_flags |= PBF_MAPPED; 749 bp->b_flags |= XBF_MAPPED;
753 750
754 return 0; 751 return 0;
755} 752}
756 753
757xfs_buf_t * 754xfs_buf_t *
758pagebuf_get_no_daddr( 755xfs_buf_get_noaddr(
759 size_t len, 756 size_t len,
760 xfs_buftarg_t *target) 757 xfs_buftarg_t *target)
761{ 758{
@@ -764,10 +761,10 @@ pagebuf_get_no_daddr(
764 void *data; 761 void *data;
765 int error; 762 int error;
766 763
767 bp = pagebuf_allocate(0); 764 bp = xfs_buf_allocate(0);
768 if (unlikely(bp == NULL)) 765 if (unlikely(bp == NULL))
769 goto fail; 766 goto fail;
770 _pagebuf_initialize(bp, target, 0, len, 0); 767 _xfs_buf_initialize(bp, target, 0, len, 0);
771 768
772 try_again: 769 try_again:
773 data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL); 770 data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL);
@@ -776,78 +773,73 @@ pagebuf_get_no_daddr(
776 773
777 /* check whether alignment matches.. */ 774 /* check whether alignment matches.. */
778 if ((__psunsigned_t)data != 775 if ((__psunsigned_t)data !=
779 ((__psunsigned_t)data & ~target->pbr_smask)) { 776 ((__psunsigned_t)data & ~target->bt_smask)) {
780 /* .. else double the size and try again */ 777 /* .. else double the size and try again */
781 kmem_free(data, malloc_len); 778 kmem_free(data, malloc_len);
782 malloc_len <<= 1; 779 malloc_len <<= 1;
783 goto try_again; 780 goto try_again;
784 } 781 }
785 782
786 error = pagebuf_associate_memory(bp, data, len); 783 error = xfs_buf_associate_memory(bp, data, len);
787 if (error) 784 if (error)
788 goto fail_free_mem; 785 goto fail_free_mem;
789 bp->pb_flags |= _PBF_KMEM_ALLOC; 786 bp->b_flags |= _XBF_KMEM_ALLOC;
790 787
791 pagebuf_unlock(bp); 788 xfs_buf_unlock(bp);
792 789
793 PB_TRACE(bp, "no_daddr", data); 790 XB_TRACE(bp, "no_daddr", data);
794 return bp; 791 return bp;
795 fail_free_mem: 792 fail_free_mem:
796 kmem_free(data, malloc_len); 793 kmem_free(data, malloc_len);
797 fail_free_buf: 794 fail_free_buf:
798 pagebuf_free(bp); 795 xfs_buf_free(bp);
799 fail: 796 fail:
800 return NULL; 797 return NULL;
801} 798}
802 799
803/* 800/*
804 * pagebuf_hold
805 *
806 * Increment reference count on buffer, to hold the buffer concurrently 801 * Increment reference count on buffer, to hold the buffer concurrently
807 * with another thread which may release (free) the buffer asynchronously. 802 * with another thread which may release (free) the buffer asynchronously.
808 *
809 * Must hold the buffer already to call this function. 803 * Must hold the buffer already to call this function.
810 */ 804 */
811void 805void
812pagebuf_hold( 806xfs_buf_hold(
813 xfs_buf_t *pb) 807 xfs_buf_t *bp)
814{ 808{
815 atomic_inc(&pb->pb_hold); 809 atomic_inc(&bp->b_hold);
816 PB_TRACE(pb, "hold", 0); 810 XB_TRACE(bp, "hold", 0);
817} 811}
818 812
819/* 813/*
820 * pagebuf_rele 814 * Releases a hold on the specified buffer. If the
821 * 815 * the hold count is 1, calls xfs_buf_free.
822 * pagebuf_rele releases a hold on the specified buffer. If the
823 * the hold count is 1, pagebuf_rele calls pagebuf_free.
824 */ 816 */
825void 817void
826pagebuf_rele( 818xfs_buf_rele(
827 xfs_buf_t *pb) 819 xfs_buf_t *bp)
828{ 820{
829 xfs_bufhash_t *hash = pb->pb_hash; 821 xfs_bufhash_t *hash = bp->b_hash;
830 822
831 PB_TRACE(pb, "rele", pb->pb_relse); 823 XB_TRACE(bp, "rele", bp->b_relse);
832 824
833 if (atomic_dec_and_lock(&pb->pb_hold, &hash->bh_lock)) { 825 if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
834 if (pb->pb_relse) { 826 if (bp->b_relse) {
835 atomic_inc(&pb->pb_hold); 827 atomic_inc(&bp->b_hold);
836 spin_unlock(&hash->bh_lock); 828 spin_unlock(&hash->bh_lock);
837 (*(pb->pb_relse)) (pb); 829 (*(bp->b_relse)) (bp);
838 } else if (pb->pb_flags & PBF_FS_MANAGED) { 830 } else if (bp->b_flags & XBF_FS_MANAGED) {
839 spin_unlock(&hash->bh_lock); 831 spin_unlock(&hash->bh_lock);
840 } else { 832 } else {
841 ASSERT(!(pb->pb_flags & (PBF_DELWRI|_PBF_DELWRI_Q))); 833 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
842 list_del_init(&pb->pb_hash_list); 834 list_del_init(&bp->b_hash_list);
843 spin_unlock(&hash->bh_lock); 835 spin_unlock(&hash->bh_lock);
844 pagebuf_free(pb); 836 xfs_buf_free(bp);
845 } 837 }
846 } else { 838 } else {
847 /* 839 /*
848 * Catch reference count leaks 840 * Catch reference count leaks
849 */ 841 */
850 ASSERT(atomic_read(&pb->pb_hold) >= 0); 842 ASSERT(atomic_read(&bp->b_hold) >= 0);
851 } 843 }
852} 844}
853 845
@@ -863,168 +855,122 @@ pagebuf_rele(
863 */ 855 */
864 856
865/* 857/*
866 * pagebuf_cond_lock 858 * Locks a buffer object, if it is not already locked.
867 * 859 * Note that this in no way locks the underlying pages, so it is only
868 * pagebuf_cond_lock locks a buffer object, if it is not already locked. 860 * useful for synchronizing concurrent use of buffer objects, not for
869 * Note that this in no way 861 * synchronizing independent access to the underlying pages.
870 * locks the underlying pages, so it is only useful for synchronizing
871 * concurrent use of page buffer objects, not for synchronizing independent
872 * access to the underlying pages.
873 */ 862 */
874int 863int
875pagebuf_cond_lock( /* lock buffer, if not locked */ 864xfs_buf_cond_lock(
876 /* returns -EBUSY if locked) */ 865 xfs_buf_t *bp)
877 xfs_buf_t *pb)
878{ 866{
879 int locked; 867 int locked;
880 868
881 locked = down_trylock(&pb->pb_sema) == 0; 869 locked = down_trylock(&bp->b_sema) == 0;
882 if (locked) { 870 if (locked) {
883 PB_SET_OWNER(pb); 871 XB_SET_OWNER(bp);
884 } 872 }
885 PB_TRACE(pb, "cond_lock", (long)locked); 873 XB_TRACE(bp, "cond_lock", (long)locked);
886 return(locked ? 0 : -EBUSY); 874 return locked ? 0 : -EBUSY;
887} 875}
888 876
889#if defined(DEBUG) || defined(XFS_BLI_TRACE) 877#if defined(DEBUG) || defined(XFS_BLI_TRACE)
890/*
891 * pagebuf_lock_value
892 *
893 * Return lock value for a pagebuf
894 */
895int 878int
896pagebuf_lock_value( 879xfs_buf_lock_value(
897 xfs_buf_t *pb) 880 xfs_buf_t *bp)
898{ 881{
899 return(atomic_read(&pb->pb_sema.count)); 882 return atomic_read(&bp->b_sema.count);
900} 883}
901#endif 884#endif
902 885
903/* 886/*
904 * pagebuf_lock 887 * Locks a buffer object.
905 * 888 * Note that this in no way locks the underlying pages, so it is only
906 * pagebuf_lock locks a buffer object. Note that this in no way 889 * useful for synchronizing concurrent use of buffer objects, not for
907 * locks the underlying pages, so it is only useful for synchronizing 890 * synchronizing independent access to the underlying pages.
908 * concurrent use of page buffer objects, not for synchronizing independent
909 * access to the underlying pages.
910 */ 891 */
911int 892void
912pagebuf_lock( 893xfs_buf_lock(
913 xfs_buf_t *pb) 894 xfs_buf_t *bp)
914{ 895{
915 PB_TRACE(pb, "lock", 0); 896 XB_TRACE(bp, "lock", 0);
916 if (atomic_read(&pb->pb_io_remaining)) 897 if (atomic_read(&bp->b_io_remaining))
917 blk_run_address_space(pb->pb_target->pbr_mapping); 898 blk_run_address_space(bp->b_target->bt_mapping);
918 down(&pb->pb_sema); 899 down(&bp->b_sema);
919 PB_SET_OWNER(pb); 900 XB_SET_OWNER(bp);
920 PB_TRACE(pb, "locked", 0); 901 XB_TRACE(bp, "locked", 0);
921 return 0;
922} 902}
923 903
924/* 904/*
925 * pagebuf_unlock 905 * Releases the lock on the buffer object.
926 *
927 * pagebuf_unlock releases the lock on the buffer object created by
928 * pagebuf_lock or pagebuf_cond_lock (not any pinning of underlying pages
929 * created by pagebuf_pin).
930 *
931 * If the buffer is marked delwri but is not queued, do so before we 906 * If the buffer is marked delwri but is not queued, do so before we
932 * unlock the buffer as we need to set flags correctly. We also need to 907 * unlock the buffer as we need to set flags correctly. We also need to
933 * take a reference for the delwri queue because the unlocker is going to 908 * take a reference for the delwri queue because the unlocker is going to
934 * drop their's and they don't know we just queued it. 909 * drop their's and they don't know we just queued it.
935 */ 910 */
936void 911void
937pagebuf_unlock( /* unlock buffer */ 912xfs_buf_unlock(
938 xfs_buf_t *pb) /* buffer to unlock */ 913 xfs_buf_t *bp)
939{ 914{
940 if ((pb->pb_flags & (PBF_DELWRI|_PBF_DELWRI_Q)) == PBF_DELWRI) { 915 if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) {
941 atomic_inc(&pb->pb_hold); 916 atomic_inc(&bp->b_hold);
942 pb->pb_flags |= PBF_ASYNC; 917 bp->b_flags |= XBF_ASYNC;
943 pagebuf_delwri_queue(pb, 0); 918 xfs_buf_delwri_queue(bp, 0);
944 } 919 }
945 920
946 PB_CLEAR_OWNER(pb); 921 XB_CLEAR_OWNER(bp);
947 up(&pb->pb_sema); 922 up(&bp->b_sema);
948 PB_TRACE(pb, "unlock", 0); 923 XB_TRACE(bp, "unlock", 0);
949} 924}
950 925
951 926
952/* 927/*
953 * Pinning Buffer Storage in Memory 928 * Pinning Buffer Storage in Memory
954 */ 929 * Ensure that no attempt to force a buffer to disk will succeed.
955
956/*
957 * pagebuf_pin
958 *
959 * pagebuf_pin locks all of the memory represented by a buffer in
960 * memory. Multiple calls to pagebuf_pin and pagebuf_unpin, for
961 * the same or different buffers affecting a given page, will
962 * properly count the number of outstanding "pin" requests. The
963 * buffer may be released after the pagebuf_pin and a different
964 * buffer used when calling pagebuf_unpin, if desired.
965 * pagebuf_pin should be used by the file system when it wants be
966 * assured that no attempt will be made to force the affected
967 * memory to disk. It does not assure that a given logical page
968 * will not be moved to a different physical page.
969 */ 930 */
970void 931void
971pagebuf_pin( 932xfs_buf_pin(
972 xfs_buf_t *pb) 933 xfs_buf_t *bp)
973{ 934{
974 atomic_inc(&pb->pb_pin_count); 935 atomic_inc(&bp->b_pin_count);
975 PB_TRACE(pb, "pin", (long)pb->pb_pin_count.counter); 936 XB_TRACE(bp, "pin", (long)bp->b_pin_count.counter);
976} 937}
977 938
978/*
979 * pagebuf_unpin
980 *
981 * pagebuf_unpin reverses the locking of memory performed by
982 * pagebuf_pin. Note that both functions affected the logical
983 * pages associated with the buffer, not the buffer itself.
984 */
985void 939void
986pagebuf_unpin( 940xfs_buf_unpin(
987 xfs_buf_t *pb) 941 xfs_buf_t *bp)
988{ 942{
989 if (atomic_dec_and_test(&pb->pb_pin_count)) { 943 if (atomic_dec_and_test(&bp->b_pin_count))
990 wake_up_all(&pb->pb_waiters); 944 wake_up_all(&bp->b_waiters);
991 } 945 XB_TRACE(bp, "unpin", (long)bp->b_pin_count.counter);
992 PB_TRACE(pb, "unpin", (long)pb->pb_pin_count.counter);
993} 946}
994 947
995int 948int
996pagebuf_ispin( 949xfs_buf_ispin(
997 xfs_buf_t *pb) 950 xfs_buf_t *bp)
998{ 951{
999 return atomic_read(&pb->pb_pin_count); 952 return atomic_read(&bp->b_pin_count);
1000} 953}
1001 954
1002/* 955STATIC void
1003 * pagebuf_wait_unpin 956xfs_buf_wait_unpin(
1004 * 957 xfs_buf_t *bp)
1005 * pagebuf_wait_unpin waits until all of the memory associated
1006 * with the buffer is not longer locked in memory. It returns
1007 * immediately if none of the affected pages are locked.
1008 */
1009static inline void
1010_pagebuf_wait_unpin(
1011 xfs_buf_t *pb)
1012{ 958{
1013 DECLARE_WAITQUEUE (wait, current); 959 DECLARE_WAITQUEUE (wait, current);
1014 960
1015 if (atomic_read(&pb->pb_pin_count) == 0) 961 if (atomic_read(&bp->b_pin_count) == 0)
1016 return; 962 return;
1017 963
1018 add_wait_queue(&pb->pb_waiters, &wait); 964 add_wait_queue(&bp->b_waiters, &wait);
1019 for (;;) { 965 for (;;) {
1020 set_current_state(TASK_UNINTERRUPTIBLE); 966 set_current_state(TASK_UNINTERRUPTIBLE);
1021 if (atomic_read(&pb->pb_pin_count) == 0) 967 if (atomic_read(&bp->b_pin_count) == 0)
1022 break; 968 break;
1023 if (atomic_read(&pb->pb_io_remaining)) 969 if (atomic_read(&bp->b_io_remaining))
1024 blk_run_address_space(pb->pb_target->pbr_mapping); 970 blk_run_address_space(bp->b_target->bt_mapping);
1025 schedule(); 971 schedule();
1026 } 972 }
1027 remove_wait_queue(&pb->pb_waiters, &wait); 973 remove_wait_queue(&bp->b_waiters, &wait);
1028 set_current_state(TASK_RUNNING); 974 set_current_state(TASK_RUNNING);
1029} 975}
1030 976
@@ -1032,241 +978,216 @@ _pagebuf_wait_unpin(
1032 * Buffer Utility Routines 978 * Buffer Utility Routines
1033 */ 979 */
1034 980
1035/*
1036 * pagebuf_iodone
1037 *
1038 * pagebuf_iodone marks a buffer for which I/O is in progress
1039 * done with respect to that I/O. The pb_iodone routine, if
1040 * present, will be called as a side-effect.
1041 */
1042STATIC void 981STATIC void
1043pagebuf_iodone_work( 982xfs_buf_iodone_work(
1044 void *v) 983 void *v)
1045{ 984{
1046 xfs_buf_t *bp = (xfs_buf_t *)v; 985 xfs_buf_t *bp = (xfs_buf_t *)v;
1047 986
1048 if (bp->pb_iodone) 987 if (bp->b_iodone)
1049 (*(bp->pb_iodone))(bp); 988 (*(bp->b_iodone))(bp);
1050 else if (bp->pb_flags & PBF_ASYNC) 989 else if (bp->b_flags & XBF_ASYNC)
1051 xfs_buf_relse(bp); 990 xfs_buf_relse(bp);
1052} 991}
1053 992
1054void 993void
1055pagebuf_iodone( 994xfs_buf_ioend(
1056 xfs_buf_t *pb, 995 xfs_buf_t *bp,
1057 int schedule) 996 int schedule)
1058{ 997{
1059 pb->pb_flags &= ~(PBF_READ | PBF_WRITE); 998 bp->b_flags &= ~(XBF_READ | XBF_WRITE);
1060 if (pb->pb_error == 0) 999 if (bp->b_error == 0)
1061 pb->pb_flags |= PBF_DONE; 1000 bp->b_flags |= XBF_DONE;
1062 1001
1063 PB_TRACE(pb, "iodone", pb->pb_iodone); 1002 XB_TRACE(bp, "iodone", bp->b_iodone);
1064 1003
1065 if ((pb->pb_iodone) || (pb->pb_flags & PBF_ASYNC)) { 1004 if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
1066 if (schedule) { 1005 if (schedule) {
1067 INIT_WORK(&pb->pb_iodone_work, pagebuf_iodone_work, pb); 1006 INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work, bp);
1068 queue_work(xfslogd_workqueue, &pb->pb_iodone_work); 1007 queue_work(xfslogd_workqueue, &bp->b_iodone_work);
1069 } else { 1008 } else {
1070 pagebuf_iodone_work(pb); 1009 xfs_buf_iodone_work(bp);
1071 } 1010 }
1072 } else { 1011 } else {
1073 up(&pb->pb_iodonesema); 1012 up(&bp->b_iodonesema);
1074 } 1013 }
1075} 1014}
1076 1015
1077/*
1078 * pagebuf_ioerror
1079 *
1080 * pagebuf_ioerror sets the error code for a buffer.
1081 */
1082void 1016void
1083pagebuf_ioerror( /* mark/clear buffer error flag */ 1017xfs_buf_ioerror(
1084 xfs_buf_t *pb, /* buffer to mark */ 1018 xfs_buf_t *bp,
1085 int error) /* error to store (0 if none) */ 1019 int error)
1086{ 1020{
1087 ASSERT(error >= 0 && error <= 0xffff); 1021 ASSERT(error >= 0 && error <= 0xffff);
1088 pb->pb_error = (unsigned short)error; 1022 bp->b_error = (unsigned short)error;
1089 PB_TRACE(pb, "ioerror", (unsigned long)error); 1023 XB_TRACE(bp, "ioerror", (unsigned long)error);
1090} 1024}
1091 1025
1092/* 1026/*
1093 * pagebuf_iostart 1027 * Initiate I/O on a buffer, based on the flags supplied.
1094 * 1028 * The b_iodone routine in the buffer supplied will only be called
1095 * pagebuf_iostart initiates I/O on a buffer, based on the flags supplied.
1096 * If necessary, it will arrange for any disk space allocation required,
1097 * and it will break up the request if the block mappings require it.
1098 * The pb_iodone routine in the buffer supplied will only be called
1099 * when all of the subsidiary I/O requests, if any, have been completed. 1029 * when all of the subsidiary I/O requests, if any, have been completed.
1100 * pagebuf_iostart calls the pagebuf_ioinitiate routine or
1101 * pagebuf_iorequest, if the former routine is not defined, to start
1102 * the I/O on a given low-level request.
1103 */ 1030 */
1104int 1031int
1105pagebuf_iostart( /* start I/O on a buffer */ 1032xfs_buf_iostart(
1106 xfs_buf_t *pb, /* buffer to start */ 1033 xfs_buf_t *bp,
1107 page_buf_flags_t flags) /* PBF_LOCK, PBF_ASYNC, PBF_READ, */ 1034 xfs_buf_flags_t flags)
1108 /* PBF_WRITE, PBF_DELWRI, */
1109 /* PBF_DONT_BLOCK */
1110{ 1035{
1111 int status = 0; 1036 int status = 0;
1112 1037
1113 PB_TRACE(pb, "iostart", (unsigned long)flags); 1038 XB_TRACE(bp, "iostart", (unsigned long)flags);
1114 1039
1115 if (flags & PBF_DELWRI) { 1040 if (flags & XBF_DELWRI) {
1116 pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC); 1041 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC);
1117 pb->pb_flags |= flags & (PBF_DELWRI | PBF_ASYNC); 1042 bp->b_flags |= flags & (XBF_DELWRI | XBF_ASYNC);
1118 pagebuf_delwri_queue(pb, 1); 1043 xfs_buf_delwri_queue(bp, 1);
1119 return status; 1044 return status;
1120 } 1045 }
1121 1046
1122 pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC | PBF_DELWRI | \ 1047 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \
1123 PBF_READ_AHEAD | _PBF_RUN_QUEUES); 1048 XBF_READ_AHEAD | _XBF_RUN_QUEUES);
1124 pb->pb_flags |= flags & (PBF_READ | PBF_WRITE | PBF_ASYNC | \ 1049 bp->b_flags |= flags & (XBF_READ | XBF_WRITE | XBF_ASYNC | \
1125 PBF_READ_AHEAD | _PBF_RUN_QUEUES); 1050 XBF_READ_AHEAD | _XBF_RUN_QUEUES);
1126 1051
1127 BUG_ON(pb->pb_bn == XFS_BUF_DADDR_NULL); 1052 BUG_ON(bp->b_bn == XFS_BUF_DADDR_NULL);
1128 1053
1129 /* For writes allow an alternate strategy routine to precede 1054 /* For writes allow an alternate strategy routine to precede
1130 * the actual I/O request (which may not be issued at all in 1055 * the actual I/O request (which may not be issued at all in
1131 * a shutdown situation, for example). 1056 * a shutdown situation, for example).
1132 */ 1057 */
1133 status = (flags & PBF_WRITE) ? 1058 status = (flags & XBF_WRITE) ?
1134 pagebuf_iostrategy(pb) : pagebuf_iorequest(pb); 1059 xfs_buf_iostrategy(bp) : xfs_buf_iorequest(bp);
1135 1060
1136 /* Wait for I/O if we are not an async request. 1061 /* Wait for I/O if we are not an async request.
1137 * Note: async I/O request completion will release the buffer, 1062 * Note: async I/O request completion will release the buffer,
1138 * and that can already be done by this point. So using the 1063 * and that can already be done by this point. So using the
1139 * buffer pointer from here on, after async I/O, is invalid. 1064 * buffer pointer from here on, after async I/O, is invalid.
1140 */ 1065 */
1141 if (!status && !(flags & PBF_ASYNC)) 1066 if (!status && !(flags & XBF_ASYNC))
1142 status = pagebuf_iowait(pb); 1067 status = xfs_buf_iowait(bp);
1143 1068
1144 return status; 1069 return status;
1145} 1070}
1146 1071
1147/*
1148 * Helper routine for pagebuf_iorequest
1149 */
1150
1151STATIC __inline__ int 1072STATIC __inline__ int
1152_pagebuf_iolocked( 1073_xfs_buf_iolocked(
1153 xfs_buf_t *pb) 1074 xfs_buf_t *bp)
1154{ 1075{
1155 ASSERT(pb->pb_flags & (PBF_READ|PBF_WRITE)); 1076 ASSERT(bp->b_flags & (XBF_READ | XBF_WRITE));
1156 if (pb->pb_flags & PBF_READ) 1077 if (bp->b_flags & XBF_READ)
1157 return pb->pb_locked; 1078 return bp->b_locked;
1158 return 0; 1079 return 0;
1159} 1080}
1160 1081
1161STATIC __inline__ void 1082STATIC __inline__ void
1162_pagebuf_iodone( 1083_xfs_buf_ioend(
1163 xfs_buf_t *pb, 1084 xfs_buf_t *bp,
1164 int schedule) 1085 int schedule)
1165{ 1086{
1166 if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) { 1087 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
1167 pb->pb_locked = 0; 1088 bp->b_locked = 0;
1168 pagebuf_iodone(pb, schedule); 1089 xfs_buf_ioend(bp, schedule);
1169 } 1090 }
1170} 1091}
1171 1092
1172STATIC int 1093STATIC int
1173bio_end_io_pagebuf( 1094xfs_buf_bio_end_io(
1174 struct bio *bio, 1095 struct bio *bio,
1175 unsigned int bytes_done, 1096 unsigned int bytes_done,
1176 int error) 1097 int error)
1177{ 1098{
1178 xfs_buf_t *pb = (xfs_buf_t *)bio->bi_private; 1099 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private;
1179 unsigned int blocksize = pb->pb_target->pbr_bsize; 1100 unsigned int blocksize = bp->b_target->bt_bsize;
1180 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1101 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1181 1102
1182 if (bio->bi_size) 1103 if (bio->bi_size)
1183 return 1; 1104 return 1;
1184 1105
1185 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 1106 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1186 pb->pb_error = EIO; 1107 bp->b_error = EIO;
1187 1108
1188 do { 1109 do {
1189 struct page *page = bvec->bv_page; 1110 struct page *page = bvec->bv_page;
1190 1111
1191 if (unlikely(pb->pb_error)) { 1112 if (unlikely(bp->b_error)) {
1192 if (pb->pb_flags & PBF_READ) 1113 if (bp->b_flags & XBF_READ)
1193 ClearPageUptodate(page); 1114 ClearPageUptodate(page);
1194 SetPageError(page); 1115 SetPageError(page);
1195 } else if (blocksize == PAGE_CACHE_SIZE) { 1116 } else if (blocksize >= PAGE_CACHE_SIZE) {
1196 SetPageUptodate(page); 1117 SetPageUptodate(page);
1197 } else if (!PagePrivate(page) && 1118 } else if (!PagePrivate(page) &&
1198 (pb->pb_flags & _PBF_PAGE_CACHE)) { 1119 (bp->b_flags & _XBF_PAGE_CACHE)) {
1199 set_page_region(page, bvec->bv_offset, bvec->bv_len); 1120 set_page_region(page, bvec->bv_offset, bvec->bv_len);
1200 } 1121 }
1201 1122
1202 if (--bvec >= bio->bi_io_vec) 1123 if (--bvec >= bio->bi_io_vec)
1203 prefetchw(&bvec->bv_page->flags); 1124 prefetchw(&bvec->bv_page->flags);
1204 1125
1205 if (_pagebuf_iolocked(pb)) { 1126 if (_xfs_buf_iolocked(bp)) {
1206 unlock_page(page); 1127 unlock_page(page);
1207 } 1128 }
1208 } while (bvec >= bio->bi_io_vec); 1129 } while (bvec >= bio->bi_io_vec);
1209 1130
1210 _pagebuf_iodone(pb, 1); 1131 _xfs_buf_ioend(bp, 1);
1211 bio_put(bio); 1132 bio_put(bio);
1212 return 0; 1133 return 0;
1213} 1134}
1214 1135
1215STATIC void 1136STATIC void
1216_pagebuf_ioapply( 1137_xfs_buf_ioapply(
1217 xfs_buf_t *pb) 1138 xfs_buf_t *bp)
1218{ 1139{
1219 int i, rw, map_i, total_nr_pages, nr_pages; 1140 int i, rw, map_i, total_nr_pages, nr_pages;
1220 struct bio *bio; 1141 struct bio *bio;
1221 int offset = pb->pb_offset; 1142 int offset = bp->b_offset;
1222 int size = pb->pb_count_desired; 1143 int size = bp->b_count_desired;
1223 sector_t sector = pb->pb_bn; 1144 sector_t sector = bp->b_bn;
1224 unsigned int blocksize = pb->pb_target->pbr_bsize; 1145 unsigned int blocksize = bp->b_target->bt_bsize;
1225 int locking = _pagebuf_iolocked(pb); 1146 int locking = _xfs_buf_iolocked(bp);
1226 1147
1227 total_nr_pages = pb->pb_page_count; 1148 total_nr_pages = bp->b_page_count;
1228 map_i = 0; 1149 map_i = 0;
1229 1150
1230 if (pb->pb_flags & _PBF_RUN_QUEUES) { 1151 if (bp->b_flags & _XBF_RUN_QUEUES) {
1231 pb->pb_flags &= ~_PBF_RUN_QUEUES; 1152 bp->b_flags &= ~_XBF_RUN_QUEUES;
1232 rw = (pb->pb_flags & PBF_READ) ? READ_SYNC : WRITE_SYNC; 1153 rw = (bp->b_flags & XBF_READ) ? READ_SYNC : WRITE_SYNC;
1233 } else { 1154 } else {
1234 rw = (pb->pb_flags & PBF_READ) ? READ : WRITE; 1155 rw = (bp->b_flags & XBF_READ) ? READ : WRITE;
1235 } 1156 }
1236 1157
1237 if (pb->pb_flags & PBF_ORDERED) { 1158 if (bp->b_flags & XBF_ORDERED) {
1238 ASSERT(!(pb->pb_flags & PBF_READ)); 1159 ASSERT(!(bp->b_flags & XBF_READ));
1239 rw = WRITE_BARRIER; 1160 rw = WRITE_BARRIER;
1240 } 1161 }
1241 1162
1242 /* Special code path for reading a sub page size pagebuf in -- 1163 /* Special code path for reading a sub page size buffer in --
1243 * we populate up the whole page, and hence the other metadata 1164 * we populate up the whole page, and hence the other metadata
1244 * in the same page. This optimization is only valid when the 1165 * in the same page. This optimization is only valid when the
1245 * filesystem block size and the page size are equal. 1166 * filesystem block size is not smaller than the page size.
1246 */ 1167 */
1247 if ((pb->pb_buffer_length < PAGE_CACHE_SIZE) && 1168 if ((bp->b_buffer_length < PAGE_CACHE_SIZE) &&
1248 (pb->pb_flags & PBF_READ) && locking && 1169 (bp->b_flags & XBF_READ) && locking &&
1249 (blocksize == PAGE_CACHE_SIZE)) { 1170 (blocksize >= PAGE_CACHE_SIZE)) {
1250 bio = bio_alloc(GFP_NOIO, 1); 1171 bio = bio_alloc(GFP_NOIO, 1);
1251 1172
1252 bio->bi_bdev = pb->pb_target->pbr_bdev; 1173 bio->bi_bdev = bp->b_target->bt_bdev;
1253 bio->bi_sector = sector - (offset >> BBSHIFT); 1174 bio->bi_sector = sector - (offset >> BBSHIFT);
1254 bio->bi_end_io = bio_end_io_pagebuf; 1175 bio->bi_end_io = xfs_buf_bio_end_io;
1255 bio->bi_private = pb; 1176 bio->bi_private = bp;
1256 1177
1257 bio_add_page(bio, pb->pb_pages[0], PAGE_CACHE_SIZE, 0); 1178 bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0);
1258 size = 0; 1179 size = 0;
1259 1180
1260 atomic_inc(&pb->pb_io_remaining); 1181 atomic_inc(&bp->b_io_remaining);
1261 1182
1262 goto submit_io; 1183 goto submit_io;
1263 } 1184 }
1264 1185
1265 /* Lock down the pages which we need to for the request */ 1186 /* Lock down the pages which we need to for the request */
1266 if (locking && (pb->pb_flags & PBF_WRITE) && (pb->pb_locked == 0)) { 1187 if (locking && (bp->b_flags & XBF_WRITE) && (bp->b_locked == 0)) {
1267 for (i = 0; size; i++) { 1188 for (i = 0; size; i++) {
1268 int nbytes = PAGE_CACHE_SIZE - offset; 1189 int nbytes = PAGE_CACHE_SIZE - offset;
1269 struct page *page = pb->pb_pages[i]; 1190 struct page *page = bp->b_pages[i];
1270 1191
1271 if (nbytes > size) 1192 if (nbytes > size)
1272 nbytes = size; 1193 nbytes = size;
@@ -1276,30 +1197,30 @@ _pagebuf_ioapply(
1276 size -= nbytes; 1197 size -= nbytes;
1277 offset = 0; 1198 offset = 0;
1278 } 1199 }
1279 offset = pb->pb_offset; 1200 offset = bp->b_offset;
1280 size = pb->pb_count_desired; 1201 size = bp->b_count_desired;
1281 } 1202 }
1282 1203
1283next_chunk: 1204next_chunk:
1284 atomic_inc(&pb->pb_io_remaining); 1205 atomic_inc(&bp->b_io_remaining);
1285 nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); 1206 nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
1286 if (nr_pages > total_nr_pages) 1207 if (nr_pages > total_nr_pages)
1287 nr_pages = total_nr_pages; 1208 nr_pages = total_nr_pages;
1288 1209
1289 bio = bio_alloc(GFP_NOIO, nr_pages); 1210 bio = bio_alloc(GFP_NOIO, nr_pages);
1290 bio->bi_bdev = pb->pb_target->pbr_bdev; 1211 bio->bi_bdev = bp->b_target->bt_bdev;
1291 bio->bi_sector = sector; 1212 bio->bi_sector = sector;
1292 bio->bi_end_io = bio_end_io_pagebuf; 1213 bio->bi_end_io = xfs_buf_bio_end_io;
1293 bio->bi_private = pb; 1214 bio->bi_private = bp;
1294 1215
1295 for (; size && nr_pages; nr_pages--, map_i++) { 1216 for (; size && nr_pages; nr_pages--, map_i++) {
1296 int nbytes = PAGE_CACHE_SIZE - offset; 1217 int rbytes, nbytes = PAGE_CACHE_SIZE - offset;
1297 1218
1298 if (nbytes > size) 1219 if (nbytes > size)
1299 nbytes = size; 1220 nbytes = size;
1300 1221
1301 if (bio_add_page(bio, pb->pb_pages[map_i], 1222 rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset);
1302 nbytes, offset) < nbytes) 1223 if (rbytes < nbytes)
1303 break; 1224 break;
1304 1225
1305 offset = 0; 1226 offset = 0;
@@ -1315,107 +1236,102 @@ submit_io:
1315 goto next_chunk; 1236 goto next_chunk;
1316 } else { 1237 } else {
1317 bio_put(bio); 1238 bio_put(bio);
1318 pagebuf_ioerror(pb, EIO); 1239 xfs_buf_ioerror(bp, EIO);
1319 } 1240 }
1320} 1241}
1321 1242
1322/*
1323 * pagebuf_iorequest -- the core I/O request routine.
1324 */
1325int 1243int
1326pagebuf_iorequest( /* start real I/O */ 1244xfs_buf_iorequest(
1327 xfs_buf_t *pb) /* buffer to convey to device */ 1245 xfs_buf_t *bp)
1328{ 1246{
1329 PB_TRACE(pb, "iorequest", 0); 1247 XB_TRACE(bp, "iorequest", 0);
1330 1248
1331 if (pb->pb_flags & PBF_DELWRI) { 1249 if (bp->b_flags & XBF_DELWRI) {
1332 pagebuf_delwri_queue(pb, 1); 1250 xfs_buf_delwri_queue(bp, 1);
1333 return 0; 1251 return 0;
1334 } 1252 }
1335 1253
1336 if (pb->pb_flags & PBF_WRITE) { 1254 if (bp->b_flags & XBF_WRITE) {
1337 _pagebuf_wait_unpin(pb); 1255 xfs_buf_wait_unpin(bp);
1338 } 1256 }
1339 1257
1340 pagebuf_hold(pb); 1258 xfs_buf_hold(bp);
1341 1259
1342 /* Set the count to 1 initially, this will stop an I/O 1260 /* Set the count to 1 initially, this will stop an I/O
1343 * completion callout which happens before we have started 1261 * completion callout which happens before we have started
1344 * all the I/O from calling pagebuf_iodone too early. 1262 * all the I/O from calling xfs_buf_ioend too early.
1345 */ 1263 */
1346 atomic_set(&pb->pb_io_remaining, 1); 1264 atomic_set(&bp->b_io_remaining, 1);
1347 _pagebuf_ioapply(pb); 1265 _xfs_buf_ioapply(bp);
1348 _pagebuf_iodone(pb, 0); 1266 _xfs_buf_ioend(bp, 0);
1349 1267
1350 pagebuf_rele(pb); 1268 xfs_buf_rele(bp);
1351 return 0; 1269 return 0;
1352} 1270}
1353 1271
1354/* 1272/*
1355 * pagebuf_iowait 1273 * Waits for I/O to complete on the buffer supplied.
1356 * 1274 * It returns immediately if no I/O is pending.
1357 * pagebuf_iowait waits for I/O to complete on the buffer supplied. 1275 * It returns the I/O error code, if any, or 0 if there was no error.
1358 * It returns immediately if no I/O is pending. In any case, it returns
1359 * the error code, if any, or 0 if there is no error.
1360 */ 1276 */
1361int 1277int
1362pagebuf_iowait( 1278xfs_buf_iowait(
1363 xfs_buf_t *pb) 1279 xfs_buf_t *bp)
1364{ 1280{
1365 PB_TRACE(pb, "iowait", 0); 1281 XB_TRACE(bp, "iowait", 0);
1366 if (atomic_read(&pb->pb_io_remaining)) 1282 if (atomic_read(&bp->b_io_remaining))
1367 blk_run_address_space(pb->pb_target->pbr_mapping); 1283 blk_run_address_space(bp->b_target->bt_mapping);
1368 down(&pb->pb_iodonesema); 1284 down(&bp->b_iodonesema);
1369 PB_TRACE(pb, "iowaited", (long)pb->pb_error); 1285 XB_TRACE(bp, "iowaited", (long)bp->b_error);
1370 return pb->pb_error; 1286 return bp->b_error;
1371} 1287}
1372 1288
1373caddr_t 1289xfs_caddr_t
1374pagebuf_offset( 1290xfs_buf_offset(
1375 xfs_buf_t *pb, 1291 xfs_buf_t *bp,
1376 size_t offset) 1292 size_t offset)
1377{ 1293{
1378 struct page *page; 1294 struct page *page;
1379 1295
1380 offset += pb->pb_offset; 1296 if (bp->b_flags & XBF_MAPPED)
1297 return XFS_BUF_PTR(bp) + offset;
1381 1298
1382 page = pb->pb_pages[offset >> PAGE_CACHE_SHIFT]; 1299 offset += bp->b_offset;
1383 return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1)); 1300 page = bp->b_pages[offset >> PAGE_CACHE_SHIFT];
1301 return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1));
1384} 1302}
1385 1303
1386/* 1304/*
1387 * pagebuf_iomove
1388 *
1389 * Move data into or out of a buffer. 1305 * Move data into or out of a buffer.
1390 */ 1306 */
1391void 1307void
1392pagebuf_iomove( 1308xfs_buf_iomove(
1393 xfs_buf_t *pb, /* buffer to process */ 1309 xfs_buf_t *bp, /* buffer to process */
1394 size_t boff, /* starting buffer offset */ 1310 size_t boff, /* starting buffer offset */
1395 size_t bsize, /* length to copy */ 1311 size_t bsize, /* length to copy */
1396 caddr_t data, /* data address */ 1312 caddr_t data, /* data address */
1397 page_buf_rw_t mode) /* read/write flag */ 1313 xfs_buf_rw_t mode) /* read/write/zero flag */
1398{ 1314{
1399 size_t bend, cpoff, csize; 1315 size_t bend, cpoff, csize;
1400 struct page *page; 1316 struct page *page;
1401 1317
1402 bend = boff + bsize; 1318 bend = boff + bsize;
1403 while (boff < bend) { 1319 while (boff < bend) {
1404 page = pb->pb_pages[page_buf_btoct(boff + pb->pb_offset)]; 1320 page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
1405 cpoff = page_buf_poff(boff + pb->pb_offset); 1321 cpoff = xfs_buf_poff(boff + bp->b_offset);
1406 csize = min_t(size_t, 1322 csize = min_t(size_t,
1407 PAGE_CACHE_SIZE-cpoff, pb->pb_count_desired-boff); 1323 PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff);
1408 1324
1409 ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE)); 1325 ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));
1410 1326
1411 switch (mode) { 1327 switch (mode) {
1412 case PBRW_ZERO: 1328 case XBRW_ZERO:
1413 memset(page_address(page) + cpoff, 0, csize); 1329 memset(page_address(page) + cpoff, 0, csize);
1414 break; 1330 break;
1415 case PBRW_READ: 1331 case XBRW_READ:
1416 memcpy(data, page_address(page) + cpoff, csize); 1332 memcpy(data, page_address(page) + cpoff, csize);
1417 break; 1333 break;
1418 case PBRW_WRITE: 1334 case XBRW_WRITE:
1419 memcpy(page_address(page) + cpoff, data, csize); 1335 memcpy(page_address(page) + cpoff, data, csize);
1420 } 1336 }
1421 1337
@@ -1425,12 +1341,12 @@ pagebuf_iomove(
1425} 1341}
1426 1342
1427/* 1343/*
1428 * Handling of buftargs. 1344 * Handling of buffer targets (buftargs).
1429 */ 1345 */
1430 1346
1431/* 1347/*
1432 * Wait for any bufs with callbacks that have been submitted but 1348 * Wait for any bufs with callbacks that have been submitted but
1433 * have not yet returned... walk the hash list for the target. 1349 * have not yet returned... walk the hash list for the target.
1434 */ 1350 */
1435void 1351void
1436xfs_wait_buftarg( 1352xfs_wait_buftarg(
@@ -1444,15 +1360,15 @@ xfs_wait_buftarg(
1444 hash = &btp->bt_hash[i]; 1360 hash = &btp->bt_hash[i];
1445again: 1361again:
1446 spin_lock(&hash->bh_lock); 1362 spin_lock(&hash->bh_lock);
1447 list_for_each_entry_safe(bp, n, &hash->bh_list, pb_hash_list) { 1363 list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
1448 ASSERT(btp == bp->pb_target); 1364 ASSERT(btp == bp->b_target);
1449 if (!(bp->pb_flags & PBF_FS_MANAGED)) { 1365 if (!(bp->b_flags & XBF_FS_MANAGED)) {
1450 spin_unlock(&hash->bh_lock); 1366 spin_unlock(&hash->bh_lock);
1451 /* 1367 /*
1452 * Catch superblock reference count leaks 1368 * Catch superblock reference count leaks
1453 * immediately 1369 * immediately
1454 */ 1370 */
1455 BUG_ON(bp->pb_bn == 0); 1371 BUG_ON(bp->b_bn == 0);
1456 delay(100); 1372 delay(100);
1457 goto again; 1373 goto again;
1458 } 1374 }
@@ -1462,9 +1378,9 @@ again:
1462} 1378}
1463 1379
1464/* 1380/*
1465 * Allocate buffer hash table for a given target. 1381 * Allocate buffer hash table for a given target.
1466 * For devices containing metadata (i.e. not the log/realtime devices) 1382 * For devices containing metadata (i.e. not the log/realtime devices)
1467 * we need to allocate a much larger hash table. 1383 * we need to allocate a much larger hash table.
1468 */ 1384 */
1469STATIC void 1385STATIC void
1470xfs_alloc_bufhash( 1386xfs_alloc_bufhash(
@@ -1487,11 +1403,34 @@ STATIC void
1487xfs_free_bufhash( 1403xfs_free_bufhash(
1488 xfs_buftarg_t *btp) 1404 xfs_buftarg_t *btp)
1489{ 1405{
1490 kmem_free(btp->bt_hash, 1406 kmem_free(btp->bt_hash, (1<<btp->bt_hashshift) * sizeof(xfs_bufhash_t));
1491 (1 << btp->bt_hashshift) * sizeof(xfs_bufhash_t));
1492 btp->bt_hash = NULL; 1407 btp->bt_hash = NULL;
1493} 1408}
1494 1409
1410/*
1411 * buftarg list for delwrite queue processing
1412 */
1413STATIC LIST_HEAD(xfs_buftarg_list);
1414STATIC DEFINE_SPINLOCK(xfs_buftarg_lock);
1415
1416STATIC void
1417xfs_register_buftarg(
1418 xfs_buftarg_t *btp)
1419{
1420 spin_lock(&xfs_buftarg_lock);
1421 list_add(&btp->bt_list, &xfs_buftarg_list);
1422 spin_unlock(&xfs_buftarg_lock);
1423}
1424
1425STATIC void
1426xfs_unregister_buftarg(
1427 xfs_buftarg_t *btp)
1428{
1429 spin_lock(&xfs_buftarg_lock);
1430 list_del(&btp->bt_list);
1431 spin_unlock(&xfs_buftarg_lock);
1432}
1433
1495void 1434void
1496xfs_free_buftarg( 1435xfs_free_buftarg(
1497 xfs_buftarg_t *btp, 1436 xfs_buftarg_t *btp,
@@ -1499,9 +1438,16 @@ xfs_free_buftarg(
1499{ 1438{
1500 xfs_flush_buftarg(btp, 1); 1439 xfs_flush_buftarg(btp, 1);
1501 if (external) 1440 if (external)
1502 xfs_blkdev_put(btp->pbr_bdev); 1441 xfs_blkdev_put(btp->bt_bdev);
1503 xfs_free_bufhash(btp); 1442 xfs_free_bufhash(btp);
1504 iput(btp->pbr_mapping->host); 1443 iput(btp->bt_mapping->host);
1444
1445 /* Unregister the buftarg first so that we don't get a
1446 * wakeup finding a non-existent task
1447 */
1448 xfs_unregister_buftarg(btp);
1449 kthread_stop(btp->bt_task);
1450
1505 kmem_free(btp, sizeof(*btp)); 1451 kmem_free(btp, sizeof(*btp));
1506} 1452}
1507 1453
@@ -1512,11 +1458,11 @@ xfs_setsize_buftarg_flags(
1512 unsigned int sectorsize, 1458 unsigned int sectorsize,
1513 int verbose) 1459 int verbose)
1514{ 1460{
1515 btp->pbr_bsize = blocksize; 1461 btp->bt_bsize = blocksize;
1516 btp->pbr_sshift = ffs(sectorsize) - 1; 1462 btp->bt_sshift = ffs(sectorsize) - 1;
1517 btp->pbr_smask = sectorsize - 1; 1463 btp->bt_smask = sectorsize - 1;
1518 1464
1519 if (set_blocksize(btp->pbr_bdev, sectorsize)) { 1465 if (set_blocksize(btp->bt_bdev, sectorsize)) {
1520 printk(KERN_WARNING 1466 printk(KERN_WARNING
1521 "XFS: Cannot set_blocksize to %u on device %s\n", 1467 "XFS: Cannot set_blocksize to %u on device %s\n",
1522 sectorsize, XFS_BUFTARG_NAME(btp)); 1468 sectorsize, XFS_BUFTARG_NAME(btp));
@@ -1536,10 +1482,10 @@ xfs_setsize_buftarg_flags(
1536} 1482}
1537 1483
1538/* 1484/*
1539* When allocating the initial buffer target we have not yet 1485 * When allocating the initial buffer target we have not yet
1540* read in the superblock, so don't know what sized sectors 1486 * read in the superblock, so don't know what sized sectors
1541* are being used is at this early stage. Play safe. 1487 * are being used is at this early stage. Play safe.
1542*/ 1488 */
1543STATIC int 1489STATIC int
1544xfs_setsize_buftarg_early( 1490xfs_setsize_buftarg_early(
1545 xfs_buftarg_t *btp, 1491 xfs_buftarg_t *btp,
@@ -1587,10 +1533,30 @@ xfs_mapping_buftarg(
1587 mapping->a_ops = &mapping_aops; 1533 mapping->a_ops = &mapping_aops;
1588 mapping->backing_dev_info = bdi; 1534 mapping->backing_dev_info = bdi;
1589 mapping_set_gfp_mask(mapping, GFP_NOFS); 1535 mapping_set_gfp_mask(mapping, GFP_NOFS);
1590 btp->pbr_mapping = mapping; 1536 btp->bt_mapping = mapping;
1591 return 0; 1537 return 0;
1592} 1538}
1593 1539
1540STATIC int
1541xfs_alloc_delwrite_queue(
1542 xfs_buftarg_t *btp)
1543{
1544 int error = 0;
1545
1546 INIT_LIST_HEAD(&btp->bt_list);
1547 INIT_LIST_HEAD(&btp->bt_delwrite_queue);
1548 spinlock_init(&btp->bt_delwrite_lock, "delwri_lock");
1549 btp->bt_flags = 0;
1550 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd");
1551 if (IS_ERR(btp->bt_task)) {
1552 error = PTR_ERR(btp->bt_task);
1553 goto out_error;
1554 }
1555 xfs_register_buftarg(btp);
1556out_error:
1557 return error;
1558}
1559
1594xfs_buftarg_t * 1560xfs_buftarg_t *
1595xfs_alloc_buftarg( 1561xfs_alloc_buftarg(
1596 struct block_device *bdev, 1562 struct block_device *bdev,
@@ -1600,12 +1566,14 @@ xfs_alloc_buftarg(
1600 1566
1601 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); 1567 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
1602 1568
1603 btp->pbr_dev = bdev->bd_dev; 1569 btp->bt_dev = bdev->bd_dev;
1604 btp->pbr_bdev = bdev; 1570 btp->bt_bdev = bdev;
1605 if (xfs_setsize_buftarg_early(btp, bdev)) 1571 if (xfs_setsize_buftarg_early(btp, bdev))
1606 goto error; 1572 goto error;
1607 if (xfs_mapping_buftarg(btp, bdev)) 1573 if (xfs_mapping_buftarg(btp, bdev))
1608 goto error; 1574 goto error;
1575 if (xfs_alloc_delwrite_queue(btp))
1576 goto error;
1609 xfs_alloc_bufhash(btp, external); 1577 xfs_alloc_bufhash(btp, external);
1610 return btp; 1578 return btp;
1611 1579
@@ -1616,83 +1584,81 @@ error:
1616 1584
1617 1585
1618/* 1586/*
1619 * Pagebuf delayed write buffer handling 1587 * Delayed write buffer handling
1620 */ 1588 */
1621
1622STATIC LIST_HEAD(pbd_delwrite_queue);
1623STATIC DEFINE_SPINLOCK(pbd_delwrite_lock);
1624
1625STATIC void 1589STATIC void
1626pagebuf_delwri_queue( 1590xfs_buf_delwri_queue(
1627 xfs_buf_t *pb, 1591 xfs_buf_t *bp,
1628 int unlock) 1592 int unlock)
1629{ 1593{
1630 PB_TRACE(pb, "delwri_q", (long)unlock); 1594 struct list_head *dwq = &bp->b_target->bt_delwrite_queue;
1631 ASSERT((pb->pb_flags & (PBF_DELWRI|PBF_ASYNC)) == 1595 spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock;
1632 (PBF_DELWRI|PBF_ASYNC)); 1596
1597 XB_TRACE(bp, "delwri_q", (long)unlock);
1598 ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC));
1633 1599
1634 spin_lock(&pbd_delwrite_lock); 1600 spin_lock(dwlk);
1635 /* If already in the queue, dequeue and place at tail */ 1601 /* If already in the queue, dequeue and place at tail */
1636 if (!list_empty(&pb->pb_list)) { 1602 if (!list_empty(&bp->b_list)) {
1637 ASSERT(pb->pb_flags & _PBF_DELWRI_Q); 1603 ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1638 if (unlock) { 1604 if (unlock)
1639 atomic_dec(&pb->pb_hold); 1605 atomic_dec(&bp->b_hold);
1640 } 1606 list_del(&bp->b_list);
1641 list_del(&pb->pb_list);
1642 } 1607 }
1643 1608
1644 pb->pb_flags |= _PBF_DELWRI_Q; 1609 bp->b_flags |= _XBF_DELWRI_Q;
1645 list_add_tail(&pb->pb_list, &pbd_delwrite_queue); 1610 list_add_tail(&bp->b_list, dwq);
1646 pb->pb_queuetime = jiffies; 1611 bp->b_queuetime = jiffies;
1647 spin_unlock(&pbd_delwrite_lock); 1612 spin_unlock(dwlk);
1648 1613
1649 if (unlock) 1614 if (unlock)
1650 pagebuf_unlock(pb); 1615 xfs_buf_unlock(bp);
1651} 1616}
1652 1617
1653void 1618void
1654pagebuf_delwri_dequeue( 1619xfs_buf_delwri_dequeue(
1655 xfs_buf_t *pb) 1620 xfs_buf_t *bp)
1656{ 1621{
1622 spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock;
1657 int dequeued = 0; 1623 int dequeued = 0;
1658 1624
1659 spin_lock(&pbd_delwrite_lock); 1625 spin_lock(dwlk);
1660 if ((pb->pb_flags & PBF_DELWRI) && !list_empty(&pb->pb_list)) { 1626 if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {
1661 ASSERT(pb->pb_flags & _PBF_DELWRI_Q); 1627 ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1662 list_del_init(&pb->pb_list); 1628 list_del_init(&bp->b_list);
1663 dequeued = 1; 1629 dequeued = 1;
1664 } 1630 }
1665 pb->pb_flags &= ~(PBF_DELWRI|_PBF_DELWRI_Q); 1631 bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
1666 spin_unlock(&pbd_delwrite_lock); 1632 spin_unlock(dwlk);
1667 1633
1668 if (dequeued) 1634 if (dequeued)
1669 pagebuf_rele(pb); 1635 xfs_buf_rele(bp);
1670 1636
1671 PB_TRACE(pb, "delwri_dq", (long)dequeued); 1637 XB_TRACE(bp, "delwri_dq", (long)dequeued);
1672} 1638}
1673 1639
1674STATIC void 1640STATIC void
1675pagebuf_runall_queues( 1641xfs_buf_runall_queues(
1676 struct workqueue_struct *queue) 1642 struct workqueue_struct *queue)
1677{ 1643{
1678 flush_workqueue(queue); 1644 flush_workqueue(queue);
1679} 1645}
1680 1646
1681/* Defines for pagebuf daemon */
1682STATIC struct task_struct *xfsbufd_task;
1683STATIC int xfsbufd_force_flush;
1684STATIC int xfsbufd_force_sleep;
1685
1686STATIC int 1647STATIC int
1687xfsbufd_wakeup( 1648xfsbufd_wakeup(
1688 int priority, 1649 int priority,
1689 gfp_t mask) 1650 gfp_t mask)
1690{ 1651{
1691 if (xfsbufd_force_sleep) 1652 xfs_buftarg_t *btp;
1692 return 0; 1653
1693 xfsbufd_force_flush = 1; 1654 spin_lock(&xfs_buftarg_lock);
1694 barrier(); 1655 list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
1695 wake_up_process(xfsbufd_task); 1656 if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
1657 continue;
1658 set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
1659 wake_up_process(btp->bt_task);
1660 }
1661 spin_unlock(&xfs_buftarg_lock);
1696 return 0; 1662 return 0;
1697} 1663}
1698 1664
@@ -1702,67 +1668,70 @@ xfsbufd(
1702{ 1668{
1703 struct list_head tmp; 1669 struct list_head tmp;
1704 unsigned long age; 1670 unsigned long age;
1705 xfs_buftarg_t *target; 1671 xfs_buftarg_t *target = (xfs_buftarg_t *)data;
1706 xfs_buf_t *pb, *n; 1672 xfs_buf_t *bp, *n;
1673 struct list_head *dwq = &target->bt_delwrite_queue;
1674 spinlock_t *dwlk = &target->bt_delwrite_lock;
1707 1675
1708 current->flags |= PF_MEMALLOC; 1676 current->flags |= PF_MEMALLOC;
1709 1677
1710 INIT_LIST_HEAD(&tmp); 1678 INIT_LIST_HEAD(&tmp);
1711 do { 1679 do {
1712 if (unlikely(freezing(current))) { 1680 if (unlikely(freezing(current))) {
1713 xfsbufd_force_sleep = 1; 1681 set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
1714 refrigerator(); 1682 refrigerator();
1715 } else { 1683 } else {
1716 xfsbufd_force_sleep = 0; 1684 clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
1717 } 1685 }
1718 1686
1719 schedule_timeout_interruptible( 1687 schedule_timeout_interruptible(
1720 xfs_buf_timer_centisecs * msecs_to_jiffies(10)); 1688 xfs_buf_timer_centisecs * msecs_to_jiffies(10));
1721 1689
1722 age = xfs_buf_age_centisecs * msecs_to_jiffies(10); 1690 age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
1723 spin_lock(&pbd_delwrite_lock); 1691 spin_lock(dwlk);
1724 list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) { 1692 list_for_each_entry_safe(bp, n, dwq, b_list) {
1725 PB_TRACE(pb, "walkq1", (long)pagebuf_ispin(pb)); 1693 XB_TRACE(bp, "walkq1", (long)xfs_buf_ispin(bp));
1726 ASSERT(pb->pb_flags & PBF_DELWRI); 1694 ASSERT(bp->b_flags & XBF_DELWRI);
1727 1695
1728 if (!pagebuf_ispin(pb) && !pagebuf_cond_lock(pb)) { 1696 if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) {
1729 if (!xfsbufd_force_flush && 1697 if (!test_bit(XBT_FORCE_FLUSH,
1698 &target->bt_flags) &&
1730 time_before(jiffies, 1699 time_before(jiffies,
1731 pb->pb_queuetime + age)) { 1700 bp->b_queuetime + age)) {
1732 pagebuf_unlock(pb); 1701 xfs_buf_unlock(bp);
1733 break; 1702 break;
1734 } 1703 }
1735 1704
1736 pb->pb_flags &= ~(PBF_DELWRI|_PBF_DELWRI_Q); 1705 bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
1737 pb->pb_flags |= PBF_WRITE; 1706 bp->b_flags |= XBF_WRITE;
1738 list_move(&pb->pb_list, &tmp); 1707 list_move(&bp->b_list, &tmp);
1739 } 1708 }
1740 } 1709 }
1741 spin_unlock(&pbd_delwrite_lock); 1710 spin_unlock(dwlk);
1742 1711
1743 while (!list_empty(&tmp)) { 1712 while (!list_empty(&tmp)) {
1744 pb = list_entry(tmp.next, xfs_buf_t, pb_list); 1713 bp = list_entry(tmp.next, xfs_buf_t, b_list);
1745 target = pb->pb_target; 1714 ASSERT(target == bp->b_target);
1746 1715
1747 list_del_init(&pb->pb_list); 1716 list_del_init(&bp->b_list);
1748 pagebuf_iostrategy(pb); 1717 xfs_buf_iostrategy(bp);
1749 1718
1750 blk_run_address_space(target->pbr_mapping); 1719 blk_run_address_space(target->bt_mapping);
1751 } 1720 }
1752 1721
1753 if (as_list_len > 0) 1722 if (as_list_len > 0)
1754 purge_addresses(); 1723 purge_addresses();
1755 1724
1756 xfsbufd_force_flush = 0; 1725 clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
1757 } while (!kthread_should_stop()); 1726 } while (!kthread_should_stop());
1758 1727
1759 return 0; 1728 return 0;
1760} 1729}
1761 1730
1762/* 1731/*
1763 * Go through all incore buffers, and release buffers if they belong to 1732 * Go through all incore buffers, and release buffers if they belong to
1764 * the given device. This is used in filesystem error handling to 1733 * the given device. This is used in filesystem error handling to
1765 * preserve the consistency of its metadata. 1734 * preserve the consistency of its metadata.
1766 */ 1735 */
1767int 1736int
1768xfs_flush_buftarg( 1737xfs_flush_buftarg(
@@ -1770,73 +1739,72 @@ xfs_flush_buftarg(
1770 int wait) 1739 int wait)
1771{ 1740{
1772 struct list_head tmp; 1741 struct list_head tmp;
1773 xfs_buf_t *pb, *n; 1742 xfs_buf_t *bp, *n;
1774 int pincount = 0; 1743 int pincount = 0;
1744 struct list_head *dwq = &target->bt_delwrite_queue;
1745 spinlock_t *dwlk = &target->bt_delwrite_lock;
1775 1746
1776 pagebuf_runall_queues(xfsdatad_workqueue); 1747 xfs_buf_runall_queues(xfsdatad_workqueue);
1777 pagebuf_runall_queues(xfslogd_workqueue); 1748 xfs_buf_runall_queues(xfslogd_workqueue);
1778 1749
1779 INIT_LIST_HEAD(&tmp); 1750 INIT_LIST_HEAD(&tmp);
1780 spin_lock(&pbd_delwrite_lock); 1751 spin_lock(dwlk);
1781 list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) { 1752 list_for_each_entry_safe(bp, n, dwq, b_list) {
1782 1753 ASSERT(bp->b_target == target);
1783 if (pb->pb_target != target) 1754 ASSERT(bp->b_flags & (XBF_DELWRI | _XBF_DELWRI_Q));
1784 continue; 1755 XB_TRACE(bp, "walkq2", (long)xfs_buf_ispin(bp));
1785 1756 if (xfs_buf_ispin(bp)) {
1786 ASSERT(pb->pb_flags & (PBF_DELWRI|_PBF_DELWRI_Q));
1787 PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb));
1788 if (pagebuf_ispin(pb)) {
1789 pincount++; 1757 pincount++;
1790 continue; 1758 continue;
1791 } 1759 }
1792 1760
1793 list_move(&pb->pb_list, &tmp); 1761 list_move(&bp->b_list, &tmp);
1794 } 1762 }
1795 spin_unlock(&pbd_delwrite_lock); 1763 spin_unlock(dwlk);
1796 1764
1797 /* 1765 /*
1798 * Dropped the delayed write list lock, now walk the temporary list 1766 * Dropped the delayed write list lock, now walk the temporary list
1799 */ 1767 */
1800 list_for_each_entry_safe(pb, n, &tmp, pb_list) { 1768 list_for_each_entry_safe(bp, n, &tmp, b_list) {
1801 pagebuf_lock(pb); 1769 xfs_buf_lock(bp);
1802 pb->pb_flags &= ~(PBF_DELWRI|_PBF_DELWRI_Q); 1770 bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
1803 pb->pb_flags |= PBF_WRITE; 1771 bp->b_flags |= XBF_WRITE;
1804 if (wait) 1772 if (wait)
1805 pb->pb_flags &= ~PBF_ASYNC; 1773 bp->b_flags &= ~XBF_ASYNC;
1806 else 1774 else
1807 list_del_init(&pb->pb_list); 1775 list_del_init(&bp->b_list);
1808 1776
1809 pagebuf_iostrategy(pb); 1777 xfs_buf_iostrategy(bp);
1810 } 1778 }
1811 1779
1812 /* 1780 /*
1813 * Remaining list items must be flushed before returning 1781 * Remaining list items must be flushed before returning
1814 */ 1782 */
1815 while (!list_empty(&tmp)) { 1783 while (!list_empty(&tmp)) {
1816 pb = list_entry(tmp.next, xfs_buf_t, pb_list); 1784 bp = list_entry(tmp.next, xfs_buf_t, b_list);
1817 1785
1818 list_del_init(&pb->pb_list); 1786 list_del_init(&bp->b_list);
1819 xfs_iowait(pb); 1787 xfs_iowait(bp);
1820 xfs_buf_relse(pb); 1788 xfs_buf_relse(bp);
1821 } 1789 }
1822 1790
1823 if (wait) 1791 if (wait)
1824 blk_run_address_space(target->pbr_mapping); 1792 blk_run_address_space(target->bt_mapping);
1825 1793
1826 return pincount; 1794 return pincount;
1827} 1795}
1828 1796
1829int __init 1797int __init
1830pagebuf_init(void) 1798xfs_buf_init(void)
1831{ 1799{
1832 int error = -ENOMEM; 1800 int error = -ENOMEM;
1833 1801
1834#ifdef PAGEBUF_TRACE 1802#ifdef XFS_BUF_TRACE
1835 pagebuf_trace_buf = ktrace_alloc(PAGEBUF_TRACE_SIZE, KM_SLEEP); 1803 xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_SLEEP);
1836#endif 1804#endif
1837 1805
1838 pagebuf_zone = kmem_zone_init(sizeof(xfs_buf_t), "xfs_buf"); 1806 xfs_buf_zone = kmem_zone_init(sizeof(xfs_buf_t), "xfs_buf");
1839 if (!pagebuf_zone) 1807 if (!xfs_buf_zone)
1840 goto out_free_trace_buf; 1808 goto out_free_trace_buf;
1841 1809
1842 xfslogd_workqueue = create_workqueue("xfslogd"); 1810 xfslogd_workqueue = create_workqueue("xfslogd");
@@ -1847,42 +1815,33 @@ pagebuf_init(void)
1847 if (!xfsdatad_workqueue) 1815 if (!xfsdatad_workqueue)
1848 goto out_destroy_xfslogd_workqueue; 1816 goto out_destroy_xfslogd_workqueue;
1849 1817
1850 xfsbufd_task = kthread_run(xfsbufd, NULL, "xfsbufd"); 1818 xfs_buf_shake = kmem_shake_register(xfsbufd_wakeup);
1851 if (IS_ERR(xfsbufd_task)) { 1819 if (!xfs_buf_shake)
1852 error = PTR_ERR(xfsbufd_task);
1853 goto out_destroy_xfsdatad_workqueue; 1820 goto out_destroy_xfsdatad_workqueue;
1854 }
1855
1856 pagebuf_shake = kmem_shake_register(xfsbufd_wakeup);
1857 if (!pagebuf_shake)
1858 goto out_stop_xfsbufd;
1859 1821
1860 return 0; 1822 return 0;
1861 1823
1862 out_stop_xfsbufd:
1863 kthread_stop(xfsbufd_task);
1864 out_destroy_xfsdatad_workqueue: 1824 out_destroy_xfsdatad_workqueue:
1865 destroy_workqueue(xfsdatad_workqueue); 1825 destroy_workqueue(xfsdatad_workqueue);
1866 out_destroy_xfslogd_workqueue: 1826 out_destroy_xfslogd_workqueue:
1867 destroy_workqueue(xfslogd_workqueue); 1827 destroy_workqueue(xfslogd_workqueue);
1868 out_free_buf_zone: 1828 out_free_buf_zone:
1869 kmem_zone_destroy(pagebuf_zone); 1829 kmem_zone_destroy(xfs_buf_zone);
1870 out_free_trace_buf: 1830 out_free_trace_buf:
1871#ifdef PAGEBUF_TRACE 1831#ifdef XFS_BUF_TRACE
1872 ktrace_free(pagebuf_trace_buf); 1832 ktrace_free(xfs_buf_trace_buf);
1873#endif 1833#endif
1874 return error; 1834 return error;
1875} 1835}
1876 1836
1877void 1837void
1878pagebuf_terminate(void) 1838xfs_buf_terminate(void)
1879{ 1839{
1880 kmem_shake_deregister(pagebuf_shake); 1840 kmem_shake_deregister(xfs_buf_shake);
1881 kthread_stop(xfsbufd_task);
1882 destroy_workqueue(xfsdatad_workqueue); 1841 destroy_workqueue(xfsdatad_workqueue);
1883 destroy_workqueue(xfslogd_workqueue); 1842 destroy_workqueue(xfslogd_workqueue);
1884 kmem_zone_destroy(pagebuf_zone); 1843 kmem_zone_destroy(xfs_buf_zone);
1885#ifdef PAGEBUF_TRACE 1844#ifdef XFS_BUF_TRACE
1886 ktrace_free(pagebuf_trace_buf); 1845 ktrace_free(xfs_buf_trace_buf);
1887#endif 1846#endif
1888} 1847}
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 237a35b915d..4dd6592d5a4 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -32,44 +32,47 @@
32 * Base types 32 * Base types
33 */ 33 */
34 34
35#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) 35#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL))
36 36
37#define page_buf_ctob(pp) ((pp) * PAGE_CACHE_SIZE) 37#define xfs_buf_ctob(pp) ((pp) * PAGE_CACHE_SIZE)
38#define page_buf_btoc(dd) (((dd) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) 38#define xfs_buf_btoc(dd) (((dd) + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT)
39#define page_buf_btoct(dd) ((dd) >> PAGE_CACHE_SHIFT) 39#define xfs_buf_btoct(dd) ((dd) >> PAGE_CACHE_SHIFT)
40#define page_buf_poff(aa) ((aa) & ~PAGE_CACHE_MASK) 40#define xfs_buf_poff(aa) ((aa) & ~PAGE_CACHE_MASK)
41 41
42typedef enum page_buf_rw_e { 42typedef enum {
43 PBRW_READ = 1, /* transfer into target memory */ 43 XBRW_READ = 1, /* transfer into target memory */
44 PBRW_WRITE = 2, /* transfer from target memory */ 44 XBRW_WRITE = 2, /* transfer from target memory */
45 PBRW_ZERO = 3 /* Zero target memory */ 45 XBRW_ZERO = 3, /* Zero target memory */
46} page_buf_rw_t; 46} xfs_buf_rw_t;
47 47
48 48typedef enum {
49typedef enum page_buf_flags_e { /* pb_flags values */ 49 XBF_READ = (1 << 0), /* buffer intended for reading from device */
50 PBF_READ = (1 << 0), /* buffer intended for reading from device */ 50 XBF_WRITE = (1 << 1), /* buffer intended for writing to device */
51 PBF_WRITE = (1 << 1), /* buffer intended for writing to device */ 51 XBF_MAPPED = (1 << 2), /* buffer mapped (b_addr valid) */
52 PBF_MAPPED = (1 << 2), /* buffer mapped (pb_addr valid) */ 52 XBF_ASYNC = (1 << 4), /* initiator will not wait for completion */
53 PBF_ASYNC = (1 << 4), /* initiator will not wait for completion */ 53 XBF_DONE = (1 << 5), /* all pages in the buffer uptodate */
54 PBF_DONE = (1 << 5), /* all pages in the buffer uptodate */ 54 XBF_DELWRI = (1 << 6), /* buffer has dirty pages */
55 PBF_DELWRI = (1 << 6), /* buffer has dirty pages */ 55 XBF_STALE = (1 << 7), /* buffer has been staled, do not find it */
56 PBF_STALE = (1 << 7), /* buffer has been staled, do not find it */ 56 XBF_FS_MANAGED = (1 << 8), /* filesystem controls freeing memory */
57 PBF_FS_MANAGED = (1 << 8), /* filesystem controls freeing memory */ 57 XBF_ORDERED = (1 << 11), /* use ordered writes */
58 PBF_ORDERED = (1 << 11), /* use ordered writes */ 58 XBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead */
59 PBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead */
60 59
61 /* flags used only as arguments to access routines */ 60 /* flags used only as arguments to access routines */
62 PBF_LOCK = (1 << 14), /* lock requested */ 61 XBF_LOCK = (1 << 14), /* lock requested */
63 PBF_TRYLOCK = (1 << 15), /* lock requested, but do not wait */ 62 XBF_TRYLOCK = (1 << 15), /* lock requested, but do not wait */
64 PBF_DONT_BLOCK = (1 << 16), /* do not block in current thread */ 63 XBF_DONT_BLOCK = (1 << 16), /* do not block in current thread */
65 64
66 /* flags used only internally */ 65 /* flags used only internally */
67 _PBF_PAGE_CACHE = (1 << 17),/* backed by pagecache */ 66 _XBF_PAGE_CACHE = (1 << 17),/* backed by pagecache */
68 _PBF_KMEM_ALLOC = (1 << 18),/* backed by kmem_alloc() */ 67 _XBF_KMEM_ALLOC = (1 << 18),/* backed by kmem_alloc() */
69 _PBF_RUN_QUEUES = (1 << 19),/* run block device task queue */ 68 _XBF_RUN_QUEUES = (1 << 19),/* run block device task queue */
70 _PBF_DELWRI_Q = (1 << 21), /* buffer on delwri queue */ 69 _XBF_DELWRI_Q = (1 << 21), /* buffer on delwri queue */
71} page_buf_flags_t; 70} xfs_buf_flags_t;
72 71
72typedef enum {
73 XBT_FORCE_SLEEP = (0 << 1),
74 XBT_FORCE_FLUSH = (1 << 1),
75} xfs_buftarg_flags_t;
73 76
74typedef struct xfs_bufhash { 77typedef struct xfs_bufhash {
75 struct list_head bh_list; 78 struct list_head bh_list;
@@ -77,477 +80,350 @@ typedef struct xfs_bufhash {
77} xfs_bufhash_t; 80} xfs_bufhash_t;
78 81
79typedef struct xfs_buftarg { 82typedef struct xfs_buftarg {
80 dev_t pbr_dev; 83 dev_t bt_dev;
81 struct block_device *pbr_bdev; 84 struct block_device *bt_bdev;
82 struct address_space *pbr_mapping; 85 struct address_space *bt_mapping;
83 unsigned int pbr_bsize; 86 unsigned int bt_bsize;
84 unsigned int pbr_sshift; 87 unsigned int bt_sshift;
85 size_t pbr_smask; 88 size_t bt_smask;
86 89
87 /* per-device buffer hash table */ 90 /* per device buffer hash table */
88 uint bt_hashmask; 91 uint bt_hashmask;
89 uint bt_hashshift; 92 uint bt_hashshift;
90 xfs_bufhash_t *bt_hash; 93 xfs_bufhash_t *bt_hash;
94
95 /* per device delwri queue */
96 struct task_struct *bt_task;
97 struct list_head bt_list;
98 struct list_head bt_delwrite_queue;
99 spinlock_t bt_delwrite_lock;
100 unsigned long bt_flags;
91} xfs_buftarg_t; 101} xfs_buftarg_t;
92 102
93/* 103/*
94 * xfs_buf_t: Buffer structure for page cache-based buffers 104 * xfs_buf_t: Buffer structure for pagecache-based buffers
105 *
106 * This buffer structure is used by the pagecache buffer management routines
107 * to refer to an assembly of pages forming a logical buffer.
95 * 108 *
96 * This buffer structure is used by the page cache buffer management routines 109 * The buffer structure is used on a temporary basis only, and discarded when
97 * to refer to an assembly of pages forming a logical buffer. The actual I/O 110 * released. The real data storage is recorded in the pagecache. Buffers are
98 * is performed with buffer_head structures, as required by drivers.
99 *
100 * The buffer structure is used on temporary basis only, and discarded when
101 * released. The real data storage is recorded in the page cache. Metadata is
102 * hashed to the block device on which the file system resides. 111 * hashed to the block device on which the file system resides.
103 */ 112 */
104 113
105struct xfs_buf; 114struct xfs_buf;
115typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
116typedef void (*xfs_buf_relse_t)(struct xfs_buf *);
117typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
106 118
107/* call-back function on I/O completion */ 119#define XB_PAGES 2
108typedef void (*page_buf_iodone_t)(struct xfs_buf *);
109/* call-back function on I/O completion */
110typedef void (*page_buf_relse_t)(struct xfs_buf *);
111/* pre-write function */
112typedef int (*page_buf_bdstrat_t)(struct xfs_buf *);
113
114#define PB_PAGES 2
115 120
116typedef struct xfs_buf { 121typedef struct xfs_buf {
117 struct semaphore pb_sema; /* semaphore for lockables */ 122 struct semaphore b_sema; /* semaphore for lockables */
118 unsigned long pb_queuetime; /* time buffer was queued */ 123 unsigned long b_queuetime; /* time buffer was queued */
119 atomic_t pb_pin_count; /* pin count */ 124 atomic_t b_pin_count; /* pin count */
120 wait_queue_head_t pb_waiters; /* unpin waiters */ 125 wait_queue_head_t b_waiters; /* unpin waiters */
121 struct list_head pb_list; 126 struct list_head b_list;
122 page_buf_flags_t pb_flags; /* status flags */ 127 xfs_buf_flags_t b_flags; /* status flags */
123 struct list_head pb_hash_list; /* hash table list */ 128 struct list_head b_hash_list; /* hash table list */
124 xfs_bufhash_t *pb_hash; /* hash table list start */ 129 xfs_bufhash_t *b_hash; /* hash table list start */
125 xfs_buftarg_t *pb_target; /* buffer target (device) */ 130 xfs_buftarg_t *b_target; /* buffer target (device) */
126 atomic_t pb_hold; /* reference count */ 131 atomic_t b_hold; /* reference count */
127 xfs_daddr_t pb_bn; /* block number for I/O */ 132 xfs_daddr_t b_bn; /* block number for I/O */
128 loff_t pb_file_offset; /* offset in file */ 133 xfs_off_t b_file_offset; /* offset in file */
129 size_t pb_buffer_length; /* size of buffer in bytes */ 134 size_t b_buffer_length;/* size of buffer in bytes */
130 size_t pb_count_desired; /* desired transfer size */ 135 size_t b_count_desired;/* desired transfer size */
131 void *pb_addr; /* virtual address of buffer */ 136 void *b_addr; /* virtual address of buffer */
132 struct work_struct pb_iodone_work; 137 struct work_struct b_iodone_work;
133 atomic_t pb_io_remaining;/* #outstanding I/O requests */ 138 atomic_t b_io_remaining; /* #outstanding I/O requests */
134 page_buf_iodone_t pb_iodone; /* I/O completion function */ 139 xfs_buf_iodone_t b_iodone; /* I/O completion function */
135 page_buf_relse_t pb_relse; /* releasing function */ 140 xfs_buf_relse_t b_relse; /* releasing function */
136 page_buf_bdstrat_t pb_strat; /* pre-write function */ 141 xfs_buf_bdstrat_t b_strat; /* pre-write function */
137 struct semaphore pb_iodonesema; /* Semaphore for I/O waiters */ 142 struct semaphore b_iodonesema; /* Semaphore for I/O waiters */
138 void *pb_fspriv; 143 void *b_fspriv;
139 void *pb_fspriv2; 144 void *b_fspriv2;
140 void *pb_fspriv3; 145 void *b_fspriv3;
141 unsigned short pb_error; /* error code on I/O */ 146 unsigned short b_error; /* error code on I/O */
142 unsigned short pb_locked; /* page array is locked */ 147 unsigned short b_locked; /* page array is locked */
143 unsigned int pb_page_count; /* size of page array */ 148 unsigned int b_page_count; /* size of page array */
144 unsigned int pb_offset; /* page offset in first page */ 149 unsigned int b_offset; /* page offset in first page */
145 struct page **pb_pages; /* array of page pointers */ 150 struct page **b_pages; /* array of page pointers */
146 struct page *pb_page_array[PB_PAGES]; /* inline pages */ 151 struct page *b_page_array[XB_PAGES]; /* inline pages */
147#ifdef PAGEBUF_LOCK_TRACKING 152#ifdef XFS_BUF_LOCK_TRACKING
148 int pb_last_holder; 153 int b_last_holder;
149#endif 154#endif
150} xfs_buf_t; 155} xfs_buf_t;
151 156
152 157
153/* Finding and Reading Buffers */ 158/* Finding and Reading Buffers */
154 159extern xfs_buf_t *_xfs_buf_find(xfs_buftarg_t *, xfs_off_t, size_t,
155extern xfs_buf_t *_pagebuf_find( /* find buffer for block if */ 160 xfs_buf_flags_t, xfs_buf_t *);
156 /* the block is in memory */
157 xfs_buftarg_t *, /* inode for block */
158 loff_t, /* starting offset of range */
159 size_t, /* length of range */
160 page_buf_flags_t, /* PBF_LOCK */
161 xfs_buf_t *); /* newly allocated buffer */
162
163#define xfs_incore(buftarg,blkno,len,lockit) \ 161#define xfs_incore(buftarg,blkno,len,lockit) \
164 _pagebuf_find(buftarg, blkno ,len, lockit, NULL) 162 _xfs_buf_find(buftarg, blkno ,len, lockit, NULL)
165
166extern xfs_buf_t *xfs_buf_get_flags( /* allocate a buffer */
167 xfs_buftarg_t *, /* inode for buffer */
168 loff_t, /* starting offset of range */
169 size_t, /* length of range */
170 page_buf_flags_t); /* PBF_LOCK, PBF_READ, */
171 /* PBF_ASYNC */
172 163
164extern xfs_buf_t *xfs_buf_get_flags(xfs_buftarg_t *, xfs_off_t, size_t,
165 xfs_buf_flags_t);
173#define xfs_buf_get(target, blkno, len, flags) \ 166#define xfs_buf_get(target, blkno, len, flags) \
174 xfs_buf_get_flags((target), (blkno), (len), PBF_LOCK | PBF_MAPPED) 167 xfs_buf_get_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED)
175
176extern xfs_buf_t *xfs_buf_read_flags( /* allocate and read a buffer */
177 xfs_buftarg_t *, /* inode for buffer */
178 loff_t, /* starting offset of range */
179 size_t, /* length of range */
180 page_buf_flags_t); /* PBF_LOCK, PBF_ASYNC */
181 168
169extern xfs_buf_t *xfs_buf_read_flags(xfs_buftarg_t *, xfs_off_t, size_t,
170 xfs_buf_flags_t);
182#define xfs_buf_read(target, blkno, len, flags) \ 171#define xfs_buf_read(target, blkno, len, flags) \
183 xfs_buf_read_flags((target), (blkno), (len), PBF_LOCK | PBF_MAPPED) 172 xfs_buf_read_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED)
184
185extern xfs_buf_t *pagebuf_get_empty( /* allocate pagebuf struct with */
186 /* no memory or disk address */
187 size_t len,
188 xfs_buftarg_t *); /* mount point "fake" inode */
189
190extern xfs_buf_t *pagebuf_get_no_daddr(/* allocate pagebuf struct */
191 /* without disk address */
192 size_t len,
193 xfs_buftarg_t *); /* mount point "fake" inode */
194
195extern int pagebuf_associate_memory(
196 xfs_buf_t *,
197 void *,
198 size_t);
199
200extern void pagebuf_hold( /* increment reference count */
201 xfs_buf_t *); /* buffer to hold */
202 173
203extern void pagebuf_readahead( /* read ahead into cache */ 174extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
204 xfs_buftarg_t *, /* target for buffer (or NULL) */ 175extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *);
205 loff_t, /* starting offset of range */ 176extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
206 size_t, /* length of range */ 177extern void xfs_buf_hold(xfs_buf_t *);
207 page_buf_flags_t); /* additional read flags */ 178extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t,
179 xfs_buf_flags_t);
208 180
209/* Releasing Buffers */ 181/* Releasing Buffers */
210 182extern void xfs_buf_free(xfs_buf_t *);
211extern void pagebuf_free( /* deallocate a buffer */ 183extern void xfs_buf_rele(xfs_buf_t *);
212 xfs_buf_t *); /* buffer to deallocate */
213
214extern void pagebuf_rele( /* release hold on a buffer */
215 xfs_buf_t *); /* buffer to release */
216 184
217/* Locking and Unlocking Buffers */ 185/* Locking and Unlocking Buffers */
218 186extern int xfs_buf_cond_lock(xfs_buf_t *);
219extern int pagebuf_cond_lock( /* lock buffer, if not locked */ 187extern int xfs_buf_lock_value(xfs_buf_t *);
220 /* (returns -EBUSY if locked) */ 188extern void xfs_buf_lock(xfs_buf_t *);
221 xfs_buf_t *); /* buffer to lock */ 189extern void xfs_buf_unlock(xfs_buf_t *);
222
223extern int pagebuf_lock_value( /* return count on lock */
224 xfs_buf_t *); /* buffer to check */
225
226extern int pagebuf_lock( /* lock buffer */
227 xfs_buf_t *); /* buffer to lock */
228
229extern void pagebuf_unlock( /* unlock buffer */
230 xfs_buf_t *); /* buffer to unlock */
231 190
232/* Buffer Read and Write Routines */ 191/* Buffer Read and Write Routines */
233 192extern void xfs_buf_ioend(xfs_buf_t *, int);
234extern void pagebuf_iodone( /* mark buffer I/O complete */ 193extern void xfs_buf_ioerror(xfs_buf_t *, int);
235 xfs_buf_t *, /* buffer to mark */ 194extern int xfs_buf_iostart(xfs_buf_t *, xfs_buf_flags_t);
236 int); /* run completion locally, or in 195extern int xfs_buf_iorequest(xfs_buf_t *);
237 * a helper thread. */ 196extern int xfs_buf_iowait(xfs_buf_t *);
238 197extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t,
239extern void pagebuf_ioerror( /* mark buffer in error (or not) */ 198 xfs_buf_rw_t);
240 xfs_buf_t *, /* buffer to mark */ 199
241 int); /* error to store (0 if none) */ 200static inline int xfs_buf_iostrategy(xfs_buf_t *bp)
242
243extern int pagebuf_iostart( /* start I/O on a buffer */
244 xfs_buf_t *, /* buffer to start */
245 page_buf_flags_t); /* PBF_LOCK, PBF_ASYNC, */
246 /* PBF_READ, PBF_WRITE, */
247 /* PBF_DELWRI */
248
249extern int pagebuf_iorequest( /* start real I/O */
250 xfs_buf_t *); /* buffer to convey to device */
251
252extern int pagebuf_iowait( /* wait for buffer I/O done */
253 xfs_buf_t *); /* buffer to wait on */
254
255extern void pagebuf_iomove( /* move data in/out of pagebuf */
256 xfs_buf_t *, /* buffer to manipulate */
257 size_t, /* starting buffer offset */
258 size_t, /* length in buffer */
259 caddr_t, /* data pointer */
260 page_buf_rw_t); /* direction */
261
262static inline int pagebuf_iostrategy(xfs_buf_t *pb)
263{ 201{
264 return pb->pb_strat ? pb->pb_strat(pb) : pagebuf_iorequest(pb); 202 return bp->b_strat ? bp->b_strat(bp) : xfs_buf_iorequest(bp);
265} 203}
266 204
267static inline int pagebuf_geterror(xfs_buf_t *pb) 205static inline int xfs_buf_geterror(xfs_buf_t *bp)
268{ 206{
269 return pb ? pb->pb_error : ENOMEM; 207 return bp ? bp->b_error : ENOMEM;
270} 208}
271 209
272/* Buffer Utility Routines */ 210/* Buffer Utility Routines */
273 211extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
274extern caddr_t pagebuf_offset( /* pointer at offset in buffer */
275 xfs_buf_t *, /* buffer to offset into */
276 size_t); /* offset */
277 212
278/* Pinning Buffer Storage in Memory */ 213/* Pinning Buffer Storage in Memory */
279 214extern void xfs_buf_pin(xfs_buf_t *);
280extern void pagebuf_pin( /* pin buffer in memory */ 215extern void xfs_buf_unpin(xfs_buf_t *);
281 xfs_buf_t *); /* buffer to pin */ 216extern int xfs_buf_ispin(xfs_buf_t *);
282
283extern void pagebuf_unpin( /* unpin buffered data */
284 xfs_buf_t *); /* buffer to unpin */
285
286extern int pagebuf_ispin( /* check if buffer is pinned */
287 xfs_buf_t *); /* buffer to check */
288 217
289/* Delayed Write Buffer Routines */ 218/* Delayed Write Buffer Routines */
290 219extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
291extern void pagebuf_delwri_dequeue(xfs_buf_t *);
292 220
293/* Buffer Daemon Setup Routines */ 221/* Buffer Daemon Setup Routines */
222extern int xfs_buf_init(void);
223extern void xfs_buf_terminate(void);
294 224
295extern int pagebuf_init(void); 225#ifdef XFS_BUF_TRACE
296extern void pagebuf_terminate(void); 226extern ktrace_t *xfs_buf_trace_buf;
297 227extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
298
299#ifdef PAGEBUF_TRACE
300extern ktrace_t *pagebuf_trace_buf;
301extern void pagebuf_trace(
302 xfs_buf_t *, /* buffer being traced */
303 char *, /* description of operation */
304 void *, /* arbitrary diagnostic value */
305 void *); /* return address */
306#else 228#else
307# define pagebuf_trace(pb, id, ptr, ra) do { } while (0) 229#define xfs_buf_trace(bp,id,ptr,ra) do { } while (0)
308#endif 230#endif
309 231
310#define pagebuf_target_name(target) \ 232#define xfs_buf_target_name(target) \
311 ({ char __b[BDEVNAME_SIZE]; bdevname((target)->pbr_bdev, __b); __b; }) 233 ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; })
312 234
313 235
236#define XFS_B_ASYNC XBF_ASYNC
237#define XFS_B_DELWRI XBF_DELWRI
238#define XFS_B_READ XBF_READ
239#define XFS_B_WRITE XBF_WRITE
240#define XFS_B_STALE XBF_STALE
314 241
315/* These are just for xfs_syncsub... it sets an internal variable 242#define XFS_BUF_TRYLOCK XBF_TRYLOCK
316 * then passes it to VOP_FLUSH_PAGES or adds the flags to a newly gotten buf_t 243#define XFS_INCORE_TRYLOCK XBF_TRYLOCK
317 */ 244#define XFS_BUF_LOCK XBF_LOCK
318#define XFS_B_ASYNC PBF_ASYNC 245#define XFS_BUF_MAPPED XBF_MAPPED
319#define XFS_B_DELWRI PBF_DELWRI
320#define XFS_B_READ PBF_READ
321#define XFS_B_WRITE PBF_WRITE
322#define XFS_B_STALE PBF_STALE
323
324#define XFS_BUF_TRYLOCK PBF_TRYLOCK
325#define XFS_INCORE_TRYLOCK PBF_TRYLOCK
326#define XFS_BUF_LOCK PBF_LOCK
327#define XFS_BUF_MAPPED PBF_MAPPED
328
329#define BUF_BUSY PBF_DONT_BLOCK
330
331#define XFS_BUF_BFLAGS(x) ((x)->pb_flags)
332#define XFS_BUF_ZEROFLAGS(x) \
333 ((x)->pb_flags &= ~(PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_DELWRI))
334
335#define XFS_BUF_STALE(x) ((x)->pb_flags |= XFS_B_STALE)
336#define XFS_BUF_UNSTALE(x) ((x)->pb_flags &= ~XFS_B_STALE)
337#define XFS_BUF_ISSTALE(x) ((x)->pb_flags & XFS_B_STALE)
338#define XFS_BUF_SUPER_STALE(x) do { \
339 XFS_BUF_STALE(x); \
340 pagebuf_delwri_dequeue(x); \
341 XFS_BUF_DONE(x); \
342 } while (0)
343 246
344#define XFS_BUF_MANAGE PBF_FS_MANAGED 247#define BUF_BUSY XBF_DONT_BLOCK
345#define XFS_BUF_UNMANAGE(x) ((x)->pb_flags &= ~PBF_FS_MANAGED) 248
346 249#define XFS_BUF_BFLAGS(bp) ((bp)->b_flags)
347#define XFS_BUF_DELAYWRITE(x) ((x)->pb_flags |= PBF_DELWRI) 250#define XFS_BUF_ZEROFLAGS(bp) \
348#define XFS_BUF_UNDELAYWRITE(x) pagebuf_delwri_dequeue(x) 251 ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI))
349#define XFS_BUF_ISDELAYWRITE(x) ((x)->pb_flags & PBF_DELWRI) 252
350 253#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XFS_B_STALE)
351#define XFS_BUF_ERROR(x,no) pagebuf_ioerror(x,no) 254#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XFS_B_STALE)
352#define XFS_BUF_GETERROR(x) pagebuf_geterror(x) 255#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XFS_B_STALE)
353#define XFS_BUF_ISERROR(x) (pagebuf_geterror(x)?1:0) 256#define XFS_BUF_SUPER_STALE(bp) do { \
354 257 XFS_BUF_STALE(bp); \
355#define XFS_BUF_DONE(x) ((x)->pb_flags |= PBF_DONE) 258 xfs_buf_delwri_dequeue(bp); \
356#define XFS_BUF_UNDONE(x) ((x)->pb_flags &= ~PBF_DONE) 259 XFS_BUF_DONE(bp); \
357#define XFS_BUF_ISDONE(x) ((x)->pb_flags & PBF_DONE) 260 } while (0)
358
359#define XFS_BUF_BUSY(x) do { } while (0)
360#define XFS_BUF_UNBUSY(x) do { } while (0)
361#define XFS_BUF_ISBUSY(x) (1)
362
363#define XFS_BUF_ASYNC(x) ((x)->pb_flags |= PBF_ASYNC)
364#define XFS_BUF_UNASYNC(x) ((x)->pb_flags &= ~PBF_ASYNC)
365#define XFS_BUF_ISASYNC(x) ((x)->pb_flags & PBF_ASYNC)
366
367#define XFS_BUF_ORDERED(x) ((x)->pb_flags |= PBF_ORDERED)
368#define XFS_BUF_UNORDERED(x) ((x)->pb_flags &= ~PBF_ORDERED)
369#define XFS_BUF_ISORDERED(x) ((x)->pb_flags & PBF_ORDERED)
370
371#define XFS_BUF_SHUT(x) printk("XFS_BUF_SHUT not implemented yet\n")
372#define XFS_BUF_UNSHUT(x) printk("XFS_BUF_UNSHUT not implemented yet\n")
373#define XFS_BUF_ISSHUT(x) (0)
374
375#define XFS_BUF_HOLD(x) pagebuf_hold(x)
376#define XFS_BUF_READ(x) ((x)->pb_flags |= PBF_READ)
377#define XFS_BUF_UNREAD(x) ((x)->pb_flags &= ~PBF_READ)
378#define XFS_BUF_ISREAD(x) ((x)->pb_flags & PBF_READ)
379
380#define XFS_BUF_WRITE(x) ((x)->pb_flags |= PBF_WRITE)
381#define XFS_BUF_UNWRITE(x) ((x)->pb_flags &= ~PBF_WRITE)
382#define XFS_BUF_ISWRITE(x) ((x)->pb_flags & PBF_WRITE)
383
384#define XFS_BUF_ISUNINITIAL(x) (0)
385#define XFS_BUF_UNUNINITIAL(x) (0)
386
387#define XFS_BUF_BP_ISMAPPED(bp) 1
388
389#define XFS_BUF_IODONE_FUNC(buf) (buf)->pb_iodone
390#define XFS_BUF_SET_IODONE_FUNC(buf, func) \
391 (buf)->pb_iodone = (func)
392#define XFS_BUF_CLR_IODONE_FUNC(buf) \
393 (buf)->pb_iodone = NULL
394#define XFS_BUF_SET_BDSTRAT_FUNC(buf, func) \
395 (buf)->pb_strat = (func)
396#define XFS_BUF_CLR_BDSTRAT_FUNC(buf) \
397 (buf)->pb_strat = NULL
398
399#define XFS_BUF_FSPRIVATE(buf, type) \
400 ((type)(buf)->pb_fspriv)
401#define XFS_BUF_SET_FSPRIVATE(buf, value) \
402 (buf)->pb_fspriv = (void *)(value)
403#define XFS_BUF_FSPRIVATE2(buf, type) \
404 ((type)(buf)->pb_fspriv2)
405#define XFS_BUF_SET_FSPRIVATE2(buf, value) \
406 (buf)->pb_fspriv2 = (void *)(value)
407#define XFS_BUF_FSPRIVATE3(buf, type) \
408 ((type)(buf)->pb_fspriv3)
409#define XFS_BUF_SET_FSPRIVATE3(buf, value) \
410 (buf)->pb_fspriv3 = (void *)(value)
411#define XFS_BUF_SET_START(buf)
412
413#define XFS_BUF_SET_BRELSE_FUNC(buf, value) \
414 (buf)->pb_relse = (value)
415
416#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->pb_addr)
417
418static inline xfs_caddr_t xfs_buf_offset(xfs_buf_t *bp, size_t offset)
419{
420 if (bp->pb_flags & PBF_MAPPED)
421 return XFS_BUF_PTR(bp) + offset;
422 return (xfs_caddr_t) pagebuf_offset(bp, offset);
423}
424 261
425#define XFS_BUF_SET_PTR(bp, val, count) \ 262#define XFS_BUF_MANAGE XBF_FS_MANAGED
426 pagebuf_associate_memory(bp, val, count) 263#define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED)
427#define XFS_BUF_ADDR(bp) ((bp)->pb_bn) 264
428#define XFS_BUF_SET_ADDR(bp, blk) \ 265#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI)
429 ((bp)->pb_bn = (xfs_daddr_t)(blk)) 266#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp)
430#define XFS_BUF_OFFSET(bp) ((bp)->pb_file_offset) 267#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI)
431#define XFS_BUF_SET_OFFSET(bp, off) \ 268
432 ((bp)->pb_file_offset = (off)) 269#define XFS_BUF_ERROR(bp,no) xfs_buf_ioerror(bp,no)
433#define XFS_BUF_COUNT(bp) ((bp)->pb_count_desired) 270#define XFS_BUF_GETERROR(bp) xfs_buf_geterror(bp)
434#define XFS_BUF_SET_COUNT(bp, cnt) \ 271#define XFS_BUF_ISERROR(bp) (xfs_buf_geterror(bp) ? 1 : 0)
435 ((bp)->pb_count_desired = (cnt)) 272
436#define XFS_BUF_SIZE(bp) ((bp)->pb_buffer_length) 273#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE)
437#define XFS_BUF_SET_SIZE(bp, cnt) \ 274#define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE)
438 ((bp)->pb_buffer_length = (cnt)) 275#define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE)
439#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) 276
440#define XFS_BUF_SET_VTYPE(bp, type) 277#define XFS_BUF_BUSY(bp) do { } while (0)
441#define XFS_BUF_SET_REF(bp, ref) 278#define XFS_BUF_UNBUSY(bp) do { } while (0)
442 279#define XFS_BUF_ISBUSY(bp) (1)
443#define XFS_BUF_ISPINNED(bp) pagebuf_ispin(bp) 280
444 281#define XFS_BUF_ASYNC(bp) ((bp)->b_flags |= XBF_ASYNC)
445#define XFS_BUF_VALUSEMA(bp) pagebuf_lock_value(bp) 282#define XFS_BUF_UNASYNC(bp) ((bp)->b_flags &= ~XBF_ASYNC)
446#define XFS_BUF_CPSEMA(bp) (pagebuf_cond_lock(bp) == 0) 283#define XFS_BUF_ISASYNC(bp) ((bp)->b_flags & XBF_ASYNC)
447#define XFS_BUF_VSEMA(bp) pagebuf_unlock(bp) 284
448#define XFS_BUF_PSEMA(bp,x) pagebuf_lock(bp) 285#define XFS_BUF_ORDERED(bp) ((bp)->b_flags |= XBF_ORDERED)
449#define XFS_BUF_V_IODONESEMA(bp) up(&bp->pb_iodonesema); 286#define XFS_BUF_UNORDERED(bp) ((bp)->b_flags &= ~XBF_ORDERED)
450 287#define XFS_BUF_ISORDERED(bp) ((bp)->b_flags & XBF_ORDERED)
451/* setup the buffer target from a buftarg structure */ 288
452#define XFS_BUF_SET_TARGET(bp, target) \ 289#define XFS_BUF_SHUT(bp) do { } while (0)
453 (bp)->pb_target = (target) 290#define XFS_BUF_UNSHUT(bp) do { } while (0)
454#define XFS_BUF_TARGET(bp) ((bp)->pb_target) 291#define XFS_BUF_ISSHUT(bp) (0)
455#define XFS_BUFTARG_NAME(target) \ 292
456 pagebuf_target_name(target) 293#define XFS_BUF_HOLD(bp) xfs_buf_hold(bp)
457 294#define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ)
458#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) 295#define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ)
459#define XFS_BUF_SET_VTYPE(bp, type) 296#define XFS_BUF_ISREAD(bp) ((bp)->b_flags & XBF_READ)
460#define XFS_BUF_SET_REF(bp, ref) 297
461 298#define XFS_BUF_WRITE(bp) ((bp)->b_flags |= XBF_WRITE)
462static inline int xfs_bawrite(void *mp, xfs_buf_t *bp) 299#define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE)
300#define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE)
301
302#define XFS_BUF_ISUNINITIAL(bp) (0)
303#define XFS_BUF_UNUNINITIAL(bp) (0)
304
305#define XFS_BUF_BP_ISMAPPED(bp) (1)
306
307#define XFS_BUF_IODONE_FUNC(bp) ((bp)->b_iodone)
308#define XFS_BUF_SET_IODONE_FUNC(bp, func) ((bp)->b_iodone = (func))
309#define XFS_BUF_CLR_IODONE_FUNC(bp) ((bp)->b_iodone = NULL)
310#define XFS_BUF_SET_BDSTRAT_FUNC(bp, func) ((bp)->b_strat = (func))
311#define XFS_BUF_CLR_BDSTRAT_FUNC(bp) ((bp)->b_strat = NULL)
312
313#define XFS_BUF_FSPRIVATE(bp, type) ((type)(bp)->b_fspriv)
314#define XFS_BUF_SET_FSPRIVATE(bp, val) ((bp)->b_fspriv = (void*)(val))
315#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2)
316#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val))
317#define XFS_BUF_FSPRIVATE3(bp, type) ((type)(bp)->b_fspriv3)
318#define XFS_BUF_SET_FSPRIVATE3(bp, val) ((bp)->b_fspriv3 = (void*)(val))
319#define XFS_BUF_SET_START(bp) do { } while (0)
320#define XFS_BUF_SET_BRELSE_FUNC(bp, func) ((bp)->b_relse = (func))
321
322#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr)
323#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt)
324#define XFS_BUF_ADDR(bp) ((bp)->b_bn)
325#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno))
326#define XFS_BUF_OFFSET(bp) ((bp)->b_file_offset)
327#define XFS_BUF_SET_OFFSET(bp, off) ((bp)->b_file_offset = (off))
328#define XFS_BUF_COUNT(bp) ((bp)->b_count_desired)
329#define XFS_BUF_SET_COUNT(bp, cnt) ((bp)->b_count_desired = (cnt))
330#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length)
331#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt))
332
333#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) do { } while (0)
334#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0)
335#define XFS_BUF_SET_REF(bp, ref) do { } while (0)
336
337#define XFS_BUF_ISPINNED(bp) xfs_buf_ispin(bp)
338
339#define XFS_BUF_VALUSEMA(bp) xfs_buf_lock_value(bp)
340#define XFS_BUF_CPSEMA(bp) (xfs_buf_cond_lock(bp) == 0)
341#define XFS_BUF_VSEMA(bp) xfs_buf_unlock(bp)
342#define XFS_BUF_PSEMA(bp,x) xfs_buf_lock(bp)
343#define XFS_BUF_V_IODONESEMA(bp) up(&bp->b_iodonesema);
344
345#define XFS_BUF_SET_TARGET(bp, target) ((bp)->b_target = (target))
346#define XFS_BUF_TARGET(bp) ((bp)->b_target)
347#define XFS_BUFTARG_NAME(target) xfs_buf_target_name(target)
348
349static inline int xfs_bawrite(void *mp, xfs_buf_t *bp)
463{ 350{
464 bp->pb_fspriv3 = mp; 351 bp->b_fspriv3 = mp;
465 bp->pb_strat = xfs_bdstrat_cb; 352 bp->b_strat = xfs_bdstrat_cb;
466 pagebuf_delwri_dequeue(bp); 353 xfs_buf_delwri_dequeue(bp);
467 return pagebuf_iostart(bp, PBF_WRITE | PBF_ASYNC | _PBF_RUN_QUEUES); 354 return xfs_buf_iostart(bp, XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES);
468} 355}
469 356
470static inline void xfs_buf_relse(xfs_buf_t *bp) 357static inline void xfs_buf_relse(xfs_buf_t *bp)
471{ 358{
472 if (!bp->pb_relse) 359 if (!bp->b_relse)
473 pagebuf_unlock(bp); 360 xfs_buf_unlock(bp);
474 pagebuf_rele(bp); 361 xfs_buf_rele(bp);
475} 362}
476 363
477#define xfs_bpin(bp) pagebuf_pin(bp) 364#define xfs_bpin(bp) xfs_buf_pin(bp)
478#define xfs_bunpin(bp) pagebuf_unpin(bp) 365#define xfs_bunpin(bp) xfs_buf_unpin(bp)
479 366
480#define xfs_buftrace(id, bp) \ 367#define xfs_buftrace(id, bp) \
481 pagebuf_trace(bp, id, NULL, (void *)__builtin_return_address(0)) 368 xfs_buf_trace(bp, id, NULL, (void *)__builtin_return_address(0))
482 369
483#define xfs_biodone(pb) \ 370#define xfs_biodone(bp) xfs_buf_ioend(bp, 0)
484 pagebuf_iodone(pb, 0)
485 371
486#define xfs_biomove(pb, off, len, data, rw) \ 372#define xfs_biomove(bp, off, len, data, rw) \
487 pagebuf_iomove((pb), (off), (len), (data), \ 373 xfs_buf_iomove((bp), (off), (len), (data), \
488 ((rw) == XFS_B_WRITE) ? PBRW_WRITE : PBRW_READ) 374 ((rw) == XFS_B_WRITE) ? XBRW_WRITE : XBRW_READ)
489 375
490#define xfs_biozero(pb, off, len) \ 376#define xfs_biozero(bp, off, len) \
491 pagebuf_iomove((pb), (off), (len), NULL, PBRW_ZERO) 377 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
492 378
493 379
494static inline int XFS_bwrite(xfs_buf_t *pb) 380static inline int XFS_bwrite(xfs_buf_t *bp)
495{ 381{
496 int iowait = (pb->pb_flags & PBF_ASYNC) == 0; 382 int iowait = (bp->b_flags & XBF_ASYNC) == 0;
497 int error = 0; 383 int error = 0;
498 384
499 if (!iowait) 385 if (!iowait)
500 pb->pb_flags |= _PBF_RUN_QUEUES; 386 bp->b_flags |= _XBF_RUN_QUEUES;
501 387
502 pagebuf_delwri_dequeue(pb); 388 xfs_buf_delwri_dequeue(bp);
503 pagebuf_iostrategy(pb); 389 xfs_buf_iostrategy(bp);
504 if (iowait) { 390 if (iowait) {
505 error = pagebuf_iowait(pb); 391 error = xfs_buf_iowait(bp);
506 xfs_buf_relse(pb); 392 xfs_buf_relse(bp);
507 } 393 }
508 return error; 394 return error;
509} 395}
510 396
511#define XFS_bdwrite(pb) \ 397#define XFS_bdwrite(bp) xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC)
512 pagebuf_iostart(pb, PBF_DELWRI | PBF_ASYNC)
513 398
514static inline int xfs_bdwrite(void *mp, xfs_buf_t *bp) 399static inline int xfs_bdwrite(void *mp, xfs_buf_t *bp)
515{ 400{
516 bp->pb_strat = xfs_bdstrat_cb; 401 bp->b_strat = xfs_bdstrat_cb;
517 bp->pb_fspriv3 = mp; 402 bp->b_fspriv3 = mp;
518 403 return xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC);
519 return pagebuf_iostart(bp, PBF_DELWRI | PBF_ASYNC);
520} 404}
521 405
522#define XFS_bdstrat(bp) pagebuf_iorequest(bp) 406#define XFS_bdstrat(bp) xfs_buf_iorequest(bp)
523 407
524#define xfs_iowait(pb) pagebuf_iowait(pb) 408#define xfs_iowait(bp) xfs_buf_iowait(bp)
525 409
526#define xfs_baread(target, rablkno, ralen) \ 410#define xfs_baread(target, rablkno, ralen) \
527 pagebuf_readahead((target), (rablkno), (ralen), PBF_DONT_BLOCK) 411 xfs_buf_readahead((target), (rablkno), (ralen), XBF_DONT_BLOCK)
528
529#define xfs_buf_get_empty(len, target) pagebuf_get_empty((len), (target))
530#define xfs_buf_get_noaddr(len, target) pagebuf_get_no_daddr((len), (target))
531#define xfs_buf_free(bp) pagebuf_free(bp)
532 412
533 413
534/* 414/*
535 * Handling of buftargs. 415 * Handling of buftargs.
536 */ 416 */
537
538extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int); 417extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int);
539extern void xfs_free_buftarg(xfs_buftarg_t *, int); 418extern void xfs_free_buftarg(xfs_buftarg_t *, int);
540extern void xfs_wait_buftarg(xfs_buftarg_t *); 419extern void xfs_wait_buftarg(xfs_buftarg_t *);
541extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); 420extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
542extern int xfs_flush_buftarg(xfs_buftarg_t *, int); 421extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
543 422
544#define xfs_getsize_buftarg(buftarg) \ 423#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
545 block_size((buftarg)->pbr_bdev) 424#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
546#define xfs_readonly_buftarg(buftarg) \ 425
547 bdev_read_only((buftarg)->pbr_bdev) 426#define xfs_binval(buftarg) xfs_flush_buftarg(buftarg, 1)
548#define xfs_binval(buftarg) \ 427#define XFS_bflush(buftarg) xfs_flush_buftarg(buftarg, 1)
549 xfs_flush_buftarg(buftarg, 1)
550#define XFS_bflush(buftarg) \
551 xfs_flush_buftarg(buftarg, 1)
552 428
553#endif /* __XFS_BUF_H__ */ 429#endif /* __XFS_BUF_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
index 4af49102472..e7f3da61c6c 100644
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ b/fs/xfs/linux-2.6/xfs_cred.h
@@ -18,6 +18,8 @@
18#ifndef __XFS_CRED_H__ 18#ifndef __XFS_CRED_H__
19#define __XFS_CRED_H__ 19#define __XFS_CRED_H__
20 20
21#include <linux/capability.h>
22
21/* 23/*
22 * Credentials 24 * Credentials
23 */ 25 */
@@ -27,7 +29,7 @@ typedef struct cred {
27 29
28extern struct cred *sys_cred; 30extern struct cred *sys_cred;
29 31
30/* this is a hack.. (assums sys_cred is the only cred_t in the system) */ 32/* this is a hack.. (assumes sys_cred is the only cred_t in the system) */
31static __inline int capable_cred(cred_t *cr, int cid) 33static __inline int capable_cred(cred_t *cr, int cid)
32{ 34{
33 return (cr == sys_cred) ? 1 : capable(cid); 35 return (cr == sys_cred) ? 1 : capable(cid);
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 06111d0bbae..ced4404339c 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -509,16 +509,14 @@ linvfs_open_exec(
509 vnode_t *vp = LINVFS_GET_VP(inode); 509 vnode_t *vp = LINVFS_GET_VP(inode);
510 xfs_mount_t *mp = XFS_VFSTOM(vp->v_vfsp); 510 xfs_mount_t *mp = XFS_VFSTOM(vp->v_vfsp);
511 int error = 0; 511 int error = 0;
512 bhv_desc_t *bdp;
513 xfs_inode_t *ip; 512 xfs_inode_t *ip;
514 513
515 if (vp->v_vfsp->vfs_flag & VFS_DMI) { 514 if (vp->v_vfsp->vfs_flag & VFS_DMI) {
516 bdp = vn_bhv_lookup(VN_BHV_HEAD(vp), &xfs_vnodeops); 515 ip = xfs_vtoi(vp);
517 if (!bdp) { 516 if (!ip) {
518 error = -EINVAL; 517 error = -EINVAL;
519 goto open_exec_out; 518 goto open_exec_out;
520 } 519 }
521 ip = XFS_BHVTOI(bdp);
522 if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ)) { 520 if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ)) {
523 error = -XFS_SEND_DATA(mp, DM_EVENT_READ, vp, 521 error = -XFS_SEND_DATA(mp, DM_EVENT_READ, vp,
524 0, 0, 0, NULL); 522 0, 0, 0, NULL);
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index f89340c61bf..4fa4b1a5187 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -79,8 +79,7 @@ fs_flushinval_pages(
79 struct inode *ip = LINVFS_GET_IP(vp); 79 struct inode *ip = LINVFS_GET_IP(vp);
80 80
81 if (VN_CACHED(vp)) { 81 if (VN_CACHED(vp)) {
82 filemap_fdatawrite(ip->i_mapping); 82 filemap_write_and_wait(ip->i_mapping);
83 filemap_fdatawait(ip->i_mapping);
84 83
85 truncate_inode_pages(ip->i_mapping, first); 84 truncate_inode_pages(ip->i_mapping, first);
86 } 85 }
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index b78b5eb9e96..4db47790415 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -52,6 +52,7 @@
52#include "xfs_dfrag.h" 52#include "xfs_dfrag.h"
53#include "xfs_fsops.h" 53#include "xfs_fsops.h"
54 54
55#include <linux/capability.h>
55#include <linux/dcache.h> 56#include <linux/dcache.h>
56#include <linux/mount.h> 57#include <linux/mount.h>
57#include <linux/namei.h> 58#include <linux/namei.h>
@@ -145,13 +146,10 @@ xfs_find_handle(
145 146
146 if (cmd != XFS_IOC_PATH_TO_FSHANDLE) { 147 if (cmd != XFS_IOC_PATH_TO_FSHANDLE) {
147 xfs_inode_t *ip; 148 xfs_inode_t *ip;
148 bhv_desc_t *bhv;
149 int lock_mode; 149 int lock_mode;
150 150
151 /* need to get access to the xfs_inode to read the generation */ 151 /* need to get access to the xfs_inode to read the generation */
152 bhv = vn_bhv_lookup_unlocked(VN_BHV_HEAD(vp), &xfs_vnodeops); 152 ip = xfs_vtoi(vp);
153 ASSERT(bhv);
154 ip = XFS_BHVTOI(bhv);
155 ASSERT(ip); 153 ASSERT(ip);
156 lock_mode = xfs_ilock_map_shared(ip); 154 lock_mode = xfs_ilock_map_shared(ip);
157 155
@@ -530,6 +528,8 @@ xfs_attrmulti_attr_set(
530 char *kbuf; 528 char *kbuf;
531 int error = EFAULT; 529 int error = EFAULT;
532 530
531 if (IS_RDONLY(&vp->v_inode))
532 return -EROFS;
533 if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode)) 533 if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode))
534 return EPERM; 534 return EPERM;
535 if (len > XATTR_SIZE_MAX) 535 if (len > XATTR_SIZE_MAX)
@@ -557,6 +557,9 @@ xfs_attrmulti_attr_remove(
557{ 557{
558 int error; 558 int error;
559 559
560
561 if (IS_RDONLY(&vp->v_inode))
562 return -EROFS;
560 if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode)) 563 if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode))
561 return EPERM; 564 return EPERM;
562 565
@@ -745,9 +748,8 @@ xfs_ioctl(
745 (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? 748 (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
746 mp->m_rtdev_targp : mp->m_ddev_targp; 749 mp->m_rtdev_targp : mp->m_ddev_targp;
747 750
748 da.d_mem = da.d_miniosz = 1 << target->pbr_sshift; 751 da.d_mem = da.d_miniosz = 1 << target->bt_sshift;
749 /* The size dio will do in one go */ 752 da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
750 da.d_maxiosz = 64 * PAGE_CACHE_SIZE;
751 753
752 if (copy_to_user(arg, &da, sizeof(da))) 754 if (copy_to_user(arg, &da, sizeof(da)))
753 return -XFS_ERROR(EFAULT); 755 return -XFS_ERROR(EFAULT);
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index c83ae15bb0e..a7c9ba1a9f7 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -19,7 +19,6 @@
19#include <linux/compat.h> 19#include <linux/compat.h>
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/ioctl.h> 21#include <linux/ioctl.h>
22#include <linux/ioctl32.h>
23#include <linux/syscalls.h> 22#include <linux/syscalls.h>
24#include <linux/types.h> 23#include <linux/types.h>
25#include <linux/fs.h> 24#include <linux/fs.h>
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 14215a7db59..76c6df34d0d 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -51,8 +51,44 @@
51#include "xfs_buf_item.h" 51#include "xfs_buf_item.h"
52#include "xfs_utils.h" 52#include "xfs_utils.h"
53 53
54#include <linux/capability.h>
54#include <linux/xattr.h> 55#include <linux/xattr.h>
55#include <linux/namei.h> 56#include <linux/namei.h>
57#include <linux/security.h>
58
59/*
60 * Get a XFS inode from a given vnode.
61 */
62xfs_inode_t *
63xfs_vtoi(
64 struct vnode *vp)
65{
66 bhv_desc_t *bdp;
67
68 bdp = bhv_lookup_range(VN_BHV_HEAD(vp),
69 VNODE_POSITION_XFS, VNODE_POSITION_XFS);
70 if (unlikely(bdp == NULL))
71 return NULL;
72 return XFS_BHVTOI(bdp);
73}
74
75/*
76 * Bring the atime in the XFS inode uptodate.
77 * Used before logging the inode to disk or when the Linux inode goes away.
78 */
79void
80xfs_synchronize_atime(
81 xfs_inode_t *ip)
82{
83 vnode_t *vp;
84
85 vp = XFS_ITOV_NULL(ip);
86 if (vp) {
87 struct inode *inode = &vp->v_inode;
88 ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
89 ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
90 }
91}
56 92
57/* 93/*
58 * Change the requested timestamp in the given inode. 94 * Change the requested timestamp in the given inode.
@@ -73,23 +109,6 @@ xfs_ichgtime(
73 struct inode *inode = LINVFS_GET_IP(XFS_ITOV(ip)); 109 struct inode *inode = LINVFS_GET_IP(XFS_ITOV(ip));
74 timespec_t tv; 110 timespec_t tv;
75 111
76 /*
77 * We're not supposed to change timestamps in readonly-mounted
78 * filesystems. Throw it away if anyone asks us.
79 */
80 if (unlikely(IS_RDONLY(inode)))
81 return;
82
83 /*
84 * Don't update access timestamps on reads if mounted "noatime".
85 * Throw it away if anyone asks us.
86 */
87 if (unlikely(
88 (ip->i_mount->m_flags & XFS_MOUNT_NOATIME || IS_NOATIME(inode)) &&
89 (flags & (XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD|XFS_ICHGTIME_CHG)) ==
90 XFS_ICHGTIME_ACC))
91 return;
92
93 nanotime(&tv); 112 nanotime(&tv);
94 if (flags & XFS_ICHGTIME_MOD) { 113 if (flags & XFS_ICHGTIME_MOD) {
95 inode->i_mtime = tv; 114 inode->i_mtime = tv;
@@ -126,8 +145,6 @@ xfs_ichgtime(
126 * Variant on the above which avoids querying the system clock 145 * Variant on the above which avoids querying the system clock
127 * in situations where we know the Linux inode timestamps have 146 * in situations where we know the Linux inode timestamps have
128 * just been updated (and so we can update our inode cheaply). 147 * just been updated (and so we can update our inode cheaply).
129 * We also skip the readonly and noatime checks here, they are
130 * also catered for already.
131 */ 148 */
132void 149void
133xfs_ichgtime_fast( 150xfs_ichgtime_fast(
@@ -138,20 +155,16 @@ xfs_ichgtime_fast(
138 timespec_t *tvp; 155 timespec_t *tvp;
139 156
140 /* 157 /*
141 * We're not supposed to change timestamps in readonly-mounted 158 * Atime updates for read() & friends are handled lazily now, and
142 * filesystems. Throw it away if anyone asks us. 159 * explicit updates must go through xfs_ichgtime()
143 */ 160 */
144 if (unlikely(IS_RDONLY(inode))) 161 ASSERT((flags & XFS_ICHGTIME_ACC) == 0);
145 return;
146 162
147 /* 163 /*
148 * Don't update access timestamps on reads if mounted "noatime". 164 * We're not supposed to change timestamps in readonly-mounted
149 * Throw it away if anyone asks us. 165 * filesystems. Throw it away if anyone asks us.
150 */ 166 */
151 if (unlikely( 167 if (unlikely(IS_RDONLY(inode)))
152 (ip->i_mount->m_flags & XFS_MOUNT_NOATIME || IS_NOATIME(inode)) &&
153 ((flags & (XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD|XFS_ICHGTIME_CHG)) ==
154 XFS_ICHGTIME_ACC)))
155 return; 168 return;
156 169
157 if (flags & XFS_ICHGTIME_MOD) { 170 if (flags & XFS_ICHGTIME_MOD) {
@@ -159,11 +172,6 @@ xfs_ichgtime_fast(
159 ip->i_d.di_mtime.t_sec = (__int32_t)tvp->tv_sec; 172 ip->i_d.di_mtime.t_sec = (__int32_t)tvp->tv_sec;
160 ip->i_d.di_mtime.t_nsec = (__int32_t)tvp->tv_nsec; 173 ip->i_d.di_mtime.t_nsec = (__int32_t)tvp->tv_nsec;
161 } 174 }
162 if (flags & XFS_ICHGTIME_ACC) {
163 tvp = &inode->i_atime;
164 ip->i_d.di_atime.t_sec = (__int32_t)tvp->tv_sec;
165 ip->i_d.di_atime.t_nsec = (__int32_t)tvp->tv_nsec;
166 }
167 if (flags & XFS_ICHGTIME_CHG) { 175 if (flags & XFS_ICHGTIME_CHG) {
168 tvp = &inode->i_ctime; 176 tvp = &inode->i_ctime;
169 ip->i_d.di_ctime.t_sec = (__int32_t)tvp->tv_sec; 177 ip->i_d.di_ctime.t_sec = (__int32_t)tvp->tv_sec;
@@ -203,13 +211,46 @@ validate_fields(
203 ip->i_nlink = va.va_nlink; 211 ip->i_nlink = va.va_nlink;
204 ip->i_blocks = va.va_nblocks; 212 ip->i_blocks = va.va_nblocks;
205 213
206 /* we're under i_sem so i_size can't change under us */ 214 /* we're under i_mutex so i_size can't change under us */
207 if (i_size_read(ip) != va.va_size) 215 if (i_size_read(ip) != va.va_size)
208 i_size_write(ip, va.va_size); 216 i_size_write(ip, va.va_size);
209 } 217 }
210} 218}
211 219
212/* 220/*
221 * Hook in SELinux. This is not quite correct yet, what we really need
222 * here (as we do for default ACLs) is a mechanism by which creation of
223 * these attrs can be journalled at inode creation time (along with the
224 * inode, of course, such that log replay can't cause these to be lost).
225 */
226STATIC int
227linvfs_init_security(
228 struct vnode *vp,
229 struct inode *dir)
230{
231 struct inode *ip = LINVFS_GET_IP(vp);
232 size_t length;
233 void *value;
234 char *name;
235 int error;
236
237 error = security_inode_init_security(ip, dir, &name, &value, &length);
238 if (error) {
239 if (error == -EOPNOTSUPP)
240 return 0;
241 return -error;
242 }
243
244 VOP_ATTR_SET(vp, name, value, length, ATTR_SECURE, NULL, error);
245 if (!error)
246 VMODIFY(vp);
247
248 kfree(name);
249 kfree(value);
250 return error;
251}
252
253/*
213 * Determine whether a process has a valid fs_struct (kernel daemons 254 * Determine whether a process has a valid fs_struct (kernel daemons
214 * like knfsd don't have an fs_struct). 255 * like knfsd don't have an fs_struct).
215 * 256 *
@@ -274,6 +315,9 @@ linvfs_mknod(
274 break; 315 break;
275 } 316 }
276 317
318 if (!error)
319 error = linvfs_init_security(vp, dir);
320
277 if (default_acl) { 321 if (default_acl) {
278 if (!error) { 322 if (!error) {
279 error = _ACL_INHERIT(vp, &va, default_acl); 323 error = _ACL_INHERIT(vp, &va, default_acl);
@@ -290,8 +334,6 @@ linvfs_mknod(
290 teardown.d_inode = ip = LINVFS_GET_IP(vp); 334 teardown.d_inode = ip = LINVFS_GET_IP(vp);
291 teardown.d_name = dentry->d_name; 335 teardown.d_name = dentry->d_name;
292 336
293 vn_mark_bad(vp);
294
295 if (S_ISDIR(mode)) 337 if (S_ISDIR(mode))
296 VOP_RMDIR(dvp, &teardown, NULL, err2); 338 VOP_RMDIR(dvp, &teardown, NULL, err2);
297 else 339 else
@@ -429,11 +471,14 @@ linvfs_symlink(
429 471
430 error = 0; 472 error = 0;
431 VOP_SYMLINK(dvp, dentry, &va, (char *)symname, &cvp, NULL, error); 473 VOP_SYMLINK(dvp, dentry, &va, (char *)symname, &cvp, NULL, error);
432 if (!error && cvp) { 474 if (likely(!error && cvp)) {
433 ip = LINVFS_GET_IP(cvp); 475 error = linvfs_init_security(cvp, dir);
434 d_instantiate(dentry, ip); 476 if (likely(!error)) {
435 validate_fields(dir); 477 ip = LINVFS_GET_IP(cvp);
436 validate_fields(ip); /* size needs update */ 478 d_instantiate(dentry, ip);
479 validate_fields(dir);
480 validate_fields(ip);
481 }
437 } 482 }
438 return -error; 483 return -error;
439} 484}
@@ -502,7 +547,7 @@ linvfs_follow_link(
502 ASSERT(dentry); 547 ASSERT(dentry);
503 ASSERT(nd); 548 ASSERT(nd);
504 549
505 link = (char *)kmalloc(MAXNAMELEN+1, GFP_KERNEL); 550 link = (char *)kmalloc(MAXPATHLEN+1, GFP_KERNEL);
506 if (!link) { 551 if (!link) {
507 nd_set_link(nd, ERR_PTR(-ENOMEM)); 552 nd_set_link(nd, ERR_PTR(-ENOMEM));
508 return NULL; 553 return NULL;
@@ -518,12 +563,12 @@ linvfs_follow_link(
518 vp = LINVFS_GET_VP(dentry->d_inode); 563 vp = LINVFS_GET_VP(dentry->d_inode);
519 564
520 iov.iov_base = link; 565 iov.iov_base = link;
521 iov.iov_len = MAXNAMELEN; 566 iov.iov_len = MAXPATHLEN;
522 567
523 uio->uio_iov = &iov; 568 uio->uio_iov = &iov;
524 uio->uio_offset = 0; 569 uio->uio_offset = 0;
525 uio->uio_segflg = UIO_SYSSPACE; 570 uio->uio_segflg = UIO_SYSSPACE;
526 uio->uio_resid = MAXNAMELEN; 571 uio->uio_resid = MAXPATHLEN;
527 uio->uio_iovcnt = 1; 572 uio->uio_iovcnt = 1;
528 573
529 VOP_READLINK(vp, uio, 0, NULL, error); 574 VOP_READLINK(vp, uio, 0, NULL, error);
@@ -531,7 +576,7 @@ linvfs_follow_link(
531 kfree(link); 576 kfree(link);
532 link = ERR_PTR(-error); 577 link = ERR_PTR(-error);
533 } else { 578 } else {
534 link[MAXNAMELEN - uio->uio_resid] = '\0'; 579 link[MAXPATHLEN - uio->uio_resid] = '\0';
535 } 580 }
536 kfree(uio); 581 kfree(uio);
537 582
diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h
index ee784b63acb..6899a6b4a50 100644
--- a/fs/xfs/linux-2.6/xfs_iops.h
+++ b/fs/xfs/linux-2.6/xfs_iops.h
@@ -26,11 +26,6 @@ extern struct file_operations linvfs_file_operations;
26extern struct file_operations linvfs_invis_file_operations; 26extern struct file_operations linvfs_invis_file_operations;
27extern struct file_operations linvfs_dir_operations; 27extern struct file_operations linvfs_dir_operations;
28 28
29extern struct address_space_operations linvfs_aops;
30
31extern int linvfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
32extern void linvfs_unwritten_done(struct buffer_head *, int);
33
34extern int xfs_ioctl(struct bhv_desc *, struct inode *, struct file *, 29extern int xfs_ioctl(struct bhv_desc *, struct inode *, struct file *,
35 int, unsigned int, void __user *); 30 int, unsigned int, void __user *);
36 31
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index d8e21ba0ccc..67389b74552 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -110,10 +110,6 @@
110 * delalloc and these ondisk-uninitialised buffers. 110 * delalloc and these ondisk-uninitialised buffers.
111 */ 111 */
112BUFFER_FNS(PrivateStart, unwritten); 112BUFFER_FNS(PrivateStart, unwritten);
113static inline void set_buffer_unwritten_io(struct buffer_head *bh)
114{
115 bh->b_end_io = linvfs_unwritten_done;
116}
117 113
118#define restricted_chown xfs_params.restrict_chown.val 114#define restricted_chown xfs_params.restrict_chown.val
119#define irix_sgid_inherit xfs_params.sgid_inherit.val 115#define irix_sgid_inherit xfs_params.sgid_inherit.val
@@ -232,7 +228,7 @@ static inline void set_buffer_unwritten_io(struct buffer_head *bh)
232#define xfs_itruncate_data(ip, off) \ 228#define xfs_itruncate_data(ip, off) \
233 (-vmtruncate(LINVFS_GET_IP(XFS_ITOV(ip)), (off))) 229 (-vmtruncate(LINVFS_GET_IP(XFS_ITOV(ip)), (off)))
234#define xfs_statvfs_fsid(statp, mp) \ 230#define xfs_statvfs_fsid(statp, mp) \
235 ({ u64 id = huge_encode_dev((mp)->m_dev); \ 231 ({ u64 id = huge_encode_dev((mp)->m_ddev_targp->bt_dev); \
236 __kernel_fsid_t *fsid = &(statp)->f_fsid; \ 232 __kernel_fsid_t *fsid = &(statp)->f_fsid; \
237 (fsid->val[0] = (u32)id, fsid->val[1] = (u32)(id >> 32)); }) 233 (fsid->val[0] = (u32)id, fsid->val[1] = (u32)(id >> 32)); })
238 234
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 279e9bc92ab..e0ab45fbfeb 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -233,8 +233,8 @@ xfs_read(
233 xfs_buftarg_t *target = 233 xfs_buftarg_t *target =
234 (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? 234 (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
235 mp->m_rtdev_targp : mp->m_ddev_targp; 235 mp->m_rtdev_targp : mp->m_ddev_targp;
236 if ((*offset & target->pbr_smask) || 236 if ((*offset & target->bt_smask) ||
237 (size & target->pbr_smask)) { 237 (size & target->bt_smask)) {
238 if (*offset == ip->i_d.di_size) { 238 if (*offset == ip->i_d.di_size) {
239 return (0); 239 return (0);
240 } 240 }
@@ -254,7 +254,7 @@ xfs_read(
254 } 254 }
255 255
256 if (unlikely(ioflags & IO_ISDIRECT)) 256 if (unlikely(ioflags & IO_ISDIRECT))
257 down(&inode->i_sem); 257 mutex_lock(&inode->i_mutex);
258 xfs_ilock(ip, XFS_IOLOCK_SHARED); 258 xfs_ilock(ip, XFS_IOLOCK_SHARED);
259 259
260 if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) && 260 if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) &&
@@ -281,12 +281,9 @@ xfs_read(
281 281
282 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 282 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
283 283
284 if (likely(!(ioflags & IO_INVIS)))
285 xfs_ichgtime_fast(ip, inode, XFS_ICHGTIME_ACC);
286
287unlock_isem: 284unlock_isem:
288 if (unlikely(ioflags & IO_ISDIRECT)) 285 if (unlikely(ioflags & IO_ISDIRECT))
289 up(&inode->i_sem); 286 mutex_unlock(&inode->i_mutex);
290 return ret; 287 return ret;
291} 288}
292 289
@@ -346,9 +343,6 @@ xfs_sendfile(
346 if (ret > 0) 343 if (ret > 0)
347 XFS_STATS_ADD(xs_read_bytes, ret); 344 XFS_STATS_ADD(xs_read_bytes, ret);
348 345
349 if (likely(!(ioflags & IO_INVIS)))
350 xfs_ichgtime_fast(ip, LINVFS_GET_IP(vp), XFS_ICHGTIME_ACC);
351
352 return ret; 346 return ret;
353} 347}
354 348
@@ -362,7 +356,6 @@ STATIC int /* error (positive) */
362xfs_zero_last_block( 356xfs_zero_last_block(
363 struct inode *ip, 357 struct inode *ip,
364 xfs_iocore_t *io, 358 xfs_iocore_t *io,
365 xfs_off_t offset,
366 xfs_fsize_t isize, 359 xfs_fsize_t isize,
367 xfs_fsize_t end_size) 360 xfs_fsize_t end_size)
368{ 361{
@@ -371,19 +364,16 @@ xfs_zero_last_block(
371 int nimaps; 364 int nimaps;
372 int zero_offset; 365 int zero_offset;
373 int zero_len; 366 int zero_len;
374 int isize_fsb_offset;
375 int error = 0; 367 int error = 0;
376 xfs_bmbt_irec_t imap; 368 xfs_bmbt_irec_t imap;
377 loff_t loff; 369 loff_t loff;
378 size_t lsize;
379 370
380 ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0); 371 ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0);
381 ASSERT(offset > isize);
382 372
383 mp = io->io_mount; 373 mp = io->io_mount;
384 374
385 isize_fsb_offset = XFS_B_FSB_OFFSET(mp, isize); 375 zero_offset = XFS_B_FSB_OFFSET(mp, isize);
386 if (isize_fsb_offset == 0) { 376 if (zero_offset == 0) {
387 /* 377 /*
388 * There are no extra bytes in the last block on disk to 378 * There are no extra bytes in the last block on disk to
389 * zero, so return. 379 * zero, so return.
@@ -413,10 +403,8 @@ xfs_zero_last_block(
413 */ 403 */
414 XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD); 404 XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD);
415 loff = XFS_FSB_TO_B(mp, last_fsb); 405 loff = XFS_FSB_TO_B(mp, last_fsb);
416 lsize = XFS_FSB_TO_B(mp, 1);
417 406
418 zero_offset = isize_fsb_offset; 407 zero_len = mp->m_sb.sb_blocksize - zero_offset;
419 zero_len = mp->m_sb.sb_blocksize - isize_fsb_offset;
420 408
421 error = xfs_iozero(ip, loff + zero_offset, zero_len, end_size); 409 error = xfs_iozero(ip, loff + zero_offset, zero_len, end_size);
422 410
@@ -447,20 +435,17 @@ xfs_zero_eof(
447 struct inode *ip = LINVFS_GET_IP(vp); 435 struct inode *ip = LINVFS_GET_IP(vp);
448 xfs_fileoff_t start_zero_fsb; 436 xfs_fileoff_t start_zero_fsb;
449 xfs_fileoff_t end_zero_fsb; 437 xfs_fileoff_t end_zero_fsb;
450 xfs_fileoff_t prev_zero_fsb;
451 xfs_fileoff_t zero_count_fsb; 438 xfs_fileoff_t zero_count_fsb;
452 xfs_fileoff_t last_fsb; 439 xfs_fileoff_t last_fsb;
453 xfs_extlen_t buf_len_fsb; 440 xfs_extlen_t buf_len_fsb;
454 xfs_extlen_t prev_zero_count;
455 xfs_mount_t *mp; 441 xfs_mount_t *mp;
456 int nimaps; 442 int nimaps;
457 int error = 0; 443 int error = 0;
458 xfs_bmbt_irec_t imap; 444 xfs_bmbt_irec_t imap;
459 loff_t loff;
460 size_t lsize;
461 445
462 ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); 446 ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
463 ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); 447 ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
448 ASSERT(offset > isize);
464 449
465 mp = io->io_mount; 450 mp = io->io_mount;
466 451
@@ -468,7 +453,7 @@ xfs_zero_eof(
468 * First handle zeroing the block on which isize resides. 453 * First handle zeroing the block on which isize resides.
469 * We only zero a part of that block so it is handled specially. 454 * We only zero a part of that block so it is handled specially.
470 */ 455 */
471 error = xfs_zero_last_block(ip, io, offset, isize, end_size); 456 error = xfs_zero_last_block(ip, io, isize, end_size);
472 if (error) { 457 if (error) {
473 ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); 458 ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
474 ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); 459 ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
@@ -496,8 +481,6 @@ xfs_zero_eof(
496 } 481 }
497 482
498 ASSERT(start_zero_fsb <= end_zero_fsb); 483 ASSERT(start_zero_fsb <= end_zero_fsb);
499 prev_zero_fsb = NULLFILEOFF;
500 prev_zero_count = 0;
501 while (start_zero_fsb <= end_zero_fsb) { 484 while (start_zero_fsb <= end_zero_fsb) {
502 nimaps = 1; 485 nimaps = 1;
503 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; 486 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
@@ -519,10 +502,7 @@ xfs_zero_eof(
519 * that sits on a hole and sets the page as P_HOLE 502 * that sits on a hole and sets the page as P_HOLE
520 * and calls remapf if it is a mapped file. 503 * and calls remapf if it is a mapped file.
521 */ 504 */
522 prev_zero_fsb = NULLFILEOFF; 505 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
523 prev_zero_count = 0;
524 start_zero_fsb = imap.br_startoff +
525 imap.br_blockcount;
526 ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); 506 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
527 continue; 507 continue;
528 } 508 }
@@ -543,17 +523,15 @@ xfs_zero_eof(
543 */ 523 */
544 XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); 524 XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
545 525
546 loff = XFS_FSB_TO_B(mp, start_zero_fsb); 526 error = xfs_iozero(ip,
547 lsize = XFS_FSB_TO_B(mp, buf_len_fsb); 527 XFS_FSB_TO_B(mp, start_zero_fsb),
548 528 XFS_FSB_TO_B(mp, buf_len_fsb),
549 error = xfs_iozero(ip, loff, lsize, end_size); 529 end_size);
550 530
551 if (error) { 531 if (error) {
552 goto out_lock; 532 goto out_lock;
553 } 533 }
554 534
555 prev_zero_fsb = start_zero_fsb;
556 prev_zero_count = buf_len_fsb;
557 start_zero_fsb = imap.br_startoff + buf_len_fsb; 535 start_zero_fsb = imap.br_startoff + buf_len_fsb;
558 ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); 536 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
559 537
@@ -640,7 +618,7 @@ xfs_write(
640 (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? 618 (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
641 mp->m_rtdev_targp : mp->m_ddev_targp; 619 mp->m_rtdev_targp : mp->m_ddev_targp;
642 620
643 if ((pos & target->pbr_smask) || (count & target->pbr_smask)) 621 if ((pos & target->bt_smask) || (count & target->bt_smask))
644 return XFS_ERROR(-EINVAL); 622 return XFS_ERROR(-EINVAL);
645 623
646 if (!VN_CACHED(vp) && pos < i_size_read(inode)) 624 if (!VN_CACHED(vp) && pos < i_size_read(inode))
@@ -655,7 +633,7 @@ relock:
655 iolock = XFS_IOLOCK_EXCL; 633 iolock = XFS_IOLOCK_EXCL;
656 locktype = VRWLOCK_WRITE; 634 locktype = VRWLOCK_WRITE;
657 635
658 down(&inode->i_sem); 636 mutex_lock(&inode->i_mutex);
659 } else { 637 } else {
660 iolock = XFS_IOLOCK_SHARED; 638 iolock = XFS_IOLOCK_SHARED;
661 locktype = VRWLOCK_WRITE_DIRECT; 639 locktype = VRWLOCK_WRITE_DIRECT;
@@ -686,7 +664,7 @@ start:
686 int dmflags = FILP_DELAY_FLAG(file); 664 int dmflags = FILP_DELAY_FLAG(file);
687 665
688 if (need_isem) 666 if (need_isem)
689 dmflags |= DM_FLAGS_ISEM; 667 dmflags |= DM_FLAGS_IMUX;
690 668
691 xfs_iunlock(xip, XFS_ILOCK_EXCL); 669 xfs_iunlock(xip, XFS_ILOCK_EXCL);
692 error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp, 670 error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
@@ -713,7 +691,7 @@ start:
713 } 691 }
714 692
715 if (likely(!(ioflags & IO_INVIS))) { 693 if (likely(!(ioflags & IO_INVIS))) {
716 inode_update_time(inode, 1); 694 file_update_time(file);
717 xfs_ichgtime_fast(xip, inode, 695 xfs_ichgtime_fast(xip, inode,
718 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 696 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
719 } 697 }
@@ -772,7 +750,7 @@ retry:
772 if (need_isem) { 750 if (need_isem) {
773 /* demote the lock now the cached pages are gone */ 751 /* demote the lock now the cached pages are gone */
774 XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL); 752 XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL);
775 up(&inode->i_sem); 753 mutex_unlock(&inode->i_mutex);
776 754
777 iolock = XFS_IOLOCK_SHARED; 755 iolock = XFS_IOLOCK_SHARED;
778 locktype = VRWLOCK_WRITE_DIRECT; 756 locktype = VRWLOCK_WRITE_DIRECT;
@@ -817,20 +795,24 @@ retry:
817 795
818 xfs_rwunlock(bdp, locktype); 796 xfs_rwunlock(bdp, locktype);
819 if (need_isem) 797 if (need_isem)
820 up(&inode->i_sem); 798 mutex_unlock(&inode->i_mutex);
821 error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp, 799 error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
822 DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL, 800 DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
823 0, 0, 0); /* Delay flag intentionally unused */ 801 0, 0, 0); /* Delay flag intentionally unused */
824 if (error) 802 if (error)
825 goto out_nounlocks; 803 goto out_nounlocks;
826 if (need_isem) 804 if (need_isem)
827 down(&inode->i_sem); 805 mutex_lock(&inode->i_mutex);
828 xfs_rwlock(bdp, locktype); 806 xfs_rwlock(bdp, locktype);
829 pos = xip->i_d.di_size; 807 pos = xip->i_d.di_size;
830 ret = 0; 808 ret = 0;
831 goto retry; 809 goto retry;
832 } 810 }
833 811
812 isize = i_size_read(inode);
813 if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
814 *offset = isize;
815
834 if (*offset > xip->i_d.di_size) { 816 if (*offset > xip->i_d.di_size) {
835 xfs_ilock(xip, XFS_ILOCK_EXCL); 817 xfs_ilock(xip, XFS_ILOCK_EXCL);
836 if (*offset > xip->i_d.di_size) { 818 if (*offset > xip->i_d.di_size) {
@@ -926,7 +908,7 @@ retry:
926 908
927 xfs_rwunlock(bdp, locktype); 909 xfs_rwunlock(bdp, locktype);
928 if (need_isem) 910 if (need_isem)
929 up(&inode->i_sem); 911 mutex_unlock(&inode->i_mutex);
930 912
931 error = sync_page_range(inode, mapping, pos, ret); 913 error = sync_page_range(inode, mapping, pos, ret);
932 if (!error) 914 if (!error)
@@ -938,7 +920,7 @@ retry:
938 xfs_rwunlock(bdp, locktype); 920 xfs_rwunlock(bdp, locktype);
939 out_unlock_isem: 921 out_unlock_isem:
940 if (need_isem) 922 if (need_isem)
941 up(&inode->i_sem); 923 mutex_unlock(&inode->i_mutex);
942 out_nounlocks: 924 out_nounlocks:
943 return -error; 925 return -error;
944} 926}
@@ -956,7 +938,7 @@ xfs_bdstrat_cb(struct xfs_buf *bp)
956 938
957 mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *); 939 mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
958 if (!XFS_FORCED_SHUTDOWN(mp)) { 940 if (!XFS_FORCED_SHUTDOWN(mp)) {
959 pagebuf_iorequest(bp); 941 xfs_buf_iorequest(bp);
960 return 0; 942 return 0;
961 } else { 943 } else {
962 xfs_buftrace("XFS__BDSTRAT IOERROR", bp); 944 xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
@@ -1009,7 +991,7 @@ xfsbdstrat(
1009 * if (XFS_BUF_IS_GRIO(bp)) { 991 * if (XFS_BUF_IS_GRIO(bp)) {
1010 */ 992 */
1011 993
1012 pagebuf_iorequest(bp); 994 xfs_buf_iorequest(bp);
1013 return 0; 995 return 0;
1014 } 996 }
1015 997
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
index 6c40a74be7c..8955720a2c6 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -34,7 +34,7 @@ xfs_read_xfsstats(
34 __uint64_t xs_write_bytes = 0; 34 __uint64_t xs_write_bytes = 0;
35 __uint64_t xs_read_bytes = 0; 35 __uint64_t xs_read_bytes = 0;
36 36
37 static struct xstats_entry { 37 static const struct xstats_entry {
38 char *desc; 38 char *desc;
39 int endpoint; 39 int endpoint;
40 } xstats[] = { 40 } xstats[] = {
diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h
index 50027c4a561..8ba7a2fa6c1 100644
--- a/fs/xfs/linux-2.6/xfs_stats.h
+++ b/fs/xfs/linux-2.6/xfs_stats.h
@@ -109,15 +109,15 @@ struct xfsstats {
109 __uint32_t vn_remove; /* # times vn_remove called */ 109 __uint32_t vn_remove; /* # times vn_remove called */
110 __uint32_t vn_free; /* # times vn_free called */ 110 __uint32_t vn_free; /* # times vn_free called */
111#define XFSSTAT_END_BUF (XFSSTAT_END_VNODE_OPS+9) 111#define XFSSTAT_END_BUF (XFSSTAT_END_VNODE_OPS+9)
112 __uint32_t pb_get; 112 __uint32_t xb_get;
113 __uint32_t pb_create; 113 __uint32_t xb_create;
114 __uint32_t pb_get_locked; 114 __uint32_t xb_get_locked;
115 __uint32_t pb_get_locked_waited; 115 __uint32_t xb_get_locked_waited;
116 __uint32_t pb_busy_locked; 116 __uint32_t xb_busy_locked;
117 __uint32_t pb_miss_locked; 117 __uint32_t xb_miss_locked;
118 __uint32_t pb_page_retries; 118 __uint32_t xb_page_retries;
119 __uint32_t pb_page_found; 119 __uint32_t xb_page_found;
120 __uint32_t pb_get_read; 120 __uint32_t xb_get_read;
121/* Extra precision counters */ 121/* Extra precision counters */
122 __uint64_t xs_xstrat_bytes; 122 __uint64_t xs_xstrat_bytes;
123 __uint64_t xs_write_bytes; 123 __uint64_t xs_write_bytes;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 6116b5bf433..f22e426d9e4 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -306,13 +306,15 @@ xfs_mountfs_check_barriers(xfs_mount_t *mp)
306 xfs_fs_cmn_err(CE_NOTE, mp, 306 xfs_fs_cmn_err(CE_NOTE, mp,
307 "Disabling barriers, not supported with external log device"); 307 "Disabling barriers, not supported with external log device");
308 mp->m_flags &= ~XFS_MOUNT_BARRIER; 308 mp->m_flags &= ~XFS_MOUNT_BARRIER;
309 return;
309 } 310 }
310 311
311 if (mp->m_ddev_targp->pbr_bdev->bd_disk->queue->ordered == 312 if (mp->m_ddev_targp->bt_bdev->bd_disk->queue->ordered ==
312 QUEUE_ORDERED_NONE) { 313 QUEUE_ORDERED_NONE) {
313 xfs_fs_cmn_err(CE_NOTE, mp, 314 xfs_fs_cmn_err(CE_NOTE, mp,
314 "Disabling barriers, not supported by the underlying device"); 315 "Disabling barriers, not supported by the underlying device");
315 mp->m_flags &= ~XFS_MOUNT_BARRIER; 316 mp->m_flags &= ~XFS_MOUNT_BARRIER;
317 return;
316 } 318 }
317 319
318 error = xfs_barrier_test(mp); 320 error = xfs_barrier_test(mp);
@@ -320,6 +322,7 @@ xfs_mountfs_check_barriers(xfs_mount_t *mp)
320 xfs_fs_cmn_err(CE_NOTE, mp, 322 xfs_fs_cmn_err(CE_NOTE, mp,
321 "Disabling barriers, trial barrier write failed"); 323 "Disabling barriers, trial barrier write failed");
322 mp->m_flags &= ~XFS_MOUNT_BARRIER; 324 mp->m_flags &= ~XFS_MOUNT_BARRIER;
325 return;
323 } 326 }
324} 327}
325 328
@@ -327,7 +330,7 @@ void
327xfs_blkdev_issue_flush( 330xfs_blkdev_issue_flush(
328 xfs_buftarg_t *buftarg) 331 xfs_buftarg_t *buftarg)
329{ 332{
330 blkdev_issue_flush(buftarg->pbr_bdev, NULL); 333 blkdev_issue_flush(buftarg->bt_bdev, NULL);
331} 334}
332 335
333STATIC struct inode * 336STATIC struct inode *
@@ -576,7 +579,7 @@ xfssyncd(
576 timeleft = schedule_timeout_interruptible(timeleft); 579 timeleft = schedule_timeout_interruptible(timeleft);
577 /* swsusp */ 580 /* swsusp */
578 try_to_freeze(); 581 try_to_freeze();
579 if (kthread_should_stop()) 582 if (kthread_should_stop() && list_empty(&vfsp->vfs_sync_list))
580 break; 583 break;
581 584
582 spin_lock(&vfsp->vfs_sync_lock); 585 spin_lock(&vfsp->vfs_sync_lock);
@@ -966,9 +969,9 @@ init_xfs_fs( void )
966 if (error < 0) 969 if (error < 0)
967 goto undo_zones; 970 goto undo_zones;
968 971
969 error = pagebuf_init(); 972 error = xfs_buf_init();
970 if (error < 0) 973 if (error < 0)
971 goto undo_pagebuf; 974 goto undo_buffers;
972 975
973 vn_init(); 976 vn_init();
974 xfs_init(); 977 xfs_init();
@@ -982,9 +985,9 @@ init_xfs_fs( void )
982 return 0; 985 return 0;
983 986
984undo_register: 987undo_register:
985 pagebuf_terminate(); 988 xfs_buf_terminate();
986 989
987undo_pagebuf: 990undo_buffers:
988 linvfs_destroy_zones(); 991 linvfs_destroy_zones();
989 992
990undo_zones: 993undo_zones:
@@ -998,7 +1001,7 @@ exit_xfs_fs( void )
998 XFS_DM_EXIT(&xfs_fs_type); 1001 XFS_DM_EXIT(&xfs_fs_type);
999 unregister_filesystem(&xfs_fs_type); 1002 unregister_filesystem(&xfs_fs_type);
1000 xfs_cleanup(); 1003 xfs_cleanup();
1001 pagebuf_terminate(); 1004 xfs_buf_terminate();
1002 linvfs_destroy_zones(); 1005 linvfs_destroy_zones();
1003 ktrace_uninit(); 1006 ktrace_uninit();
1004} 1007}
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
index e9bbcb4d624..260dd8415dd 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.c
+++ b/fs/xfs/linux-2.6/xfs_vnode.c
@@ -106,7 +106,6 @@ vn_revalidate_core(
106 inode->i_blocks = vap->va_nblocks; 106 inode->i_blocks = vap->va_nblocks;
107 inode->i_mtime = vap->va_mtime; 107 inode->i_mtime = vap->va_mtime;
108 inode->i_ctime = vap->va_ctime; 108 inode->i_ctime = vap->va_ctime;
109 inode->i_atime = vap->va_atime;
110 inode->i_blksize = vap->va_blocksize; 109 inode->i_blksize = vap->va_blocksize;
111 if (vap->va_xflags & XFS_XFLAG_IMMUTABLE) 110 if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
112 inode->i_flags |= S_IMMUTABLE; 111 inode->i_flags |= S_IMMUTABLE;
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index f2bbb327c08..0fe2419461d 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -566,6 +566,25 @@ static inline int VN_BAD(struct vnode *vp)
566} 566}
567 567
568/* 568/*
569 * Extracting atime values in various formats
570 */
571static inline void vn_atime_to_bstime(struct vnode *vp, xfs_bstime_t *bs_atime)
572{
573 bs_atime->tv_sec = vp->v_inode.i_atime.tv_sec;
574 bs_atime->tv_nsec = vp->v_inode.i_atime.tv_nsec;
575}
576
577static inline void vn_atime_to_timespec(struct vnode *vp, struct timespec *ts)
578{
579 *ts = vp->v_inode.i_atime;
580}
581
582static inline void vn_atime_to_time_t(struct vnode *vp, time_t *tt)
583{
584 *tt = vp->v_inode.i_atime.tv_sec;
585}
586
587/*
569 * Some useful predicates. 588 * Some useful predicates.
570 */ 589 */
571#define VN_MAPPED(vp) mapping_mapped(LINVFS_GET_IP(vp)->i_mapping) 590#define VN_MAPPED(vp) mapping_mapped(LINVFS_GET_IP(vp)->i_mapping)
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 00b5043dfa5..772ac48329e 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -104,7 +104,7 @@ xfs_qm_dqinit(
104 */ 104 */
105 if (brandnewdquot) { 105 if (brandnewdquot) {
106 dqp->dq_flnext = dqp->dq_flprev = dqp; 106 dqp->dq_flnext = dqp->dq_flprev = dqp;
107 mutex_init(&dqp->q_qlock, MUTEX_DEFAULT, "xdq"); 107 mutex_init(&dqp->q_qlock);
108 initnsema(&dqp->q_flock, 1, "fdq"); 108 initnsema(&dqp->q_flock, 1, "fdq");
109 sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq"); 109 sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq");
110 110
@@ -1382,7 +1382,7 @@ void
1382xfs_dqlock( 1382xfs_dqlock(
1383 xfs_dquot_t *dqp) 1383 xfs_dquot_t *dqp)
1384{ 1384{
1385 mutex_lock(&(dqp->q_qlock), PINOD); 1385 mutex_lock(&(dqp->q_qlock));
1386} 1386}
1387 1387
1388void 1388void
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 2f69822344e..2ec6b441849 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -239,7 +239,7 @@ xfs_qm_dquot_logitem_pushbuf(
239 * trying to duplicate our effort. 239 * trying to duplicate our effort.
240 */ 240 */
241 ASSERT(qip->qli_pushbuf_flag != 0); 241 ASSERT(qip->qli_pushbuf_flag != 0);
242 ASSERT(qip->qli_push_owner == get_thread_id()); 242 ASSERT(qip->qli_push_owner == current_pid());
243 243
244 /* 244 /*
245 * If flushlock isn't locked anymore, chances are that the 245 * If flushlock isn't locked anymore, chances are that the
@@ -333,7 +333,7 @@ xfs_qm_dquot_logitem_trylock(
333 qip->qli_pushbuf_flag = 1; 333 qip->qli_pushbuf_flag = 1;
334 ASSERT(qip->qli_format.qlf_blkno == dqp->q_blkno); 334 ASSERT(qip->qli_format.qlf_blkno == dqp->q_blkno);
335#ifdef DEBUG 335#ifdef DEBUG
336 qip->qli_push_owner = get_thread_id(); 336 qip->qli_push_owner = current_pid();
337#endif 337#endif
338 /* 338 /*
339 * The dquot is left locked. 339 * The dquot is left locked.
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 1aea42d71a6..53a00fb217f 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -78,7 +78,7 @@ STATIC int xfs_qm_dqhashlock_nowait(xfs_dquot_t *);
78 78
79STATIC int xfs_qm_init_quotainos(xfs_mount_t *); 79STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
80STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); 80STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
81STATIC int xfs_qm_shake(int, unsigned int); 81STATIC int xfs_qm_shake(int, gfp_t);
82 82
83#ifdef DEBUG 83#ifdef DEBUG
84extern mutex_t qcheck_lock; 84extern mutex_t qcheck_lock;
@@ -167,7 +167,7 @@ xfs_Gqm_init(void)
167 xqm->qm_dqfree_ratio = XFS_QM_DQFREE_RATIO; 167 xqm->qm_dqfree_ratio = XFS_QM_DQFREE_RATIO;
168 xqm->qm_nrefs = 0; 168 xqm->qm_nrefs = 0;
169#ifdef DEBUG 169#ifdef DEBUG
170 mutex_init(&qcheck_lock, MUTEX_DEFAULT, "qchk"); 170 mutex_init(&qcheck_lock);
171#endif 171#endif
172 return xqm; 172 return xqm;
173} 173}
@@ -497,7 +497,7 @@ xfs_qm_dqflush_all(
497 int error; 497 int error;
498 498
499 if (mp->m_quotainfo == NULL) 499 if (mp->m_quotainfo == NULL)
500 return (0); 500 return 0;
501 niters = 0; 501 niters = 0;
502again: 502again:
503 xfs_qm_mplist_lock(mp); 503 xfs_qm_mplist_lock(mp);
@@ -528,7 +528,7 @@ again:
528 error = xfs_qm_dqflush(dqp, flags); 528 error = xfs_qm_dqflush(dqp, flags);
529 xfs_dqunlock(dqp); 529 xfs_dqunlock(dqp);
530 if (error) 530 if (error)
531 return (error); 531 return error;
532 532
533 xfs_qm_mplist_lock(mp); 533 xfs_qm_mplist_lock(mp);
534 if (recl != XFS_QI_MPLRECLAIMS(mp)) { 534 if (recl != XFS_QI_MPLRECLAIMS(mp)) {
@@ -540,7 +540,7 @@ again:
540 540
541 xfs_qm_mplist_unlock(mp); 541 xfs_qm_mplist_unlock(mp);
542 /* return ! busy */ 542 /* return ! busy */
543 return (0); 543 return 0;
544} 544}
545/* 545/*
546 * Release the group dquot pointers the user dquots may be 546 * Release the group dquot pointers the user dquots may be
@@ -599,7 +599,7 @@ xfs_qm_dqpurge_int(
599 int nmisses; 599 int nmisses;
600 600
601 if (mp->m_quotainfo == NULL) 601 if (mp->m_quotainfo == NULL)
602 return (0); 602 return 0;
603 603
604 dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0; 604 dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0;
605 dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0; 605 dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0;
@@ -796,7 +796,7 @@ xfs_qm_dqattach_one(
796 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 796 ASSERT(XFS_DQ_IS_LOCKED(dqp));
797 } 797 }
798#endif 798#endif
799 return (error); 799 return error;
800} 800}
801 801
802 802
@@ -897,7 +897,7 @@ xfs_qm_dqattach(
897 (! XFS_NOT_DQATTACHED(mp, ip)) || 897 (! XFS_NOT_DQATTACHED(mp, ip)) ||
898 (ip->i_ino == mp->m_sb.sb_uquotino) || 898 (ip->i_ino == mp->m_sb.sb_uquotino) ||
899 (ip->i_ino == mp->m_sb.sb_gquotino)) 899 (ip->i_ino == mp->m_sb.sb_gquotino))
900 return (0); 900 return 0;
901 901
902 ASSERT((flags & XFS_QMOPT_ILOCKED) == 0 || 902 ASSERT((flags & XFS_QMOPT_ILOCKED) == 0 ||
903 XFS_ISLOCKED_INODE_EXCL(ip)); 903 XFS_ISLOCKED_INODE_EXCL(ip));
@@ -984,7 +984,7 @@ xfs_qm_dqattach(
984 else 984 else
985 ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); 985 ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
986#endif 986#endif
987 return (error); 987 return error;
988} 988}
989 989
990/* 990/*
@@ -1049,7 +1049,7 @@ xfs_qm_sync(
1049 */ 1049 */
1050 if (! XFS_IS_QUOTA_ON(mp)) { 1050 if (! XFS_IS_QUOTA_ON(mp)) {
1051 xfs_qm_mplist_unlock(mp); 1051 xfs_qm_mplist_unlock(mp);
1052 return (0); 1052 return 0;
1053 } 1053 }
1054 FOREACH_DQUOT_IN_MP(dqp, mp) { 1054 FOREACH_DQUOT_IN_MP(dqp, mp) {
1055 /* 1055 /*
@@ -1109,9 +1109,9 @@ xfs_qm_sync(
1109 error = xfs_qm_dqflush(dqp, flush_flags); 1109 error = xfs_qm_dqflush(dqp, flush_flags);
1110 xfs_dqunlock(dqp); 1110 xfs_dqunlock(dqp);
1111 if (error && XFS_FORCED_SHUTDOWN(mp)) 1111 if (error && XFS_FORCED_SHUTDOWN(mp))
1112 return(0); /* Need to prevent umount failure */ 1112 return 0; /* Need to prevent umount failure */
1113 else if (error) 1113 else if (error)
1114 return (error); 1114 return error;
1115 1115
1116 xfs_qm_mplist_lock(mp); 1116 xfs_qm_mplist_lock(mp);
1117 if (recl != XFS_QI_MPLRECLAIMS(mp)) { 1117 if (recl != XFS_QI_MPLRECLAIMS(mp)) {
@@ -1124,7 +1124,7 @@ xfs_qm_sync(
1124 } 1124 }
1125 1125
1126 xfs_qm_mplist_unlock(mp); 1126 xfs_qm_mplist_unlock(mp);
1127 return (0); 1127 return 0;
1128} 1128}
1129 1129
1130 1130
@@ -1146,7 +1146,7 @@ xfs_qm_init_quotainfo(
1146 * Tell XQM that we exist as soon as possible. 1146 * Tell XQM that we exist as soon as possible.
1147 */ 1147 */
1148 if ((error = xfs_qm_hold_quotafs_ref(mp))) { 1148 if ((error = xfs_qm_hold_quotafs_ref(mp))) {
1149 return (error); 1149 return error;
1150 } 1150 }
1151 1151
1152 qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); 1152 qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
@@ -1158,7 +1158,7 @@ xfs_qm_init_quotainfo(
1158 if ((error = xfs_qm_init_quotainos(mp))) { 1158 if ((error = xfs_qm_init_quotainos(mp))) {
1159 kmem_free(qinf, sizeof(xfs_quotainfo_t)); 1159 kmem_free(qinf, sizeof(xfs_quotainfo_t));
1160 mp->m_quotainfo = NULL; 1160 mp->m_quotainfo = NULL;
1161 return (error); 1161 return error;
1162 } 1162 }
1163 1163
1164 spinlock_init(&qinf->qi_pinlock, "xfs_qinf_pin"); 1164 spinlock_init(&qinf->qi_pinlock, "xfs_qinf_pin");
@@ -1166,7 +1166,7 @@ xfs_qm_init_quotainfo(
1166 qinf->qi_dqreclaims = 0; 1166 qinf->qi_dqreclaims = 0;
1167 1167
1168 /* mutex used to serialize quotaoffs */ 1168 /* mutex used to serialize quotaoffs */
1169 mutex_init(&qinf->qi_quotaofflock, MUTEX_DEFAULT, "qoff"); 1169 mutex_init(&qinf->qi_quotaofflock);
1170 1170
1171 /* Precalc some constants */ 1171 /* Precalc some constants */
1172 qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); 1172 qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
@@ -1232,7 +1232,7 @@ xfs_qm_init_quotainfo(
1232 qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; 1232 qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
1233 } 1233 }
1234 1234
1235 return (0); 1235 return 0;
1236} 1236}
1237 1237
1238 1238
@@ -1285,7 +1285,7 @@ xfs_qm_list_init(
1285 char *str, 1285 char *str,
1286 int n) 1286 int n)
1287{ 1287{
1288 mutex_init(&list->qh_lock, MUTEX_DEFAULT, str); 1288 mutex_init(&list->qh_lock);
1289 list->qh_next = NULL; 1289 list->qh_next = NULL;
1290 list->qh_version = 0; 1290 list->qh_version = 0;
1291 list->qh_nelems = 0; 1291 list->qh_nelems = 0;
@@ -1332,7 +1332,7 @@ xfs_qm_dqget_noattach(
1332 */ 1332 */
1333 ASSERT(error != ESRCH); 1333 ASSERT(error != ESRCH);
1334 ASSERT(error != ENOENT); 1334 ASSERT(error != ENOENT);
1335 return (error); 1335 return error;
1336 } 1336 }
1337 ASSERT(udqp); 1337 ASSERT(udqp);
1338 } 1338 }
@@ -1355,7 +1355,7 @@ xfs_qm_dqget_noattach(
1355 xfs_qm_dqrele(udqp); 1355 xfs_qm_dqrele(udqp);
1356 ASSERT(error != ESRCH); 1356 ASSERT(error != ESRCH);
1357 ASSERT(error != ENOENT); 1357 ASSERT(error != ENOENT);
1358 return (error); 1358 return error;
1359 } 1359 }
1360 ASSERT(gdqp); 1360 ASSERT(gdqp);
1361 1361
@@ -1376,7 +1376,7 @@ xfs_qm_dqget_noattach(
1376 if (udqp) ASSERT(XFS_DQ_IS_LOCKED(udqp)); 1376 if (udqp) ASSERT(XFS_DQ_IS_LOCKED(udqp));
1377 if (gdqp) ASSERT(XFS_DQ_IS_LOCKED(gdqp)); 1377 if (gdqp) ASSERT(XFS_DQ_IS_LOCKED(gdqp));
1378#endif 1378#endif
1379 return (0); 1379 return 0;
1380} 1380}
1381 1381
1382/* 1382/*
@@ -1392,26 +1392,28 @@ xfs_qm_qino_alloc(
1392{ 1392{
1393 xfs_trans_t *tp; 1393 xfs_trans_t *tp;
1394 int error; 1394 int error;
1395 unsigned long s; 1395 unsigned long s;
1396 cred_t zerocr; 1396 cred_t zerocr;
1397 xfs_inode_t zeroino;
1397 int committed; 1398 int committed;
1398 1399
1399 tp = xfs_trans_alloc(mp,XFS_TRANS_QM_QINOCREATE); 1400 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE);
1400 if ((error = xfs_trans_reserve(tp, 1401 if ((error = xfs_trans_reserve(tp,
1401 XFS_QM_QINOCREATE_SPACE_RES(mp), 1402 XFS_QM_QINOCREATE_SPACE_RES(mp),
1402 XFS_CREATE_LOG_RES(mp), 0, 1403 XFS_CREATE_LOG_RES(mp), 0,
1403 XFS_TRANS_PERM_LOG_RES, 1404 XFS_TRANS_PERM_LOG_RES,
1404 XFS_CREATE_LOG_COUNT))) { 1405 XFS_CREATE_LOG_COUNT))) {
1405 xfs_trans_cancel(tp, 0); 1406 xfs_trans_cancel(tp, 0);
1406 return (error); 1407 return error;
1407 } 1408 }
1408 memset(&zerocr, 0, sizeof(zerocr)); 1409 memset(&zerocr, 0, sizeof(zerocr));
1410 memset(&zeroino, 0, sizeof(zeroino));
1409 1411
1410 if ((error = xfs_dir_ialloc(&tp, mp->m_rootip, S_IFREG, 1, 0, 1412 if ((error = xfs_dir_ialloc(&tp, &zeroino, S_IFREG, 1, 0,
1411 &zerocr, 0, 1, ip, &committed))) { 1413 &zerocr, 0, 1, ip, &committed))) {
1412 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | 1414 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
1413 XFS_TRANS_ABORT); 1415 XFS_TRANS_ABORT);
1414 return (error); 1416 return error;
1415 } 1417 }
1416 1418
1417 /* 1419 /*
@@ -1459,9 +1461,9 @@ xfs_qm_qino_alloc(
1459 if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, 1461 if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES,
1460 NULL))) { 1462 NULL))) {
1461 xfs_fs_cmn_err(CE_ALERT, mp, "XFS qino_alloc failed!"); 1463 xfs_fs_cmn_err(CE_ALERT, mp, "XFS qino_alloc failed!");
1462 return (error); 1464 return error;
1463 } 1465 }
1464 return (0); 1466 return 0;
1465} 1467}
1466 1468
1467 1469
@@ -1506,7 +1508,7 @@ xfs_qm_reset_dqcounts(
1506 ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1); 1508 ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1);
1507 } 1509 }
1508 1510
1509 return (0); 1511 return 0;
1510} 1512}
1511 1513
1512STATIC int 1514STATIC int
@@ -1555,7 +1557,7 @@ xfs_qm_dqiter_bufs(
1555 bno++; 1557 bno++;
1556 firstid += XFS_QM_DQPERBLK(mp); 1558 firstid += XFS_QM_DQPERBLK(mp);
1557 } 1559 }
1558 return (error); 1560 return error;
1559} 1561}
1560 1562
1561/* 1563/*
@@ -1584,7 +1586,7 @@ xfs_qm_dqiterate(
1584 * happens only at mount time which is single threaded. 1586 * happens only at mount time which is single threaded.
1585 */ 1587 */
1586 if (qip->i_d.di_nblocks == 0) 1588 if (qip->i_d.di_nblocks == 0)
1587 return (0); 1589 return 0;
1588 1590
1589 map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP); 1591 map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP);
1590 1592
@@ -1653,7 +1655,7 @@ xfs_qm_dqiterate(
1653 1655
1654 kmem_free(map, XFS_DQITER_MAP_SIZE * sizeof(*map)); 1656 kmem_free(map, XFS_DQITER_MAP_SIZE * sizeof(*map));
1655 1657
1656 return (error); 1658 return error;
1657} 1659}
1658 1660
1659/* 1661/*
@@ -1713,7 +1715,7 @@ xfs_qm_get_rtblks(
1713 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); 1715 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
1714 if (!(ifp->if_flags & XFS_IFEXTENTS)) { 1716 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
1715 if ((error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK))) 1717 if ((error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK)))
1716 return (error); 1718 return error;
1717 } 1719 }
1718 rtblks = 0; 1720 rtblks = 0;
1719 nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); 1721 nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
@@ -1721,7 +1723,7 @@ xfs_qm_get_rtblks(
1721 for (ep = base; ep < &base[nextents]; ep++) 1723 for (ep = base; ep < &base[nextents]; ep++)
1722 rtblks += xfs_bmbt_get_blockcount(ep); 1724 rtblks += xfs_bmbt_get_blockcount(ep);
1723 *O_rtblks = (xfs_qcnt_t)rtblks; 1725 *O_rtblks = (xfs_qcnt_t)rtblks;
1724 return (0); 1726 return 0;
1725} 1727}
1726 1728
1727/* 1729/*
@@ -1765,7 +1767,7 @@ xfs_qm_dqusage_adjust(
1765 */ 1767 */
1766 if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip, bno))) { 1768 if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip, bno))) {
1767 *res = BULKSTAT_RV_NOTHING; 1769 *res = BULKSTAT_RV_NOTHING;
1768 return (error); 1770 return error;
1769 } 1771 }
1770 1772
1771 if (ip->i_d.di_mode == 0) { 1773 if (ip->i_d.di_mode == 0) {
@@ -1783,7 +1785,7 @@ xfs_qm_dqusage_adjust(
1783 if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) { 1785 if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) {
1784 xfs_iput(ip, XFS_ILOCK_EXCL); 1786 xfs_iput(ip, XFS_ILOCK_EXCL);
1785 *res = BULKSTAT_RV_GIVEUP; 1787 *res = BULKSTAT_RV_GIVEUP;
1786 return (error); 1788 return error;
1787 } 1789 }
1788 1790
1789 rtblks = 0; 1791 rtblks = 0;
@@ -1800,7 +1802,7 @@ xfs_qm_dqusage_adjust(
1800 if (gdqp) 1802 if (gdqp)
1801 xfs_qm_dqput(gdqp); 1803 xfs_qm_dqput(gdqp);
1802 *res = BULKSTAT_RV_GIVEUP; 1804 *res = BULKSTAT_RV_GIVEUP;
1803 return (error); 1805 return error;
1804 } 1806 }
1805 nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks; 1807 nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
1806 } 1808 }
@@ -1845,7 +1847,7 @@ xfs_qm_dqusage_adjust(
1845 * Goto next inode. 1847 * Goto next inode.
1846 */ 1848 */
1847 *res = BULKSTAT_RV_DIDONE; 1849 *res = BULKSTAT_RV_DIDONE;
1848 return (0); 1850 return 0;
1849} 1851}
1850 1852
1851/* 1853/*
@@ -1918,9 +1920,7 @@ xfs_qm_quotacheck(
1918 * at this point (because we intentionally didn't in dqget_noattach). 1920 * at this point (because we intentionally didn't in dqget_noattach).
1919 */ 1921 */
1920 if (error) { 1922 if (error) {
1921 xfs_qm_dqpurge_all(mp, 1923 xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF);
1922 XFS_QMOPT_UQUOTA|XFS_QMOPT_GQUOTA|
1923 XFS_QMOPT_PQUOTA|XFS_QMOPT_QUOTAOFF);
1924 goto error_return; 1924 goto error_return;
1925 } 1925 }
1926 /* 1926 /*
@@ -2041,7 +2041,7 @@ xfs_qm_init_quotainos(
2041 XFS_QI_UQIP(mp) = uip; 2041 XFS_QI_UQIP(mp) = uip;
2042 XFS_QI_GQIP(mp) = gip; 2042 XFS_QI_GQIP(mp) = gip;
2043 2043
2044 return (0); 2044 return 0;
2045} 2045}
2046 2046
2047 2047
@@ -2062,7 +2062,7 @@ xfs_qm_shake_freelist(
2062 int nflushes; 2062 int nflushes;
2063 2063
2064 if (howmany <= 0) 2064 if (howmany <= 0)
2065 return (0); 2065 return 0;
2066 2066
2067 nreclaimed = 0; 2067 nreclaimed = 0;
2068 restarts = 0; 2068 restarts = 0;
@@ -2088,7 +2088,7 @@ xfs_qm_shake_freelist(
2088 xfs_dqunlock(dqp); 2088 xfs_dqunlock(dqp);
2089 xfs_qm_freelist_unlock(xfs_Gqm); 2089 xfs_qm_freelist_unlock(xfs_Gqm);
2090 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) 2090 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
2091 return (nreclaimed); 2091 return nreclaimed;
2092 XQM_STATS_INC(xqmstats.xs_qm_dqwants); 2092 XQM_STATS_INC(xqmstats.xs_qm_dqwants);
2093 goto tryagain; 2093 goto tryagain;
2094 } 2094 }
@@ -2163,7 +2163,7 @@ xfs_qm_shake_freelist(
2163 XFS_DQ_HASH_UNLOCK(hash); 2163 XFS_DQ_HASH_UNLOCK(hash);
2164 xfs_qm_freelist_unlock(xfs_Gqm); 2164 xfs_qm_freelist_unlock(xfs_Gqm);
2165 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) 2165 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
2166 return (nreclaimed); 2166 return nreclaimed;
2167 goto tryagain; 2167 goto tryagain;
2168 } 2168 }
2169 xfs_dqtrace_entry(dqp, "DQSHAKE: UNLINKING"); 2169 xfs_dqtrace_entry(dqp, "DQSHAKE: UNLINKING");
@@ -2188,7 +2188,7 @@ xfs_qm_shake_freelist(
2188 dqp = nextdqp; 2188 dqp = nextdqp;
2189 } 2189 }
2190 xfs_qm_freelist_unlock(xfs_Gqm); 2190 xfs_qm_freelist_unlock(xfs_Gqm);
2191 return (nreclaimed); 2191 return nreclaimed;
2192} 2192}
2193 2193
2194 2194
@@ -2197,14 +2197,14 @@ xfs_qm_shake_freelist(
2197 */ 2197 */
2198/* ARGSUSED */ 2198/* ARGSUSED */
2199STATIC int 2199STATIC int
2200xfs_qm_shake(int nr_to_scan, unsigned int gfp_mask) 2200xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
2201{ 2201{
2202 int ndqused, nfree, n; 2202 int ndqused, nfree, n;
2203 2203
2204 if (!kmem_shake_allow(gfp_mask)) 2204 if (!kmem_shake_allow(gfp_mask))
2205 return (0); 2205 return 0;
2206 if (!xfs_Gqm) 2206 if (!xfs_Gqm)
2207 return (0); 2207 return 0;
2208 2208
2209 nfree = xfs_Gqm->qm_dqfreelist.qh_nelems; /* free dquots */ 2209 nfree = xfs_Gqm->qm_dqfreelist.qh_nelems; /* free dquots */
2210 /* incore dquots in all f/s's */ 2210 /* incore dquots in all f/s's */
@@ -2213,7 +2213,7 @@ xfs_qm_shake(int nr_to_scan, unsigned int gfp_mask)
2213 ASSERT(ndqused >= 0); 2213 ASSERT(ndqused >= 0);
2214 2214
2215 if (nfree <= ndqused && nfree < ndquot) 2215 if (nfree <= ndqused && nfree < ndquot)
2216 return (0); 2216 return 0;
2217 2217
2218 ndqused *= xfs_Gqm->qm_dqfree_ratio; /* target # of free dquots */ 2218 ndqused *= xfs_Gqm->qm_dqfree_ratio; /* target # of free dquots */
2219 n = nfree - ndqused - ndquot; /* # over target */ 2219 n = nfree - ndqused - ndquot; /* # over target */
@@ -2257,7 +2257,7 @@ xfs_qm_dqreclaim_one(void)
2257 xfs_dqunlock(dqp); 2257 xfs_dqunlock(dqp);
2258 xfs_qm_freelist_unlock(xfs_Gqm); 2258 xfs_qm_freelist_unlock(xfs_Gqm);
2259 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) 2259 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
2260 return (NULL); 2260 return NULL;
2261 XQM_STATS_INC(xqmstats.xs_qm_dqwants); 2261 XQM_STATS_INC(xqmstats.xs_qm_dqwants);
2262 goto startagain; 2262 goto startagain;
2263 } 2263 }
@@ -2333,7 +2333,7 @@ xfs_qm_dqreclaim_one(void)
2333 } 2333 }
2334 2334
2335 xfs_qm_freelist_unlock(xfs_Gqm); 2335 xfs_qm_freelist_unlock(xfs_Gqm);
2336 return (dqpout); 2336 return dqpout;
2337} 2337}
2338 2338
2339 2339
@@ -2369,7 +2369,7 @@ xfs_qm_dqalloc_incore(
2369 */ 2369 */
2370 memset(&dqp->q_core, 0, sizeof(dqp->q_core)); 2370 memset(&dqp->q_core, 0, sizeof(dqp->q_core));
2371 *O_dqpp = dqp; 2371 *O_dqpp = dqp;
2372 return (B_FALSE); 2372 return B_FALSE;
2373 } 2373 }
2374 XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses); 2374 XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses);
2375 } 2375 }
@@ -2382,7 +2382,7 @@ xfs_qm_dqalloc_incore(
2382 *O_dqpp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP); 2382 *O_dqpp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP);
2383 atomic_inc(&xfs_Gqm->qm_totaldquots); 2383 atomic_inc(&xfs_Gqm->qm_totaldquots);
2384 2384
2385 return (B_TRUE); 2385 return B_TRUE;
2386} 2386}
2387 2387
2388 2388
@@ -2407,13 +2407,13 @@ xfs_qm_write_sb_changes(
2407 0, 2407 0,
2408 XFS_DEFAULT_LOG_COUNT))) { 2408 XFS_DEFAULT_LOG_COUNT))) {
2409 xfs_trans_cancel(tp, 0); 2409 xfs_trans_cancel(tp, 0);
2410 return (error); 2410 return error;
2411 } 2411 }
2412 2412
2413 xfs_mod_sb(tp, flags); 2413 xfs_mod_sb(tp, flags);
2414 (void) xfs_trans_commit(tp, 0, NULL); 2414 (void) xfs_trans_commit(tp, 0, NULL);
2415 2415
2416 return (0); 2416 return 0;
2417} 2417}
2418 2418
2419 2419
@@ -2463,7 +2463,7 @@ xfs_qm_vop_dqalloc(
2463 if ((error = xfs_qm_dqattach(ip, XFS_QMOPT_DQALLOC | 2463 if ((error = xfs_qm_dqattach(ip, XFS_QMOPT_DQALLOC |
2464 XFS_QMOPT_ILOCKED))) { 2464 XFS_QMOPT_ILOCKED))) {
2465 xfs_iunlock(ip, lockflags); 2465 xfs_iunlock(ip, lockflags);
2466 return (error); 2466 return error;
2467 } 2467 }
2468 } 2468 }
2469 2469
@@ -2486,7 +2486,7 @@ xfs_qm_vop_dqalloc(
2486 XFS_QMOPT_DOWARN, 2486 XFS_QMOPT_DOWARN,
2487 &uq))) { 2487 &uq))) {
2488 ASSERT(error != ENOENT); 2488 ASSERT(error != ENOENT);
2489 return (error); 2489 return error;
2490 } 2490 }
2491 /* 2491 /*
2492 * Get the ilock in the right order. 2492 * Get the ilock in the right order.
@@ -2517,7 +2517,7 @@ xfs_qm_vop_dqalloc(
2517 if (uq) 2517 if (uq)
2518 xfs_qm_dqrele(uq); 2518 xfs_qm_dqrele(uq);
2519 ASSERT(error != ENOENT); 2519 ASSERT(error != ENOENT);
2520 return (error); 2520 return error;
2521 } 2521 }
2522 xfs_dqunlock(gq); 2522 xfs_dqunlock(gq);
2523 lockflags = XFS_ILOCK_SHARED; 2523 lockflags = XFS_ILOCK_SHARED;
@@ -2565,7 +2565,7 @@ xfs_qm_vop_dqalloc(
2565 *O_gdqpp = gq; 2565 *O_gdqpp = gq;
2566 else if (gq) 2566 else if (gq)
2567 xfs_qm_dqrele(gq); 2567 xfs_qm_dqrele(gq);
2568 return (0); 2568 return 0;
2569} 2569}
2570 2570
2571/* 2571/*
@@ -2608,7 +2608,7 @@ xfs_qm_vop_chown(
2608 xfs_dqunlock(newdq); 2608 xfs_dqunlock(newdq);
2609 *IO_olddq = newdq; 2609 *IO_olddq = newdq;
2610 2610
2611 return (prevdq); 2611 return prevdq;
2612} 2612}
2613 2613
2614/* 2614/*
@@ -2702,12 +2702,12 @@ xfs_qm_vop_rename_dqattach(
2702 ip = i_tab[0]; 2702 ip = i_tab[0];
2703 2703
2704 if (! XFS_IS_QUOTA_ON(ip->i_mount)) 2704 if (! XFS_IS_QUOTA_ON(ip->i_mount))
2705 return (0); 2705 return 0;
2706 2706
2707 if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) { 2707 if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) {
2708 error = xfs_qm_dqattach(ip, 0); 2708 error = xfs_qm_dqattach(ip, 0);
2709 if (error) 2709 if (error)
2710 return (error); 2710 return error;
2711 } 2711 }
2712 for (i = 1; (i < 4 && i_tab[i]); i++) { 2712 for (i = 1; (i < 4 && i_tab[i]); i++) {
2713 /* 2713 /*
@@ -2717,11 +2717,11 @@ xfs_qm_vop_rename_dqattach(
2717 if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) { 2717 if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) {
2718 error = xfs_qm_dqattach(ip, 0); 2718 error = xfs_qm_dqattach(ip, 0);
2719 if (error) 2719 if (error)
2720 return (error); 2720 return error;
2721 } 2721 }
2722 } 2722 }
2723 } 2723 }
2724 return (0); 2724 return 0;
2725} 2725}
2726 2726
2727void 2727void
@@ -2743,6 +2743,7 @@ xfs_qm_vop_dqattach_and_dqmod_newinode(
2743 xfs_dqunlock(udqp); 2743 xfs_dqunlock(udqp);
2744 ASSERT(ip->i_udquot == NULL); 2744 ASSERT(ip->i_udquot == NULL);
2745 ip->i_udquot = udqp; 2745 ip->i_udquot = udqp;
2746 ASSERT(XFS_IS_UQUOTA_ON(tp->t_mountp));
2746 ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id)); 2747 ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id));
2747 xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); 2748 xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
2748 } 2749 }
@@ -2752,7 +2753,10 @@ xfs_qm_vop_dqattach_and_dqmod_newinode(
2752 xfs_dqunlock(gdqp); 2753 xfs_dqunlock(gdqp);
2753 ASSERT(ip->i_gdquot == NULL); 2754 ASSERT(ip->i_gdquot == NULL);
2754 ip->i_gdquot = gdqp; 2755 ip->i_gdquot = gdqp;
2755 ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id)); 2756 ASSERT(XFS_IS_OQUOTA_ON(tp->t_mountp));
2757 ASSERT((XFS_IS_GQUOTA_ON(tp->t_mountp) ?
2758 ip->i_d.di_gid : ip->i_d.di_projid) ==
2759 be32_to_cpu(gdqp->q_core.d_id));
2756 xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); 2760 xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
2757 } 2761 }
2758} 2762}
@@ -2762,7 +2766,7 @@ STATIC void
2762xfs_qm_freelist_init(xfs_frlist_t *ql) 2766xfs_qm_freelist_init(xfs_frlist_t *ql)
2763{ 2767{
2764 ql->qh_next = ql->qh_prev = (xfs_dquot_t *) ql; 2768 ql->qh_next = ql->qh_prev = (xfs_dquot_t *) ql;
2765 mutex_init(&ql->qh_lock, MUTEX_DEFAULT, "dqf"); 2769 mutex_init(&ql->qh_lock);
2766 ql->qh_version = 0; 2770 ql->qh_version = 0;
2767 ql->qh_nelems = 0; 2771 ql->qh_nelems = 0;
2768} 2772}
@@ -2772,7 +2776,7 @@ xfs_qm_freelist_destroy(xfs_frlist_t *ql)
2772{ 2776{
2773 xfs_dquot_t *dqp, *nextdqp; 2777 xfs_dquot_t *dqp, *nextdqp;
2774 2778
2775 mutex_lock(&ql->qh_lock, PINOD); 2779 mutex_lock(&ql->qh_lock);
2776 for (dqp = ql->qh_next; 2780 for (dqp = ql->qh_next;
2777 dqp != (xfs_dquot_t *)ql; ) { 2781 dqp != (xfs_dquot_t *)ql; ) {
2778 xfs_dqlock(dqp); 2782 xfs_dqlock(dqp);
@@ -2830,7 +2834,7 @@ xfs_qm_dqhashlock_nowait(
2830 int locked; 2834 int locked;
2831 2835
2832 locked = mutex_trylock(&((dqp)->q_hash->qh_lock)); 2836 locked = mutex_trylock(&((dqp)->q_hash->qh_lock));
2833 return (locked); 2837 return locked;
2834} 2838}
2835 2839
2836int 2840int
@@ -2840,7 +2844,7 @@ xfs_qm_freelist_lock_nowait(
2840 int locked; 2844 int locked;
2841 2845
2842 locked = mutex_trylock(&(xqm->qm_dqfreelist.qh_lock)); 2846 locked = mutex_trylock(&(xqm->qm_dqfreelist.qh_lock));
2843 return (locked); 2847 return locked;
2844} 2848}
2845 2849
2846STATIC int 2850STATIC int
@@ -2851,5 +2855,5 @@ xfs_qm_mplist_nowait(
2851 2855
2852 ASSERT(mp->m_quotainfo); 2856 ASSERT(mp->m_quotainfo);
2853 locked = mutex_trylock(&(XFS_QI_MPLLOCK(mp))); 2857 locked = mutex_trylock(&(XFS_QI_MPLLOCK(mp)));
2854 return (locked); 2858 return locked;
2855} 2859}
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index 12da259f2fc..4568deb6da8 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -165,7 +165,7 @@ typedef struct xfs_dquot_acct {
165#define XFS_QM_IWARNLIMIT 5 165#define XFS_QM_IWARNLIMIT 5
166#define XFS_QM_RTBWARNLIMIT 5 166#define XFS_QM_RTBWARNLIMIT 5
167 167
168#define XFS_QM_LOCK(xqm) (mutex_lock(&xqm##_lock, PINOD)) 168#define XFS_QM_LOCK(xqm) (mutex_lock(&xqm##_lock))
169#define XFS_QM_UNLOCK(xqm) (mutex_unlock(&xqm##_lock)) 169#define XFS_QM_UNLOCK(xqm) (mutex_unlock(&xqm##_lock))
170#define XFS_QM_HOLD(xqm) ((xqm)->qm_nrefs++) 170#define XFS_QM_HOLD(xqm) ((xqm)->qm_nrefs++)
171#define XFS_QM_RELE(xqm) ((xqm)->qm_nrefs--) 171#define XFS_QM_RELE(xqm) ((xqm)->qm_nrefs--)
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index d9d2993de43..90402a1c398 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -363,7 +363,7 @@ xfs_qm_init(void)
363 KERN_INFO "SGI XFS Quota Management subsystem\n"; 363 KERN_INFO "SGI XFS Quota Management subsystem\n";
364 364
365 printk(message); 365 printk(message);
366 mutex_init(&xfs_Gqm_lock, MUTEX_DEFAULT, "xfs_qmlock"); 366 mutex_init(&xfs_Gqm_lock);
367 vfs_bhv_set_custom(&xfs_qmops, &xfs_qmcore_xfs); 367 vfs_bhv_set_custom(&xfs_qmops, &xfs_qmcore_xfs);
368 xfs_qm_init_procfs(); 368 xfs_qm_init_procfs();
369} 369}
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 24690e1af65..676884394aa 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -15,6 +15,9 @@
15 * along with this program; if not, write the Free Software Foundation, 15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18
19#include <linux/capability.h>
20
18#include "xfs.h" 21#include "xfs.h"
19#include "xfs_fs.h" 22#include "xfs_fs.h"
20#include "xfs_bit.h" 23#include "xfs_bit.h"
@@ -233,7 +236,7 @@ xfs_qm_scall_quotaoff(
233 */ 236 */
234 ASSERT(mp->m_quotainfo); 237 ASSERT(mp->m_quotainfo);
235 if (mp->m_quotainfo) 238 if (mp->m_quotainfo)
236 mutex_lock(&(XFS_QI_QOFFLOCK(mp)), PINOD); 239 mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
237 240
238 ASSERT(mp->m_quotainfo); 241 ASSERT(mp->m_quotainfo);
239 242
@@ -508,7 +511,7 @@ xfs_qm_scall_quotaon(
508 /* 511 /*
509 * Switch on quota enforcement in core. 512 * Switch on quota enforcement in core.
510 */ 513 */
511 mutex_lock(&(XFS_QI_QOFFLOCK(mp)), PINOD); 514 mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
512 mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD); 515 mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
513 mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); 516 mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
514 517
@@ -617,7 +620,7 @@ xfs_qm_scall_setqlim(
617 * a quotaoff from happening). (XXXThis doesn't currently happen 620 * a quotaoff from happening). (XXXThis doesn't currently happen
618 * because we take the vfslock before calling xfs_qm_sysent). 621 * because we take the vfslock before calling xfs_qm_sysent).
619 */ 622 */
620 mutex_lock(&(XFS_QI_QOFFLOCK(mp)), PINOD); 623 mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
621 624
622 /* 625 /*
623 * Get the dquot (locked), and join it to the transaction. 626 * Get the dquot (locked), and join it to the transaction.
@@ -1426,7 +1429,7 @@ xfs_qm_internalqcheck(
1426 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); 1429 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
1427 XFS_bflush(mp->m_ddev_targp); 1430 XFS_bflush(mp->m_ddev_targp);
1428 1431
1429 mutex_lock(&qcheck_lock, PINOD); 1432 mutex_lock(&qcheck_lock);
1430 /* There should be absolutely no quota activity while this 1433 /* There should be absolutely no quota activity while this
1431 is going on. */ 1434 is going on. */
1432 qmtest_udqtab = kmem_zalloc(qmtest_hashmask * 1435 qmtest_udqtab = kmem_zalloc(qmtest_hashmask *
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
index 7a9f3beb818..b7ddd04aae3 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -51,7 +51,7 @@
51#define XFS_QI_MPLNEXT(mp) ((mp)->m_quotainfo->qi_dqlist.qh_next) 51#define XFS_QI_MPLNEXT(mp) ((mp)->m_quotainfo->qi_dqlist.qh_next)
52#define XFS_QI_MPLNDQUOTS(mp) ((mp)->m_quotainfo->qi_dqlist.qh_nelems) 52#define XFS_QI_MPLNDQUOTS(mp) ((mp)->m_quotainfo->qi_dqlist.qh_nelems)
53 53
54#define XQMLCK(h) (mutex_lock(&((h)->qh_lock), PINOD)) 54#define XQMLCK(h) (mutex_lock(&((h)->qh_lock)))
55#define XQMUNLCK(h) (mutex_unlock(&((h)->qh_lock))) 55#define XQMUNLCK(h) (mutex_unlock(&((h)->qh_lock)))
56#ifdef DEBUG 56#ifdef DEBUG
57struct xfs_dqhash; 57struct xfs_dqhash;
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index bb6dc91ea26..b08b3d9345b 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -27,45 +27,12 @@ static DEFINE_SPINLOCK(xfs_err_lock);
27/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */ 27/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */
28#define XFS_MAX_ERR_LEVEL 7 28#define XFS_MAX_ERR_LEVEL 7
29#define XFS_ERR_MASK ((1 << 3) - 1) 29#define XFS_ERR_MASK ((1 << 3) - 1)
30static char *err_level[XFS_MAX_ERR_LEVEL+1] = 30static const char * const err_level[XFS_MAX_ERR_LEVEL+1] =
31 {KERN_EMERG, KERN_ALERT, KERN_CRIT, 31 {KERN_EMERG, KERN_ALERT, KERN_CRIT,
32 KERN_ERR, KERN_WARNING, KERN_NOTICE, 32 KERN_ERR, KERN_WARNING, KERN_NOTICE,
33 KERN_INFO, KERN_DEBUG}; 33 KERN_INFO, KERN_DEBUG};
34 34
35void 35void
36assfail(char *a, char *f, int l)
37{
38 printk("XFS assertion failed: %s, file: %s, line: %d\n", a, f, l);
39 BUG();
40}
41
42#if ((defined(DEBUG) || defined(INDUCE_IO_ERRROR)) && !defined(NO_WANT_RANDOM))
43
44unsigned long
45random(void)
46{
47 static unsigned long RandomValue = 1;
48 /* cycles pseudo-randomly through all values between 1 and 2^31 - 2 */
49 register long rv = RandomValue;
50 register long lo;
51 register long hi;
52
53 hi = rv / 127773;
54 lo = rv % 127773;
55 rv = 16807 * lo - 2836 * hi;
56 if( rv <= 0 ) rv += 2147483647;
57 return( RandomValue = rv );
58}
59
60int
61get_thread_id(void)
62{
63 return current->pid;
64}
65
66#endif /* DEBUG || INDUCE_IO_ERRROR || !NO_WANT_RANDOM */
67
68void
69cmn_err(register int level, char *fmt, ...) 36cmn_err(register int level, char *fmt, ...)
70{ 37{
71 char *fp = fmt; 38 char *fp = fmt;
@@ -90,7 +57,6 @@ cmn_err(register int level, char *fmt, ...)
90 BUG(); 57 BUG();
91} 58}
92 59
93
94void 60void
95icmn_err(register int level, char *fmt, va_list ap) 61icmn_err(register int level, char *fmt, va_list ap)
96{ 62{
@@ -109,3 +75,27 @@ icmn_err(register int level, char *fmt, va_list ap)
109 if (level == CE_PANIC) 75 if (level == CE_PANIC)
110 BUG(); 76 BUG();
111} 77}
78
79void
80assfail(char *expr, char *file, int line)
81{
82 printk("Assertion failed: %s, file: %s, line: %d\n", expr, file, line);
83 BUG();
84}
85
86#if ((defined(DEBUG) || defined(INDUCE_IO_ERRROR)) && !defined(NO_WANT_RANDOM))
87unsigned long random(void)
88{
89 static unsigned long RandomValue = 1;
90 /* cycles pseudo-randomly through all values between 1 and 2^31 - 2 */
91 register long rv = RandomValue;
92 register long lo;
93 register long hi;
94
95 hi = rv / 127773;
96 lo = rv % 127773;
97 rv = 16807 * lo - 2836 * hi;
98 if (rv <= 0) rv += 2147483647;
99 return RandomValue = rv;
100}
101#endif /* DEBUG || INDUCE_IO_ERRROR || !NO_WANT_RANDOM */
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index aff558664c3..e3bf58112e7 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -31,24 +31,23 @@ extern void icmn_err(int, char *, va_list)
31 __attribute__ ((format (printf, 2, 0))); 31 __attribute__ ((format (printf, 2, 0)));
32extern void cmn_err(int, char *, ...) 32extern void cmn_err(int, char *, ...)
33 __attribute__ ((format (printf, 2, 3))); 33 __attribute__ ((format (printf, 2, 3)));
34extern void assfail(char *expr, char *f, int l);
34 35
35#ifndef STATIC 36#define prdev(fmt,targ,args...) \
36# define STATIC static 37 printk("Device %s - " fmt "\n", XFS_BUFTARG_NAME(targ), ## args)
37#endif
38 38
39#ifdef DEBUG 39#define ASSERT_ALWAYS(expr) \
40# define ASSERT(EX) ((EX) ? ((void)0) : assfail(#EX, __FILE__, __LINE__)) 40 (unlikely((expr) != 0) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
41#else
42# define ASSERT(x) ((void)0)
43#endif
44 41
45extern void assfail(char *, char *, int); 42#ifndef DEBUG
46#ifdef DEBUG 43# define ASSERT(expr) ((void)0)
44#else
45# define ASSERT(expr) ASSERT_ALWAYS(expr)
47extern unsigned long random(void); 46extern unsigned long random(void);
48extern int get_thread_id(void);
49#endif 47#endif
50 48
51#define ASSERT_ALWAYS(EX) ((EX)?((void)0):assfail(#EX, __FILE__, __LINE__)) 49#ifndef STATIC
52#define debug_stop_all_cpus(param) /* param is "cpumask_t *" */ 50# define STATIC static
51#endif
53 52
54#endif /* __XFS_SUPPORT_DEBUG_H__ */ 53#endif /* __XFS_SUPPORT_DEBUG_H__ */
diff --git a/fs/xfs/support/uuid.c b/fs/xfs/support/uuid.c
index 70ce40914c8..a3d565a6773 100644
--- a/fs/xfs/support/uuid.c
+++ b/fs/xfs/support/uuid.c
@@ -24,9 +24,19 @@ static uuid_t *uuid_table;
24void 24void
25uuid_init(void) 25uuid_init(void)
26{ 26{
27 mutex_init(&uuid_monitor, MUTEX_DEFAULT, "uuid_monitor"); 27 mutex_init(&uuid_monitor);
28} 28}
29 29
30
31/* IRIX interpretation of an uuid_t */
32typedef struct {
33 __be32 uu_timelow;
34 __be16 uu_timemid;
35 __be16 uu_timehi;
36 __be16 uu_clockseq;
37 __be16 uu_node[3];
38} xfs_uu_t;
39
30/* 40/*
31 * uuid_getnodeuniq - obtain the node unique fields of a UUID. 41 * uuid_getnodeuniq - obtain the node unique fields of a UUID.
32 * 42 *
@@ -36,16 +46,11 @@ uuid_init(void)
36void 46void
37uuid_getnodeuniq(uuid_t *uuid, int fsid [2]) 47uuid_getnodeuniq(uuid_t *uuid, int fsid [2])
38{ 48{
39 char *uu = (char *)uuid; 49 xfs_uu_t *uup = (xfs_uu_t *)uuid;
40
41 /* on IRIX, this function assumes big-endian fields within
42 * the uuid, so we use INT_GET to get the same result on
43 * little-endian systems
44 */
45 50
46 fsid[0] = (INT_GET(*(u_int16_t*)(uu+8), ARCH_CONVERT) << 16) + 51 fsid[0] = (be16_to_cpu(uup->uu_clockseq) << 16) |
47 INT_GET(*(u_int16_t*)(uu+4), ARCH_CONVERT); 52 be16_to_cpu(uup->uu_timemid);
48 fsid[1] = INT_GET(*(u_int32_t*)(uu ), ARCH_CONVERT); 53 fsid[1] = be16_to_cpu(uup->uu_timelow);
49} 54}
50 55
51void 56void
@@ -94,7 +99,7 @@ uuid_table_insert(uuid_t *uuid)
94{ 99{
95 int i, hole; 100 int i, hole;
96 101
97 mutex_lock(&uuid_monitor, PVFS); 102 mutex_lock(&uuid_monitor);
98 for (i = 0, hole = -1; i < uuid_table_size; i++) { 103 for (i = 0, hole = -1; i < uuid_table_size; i++) {
99 if (uuid_is_nil(&uuid_table[i])) { 104 if (uuid_is_nil(&uuid_table[i])) {
100 hole = i; 105 hole = i;
@@ -122,7 +127,7 @@ uuid_table_remove(uuid_t *uuid)
122{ 127{
123 int i; 128 int i;
124 129
125 mutex_lock(&uuid_monitor, PVFS); 130 mutex_lock(&uuid_monitor);
126 for (i = 0; i < uuid_table_size; i++) { 131 for (i = 0; i < uuid_table_size; i++) {
127 if (uuid_is_nil(&uuid_table[i])) 132 if (uuid_is_nil(&uuid_table[i]))
128 continue; 133 continue;
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index cc9c91b9e77..4ff0f4e41c6 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -36,6 +36,7 @@
36#include "xfs_mac.h" 36#include "xfs_mac.h"
37#include "xfs_attr.h" 37#include "xfs_attr.h"
38 38
39#include <linux/capability.h>
39#include <linux/posix_acl_xattr.h> 40#include <linux/posix_acl_xattr.h>
40 41
41STATIC int xfs_acl_setmode(vnode_t *, xfs_acl_t *, int *); 42STATIC int xfs_acl_setmode(vnode_t *, xfs_acl_t *, int *);
diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
index 68e5051d8e2..c4836890b72 100644
--- a/fs/xfs/xfs_arch.h
+++ b/fs/xfs/xfs_arch.h
@@ -40,6 +40,22 @@
40#undef XFS_NATIVE_HOST 40#undef XFS_NATIVE_HOST
41#endif 41#endif
42 42
43#ifdef XFS_NATIVE_HOST
44#define cpu_to_be16(val) ((__be16)(val))
45#define cpu_to_be32(val) ((__be32)(val))
46#define cpu_to_be64(val) ((__be64)(val))
47#define be16_to_cpu(val) ((__uint16_t)(val))
48#define be32_to_cpu(val) ((__uint32_t)(val))
49#define be64_to_cpu(val) ((__uint64_t)(val))
50#else
51#define cpu_to_be16(val) (__swab16((__uint16_t)(val)))
52#define cpu_to_be32(val) (__swab32((__uint32_t)(val)))
53#define cpu_to_be64(val) (__swab64((__uint64_t)(val)))
54#define be16_to_cpu(val) (__swab16((__be16)(val)))
55#define be32_to_cpu(val) (__swab32((__be32)(val)))
56#define be64_to_cpu(val) (__swab64((__be64)(val)))
57#endif
58
43#endif /* __KERNEL__ */ 59#endif /* __KERNEL__ */
44 60
45/* do we need conversion? */ 61/* do we need conversion? */
@@ -186,7 +202,7 @@ static inline void be64_add(__be64 *a, __s64 b)
186 */ 202 */
187 203
188#define XFS_GET_DIR_INO4(di) \ 204#define XFS_GET_DIR_INO4(di) \
189 (((u32)(di).i[0] << 24) | ((di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3])) 205 (((__u32)(di).i[0] << 24) | ((di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3]))
190 206
191#define XFS_PUT_DIR_INO4(from, di) \ 207#define XFS_PUT_DIR_INO4(from, di) \
192do { \ 208do { \
@@ -197,9 +213,9 @@ do { \
197} while (0) 213} while (0)
198 214
199#define XFS_DI_HI(di) \ 215#define XFS_DI_HI(di) \
200 (((u32)(di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3])) 216 (((__u32)(di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3]))
201#define XFS_DI_LO(di) \ 217#define XFS_DI_LO(di) \
202 (((u32)(di).i[4] << 24) | ((di).i[5] << 16) | ((di).i[6] << 8) | ((di).i[7])) 218 (((__u32)(di).i[4] << 24) | ((di).i[5] << 16) | ((di).i[6] << 8) | ((di).i[7]))
203 219
204#define XFS_GET_DIR_INO8(di) \ 220#define XFS_GET_DIR_INO8(di) \
205 (((xfs_ino_t)XFS_DI_LO(di) & 0xffffffffULL) | \ 221 (((xfs_ino_t)XFS_DI_LO(di) & 0xffffffffULL) | \
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 5484eeb460c..e5e91e9c7e8 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -15,6 +15,9 @@
15 * along with this program; if not, write the Free Software Foundation, 15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18
19#include <linux/capability.h>
20
18#include "xfs.h" 21#include "xfs.h"
19#include "xfs_fs.h" 22#include "xfs_fs.h"
20#include "xfs_types.h" 23#include "xfs_types.h"
@@ -117,11 +120,6 @@ xfs_attr_fetch(xfs_inode_t *ip, const char *name, int namelen,
117 ip->i_d.di_anextents == 0)) 120 ip->i_d.di_anextents == 0))
118 return(ENOATTR); 121 return(ENOATTR);
119 122
120 if (!(flags & (ATTR_KERNACCESS|ATTR_SECURE))) {
121 if ((error = xfs_iaccess(ip, S_IRUSR, cred)))
122 return(XFS_ERROR(error));
123 }
124
125 /* 123 /*
126 * Fill in the arg structure for this request. 124 * Fill in the arg structure for this request.
127 */ 125 */
@@ -425,7 +423,7 @@ xfs_attr_set(bhv_desc_t *bdp, const char *name, char *value, int valuelen, int f
425 struct cred *cred) 423 struct cred *cred)
426{ 424{
427 xfs_inode_t *dp; 425 xfs_inode_t *dp;
428 int namelen, error; 426 int namelen;
429 427
430 namelen = strlen(name); 428 namelen = strlen(name);
431 if (namelen >= MAXNAMELEN) 429 if (namelen >= MAXNAMELEN)
@@ -437,14 +435,6 @@ xfs_attr_set(bhv_desc_t *bdp, const char *name, char *value, int valuelen, int f
437 if (XFS_FORCED_SHUTDOWN(dp->i_mount)) 435 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
438 return (EIO); 436 return (EIO);
439 437
440 xfs_ilock(dp, XFS_ILOCK_SHARED);
441 if (!(flags & ATTR_SECURE) &&
442 (error = xfs_iaccess(dp, S_IWUSR, cred))) {
443 xfs_iunlock(dp, XFS_ILOCK_SHARED);
444 return(XFS_ERROR(error));
445 }
446 xfs_iunlock(dp, XFS_ILOCK_SHARED);
447
448 return xfs_attr_set_int(dp, name, namelen, value, valuelen, flags); 438 return xfs_attr_set_int(dp, name, namelen, value, valuelen, flags);
449} 439}
450 440
@@ -579,7 +569,7 @@ int
579xfs_attr_remove(bhv_desc_t *bdp, const char *name, int flags, struct cred *cred) 569xfs_attr_remove(bhv_desc_t *bdp, const char *name, int flags, struct cred *cred)
580{ 570{
581 xfs_inode_t *dp; 571 xfs_inode_t *dp;
582 int namelen, error; 572 int namelen;
583 573
584 namelen = strlen(name); 574 namelen = strlen(name);
585 if (namelen >= MAXNAMELEN) 575 if (namelen >= MAXNAMELEN)
@@ -592,11 +582,7 @@ xfs_attr_remove(bhv_desc_t *bdp, const char *name, int flags, struct cred *cred)
592 return (EIO); 582 return (EIO);
593 583
594 xfs_ilock(dp, XFS_ILOCK_SHARED); 584 xfs_ilock(dp, XFS_ILOCK_SHARED);
595 if (!(flags & ATTR_SECURE) && 585 if (XFS_IFORK_Q(dp) == 0 ||
596 (error = xfs_iaccess(dp, S_IWUSR, cred))) {
597 xfs_iunlock(dp, XFS_ILOCK_SHARED);
598 return(XFS_ERROR(error));
599 } else if (XFS_IFORK_Q(dp) == 0 ||
600 (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && 586 (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
601 dp->i_d.di_anextents == 0)) { 587 dp->i_d.di_anextents == 0)) {
602 xfs_iunlock(dp, XFS_ILOCK_SHARED); 588 xfs_iunlock(dp, XFS_ILOCK_SHARED);
@@ -668,12 +654,6 @@ xfs_attr_list(bhv_desc_t *bdp, char *buffer, int bufsize, int flags,
668 return (EIO); 654 return (EIO);
669 655
670 xfs_ilock(dp, XFS_ILOCK_SHARED); 656 xfs_ilock(dp, XFS_ILOCK_SHARED);
671 if (!(flags & ATTR_SECURE) &&
672 (error = xfs_iaccess(dp, S_IRUSR, cred))) {
673 xfs_iunlock(dp, XFS_ILOCK_SHARED);
674 return(XFS_ERROR(error));
675 }
676
677 /* 657 /*
678 * Decide on what work routines to call based on the inode size. 658 * Decide on what work routines to call based on the inode size.
679 */ 659 */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 35e557b00db..fe91eac4e2a 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -128,7 +128,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
128 return (offset >= minforkoff) ? minforkoff : 0; 128 return (offset >= minforkoff) ? minforkoff : 0;
129 } 129 }
130 130
131 if (unlikely(mp->m_flags & XFS_MOUNT_COMPAT_ATTR)) { 131 if (!(mp->m_flags & XFS_MOUNT_ATTR2)) {
132 if (bytes <= XFS_IFORK_ASIZE(dp)) 132 if (bytes <= XFS_IFORK_ASIZE(dp))
133 return mp->m_attroffset >> 3; 133 return mp->m_attroffset >> 3;
134 return 0; 134 return 0;
@@ -157,7 +157,7 @@ xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp)
157{ 157{
158 unsigned long s; 158 unsigned long s;
159 159
160 if (!(mp->m_flags & XFS_MOUNT_COMPAT_ATTR) && 160 if ((mp->m_flags & XFS_MOUNT_ATTR2) &&
161 !(XFS_SB_VERSION_HASATTR2(&mp->m_sb))) { 161 !(XFS_SB_VERSION_HASATTR2(&mp->m_sb))) {
162 s = XFS_SB_LOCK(mp); 162 s = XFS_SB_LOCK(mp);
163 if (!XFS_SB_VERSION_HASATTR2(&mp->m_sb)) { 163 if (!XFS_SB_VERSION_HASATTR2(&mp->m_sb)) {
@@ -310,7 +310,8 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
310 * Fix up the start offset of the attribute fork 310 * Fix up the start offset of the attribute fork
311 */ 311 */
312 totsize -= size; 312 totsize -= size;
313 if (totsize == sizeof(xfs_attr_sf_hdr_t) && !args->addname) { 313 if (totsize == sizeof(xfs_attr_sf_hdr_t) && !args->addname &&
314 (mp->m_flags & XFS_MOUNT_ATTR2)) {
314 /* 315 /*
315 * Last attribute now removed, revert to original 316 * Last attribute now removed, revert to original
316 * inode format making all literal area available 317 * inode format making all literal area available
@@ -328,7 +329,8 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
328 xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); 329 xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
329 dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); 330 dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
330 ASSERT(dp->i_d.di_forkoff); 331 ASSERT(dp->i_d.di_forkoff);
331 ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || args->addname); 332 ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || args->addname ||
333 !(mp->m_flags & XFS_MOUNT_ATTR2));
332 dp->i_afp->if_ext_max = 334 dp->i_afp->if_ext_max =
333 XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); 335 XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
334 dp->i_df.if_ext_max = 336 dp->i_df.if_ext_max =
@@ -737,7 +739,8 @@ xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp)
737 + name_loc->namelen 739 + name_loc->namelen
738 + INT_GET(name_loc->valuelen, ARCH_CONVERT); 740 + INT_GET(name_loc->valuelen, ARCH_CONVERT);
739 } 741 }
740 if (bytes == sizeof(struct xfs_attr_sf_hdr)) 742 if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) &&
743 (bytes == sizeof(struct xfs_attr_sf_hdr)))
741 return(-1); 744 return(-1);
742 return(xfs_attr_shortform_bytesfit(dp, bytes)); 745 return(xfs_attr_shortform_bytesfit(dp, bytes));
743} 746}
@@ -775,6 +778,8 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
775 goto out; 778 goto out;
776 779
777 if (forkoff == -1) { 780 if (forkoff == -1) {
781 ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2);
782
778 /* 783 /*
779 * Last attribute was removed, revert to original 784 * Last attribute was removed, revert to original
780 * inode format making all literal area available 785 * inode format making all literal area available
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index f6143ff251a..541e34109bb 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -63,7 +63,7 @@ struct xfs_trans;
63 * the leaf_entry. The namespaces are independent only because we also look 63 * the leaf_entry. The namespaces are independent only because we also look
64 * at the namespace bit when we are looking for a matching attribute name. 64 * at the namespace bit when we are looking for a matching attribute name.
65 * 65 *
66 * We also store a "incomplete" bit in the leaf_entry. It shows that an 66 * We also store an "incomplete" bit in the leaf_entry. It shows that an
67 * attribute is in the middle of being created and should not be shown to 67 * attribute is in the middle of being created and should not be shown to
68 * the user if we crash during the time that the bit is set. We clear the 68 * the user if we crash during the time that the bit is set. We clear the
69 * bit when we have finished setting up the attribute. We do this because 69 * bit when we have finished setting up the attribute. We do this because
@@ -72,42 +72,48 @@ struct xfs_trans;
72 */ 72 */
73#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */ 73#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */
74 74
75typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */
76 __uint16_t base; /* base of free region */
77 __uint16_t size; /* length of free region */
78} xfs_attr_leaf_map_t;
79
80typedef struct xfs_attr_leaf_hdr { /* constant-structure header block */
81 xfs_da_blkinfo_t info; /* block type, links, etc. */
82 __uint16_t count; /* count of active leaf_entry's */
83 __uint16_t usedbytes; /* num bytes of names/values stored */
84 __uint16_t firstused; /* first used byte in name area */
85 __uint8_t holes; /* != 0 if blk needs compaction */
86 __uint8_t pad1;
87 xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE];
88 /* N largest free regions */
89} xfs_attr_leaf_hdr_t;
90
91typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */
92 xfs_dahash_t hashval; /* hash value of name */
93 __uint16_t nameidx; /* index into buffer of name/value */
94 __uint8_t flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */
95 __uint8_t pad2; /* unused pad byte */
96} xfs_attr_leaf_entry_t;
97
98typedef struct xfs_attr_leaf_name_local {
99 __uint16_t valuelen; /* number of bytes in value */
100 __uint8_t namelen; /* length of name bytes */
101 __uint8_t nameval[1]; /* name/value bytes */
102} xfs_attr_leaf_name_local_t;
103
104typedef struct xfs_attr_leaf_name_remote {
105 xfs_dablk_t valueblk; /* block number of value bytes */
106 __uint32_t valuelen; /* number of bytes in value */
107 __uint8_t namelen; /* length of name bytes */
108 __uint8_t name[1]; /* name bytes */
109} xfs_attr_leaf_name_remote_t;
110
75typedef struct xfs_attr_leafblock { 111typedef struct xfs_attr_leafblock {
76 struct xfs_attr_leaf_hdr { /* constant-structure header block */ 112 xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */
77 xfs_da_blkinfo_t info; /* block type, links, etc. */ 113 xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */
78 __uint16_t count; /* count of active leaf_entry's */ 114 xfs_attr_leaf_name_local_t namelist; /* grows from bottom of buf */
79 __uint16_t usedbytes; /* num bytes of names/values stored */ 115 xfs_attr_leaf_name_remote_t valuelist; /* grows from bottom of buf */
80 __uint16_t firstused; /* first used byte in name area */
81 __uint8_t holes; /* != 0 if blk needs compaction */
82 __uint8_t pad1;
83 struct xfs_attr_leaf_map { /* RLE map of free bytes */
84 __uint16_t base; /* base of free region */
85 __uint16_t size; /* length of free region */
86 } freemap[XFS_ATTR_LEAF_MAPSIZE]; /* N largest free regions */
87 } hdr;
88 struct xfs_attr_leaf_entry { /* sorted on key, not name */
89 xfs_dahash_t hashval; /* hash value of name */
90 __uint16_t nameidx; /* index into buffer of name/value */
91 __uint8_t flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */
92 __uint8_t pad2; /* unused pad byte */
93 } entries[1]; /* variable sized array */
94 struct xfs_attr_leaf_name_local {
95 __uint16_t valuelen; /* number of bytes in value */
96 __uint8_t namelen; /* length of name bytes */
97 __uint8_t nameval[1]; /* name/value bytes */
98 } namelist; /* grows from bottom of buf */
99 struct xfs_attr_leaf_name_remote {
100 xfs_dablk_t valueblk; /* block number of value bytes */
101 __uint32_t valuelen; /* number of bytes in value */
102 __uint8_t namelen; /* length of name bytes */
103 __uint8_t name[1]; /* name bytes */
104 } valuelist; /* grows from bottom of buf */
105} xfs_attr_leafblock_t; 116} xfs_attr_leafblock_t;
106typedef struct xfs_attr_leaf_hdr xfs_attr_leaf_hdr_t;
107typedef struct xfs_attr_leaf_map xfs_attr_leaf_map_t;
108typedef struct xfs_attr_leaf_entry xfs_attr_leaf_entry_t;
109typedef struct xfs_attr_leaf_name_local xfs_attr_leaf_name_local_t;
110typedef struct xfs_attr_leaf_name_remote xfs_attr_leaf_name_remote_t;
111 117
112/* 118/*
113 * Flags used in the leaf_entry[i].flags field. 119 * Flags used in the leaf_entry[i].flags field.
@@ -150,7 +156,8 @@ xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
150 (leafp))[INT_GET((leafp)->entries[idx].nameidx, ARCH_CONVERT)]; 156 (leafp))[INT_GET((leafp)->entries[idx].nameidx, ARCH_CONVERT)];
151} 157}
152 158
153#define XFS_ATTR_LEAF_NAME(leafp,idx) xfs_attr_leaf_name(leafp,idx) 159#define XFS_ATTR_LEAF_NAME(leafp,idx) \
160 xfs_attr_leaf_name(leafp,idx)
154static inline char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx) 161static inline char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
155{ 162{
156 return (&((char *) 163 return (&((char *)
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index e415a4698e9..70625e577c7 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2146,13 +2146,176 @@ xfs_bmap_add_extent_hole_real(
2146 return 0; /* keep gcc quite */ 2146 return 0; /* keep gcc quite */
2147} 2147}
2148 2148
2149/*
2150 * Adjust the size of the new extent based on di_extsize and rt extsize.
2151 */
2152STATIC int
2153xfs_bmap_extsize_align(
2154 xfs_mount_t *mp,
2155 xfs_bmbt_irec_t *gotp, /* next extent pointer */
2156 xfs_bmbt_irec_t *prevp, /* previous extent pointer */
2157 xfs_extlen_t extsz, /* align to this extent size */
2158 int rt, /* is this a realtime inode? */
2159 int eof, /* is extent at end-of-file? */
2160 int delay, /* creating delalloc extent? */
2161 int convert, /* overwriting unwritten extent? */
2162 xfs_fileoff_t *offp, /* in/out: aligned offset */
2163 xfs_extlen_t *lenp) /* in/out: aligned length */
2164{
2165 xfs_fileoff_t orig_off; /* original offset */
2166 xfs_extlen_t orig_alen; /* original length */
2167 xfs_fileoff_t orig_end; /* original off+len */
2168 xfs_fileoff_t nexto; /* next file offset */
2169 xfs_fileoff_t prevo; /* previous file offset */
2170 xfs_fileoff_t align_off; /* temp for offset */
2171 xfs_extlen_t align_alen; /* temp for length */
2172 xfs_extlen_t temp; /* temp for calculations */
2173
2174 if (convert)
2175 return 0;
2176
2177 orig_off = align_off = *offp;
2178 orig_alen = align_alen = *lenp;
2179 orig_end = orig_off + orig_alen;
2180
2181 /*
2182 * If this request overlaps an existing extent, then don't
2183 * attempt to perform any additional alignment.
2184 */
2185 if (!delay && !eof &&
2186 (orig_off >= gotp->br_startoff) &&
2187 (orig_end <= gotp->br_startoff + gotp->br_blockcount)) {
2188 return 0;
2189 }
2190
2191 /*
2192 * If the file offset is unaligned vs. the extent size
2193 * we need to align it. This will be possible unless
2194 * the file was previously written with a kernel that didn't
2195 * perform this alignment, or if a truncate shot us in the
2196 * foot.
2197 */
2198 temp = do_mod(orig_off, extsz);
2199 if (temp) {
2200 align_alen += temp;
2201 align_off -= temp;
2202 }
2203 /*
2204 * Same adjustment for the end of the requested area.
2205 */
2206 if ((temp = (align_alen % extsz))) {
2207 align_alen += extsz - temp;
2208 }
2209 /*
2210 * If the previous block overlaps with this proposed allocation
2211 * then move the start forward without adjusting the length.
2212 */
2213 if (prevp->br_startoff != NULLFILEOFF) {
2214 if (prevp->br_startblock == HOLESTARTBLOCK)
2215 prevo = prevp->br_startoff;
2216 else
2217 prevo = prevp->br_startoff + prevp->br_blockcount;
2218 } else
2219 prevo = 0;
2220 if (align_off != orig_off && align_off < prevo)
2221 align_off = prevo;
2222 /*
2223 * If the next block overlaps with this proposed allocation
2224 * then move the start back without adjusting the length,
2225 * but not before offset 0.
2226 * This may of course make the start overlap previous block,
2227 * and if we hit the offset 0 limit then the next block
2228 * can still overlap too.
2229 */
2230 if (!eof && gotp->br_startoff != NULLFILEOFF) {
2231 if ((delay && gotp->br_startblock == HOLESTARTBLOCK) ||
2232 (!delay && gotp->br_startblock == DELAYSTARTBLOCK))
2233 nexto = gotp->br_startoff + gotp->br_blockcount;
2234 else
2235 nexto = gotp->br_startoff;
2236 } else
2237 nexto = NULLFILEOFF;
2238 if (!eof &&
2239 align_off + align_alen != orig_end &&
2240 align_off + align_alen > nexto)
2241 align_off = nexto > align_alen ? nexto - align_alen : 0;
2242 /*
2243 * If we're now overlapping the next or previous extent that
2244 * means we can't fit an extsz piece in this hole. Just move
2245 * the start forward to the first valid spot and set
2246 * the length so we hit the end.
2247 */
2248 if (align_off != orig_off && align_off < prevo)
2249 align_off = prevo;
2250 if (align_off + align_alen != orig_end &&
2251 align_off + align_alen > nexto &&
2252 nexto != NULLFILEOFF) {
2253 ASSERT(nexto > prevo);
2254 align_alen = nexto - align_off;
2255 }
2256
2257 /*
2258 * If realtime, and the result isn't a multiple of the realtime
2259 * extent size we need to remove blocks until it is.
2260 */
2261 if (rt && (temp = (align_alen % mp->m_sb.sb_rextsize))) {
2262 /*
2263 * We're not covering the original request, or
2264 * we won't be able to once we fix the length.
2265 */
2266 if (orig_off < align_off ||
2267 orig_end > align_off + align_alen ||
2268 align_alen - temp < orig_alen)
2269 return XFS_ERROR(EINVAL);
2270 /*
2271 * Try to fix it by moving the start up.
2272 */
2273 if (align_off + temp <= orig_off) {
2274 align_alen -= temp;
2275 align_off += temp;
2276 }
2277 /*
2278 * Try to fix it by moving the end in.
2279 */
2280 else if (align_off + align_alen - temp >= orig_end)
2281 align_alen -= temp;
2282 /*
2283 * Set the start to the minimum then trim the length.
2284 */
2285 else {
2286 align_alen -= orig_off - align_off;
2287 align_off = orig_off;
2288 align_alen -= align_alen % mp->m_sb.sb_rextsize;
2289 }
2290 /*
2291 * Result doesn't cover the request, fail it.
2292 */
2293 if (orig_off < align_off || orig_end > align_off + align_alen)
2294 return XFS_ERROR(EINVAL);
2295 } else {
2296 ASSERT(orig_off >= align_off);
2297 ASSERT(orig_end <= align_off + align_alen);
2298 }
2299
2300#ifdef DEBUG
2301 if (!eof && gotp->br_startoff != NULLFILEOFF)
2302 ASSERT(align_off + align_alen <= gotp->br_startoff);
2303 if (prevp->br_startoff != NULLFILEOFF)
2304 ASSERT(align_off >= prevp->br_startoff + prevp->br_blockcount);
2305#endif
2306
2307 *lenp = align_alen;
2308 *offp = align_off;
2309 return 0;
2310}
2311
2149#define XFS_ALLOC_GAP_UNITS 4 2312#define XFS_ALLOC_GAP_UNITS 4
2150 2313
2151/* 2314/*
2152 * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. 2315 * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
2153 * It figures out where to ask the underlying allocator to put the new extent. 2316 * It figures out where to ask the underlying allocator to put the new extent.
2154 */ 2317 */
2155STATIC int /* error */ 2318STATIC int
2156xfs_bmap_alloc( 2319xfs_bmap_alloc(
2157 xfs_bmalloca_t *ap) /* bmap alloc argument struct */ 2320 xfs_bmalloca_t *ap) /* bmap alloc argument struct */
2158{ 2321{
@@ -2163,10 +2326,10 @@ xfs_bmap_alloc(
2163 xfs_mount_t *mp; /* mount point structure */ 2326 xfs_mount_t *mp; /* mount point structure */
2164 int nullfb; /* true if ap->firstblock isn't set */ 2327 int nullfb; /* true if ap->firstblock isn't set */
2165 int rt; /* true if inode is realtime */ 2328 int rt; /* true if inode is realtime */
2166#ifdef __KERNEL__ 2329 xfs_extlen_t prod = 0; /* product factor for allocators */
2167 xfs_extlen_t prod=0; /* product factor for allocators */ 2330 xfs_extlen_t ralen = 0; /* realtime allocation length */
2168 xfs_extlen_t ralen=0; /* realtime allocation length */ 2331 xfs_extlen_t align; /* minimum allocation alignment */
2169#endif 2332 xfs_rtblock_t rtx;
2170 2333
2171#define ISVALID(x,y) \ 2334#define ISVALID(x,y) \
2172 (rt ? \ 2335 (rt ? \
@@ -2182,125 +2345,25 @@ xfs_bmap_alloc(
2182 nullfb = ap->firstblock == NULLFSBLOCK; 2345 nullfb = ap->firstblock == NULLFSBLOCK;
2183 rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata; 2346 rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata;
2184 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); 2347 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
2185#ifdef __KERNEL__
2186 if (rt) { 2348 if (rt) {
2187 xfs_extlen_t extsz; /* file extent size for rt */ 2349 align = ap->ip->i_d.di_extsize ?
2188 xfs_fileoff_t nexto; /* next file offset */ 2350 ap->ip->i_d.di_extsize : mp->m_sb.sb_rextsize;
2189 xfs_extlen_t orig_alen; /* original ap->alen */ 2351 /* Set prod to match the extent size */
2190 xfs_fileoff_t orig_end; /* original off+len */ 2352 prod = align / mp->m_sb.sb_rextsize;
2191 xfs_fileoff_t orig_off; /* original ap->off */ 2353
2192 xfs_extlen_t mod_off; /* modulus calculations */ 2354 error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp,
2193 xfs_fileoff_t prevo; /* previous file offset */ 2355 align, rt, ap->eof, 0,
2194 xfs_rtblock_t rtx; /* realtime extent number */ 2356 ap->conv, &ap->off, &ap->alen);
2195 xfs_extlen_t temp; /* temp for rt calculations */ 2357 if (error)
2196 2358 return error;
2197 /* 2359 ASSERT(ap->alen);
2198 * Set prod to match the realtime extent size.
2199 */
2200 if (!(extsz = ap->ip->i_d.di_extsize))
2201 extsz = mp->m_sb.sb_rextsize;
2202 prod = extsz / mp->m_sb.sb_rextsize;
2203 orig_off = ap->off;
2204 orig_alen = ap->alen;
2205 orig_end = orig_off + orig_alen;
2206 /*
2207 * If the file offset is unaligned vs. the extent size
2208 * we need to align it. This will be possible unless
2209 * the file was previously written with a kernel that didn't
2210 * perform this alignment.
2211 */
2212 mod_off = do_mod(orig_off, extsz);
2213 if (mod_off) {
2214 ap->alen += mod_off;
2215 ap->off -= mod_off;
2216 }
2217 /*
2218 * Same adjustment for the end of the requested area.
2219 */
2220 if ((temp = (ap->alen % extsz)))
2221 ap->alen += extsz - temp;
2222 /*
2223 * If the previous block overlaps with this proposed allocation
2224 * then move the start forward without adjusting the length.
2225 */
2226 prevo =
2227 ap->prevp->br_startoff == NULLFILEOFF ?
2228 0 :
2229 (ap->prevp->br_startoff +
2230 ap->prevp->br_blockcount);
2231 if (ap->off != orig_off && ap->off < prevo)
2232 ap->off = prevo;
2233 /*
2234 * If the next block overlaps with this proposed allocation
2235 * then move the start back without adjusting the length,
2236 * but not before offset 0.
2237 * This may of course make the start overlap previous block,
2238 * and if we hit the offset 0 limit then the next block
2239 * can still overlap too.
2240 */
2241 nexto = (ap->eof || ap->gotp->br_startoff == NULLFILEOFF) ?
2242 NULLFILEOFF : ap->gotp->br_startoff;
2243 if (!ap->eof &&
2244 ap->off + ap->alen != orig_end &&
2245 ap->off + ap->alen > nexto)
2246 ap->off = nexto > ap->alen ? nexto - ap->alen : 0;
2247 /*
2248 * If we're now overlapping the next or previous extent that
2249 * means we can't fit an extsz piece in this hole. Just move
2250 * the start forward to the first valid spot and set
2251 * the length so we hit the end.
2252 */
2253 if ((ap->off != orig_off && ap->off < prevo) ||
2254 (ap->off + ap->alen != orig_end &&
2255 ap->off + ap->alen > nexto)) {
2256 ap->off = prevo;
2257 ap->alen = nexto - prevo;
2258 }
2259 /*
2260 * If the result isn't a multiple of rtextents we need to
2261 * remove blocks until it is.
2262 */
2263 if ((temp = (ap->alen % mp->m_sb.sb_rextsize))) {
2264 /*
2265 * We're not covering the original request, or
2266 * we won't be able to once we fix the length.
2267 */
2268 if (orig_off < ap->off ||
2269 orig_end > ap->off + ap->alen ||
2270 ap->alen - temp < orig_alen)
2271 return XFS_ERROR(EINVAL);
2272 /*
2273 * Try to fix it by moving the start up.
2274 */
2275 if (ap->off + temp <= orig_off) {
2276 ap->alen -= temp;
2277 ap->off += temp;
2278 }
2279 /*
2280 * Try to fix it by moving the end in.
2281 */
2282 else if (ap->off + ap->alen - temp >= orig_end)
2283 ap->alen -= temp;
2284 /*
2285 * Set the start to the minimum then trim the length.
2286 */
2287 else {
2288 ap->alen -= orig_off - ap->off;
2289 ap->off = orig_off;
2290 ap->alen -= ap->alen % mp->m_sb.sb_rextsize;
2291 }
2292 /*
2293 * Result doesn't cover the request, fail it.
2294 */
2295 if (orig_off < ap->off || orig_end > ap->off + ap->alen)
2296 return XFS_ERROR(EINVAL);
2297 }
2298 ASSERT(ap->alen % mp->m_sb.sb_rextsize == 0); 2360 ASSERT(ap->alen % mp->m_sb.sb_rextsize == 0);
2361
2299 /* 2362 /*
2300 * If the offset & length are not perfectly aligned 2363 * If the offset & length are not perfectly aligned
2301 * then kill prod, it will just get us in trouble. 2364 * then kill prod, it will just get us in trouble.
2302 */ 2365 */
2303 if (do_mod(ap->off, extsz) || ap->alen % extsz) 2366 if (do_mod(ap->off, align) || ap->alen % align)
2304 prod = 1; 2367 prod = 1;
2305 /* 2368 /*
2306 * Set ralen to be the actual requested length in rtextents. 2369 * Set ralen to be the actual requested length in rtextents.
@@ -2326,15 +2389,24 @@ xfs_bmap_alloc(
2326 ap->rval = rtx * mp->m_sb.sb_rextsize; 2389 ap->rval = rtx * mp->m_sb.sb_rextsize;
2327 } else 2390 } else
2328 ap->rval = 0; 2391 ap->rval = 0;
2392 } else {
2393 align = (ap->userdata && ap->ip->i_d.di_extsize &&
2394 (ap->ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)) ?
2395 ap->ip->i_d.di_extsize : 0;
2396 if (unlikely(align)) {
2397 error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp,
2398 align, rt,
2399 ap->eof, 0, ap->conv,
2400 &ap->off, &ap->alen);
2401 ASSERT(!error);
2402 ASSERT(ap->alen);
2403 }
2404 if (nullfb)
2405 ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
2406 else
2407 ap->rval = ap->firstblock;
2329 } 2408 }
2330#else 2409
2331 if (rt)
2332 ap->rval = 0;
2333#endif /* __KERNEL__ */
2334 else if (nullfb)
2335 ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
2336 else
2337 ap->rval = ap->firstblock;
2338 /* 2410 /*
2339 * If allocating at eof, and there's a previous real block, 2411 * If allocating at eof, and there's a previous real block,
2340 * try to use it's last block as our starting point. 2412 * try to use it's last block as our starting point.
@@ -2598,11 +2670,12 @@ xfs_bmap_alloc(
2598 args.total = ap->total; 2670 args.total = ap->total;
2599 args.minlen = ap->minlen; 2671 args.minlen = ap->minlen;
2600 } 2672 }
2601 if (ap->ip->i_d.di_extsize) { 2673 if (unlikely(ap->userdata && ap->ip->i_d.di_extsize &&
2674 (ap->ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE))) {
2602 args.prod = ap->ip->i_d.di_extsize; 2675 args.prod = ap->ip->i_d.di_extsize;
2603 if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod))) 2676 if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod)))
2604 args.mod = (xfs_extlen_t)(args.prod - args.mod); 2677 args.mod = (xfs_extlen_t)(args.prod - args.mod);
2605 } else if (mp->m_sb.sb_blocksize >= NBPP) { 2678 } else if (unlikely(mp->m_sb.sb_blocksize >= NBPP)) {
2606 args.prod = 1; 2679 args.prod = 1;
2607 args.mod = 0; 2680 args.mod = 0;
2608 } else { 2681 } else {
@@ -3580,14 +3653,16 @@ xfs_bmap_search_extents(
3580 3653
3581 ep = xfs_bmap_do_search_extents(base, lastx, nextents, bno, eofp, 3654 ep = xfs_bmap_do_search_extents(base, lastx, nextents, bno, eofp,
3582 lastxp, gotp, prevp); 3655 lastxp, gotp, prevp);
3583 rt = ip->i_d.di_flags & XFS_DIFLAG_REALTIME; 3656 rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
3584 if(!rt && !gotp->br_startblock && (*lastxp != NULLEXTNUM)) { 3657 if (unlikely(!rt && !gotp->br_startblock && (*lastxp != NULLEXTNUM))) {
3585 cmn_err(CE_PANIC,"Access to block zero: fs: <%s> inode: %lld " 3658 cmn_err(CE_PANIC,"Access to block zero: fs: <%s> inode: %lld "
3586 "start_block : %llx start_off : %llx blkcnt : %llx " 3659 "start_block : %llx start_off : %llx blkcnt : %llx "
3587 "extent-state : %x \n", 3660 "extent-state : %x \n",
3588 (ip->i_mount)->m_fsname,(long long)ip->i_ino, 3661 (ip->i_mount)->m_fsname, (long long)ip->i_ino,
3589 gotp->br_startblock, gotp->br_startoff, 3662 (unsigned long long)gotp->br_startblock,
3590 gotp->br_blockcount,gotp->br_state); 3663 (unsigned long long)gotp->br_startoff,
3664 (unsigned long long)gotp->br_blockcount,
3665 gotp->br_state);
3591 } 3666 }
3592 return ep; 3667 return ep;
3593} 3668}
@@ -3875,7 +3950,7 @@ xfs_bmap_add_attrfork(
3875 ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size); 3950 ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
3876 if (!ip->i_d.di_forkoff) 3951 if (!ip->i_d.di_forkoff)
3877 ip->i_d.di_forkoff = mp->m_attroffset >> 3; 3952 ip->i_d.di_forkoff = mp->m_attroffset >> 3;
3878 else if (!(mp->m_flags & XFS_MOUNT_COMPAT_ATTR)) 3953 else if (mp->m_flags & XFS_MOUNT_ATTR2)
3879 version = 2; 3954 version = 2;
3880 break; 3955 break;
3881 default: 3956 default:
@@ -4023,13 +4098,13 @@ xfs_bmap_compute_maxlevels(
4023 */ 4098 */
4024 if (whichfork == XFS_DATA_FORK) { 4099 if (whichfork == XFS_DATA_FORK) {
4025 maxleafents = MAXEXTNUM; 4100 maxleafents = MAXEXTNUM;
4026 sz = (mp->m_flags & XFS_MOUNT_COMPAT_ATTR) ? 4101 sz = (mp->m_flags & XFS_MOUNT_ATTR2) ?
4027 mp->m_attroffset : XFS_BMDR_SPACE_CALC(MINDBTPTRS); 4102 XFS_BMDR_SPACE_CALC(MINDBTPTRS) : mp->m_attroffset;
4028 } else { 4103 } else {
4029 maxleafents = MAXAEXTNUM; 4104 maxleafents = MAXAEXTNUM;
4030 sz = (mp->m_flags & XFS_MOUNT_COMPAT_ATTR) ? 4105 sz = (mp->m_flags & XFS_MOUNT_ATTR2) ?
4031 mp->m_sb.sb_inodesize - mp->m_attroffset : 4106 XFS_BMDR_SPACE_CALC(MINABTPTRS) :
4032 XFS_BMDR_SPACE_CALC(MINABTPTRS); 4107 mp->m_sb.sb_inodesize - mp->m_attroffset;
4033 } 4108 }
4034 maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0); 4109 maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0);
4035 minleafrecs = mp->m_bmap_dmnr[0]; 4110 minleafrecs = mp->m_bmap_dmnr[0];
@@ -4418,8 +4493,8 @@ xfs_bmap_read_extents(
4418 num_recs = be16_to_cpu(block->bb_numrecs); 4493 num_recs = be16_to_cpu(block->bb_numrecs);
4419 if (unlikely(i + num_recs > room)) { 4494 if (unlikely(i + num_recs > room)) {
4420 ASSERT(i + num_recs <= room); 4495 ASSERT(i + num_recs <= room);
4421 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 4496 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
4422 "corrupt dinode %Lu, (btree extents). Unmount and run xfs_repair.", 4497 "corrupt dinode %Lu, (btree extents).",
4423 (unsigned long long) ip->i_ino); 4498 (unsigned long long) ip->i_ino);
4424 XFS_ERROR_REPORT("xfs_bmap_read_extents(1)", 4499 XFS_ERROR_REPORT("xfs_bmap_read_extents(1)",
4425 XFS_ERRLEVEL_LOW, 4500 XFS_ERRLEVEL_LOW,
@@ -4590,6 +4665,7 @@ xfs_bmapi(
4590 char contig; /* allocation must be one extent */ 4665 char contig; /* allocation must be one extent */
4591 char delay; /* this request is for delayed alloc */ 4666 char delay; /* this request is for delayed alloc */
4592 char exact; /* don't do all of wasdelayed extent */ 4667 char exact; /* don't do all of wasdelayed extent */
4668 char convert; /* unwritten extent I/O completion */
4593 xfs_bmbt_rec_t *ep; /* extent list entry pointer */ 4669 xfs_bmbt_rec_t *ep; /* extent list entry pointer */
4594 int error; /* error return */ 4670 int error; /* error return */
4595 xfs_bmbt_irec_t got; /* current extent list record */ 4671 xfs_bmbt_irec_t got; /* current extent list record */
@@ -4643,7 +4719,7 @@ xfs_bmapi(
4643 } 4719 }
4644 if (XFS_FORCED_SHUTDOWN(mp)) 4720 if (XFS_FORCED_SHUTDOWN(mp))
4645 return XFS_ERROR(EIO); 4721 return XFS_ERROR(EIO);
4646 rt = XFS_IS_REALTIME_INODE(ip); 4722 rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
4647 ifp = XFS_IFORK_PTR(ip, whichfork); 4723 ifp = XFS_IFORK_PTR(ip, whichfork);
4648 ASSERT(ifp->if_ext_max == 4724 ASSERT(ifp->if_ext_max ==
4649 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); 4725 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
@@ -4654,6 +4730,7 @@ xfs_bmapi(
4654 delay = (flags & XFS_BMAPI_DELAY) != 0; 4730 delay = (flags & XFS_BMAPI_DELAY) != 0;
4655 trim = (flags & XFS_BMAPI_ENTIRE) == 0; 4731 trim = (flags & XFS_BMAPI_ENTIRE) == 0;
4656 userdata = (flags & XFS_BMAPI_METADATA) == 0; 4732 userdata = (flags & XFS_BMAPI_METADATA) == 0;
4733 convert = (flags & XFS_BMAPI_CONVERT) != 0;
4657 exact = (flags & XFS_BMAPI_EXACT) != 0; 4734 exact = (flags & XFS_BMAPI_EXACT) != 0;
4658 rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0; 4735 rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0;
4659 contig = (flags & XFS_BMAPI_CONTIG) != 0; 4736 contig = (flags & XFS_BMAPI_CONTIG) != 0;
@@ -4748,15 +4825,25 @@ xfs_bmapi(
4748 } 4825 }
4749 minlen = contig ? alen : 1; 4826 minlen = contig ? alen : 1;
4750 if (delay) { 4827 if (delay) {
4751 xfs_extlen_t extsz = 0; 4828 xfs_extlen_t extsz;
4752 4829
4753 /* Figure out the extent size, adjust alen */ 4830 /* Figure out the extent size, adjust alen */
4754 if (rt) { 4831 if (rt) {
4755 if (!(extsz = ip->i_d.di_extsize)) 4832 if (!(extsz = ip->i_d.di_extsize))
4756 extsz = mp->m_sb.sb_rextsize; 4833 extsz = mp->m_sb.sb_rextsize;
4757 alen = roundup(alen, extsz); 4834 } else {
4758 extsz = alen / mp->m_sb.sb_rextsize; 4835 extsz = ip->i_d.di_extsize;
4759 } 4836 }
4837 if (extsz) {
4838 error = xfs_bmap_extsize_align(mp,
4839 &got, &prev, extsz,
4840 rt, eof, delay, convert,
4841 &aoff, &alen);
4842 ASSERT(!error);
4843 }
4844
4845 if (rt)
4846 extsz = alen / mp->m_sb.sb_rextsize;
4760 4847
4761 /* 4848 /*
4762 * Make a transaction-less quota reservation for 4849 * Make a transaction-less quota reservation for
@@ -4785,32 +4872,33 @@ xfs_bmapi(
4785 xfs_bmap_worst_indlen(ip, alen); 4872 xfs_bmap_worst_indlen(ip, alen);
4786 ASSERT(indlen > 0); 4873 ASSERT(indlen > 0);
4787 4874
4788 if (rt) 4875 if (rt) {
4789 error = xfs_mod_incore_sb(mp, 4876 error = xfs_mod_incore_sb(mp,
4790 XFS_SBS_FREXTENTS, 4877 XFS_SBS_FREXTENTS,
4791 -(extsz), rsvd); 4878 -(extsz), rsvd);
4792 else 4879 } else {
4793 error = xfs_mod_incore_sb(mp, 4880 error = xfs_mod_incore_sb(mp,
4794 XFS_SBS_FDBLOCKS, 4881 XFS_SBS_FDBLOCKS,
4795 -(alen), rsvd); 4882 -(alen), rsvd);
4883 }
4796 if (!error) { 4884 if (!error) {
4797 error = xfs_mod_incore_sb(mp, 4885 error = xfs_mod_incore_sb(mp,
4798 XFS_SBS_FDBLOCKS, 4886 XFS_SBS_FDBLOCKS,
4799 -(indlen), rsvd); 4887 -(indlen), rsvd);
4800 if (error && rt) { 4888 if (error && rt)
4801 xfs_mod_incore_sb(ip->i_mount, 4889 xfs_mod_incore_sb(mp,
4802 XFS_SBS_FREXTENTS, 4890 XFS_SBS_FREXTENTS,
4803 extsz, rsvd); 4891 extsz, rsvd);
4804 } else if (error) { 4892 else if (error)
4805 xfs_mod_incore_sb(ip->i_mount, 4893 xfs_mod_incore_sb(mp,
4806 XFS_SBS_FDBLOCKS, 4894 XFS_SBS_FDBLOCKS,
4807 alen, rsvd); 4895 alen, rsvd);
4808 }
4809 } 4896 }
4810 4897
4811 if (error) { 4898 if (error) {
4812 if (XFS_IS_QUOTA_ON(ip->i_mount)) 4899 if (XFS_IS_QUOTA_ON(mp))
4813 /* unreserve the blocks now */ 4900 /* unreserve the blocks now */
4901 (void)
4814 XFS_TRANS_UNRESERVE_QUOTA_NBLKS( 4902 XFS_TRANS_UNRESERVE_QUOTA_NBLKS(
4815 mp, NULL, ip, 4903 mp, NULL, ip,
4816 (long)alen, 0, rt ? 4904 (long)alen, 0, rt ?
@@ -4849,6 +4937,7 @@ xfs_bmapi(
4849 bma.firstblock = *firstblock; 4937 bma.firstblock = *firstblock;
4850 bma.alen = alen; 4938 bma.alen = alen;
4851 bma.off = aoff; 4939 bma.off = aoff;
4940 bma.conv = convert;
4852 bma.wasdel = wasdelay; 4941 bma.wasdel = wasdelay;
4853 bma.minlen = minlen; 4942 bma.minlen = minlen;
4854 bma.low = flist->xbf_low; 4943 bma.low = flist->xbf_low;
@@ -5270,8 +5359,7 @@ xfs_bunmapi(
5270 return 0; 5359 return 0;
5271 } 5360 }
5272 XFS_STATS_INC(xs_blk_unmap); 5361 XFS_STATS_INC(xs_blk_unmap);
5273 isrt = (whichfork == XFS_DATA_FORK) && 5362 isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
5274 (ip->i_d.di_flags & XFS_DIFLAG_REALTIME);
5275 start = bno; 5363 start = bno;
5276 bno = start + len - 1; 5364 bno = start + len - 1;
5277 ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, 5365 ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
@@ -5443,7 +5531,7 @@ xfs_bunmapi(
5443 } 5531 }
5444 if (wasdel) { 5532 if (wasdel) {
5445 ASSERT(STARTBLOCKVAL(del.br_startblock) > 0); 5533 ASSERT(STARTBLOCKVAL(del.br_startblock) > 0);
5446 /* Update realtim/data freespace, unreserve quota */ 5534 /* Update realtime/data freespace, unreserve quota */
5447 if (isrt) { 5535 if (isrt) {
5448 xfs_filblks_t rtexts; 5536 xfs_filblks_t rtexts;
5449 5537
@@ -5451,14 +5539,14 @@ xfs_bunmapi(
5451 do_div(rtexts, mp->m_sb.sb_rextsize); 5539 do_div(rtexts, mp->m_sb.sb_rextsize);
5452 xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, 5540 xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
5453 (int)rtexts, rsvd); 5541 (int)rtexts, rsvd);
5454 XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, NULL, ip, 5542 (void)XFS_TRANS_RESERVE_QUOTA_NBLKS(mp,
5455 -((long)del.br_blockcount), 0, 5543 NULL, ip, -((long)del.br_blockcount), 0,
5456 XFS_QMOPT_RES_RTBLKS); 5544 XFS_QMOPT_RES_RTBLKS);
5457 } else { 5545 } else {
5458 xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, 5546 xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS,
5459 (int)del.br_blockcount, rsvd); 5547 (int)del.br_blockcount, rsvd);
5460 XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, NULL, ip, 5548 (void)XFS_TRANS_RESERVE_QUOTA_NBLKS(mp,
5461 -((long)del.br_blockcount), 0, 5549 NULL, ip, -((long)del.br_blockcount), 0,
5462 XFS_QMOPT_RES_REGBLKS); 5550 XFS_QMOPT_RES_REGBLKS);
5463 } 5551 }
5464 ip->i_delayed_blks -= del.br_blockcount; 5552 ip->i_delayed_blks -= del.br_blockcount;
@@ -5652,7 +5740,9 @@ xfs_getbmap(
5652 ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) 5740 ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
5653 return XFS_ERROR(EINVAL); 5741 return XFS_ERROR(EINVAL);
5654 if (whichfork == XFS_DATA_FORK) { 5742 if (whichfork == XFS_DATA_FORK) {
5655 if (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC) { 5743 if ((ip->i_d.di_extsize && (ip->i_d.di_flags &
5744 (XFS_DIFLAG_REALTIME|XFS_DIFLAG_EXTSIZE))) ||
5745 ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
5656 prealloced = 1; 5746 prealloced = 1;
5657 fixlen = XFS_MAXIOFFSET(mp); 5747 fixlen = XFS_MAXIOFFSET(mp);
5658 } else { 5748 } else {
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 2e0717a0130..12cc63dfc2c 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -62,6 +62,10 @@ typedef struct xfs_bmap_free
62#define XFS_BMAPI_IGSTATE 0x200 /* Ignore state - */ 62#define XFS_BMAPI_IGSTATE 0x200 /* Ignore state - */
63 /* combine contig. space */ 63 /* combine contig. space */
64#define XFS_BMAPI_CONTIG 0x400 /* must allocate only one extent */ 64#define XFS_BMAPI_CONTIG 0x400 /* must allocate only one extent */
65/* XFS_BMAPI_DIRECT_IO 0x800 */
66#define XFS_BMAPI_CONVERT 0x1000 /* unwritten extent conversion - */
67 /* need write cache flushing and no */
68 /* additional allocation alignments */
65 69
66#define XFS_BMAPI_AFLAG(w) xfs_bmapi_aflag(w) 70#define XFS_BMAPI_AFLAG(w) xfs_bmapi_aflag(w)
67static inline int xfs_bmapi_aflag(int w) 71static inline int xfs_bmapi_aflag(int w)
@@ -101,7 +105,8 @@ typedef struct xfs_bmalloca {
101 char wasdel; /* replacing a delayed allocation */ 105 char wasdel; /* replacing a delayed allocation */
102 char userdata;/* set if is user data */ 106 char userdata;/* set if is user data */
103 char low; /* low on space, using seq'l ags */ 107 char low; /* low on space, using seq'l ags */
104 char aeof; /* allocated space at eof */ 108 char aeof; /* allocated space at eof */
109 char conv; /* overwriting unwritten extents */
105} xfs_bmalloca_t; 110} xfs_bmalloca_t;
106 111
107#ifdef __KERNEL__ 112#ifdef __KERNEL__
diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h
index 328a528b926..f57cc9ac875 100644
--- a/fs/xfs/xfs_clnt.h
+++ b/fs/xfs/xfs_clnt.h
@@ -57,7 +57,7 @@ struct xfs_mount_args {
57/* 57/*
58 * XFS mount option flags -- args->flags1 58 * XFS mount option flags -- args->flags1
59 */ 59 */
60#define XFSMNT_COMPAT_ATTR 0x00000001 /* do not use ATTR2 format */ 60#define XFSMNT_ATTR2 0x00000001 /* allow ATTR2 EA format */
61#define XFSMNT_WSYNC 0x00000002 /* safe mode nfs mount 61#define XFSMNT_WSYNC 0x00000002 /* safe mode nfs mount
62 * compatible */ 62 * compatible */
63#define XFSMNT_INO64 0x00000004 /* move inode numbers up 63#define XFSMNT_INO64 0x00000004 /* move inode numbers up
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 070259a4254..c6191d00ad2 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -60,8 +60,6 @@ xfs_swapext(
60 xfs_bstat_t *sbp; 60 xfs_bstat_t *sbp;
61 struct file *fp = NULL, *tfp = NULL; 61 struct file *fp = NULL, *tfp = NULL;
62 vnode_t *vp, *tvp; 62 vnode_t *vp, *tvp;
63 bhv_desc_t *bdp, *tbdp;
64 vn_bhv_head_t *bhp, *tbhp;
65 static uint lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL; 63 static uint lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
66 int ilf_fields, tilf_fields; 64 int ilf_fields, tilf_fields;
67 int error = 0; 65 int error = 0;
@@ -90,13 +88,10 @@ xfs_swapext(
90 goto error0; 88 goto error0;
91 } 89 }
92 90
93 bhp = VN_BHV_HEAD(vp); 91 ip = xfs_vtoi(vp);
94 bdp = vn_bhv_lookup(bhp, &xfs_vnodeops); 92 if (ip == NULL) {
95 if (bdp == NULL) {
96 error = XFS_ERROR(EBADF); 93 error = XFS_ERROR(EBADF);
97 goto error0; 94 goto error0;
98 } else {
99 ip = XFS_BHVTOI(bdp);
100 } 95 }
101 96
102 if (((tfp = fget((int)sxp->sx_fdtmp)) == NULL) || 97 if (((tfp = fget((int)sxp->sx_fdtmp)) == NULL) ||
@@ -105,13 +100,10 @@ xfs_swapext(
105 goto error0; 100 goto error0;
106 } 101 }
107 102
108 tbhp = VN_BHV_HEAD(tvp); 103 tip = xfs_vtoi(tvp);
109 tbdp = vn_bhv_lookup(tbhp, &xfs_vnodeops); 104 if (tip == NULL) {
110 if (tbdp == NULL) {
111 error = XFS_ERROR(EBADF); 105 error = XFS_ERROR(EBADF);
112 goto error0; 106 goto error0;
113 } else {
114 tip = XFS_BHVTOI(tbdp);
115 } 107 }
116 108
117 if (ip->i_mount != tip->i_mount) { 109 if (ip->i_mount != tip->i_mount) {
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index c5a0e537ff1..79d0d9e1fba 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -199,10 +199,16 @@ typedef enum xfs_dinode_fmt
199 199
200#define XFS_DFORK_DSIZE(dip,mp) \ 200#define XFS_DFORK_DSIZE(dip,mp) \
201 XFS_CFORK_DSIZE_DISK(&(dip)->di_core, mp) 201 XFS_CFORK_DSIZE_DISK(&(dip)->di_core, mp)
202#define XFS_DFORK_DSIZE_HOST(dip,mp) \
203 XFS_CFORK_DSIZE(&(dip)->di_core, mp)
202#define XFS_DFORK_ASIZE(dip,mp) \ 204#define XFS_DFORK_ASIZE(dip,mp) \
203 XFS_CFORK_ASIZE_DISK(&(dip)->di_core, mp) 205 XFS_CFORK_ASIZE_DISK(&(dip)->di_core, mp)
206#define XFS_DFORK_ASIZE_HOST(dip,mp) \
207 XFS_CFORK_ASIZE(&(dip)->di_core, mp)
204#define XFS_DFORK_SIZE(dip,mp,w) \ 208#define XFS_DFORK_SIZE(dip,mp,w) \
205 XFS_CFORK_SIZE_DISK(&(dip)->di_core, mp, w) 209 XFS_CFORK_SIZE_DISK(&(dip)->di_core, mp, w)
210#define XFS_DFORK_SIZE_HOST(dip,mp,w) \
211 XFS_CFORK_SIZE(&(dip)->di_core, mp, w)
206 212
207#define XFS_DFORK_Q(dip) XFS_CFORK_Q_DISK(&(dip)->di_core) 213#define XFS_DFORK_Q(dip) XFS_CFORK_Q_DISK(&(dip)->di_core)
208#define XFS_DFORK_BOFF(dip) XFS_CFORK_BOFF_DISK(&(dip)->di_core) 214#define XFS_DFORK_BOFF(dip) XFS_CFORK_BOFF_DISK(&(dip)->di_core)
@@ -216,6 +222,7 @@ typedef enum xfs_dinode_fmt
216#define XFS_CFORK_FMT_SET(dcp,w,n) \ 222#define XFS_CFORK_FMT_SET(dcp,w,n) \
217 ((w) == XFS_DATA_FORK ? \ 223 ((w) == XFS_DATA_FORK ? \
218 ((dcp)->di_format = (n)) : ((dcp)->di_aformat = (n))) 224 ((dcp)->di_format = (n)) : ((dcp)->di_aformat = (n)))
225#define XFS_DFORK_FORMAT(dip,w) XFS_CFORK_FORMAT(&(dip)->di_core, w)
219 226
220#define XFS_CFORK_NEXTENTS_DISK(dcp,w) \ 227#define XFS_CFORK_NEXTENTS_DISK(dcp,w) \
221 ((w) == XFS_DATA_FORK ? \ 228 ((w) == XFS_DATA_FORK ? \
@@ -223,13 +230,13 @@ typedef enum xfs_dinode_fmt
223 INT_GET((dcp)->di_anextents, ARCH_CONVERT)) 230 INT_GET((dcp)->di_anextents, ARCH_CONVERT))
224#define XFS_CFORK_NEXTENTS(dcp,w) \ 231#define XFS_CFORK_NEXTENTS(dcp,w) \
225 ((w) == XFS_DATA_FORK ? (dcp)->di_nextents : (dcp)->di_anextents) 232 ((w) == XFS_DATA_FORK ? (dcp)->di_nextents : (dcp)->di_anextents)
233#define XFS_DFORK_NEXTENTS(dip,w) XFS_CFORK_NEXTENTS_DISK(&(dip)->di_core, w)
234#define XFS_DFORK_NEXTENTS_HOST(dip,w) XFS_CFORK_NEXTENTS(&(dip)->di_core, w)
226 235
227#define XFS_CFORK_NEXT_SET(dcp,w,n) \ 236#define XFS_CFORK_NEXT_SET(dcp,w,n) \
228 ((w) == XFS_DATA_FORK ? \ 237 ((w) == XFS_DATA_FORK ? \
229 ((dcp)->di_nextents = (n)) : ((dcp)->di_anextents = (n))) 238 ((dcp)->di_nextents = (n)) : ((dcp)->di_anextents = (n)))
230 239
231#define XFS_DFORK_NEXTENTS(dip,w) XFS_CFORK_NEXTENTS_DISK(&(dip)->di_core, w)
232
233#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)XFS_BUF_PTR(bp)) 240#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)XFS_BUF_PTR(bp))
234 241
235/* 242/*
@@ -246,8 +253,10 @@ typedef enum xfs_dinode_fmt
246#define XFS_DIFLAG_NOATIME_BIT 6 /* do not update atime */ 253#define XFS_DIFLAG_NOATIME_BIT 6 /* do not update atime */
247#define XFS_DIFLAG_NODUMP_BIT 7 /* do not dump */ 254#define XFS_DIFLAG_NODUMP_BIT 7 /* do not dump */
248#define XFS_DIFLAG_RTINHERIT_BIT 8 /* create with realtime bit set */ 255#define XFS_DIFLAG_RTINHERIT_BIT 8 /* create with realtime bit set */
249#define XFS_DIFLAG_PROJINHERIT_BIT 9 /* create with parents projid */ 256#define XFS_DIFLAG_PROJINHERIT_BIT 9 /* create with parents projid */
250#define XFS_DIFLAG_NOSYMLINKS_BIT 10 /* disallow symlink creation */ 257#define XFS_DIFLAG_NOSYMLINKS_BIT 10 /* disallow symlink creation */
258#define XFS_DIFLAG_EXTSIZE_BIT 11 /* inode extent size allocator hint */
259#define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */
251#define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT) 260#define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT)
252#define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT) 261#define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT)
253#define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT) 262#define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT)
@@ -259,11 +268,14 @@ typedef enum xfs_dinode_fmt
259#define XFS_DIFLAG_RTINHERIT (1 << XFS_DIFLAG_RTINHERIT_BIT) 268#define XFS_DIFLAG_RTINHERIT (1 << XFS_DIFLAG_RTINHERIT_BIT)
260#define XFS_DIFLAG_PROJINHERIT (1 << XFS_DIFLAG_PROJINHERIT_BIT) 269#define XFS_DIFLAG_PROJINHERIT (1 << XFS_DIFLAG_PROJINHERIT_BIT)
261#define XFS_DIFLAG_NOSYMLINKS (1 << XFS_DIFLAG_NOSYMLINKS_BIT) 270#define XFS_DIFLAG_NOSYMLINKS (1 << XFS_DIFLAG_NOSYMLINKS_BIT)
271#define XFS_DIFLAG_EXTSIZE (1 << XFS_DIFLAG_EXTSIZE_BIT)
272#define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT)
262 273
263#define XFS_DIFLAG_ANY \ 274#define XFS_DIFLAG_ANY \
264 (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \ 275 (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \
265 XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \ 276 XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \
266 XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \ 277 XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \
267 XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS) 278 XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \
279 XFS_DIFLAG_EXTSZINHERIT)
268 280
269#endif /* __XFS_DINODE_H__ */ 281#endif /* __XFS_DINODE_H__ */
diff --git a/fs/xfs/xfs_dir.c b/fs/xfs/xfs_dir.c
index 3dd30391f55..bb87d2a700a 100644
--- a/fs/xfs/xfs_dir.c
+++ b/fs/xfs/xfs_dir.c
@@ -176,7 +176,7 @@ xfs_dir_mount(xfs_mount_t *mp)
176 uint shortcount, leafcount, count; 176 uint shortcount, leafcount, count;
177 177
178 mp->m_dirversion = 1; 178 mp->m_dirversion = 1;
179 if (mp->m_flags & XFS_MOUNT_COMPAT_ATTR) { 179 if (!(mp->m_flags & XFS_MOUNT_ATTR2)) {
180 shortcount = (mp->m_attroffset - 180 shortcount = (mp->m_attroffset -
181 (uint)sizeof(xfs_dir_sf_hdr_t)) / 181 (uint)sizeof(xfs_dir_sf_hdr_t)) /
182 (uint)sizeof(xfs_dir_sf_entry_t); 182 (uint)sizeof(xfs_dir_sf_entry_t);
diff --git a/fs/xfs/xfs_dir.h b/fs/xfs/xfs_dir.h
index 488defe86ba..8cc8afb9f6c 100644
--- a/fs/xfs/xfs_dir.h
+++ b/fs/xfs/xfs_dir.h
@@ -135,6 +135,8 @@ void xfs_dir_startup(void); /* called exactly once */
135 ((mp)->m_dirops.xd_shortform_to_single(args)) 135 ((mp)->m_dirops.xd_shortform_to_single(args))
136 136
137#define XFS_DIR_IS_V1(mp) ((mp)->m_dirversion == 1) 137#define XFS_DIR_IS_V1(mp) ((mp)->m_dirversion == 1)
138#define XFS_DIR_IS_V2(mp) ((mp)->m_dirversion == 2)
138extern xfs_dirops_t xfsv1_dirops; 139extern xfs_dirops_t xfsv1_dirops;
140extern xfs_dirops_t xfsv2_dirops;
139 141
140#endif /* __XFS_DIR_H__ */ 142#endif /* __XFS_DIR_H__ */
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index 7e24ffeda9e..3158f5dc431 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -72,9 +72,6 @@ typedef struct xfs_dir2_put_args {
72 struct uio *uio; /* uio control structure */ 72 struct uio *uio; /* uio control structure */
73} xfs_dir2_put_args_t; 73} xfs_dir2_put_args_t;
74 74
75#define XFS_DIR_IS_V2(mp) ((mp)->m_dirversion == 2)
76extern xfs_dirops_t xfsv2_dirops;
77
78/* 75/*
79 * Other interfaces used by the rest of the dir v2 code. 76 * Other interfaces used by the rest of the dir v2 code.
80 */ 77 */
diff --git a/fs/xfs/xfs_dir_leaf.c b/fs/xfs/xfs_dir_leaf.c
index 950df31efc4..e83074016ab 100644
--- a/fs/xfs/xfs_dir_leaf.c
+++ b/fs/xfs/xfs_dir_leaf.c
@@ -147,7 +147,7 @@ xfs_dir_shortform_create(xfs_da_args_t *args, xfs_ino_t parent)
147 hdr->count = 0; 147 hdr->count = 0;
148 dp->i_d.di_size = sizeof(*hdr); 148 dp->i_d.di_size = sizeof(*hdr);
149 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); 149 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
150 return(0); 150 return 0;
151} 151}
152 152
153/* 153/*
@@ -180,7 +180,7 @@ xfs_dir_shortform_addname(xfs_da_args_t *args)
180 if (sfe->namelen == args->namelen && 180 if (sfe->namelen == args->namelen &&
181 args->name[0] == sfe->name[0] && 181 args->name[0] == sfe->name[0] &&
182 memcmp(args->name, sfe->name, args->namelen) == 0) 182 memcmp(args->name, sfe->name, args->namelen) == 0)
183 return(XFS_ERROR(EEXIST)); 183 return XFS_ERROR(EEXIST);
184 sfe = XFS_DIR_SF_NEXTENTRY(sfe); 184 sfe = XFS_DIR_SF_NEXTENTRY(sfe);
185 } 185 }
186 186
@@ -198,7 +198,7 @@ xfs_dir_shortform_addname(xfs_da_args_t *args)
198 dp->i_d.di_size += size; 198 dp->i_d.di_size += size;
199 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); 199 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
200 200
201 return(0); 201 return 0;
202} 202}
203 203
204/* 204/*
@@ -238,7 +238,7 @@ xfs_dir_shortform_removename(xfs_da_args_t *args)
238 } 238 }
239 if (i < 0) { 239 if (i < 0) {
240 ASSERT(args->oknoent); 240 ASSERT(args->oknoent);
241 return(XFS_ERROR(ENOENT)); 241 return XFS_ERROR(ENOENT);
242 } 242 }
243 243
244 if ((base + size) != dp->i_d.di_size) { 244 if ((base + size) != dp->i_d.di_size) {
@@ -251,7 +251,7 @@ xfs_dir_shortform_removename(xfs_da_args_t *args)
251 dp->i_d.di_size -= size; 251 dp->i_d.di_size -= size;
252 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); 252 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
253 253
254 return(0); 254 return 0;
255} 255}
256 256
257/* 257/*
@@ -390,7 +390,7 @@ xfs_dir_shortform_to_leaf(xfs_da_args_t *iargs)
390 390
391out: 391out:
392 kmem_free(tmpbuffer, size); 392 kmem_free(tmpbuffer, size);
393 return(retval); 393 return retval;
394} 394}
395 395
396STATIC int 396STATIC int
@@ -596,7 +596,7 @@ xfs_dir_shortform_replace(xfs_da_args_t *args)
596 /* XXX - replace assert? */ 596 /* XXX - replace assert? */
597 XFS_DIR_SF_PUT_DIRINO(&args->inumber, &sf->hdr.parent); 597 XFS_DIR_SF_PUT_DIRINO(&args->inumber, &sf->hdr.parent);
598 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA); 598 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
599 return(0); 599 return 0;
600 } 600 }
601 ASSERT(args->namelen != 1 || args->name[0] != '.'); 601 ASSERT(args->namelen != 1 || args->name[0] != '.');
602 sfe = &sf->list[0]; 602 sfe = &sf->list[0];
@@ -608,12 +608,12 @@ xfs_dir_shortform_replace(xfs_da_args_t *args)
608 (char *)&sfe->inumber, sizeof(xfs_ino_t))); 608 (char *)&sfe->inumber, sizeof(xfs_ino_t)));
609 XFS_DIR_SF_PUT_DIRINO(&args->inumber, &sfe->inumber); 609 XFS_DIR_SF_PUT_DIRINO(&args->inumber, &sfe->inumber);
610 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA); 610 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
611 return(0); 611 return 0;
612 } 612 }
613 sfe = XFS_DIR_SF_NEXTENTRY(sfe); 613 sfe = XFS_DIR_SF_NEXTENTRY(sfe);
614 } 614 }
615 ASSERT(args->oknoent); 615 ASSERT(args->oknoent);
616 return(XFS_ERROR(ENOENT)); 616 return XFS_ERROR(ENOENT);
617} 617}
618 618
619/* 619/*
@@ -695,7 +695,7 @@ xfs_dir_leaf_to_shortform(xfs_da_args_t *iargs)
695 695
696out: 696out:
697 kmem_free(tmpbuffer, XFS_LBSIZE(dp->i_mount)); 697 kmem_free(tmpbuffer, XFS_LBSIZE(dp->i_mount));
698 return(retval); 698 return retval;
699} 699}
700 700
701/* 701/*
@@ -715,17 +715,17 @@ xfs_dir_leaf_to_node(xfs_da_args_t *args)
715 retval = xfs_da_grow_inode(args, &blkno); 715 retval = xfs_da_grow_inode(args, &blkno);
716 ASSERT(blkno == 1); 716 ASSERT(blkno == 1);
717 if (retval) 717 if (retval)
718 return(retval); 718 return retval;
719 retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1, 719 retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1,
720 XFS_DATA_FORK); 720 XFS_DATA_FORK);
721 if (retval) 721 if (retval)
722 return(retval); 722 return retval;
723 ASSERT(bp1 != NULL); 723 ASSERT(bp1 != NULL);
724 retval = xfs_da_get_buf(args->trans, args->dp, 1, -1, &bp2, 724 retval = xfs_da_get_buf(args->trans, args->dp, 1, -1, &bp2,
725 XFS_DATA_FORK); 725 XFS_DATA_FORK);
726 if (retval) { 726 if (retval) {
727 xfs_da_buf_done(bp1); 727 xfs_da_buf_done(bp1);
728 return(retval); 728 return retval;
729 } 729 }
730 ASSERT(bp2 != NULL); 730 ASSERT(bp2 != NULL);
731 memcpy(bp2->data, bp1->data, XFS_LBSIZE(dp->i_mount)); 731 memcpy(bp2->data, bp1->data, XFS_LBSIZE(dp->i_mount));
@@ -738,7 +738,7 @@ xfs_dir_leaf_to_node(xfs_da_args_t *args)
738 retval = xfs_da_node_create(args, 0, 1, &bp1, XFS_DATA_FORK); 738 retval = xfs_da_node_create(args, 0, 1, &bp1, XFS_DATA_FORK);
739 if (retval) { 739 if (retval) {
740 xfs_da_buf_done(bp2); 740 xfs_da_buf_done(bp2);
741 return(retval); 741 return retval;
742 } 742 }
743 node = bp1->data; 743 node = bp1->data;
744 leaf = bp2->data; 744 leaf = bp2->data;
@@ -751,7 +751,7 @@ xfs_dir_leaf_to_node(xfs_da_args_t *args)
751 XFS_DA_LOGRANGE(node, &node->btree[0], sizeof(node->btree[0]))); 751 XFS_DA_LOGRANGE(node, &node->btree[0], sizeof(node->btree[0])));
752 xfs_da_buf_done(bp1); 752 xfs_da_buf_done(bp1);
753 753
754 return(retval); 754 return retval;
755} 755}
756 756
757 757
@@ -776,7 +776,7 @@ xfs_dir_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp)
776 ASSERT(dp != NULL); 776 ASSERT(dp != NULL);
777 retval = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp, XFS_DATA_FORK); 777 retval = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp, XFS_DATA_FORK);
778 if (retval) 778 if (retval)
779 return(retval); 779 return retval;
780 ASSERT(bp != NULL); 780 ASSERT(bp != NULL);
781 leaf = bp->data; 781 leaf = bp->data;
782 memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount)); 782 memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount));
@@ -791,7 +791,7 @@ xfs_dir_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp)
791 xfs_da_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1); 791 xfs_da_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1);
792 792
793 *bpp = bp; 793 *bpp = bp;
794 return(0); 794 return 0;
795} 795}
796 796
797/* 797/*
@@ -813,10 +813,10 @@ xfs_dir_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
813 ASSERT(oldblk->magic == XFS_DIR_LEAF_MAGIC); 813 ASSERT(oldblk->magic == XFS_DIR_LEAF_MAGIC);
814 error = xfs_da_grow_inode(args, &blkno); 814 error = xfs_da_grow_inode(args, &blkno);
815 if (error) 815 if (error)
816 return(error); 816 return error;
817 error = xfs_dir_leaf_create(args, blkno, &newblk->bp); 817 error = xfs_dir_leaf_create(args, blkno, &newblk->bp);
818 if (error) 818 if (error)
819 return(error); 819 return error;
820 newblk->blkno = blkno; 820 newblk->blkno = blkno;
821 newblk->magic = XFS_DIR_LEAF_MAGIC; 821 newblk->magic = XFS_DIR_LEAF_MAGIC;
822 822
@@ -826,7 +826,7 @@ xfs_dir_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
826 xfs_dir_leaf_rebalance(state, oldblk, newblk); 826 xfs_dir_leaf_rebalance(state, oldblk, newblk);
827 error = xfs_da_blk_link(state, oldblk, newblk); 827 error = xfs_da_blk_link(state, oldblk, newblk);
828 if (error) 828 if (error)
829 return(error); 829 return error;
830 830
831 /* 831 /*
832 * Insert the new entry in the correct block. 832 * Insert the new entry in the correct block.
@@ -842,7 +842,7 @@ xfs_dir_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
842 */ 842 */
843 oldblk->hashval = xfs_dir_leaf_lasthash(oldblk->bp, NULL); 843 oldblk->hashval = xfs_dir_leaf_lasthash(oldblk->bp, NULL);
844 newblk->hashval = xfs_dir_leaf_lasthash(newblk->bp, NULL); 844 newblk->hashval = xfs_dir_leaf_lasthash(newblk->bp, NULL);
845 return(error); 845 return error;
846} 846}
847 847
848/* 848/*
@@ -885,7 +885,7 @@ xfs_dir_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index)
885 if (INT_GET(map->size, ARCH_CONVERT) >= tmp) { 885 if (INT_GET(map->size, ARCH_CONVERT) >= tmp) {
886 if (!args->justcheck) 886 if (!args->justcheck)
887 xfs_dir_leaf_add_work(bp, args, index, i); 887 xfs_dir_leaf_add_work(bp, args, index, i);
888 return(0); 888 return 0;
889 } 889 }
890 sum += INT_GET(map->size, ARCH_CONVERT); 890 sum += INT_GET(map->size, ARCH_CONVERT);
891 } 891 }
@@ -896,7 +896,7 @@ xfs_dir_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index)
896 * no good and we should just give up. 896 * no good and we should just give up.
897 */ 897 */
898 if (!hdr->holes && (sum < entsize)) 898 if (!hdr->holes && (sum < entsize))
899 return(XFS_ERROR(ENOSPC)); 899 return XFS_ERROR(ENOSPC);
900 900
901 /* 901 /*
902 * Compact the entries to coalesce free space. 902 * Compact the entries to coalesce free space.
@@ -909,18 +909,18 @@ xfs_dir_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index)
909 (uint)sizeof(xfs_dir_leaf_entry_t) : 0, 909 (uint)sizeof(xfs_dir_leaf_entry_t) : 0,
910 args->justcheck); 910 args->justcheck);
911 if (error) 911 if (error)
912 return(error); 912 return error;
913 /* 913 /*
914 * After compaction, the block is guaranteed to have only one 914 * After compaction, the block is guaranteed to have only one
915 * free region, in freemap[0]. If it is not big enough, give up. 915 * free region, in freemap[0]. If it is not big enough, give up.
916 */ 916 */
917 if (INT_GET(hdr->freemap[0].size, ARCH_CONVERT) < 917 if (INT_GET(hdr->freemap[0].size, ARCH_CONVERT) <
918 (entsize + (uint)sizeof(xfs_dir_leaf_entry_t))) 918 (entsize + (uint)sizeof(xfs_dir_leaf_entry_t)))
919 return(XFS_ERROR(ENOSPC)); 919 return XFS_ERROR(ENOSPC);
920 920
921 if (!args->justcheck) 921 if (!args->justcheck)
922 xfs_dir_leaf_add_work(bp, args, index, 0); 922 xfs_dir_leaf_add_work(bp, args, index, 0);
923 return(0); 923 return 0;
924} 924}
925 925
926/* 926/*
@@ -1072,7 +1072,7 @@ xfs_dir_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp, int musthave,
1072 kmem_free(tmpbuffer, lbsize); 1072 kmem_free(tmpbuffer, lbsize);
1073 if (musthave || justcheck) 1073 if (musthave || justcheck)
1074 kmem_free(tmpbuffer2, lbsize); 1074 kmem_free(tmpbuffer2, lbsize);
1075 return(rval); 1075 return rval;
1076} 1076}
1077 1077
1078/* 1078/*
@@ -1292,7 +1292,7 @@ xfs_dir_leaf_figure_balance(xfs_da_state_t *state,
1292 1292
1293 *countarg = count; 1293 *countarg = count;
1294 *namebytesarg = totallen; 1294 *namebytesarg = totallen;
1295 return(foundit); 1295 return foundit;
1296} 1296}
1297 1297
1298/*======================================================================== 1298/*========================================================================
@@ -1334,7 +1334,7 @@ xfs_dir_leaf_toosmall(xfs_da_state_t *state, int *action)
1334 INT_GET(leaf->hdr.namebytes, ARCH_CONVERT); 1334 INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
1335 if (bytes > (state->blocksize >> 1)) { 1335 if (bytes > (state->blocksize >> 1)) {
1336 *action = 0; /* blk over 50%, don't try to join */ 1336 *action = 0; /* blk over 50%, don't try to join */
1337 return(0); 1337 return 0;
1338 } 1338 }
1339 1339
1340 /* 1340 /*
@@ -1353,13 +1353,13 @@ xfs_dir_leaf_toosmall(xfs_da_state_t *state, int *action)
1353 error = xfs_da_path_shift(state, &state->altpath, forward, 1353 error = xfs_da_path_shift(state, &state->altpath, forward,
1354 0, &retval); 1354 0, &retval);
1355 if (error) 1355 if (error)
1356 return(error); 1356 return error;
1357 if (retval) { 1357 if (retval) {
1358 *action = 0; 1358 *action = 0;
1359 } else { 1359 } else {
1360 *action = 2; 1360 *action = 2;
1361 } 1361 }
1362 return(0); 1362 return 0;
1363 } 1363 }
1364 1364
1365 /* 1365 /*
@@ -1381,7 +1381,7 @@ xfs_dir_leaf_toosmall(xfs_da_state_t *state, int *action)
1381 blkno, -1, &bp, 1381 blkno, -1, &bp,
1382 XFS_DATA_FORK); 1382 XFS_DATA_FORK);
1383 if (error) 1383 if (error)
1384 return(error); 1384 return error;
1385 ASSERT(bp != NULL); 1385 ASSERT(bp != NULL);
1386 1386
1387 leaf = (xfs_dir_leafblock_t *)info; 1387 leaf = (xfs_dir_leafblock_t *)info;
@@ -1402,7 +1402,7 @@ xfs_dir_leaf_toosmall(xfs_da_state_t *state, int *action)
1402 } 1402 }
1403 if (i >= 2) { 1403 if (i >= 2) {
1404 *action = 0; 1404 *action = 0;
1405 return(0); 1405 return 0;
1406 } 1406 }
1407 xfs_da_buf_done(bp); 1407 xfs_da_buf_done(bp);
1408 1408
@@ -1419,13 +1419,13 @@ xfs_dir_leaf_toosmall(xfs_da_state_t *state, int *action)
1419 0, &retval); 1419 0, &retval);
1420 } 1420 }
1421 if (error) 1421 if (error)
1422 return(error); 1422 return error;
1423 if (retval) { 1423 if (retval) {
1424 *action = 0; 1424 *action = 0;
1425 } else { 1425 } else {
1426 *action = 1; 1426 *action = 1;
1427 } 1427 }
1428 return(0); 1428 return 0;
1429} 1429}
1430 1430
1431/* 1431/*
@@ -1575,8 +1575,8 @@ xfs_dir_leaf_remove(xfs_trans_t *trans, xfs_dabuf_t *bp, int index)
1575 tmp += INT_GET(leaf->hdr.count, ARCH_CONVERT) * ((uint)sizeof(xfs_dir_leaf_name_t) - 1); 1575 tmp += INT_GET(leaf->hdr.count, ARCH_CONVERT) * ((uint)sizeof(xfs_dir_leaf_name_t) - 1);
1576 tmp += INT_GET(leaf->hdr.namebytes, ARCH_CONVERT); 1576 tmp += INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
1577 if (tmp < mp->m_dir_magicpct) 1577 if (tmp < mp->m_dir_magicpct)
1578 return(1); /* leaf is < 37% full */ 1578 return 1; /* leaf is < 37% full */
1579 return(0); 1579 return 0;
1580} 1580}
1581 1581
1582/* 1582/*
@@ -1732,7 +1732,7 @@ xfs_dir_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args, int *index)
1732 if ((probe == INT_GET(leaf->hdr.count, ARCH_CONVERT)) || (INT_GET(entry->hashval, ARCH_CONVERT) != hashval)) { 1732 if ((probe == INT_GET(leaf->hdr.count, ARCH_CONVERT)) || (INT_GET(entry->hashval, ARCH_CONVERT) != hashval)) {
1733 *index = probe; 1733 *index = probe;
1734 ASSERT(args->oknoent); 1734 ASSERT(args->oknoent);
1735 return(XFS_ERROR(ENOENT)); 1735 return XFS_ERROR(ENOENT);
1736 } 1736 }
1737 1737
1738 /* 1738 /*
@@ -1745,14 +1745,14 @@ xfs_dir_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args, int *index)
1745 memcmp(args->name, namest->name, args->namelen) == 0) { 1745 memcmp(args->name, namest->name, args->namelen) == 0) {
1746 XFS_DIR_SF_GET_DIRINO(&namest->inumber, &args->inumber); 1746 XFS_DIR_SF_GET_DIRINO(&namest->inumber, &args->inumber);
1747 *index = probe; 1747 *index = probe;
1748 return(XFS_ERROR(EEXIST)); 1748 return XFS_ERROR(EEXIST);
1749 } 1749 }
1750 entry++; 1750 entry++;
1751 probe++; 1751 probe++;
1752 } 1752 }
1753 *index = probe; 1753 *index = probe;
1754 ASSERT(probe == INT_GET(leaf->hdr.count, ARCH_CONVERT) || args->oknoent); 1754 ASSERT(probe == INT_GET(leaf->hdr.count, ARCH_CONVERT) || args->oknoent);
1755 return(XFS_ERROR(ENOENT)); 1755 return XFS_ERROR(ENOENT);
1756} 1756}
1757 1757
1758/*======================================================================== 1758/*========================================================================
@@ -1890,9 +1890,9 @@ xfs_dir_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp)
1890 INT_GET(leaf1->entries[ 0 ].hashval, ARCH_CONVERT)) || 1890 INT_GET(leaf1->entries[ 0 ].hashval, ARCH_CONVERT)) ||
1891 (INT_GET(leaf2->entries[ INT_GET(leaf2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) < 1891 (INT_GET(leaf2->entries[ INT_GET(leaf2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) <
1892 INT_GET(leaf1->entries[ INT_GET(leaf1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))) { 1892 INT_GET(leaf1->entries[ INT_GET(leaf1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))) {
1893 return(1); 1893 return 1;
1894 } 1894 }
1895 return(0); 1895 return 0;
1896} 1896}
1897 1897
1898/* 1898/*
@@ -1942,7 +1942,7 @@ xfs_dir_leaf_getdents_int(
1942 leaf = bp->data; 1942 leaf = bp->data;
1943 if (INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) != XFS_DIR_LEAF_MAGIC) { 1943 if (INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) != XFS_DIR_LEAF_MAGIC) {
1944 *eobp = 1; 1944 *eobp = 1;
1945 return(XFS_ERROR(ENOENT)); /* XXX wrong code */ 1945 return XFS_ERROR(ENOENT); /* XXX wrong code */
1946 } 1946 }
1947 1947
1948 want_entno = XFS_DA_COOKIE_ENTRY(mp, uio->uio_offset); 1948 want_entno = XFS_DA_COOKIE_ENTRY(mp, uio->uio_offset);
@@ -2000,7 +2000,7 @@ xfs_dir_leaf_getdents_int(
2000 * the node code will be setting uio_offset anyway. 2000 * the node code will be setting uio_offset anyway.
2001 */ 2001 */
2002 *eobp = 0; 2002 *eobp = 0;
2003 return(0); 2003 return 0;
2004 } 2004 }
2005 xfs_dir_trace_g_due("leaf: hash found", dp, uio, entry); 2005 xfs_dir_trace_g_due("leaf: hash found", dp, uio, entry);
2006 2006
@@ -2057,7 +2057,7 @@ xfs_dir_leaf_getdents_int(
2057 retval = xfs_da_read_buf(dp->i_transp, dp, thishash, 2057 retval = xfs_da_read_buf(dp->i_transp, dp, thishash,
2058 nextda, &bp2, XFS_DATA_FORK); 2058 nextda, &bp2, XFS_DATA_FORK);
2059 if (retval) 2059 if (retval)
2060 return(retval); 2060 return retval;
2061 2061
2062 ASSERT(bp2 != NULL); 2062 ASSERT(bp2 != NULL);
2063 2063
@@ -2073,7 +2073,7 @@ xfs_dir_leaf_getdents_int(
2073 leaf2); 2073 leaf2);
2074 xfs_da_brelse(dp->i_transp, bp2); 2074 xfs_da_brelse(dp->i_transp, bp2);
2075 2075
2076 return(XFS_ERROR(EFSCORRUPTED)); 2076 return XFS_ERROR(EFSCORRUPTED);
2077 } 2077 }
2078 2078
2079 nexthash = INT_GET(leaf2->entries[0].hashval, 2079 nexthash = INT_GET(leaf2->entries[0].hashval,
@@ -2139,7 +2139,7 @@ xfs_dir_leaf_getdents_int(
2139 2139
2140 xfs_dir_trace_g_du("leaf: E-O-B", dp, uio); 2140 xfs_dir_trace_g_du("leaf: E-O-B", dp, uio);
2141 2141
2142 return(retval); 2142 return retval;
2143 } 2143 }
2144 } 2144 }
2145 2145
@@ -2149,7 +2149,7 @@ xfs_dir_leaf_getdents_int(
2149 2149
2150 xfs_dir_trace_g_du("leaf: E-O-F", dp, uio); 2150 xfs_dir_trace_g_du("leaf: E-O-F", dp, uio);
2151 2151
2152 return(0); 2152 return 0;
2153} 2153}
2154 2154
2155/* 2155/*
diff --git a/fs/xfs/xfs_dir_leaf.h b/fs/xfs/xfs_dir_leaf.h
index ab6b09eef9a..eb8cd9a4667 100644
--- a/fs/xfs/xfs_dir_leaf.h
+++ b/fs/xfs/xfs_dir_leaf.h
@@ -67,34 +67,38 @@ struct xfs_trans;
67 */ 67 */
68#define XFS_DIR_LEAF_MAPSIZE 3 /* how many freespace slots */ 68#define XFS_DIR_LEAF_MAPSIZE 3 /* how many freespace slots */
69 69
70typedef struct xfs_dir_leaf_map { /* RLE map of free bytes */
71 __uint16_t base; /* base of free region */
72 __uint16_t size; /* run length of free region */
73} xfs_dir_leaf_map_t;
74
75typedef struct xfs_dir_leaf_hdr { /* constant-structure header block */
76 xfs_da_blkinfo_t info; /* block type, links, etc. */
77 __uint16_t count; /* count of active leaf_entry's */
78 __uint16_t namebytes; /* num bytes of name strings stored */
79 __uint16_t firstused; /* first used byte in name area */
80 __uint8_t holes; /* != 0 if blk needs compaction */
81 __uint8_t pad1;
82 xfs_dir_leaf_map_t freemap[XFS_DIR_LEAF_MAPSIZE];
83} xfs_dir_leaf_hdr_t;
84
85typedef struct xfs_dir_leaf_entry { /* sorted on key, not name */
86 xfs_dahash_t hashval; /* hash value of name */
87 __uint16_t nameidx; /* index into buffer of name */
88 __uint8_t namelen; /* length of name string */
89 __uint8_t pad2;
90} xfs_dir_leaf_entry_t;
91
92typedef struct xfs_dir_leaf_name {
93 xfs_dir_ino_t inumber; /* inode number for this key */
94 __uint8_t name[1]; /* name string itself */
95} xfs_dir_leaf_name_t;
96
70typedef struct xfs_dir_leafblock { 97typedef struct xfs_dir_leafblock {
71 struct xfs_dir_leaf_hdr { /* constant-structure header block */ 98 xfs_dir_leaf_hdr_t hdr; /* constant-structure header block */
72 xfs_da_blkinfo_t info; /* block type, links, etc. */ 99 xfs_dir_leaf_entry_t entries[1]; /* var sized array */
73 __uint16_t count; /* count of active leaf_entry's */ 100 xfs_dir_leaf_name_t namelist[1]; /* grows from bottom of buf */
74 __uint16_t namebytes; /* num bytes of name strings stored */
75 __uint16_t firstused; /* first used byte in name area */
76 __uint8_t holes; /* != 0 if blk needs compaction */
77 __uint8_t pad1;
78 struct xfs_dir_leaf_map {/* RLE map of free bytes */
79 __uint16_t base; /* base of free region */
80 __uint16_t size; /* run length of free region */
81 } freemap[XFS_DIR_LEAF_MAPSIZE]; /* N largest free regions */
82 } hdr;
83 struct xfs_dir_leaf_entry { /* sorted on key, not name */
84 xfs_dahash_t hashval; /* hash value of name */
85 __uint16_t nameidx; /* index into buffer of name */
86 __uint8_t namelen; /* length of name string */
87 __uint8_t pad2;
88 } entries[1]; /* var sized array */
89 struct xfs_dir_leaf_name {
90 xfs_dir_ino_t inumber; /* inode number for this key */
91 __uint8_t name[1]; /* name string itself */
92 } namelist[1]; /* grows from bottom of buf */
93} xfs_dir_leafblock_t; 101} xfs_dir_leafblock_t;
94typedef struct xfs_dir_leaf_hdr xfs_dir_leaf_hdr_t;
95typedef struct xfs_dir_leaf_map xfs_dir_leaf_map_t;
96typedef struct xfs_dir_leaf_entry xfs_dir_leaf_entry_t;
97typedef struct xfs_dir_leaf_name xfs_dir_leaf_name_t;
98 102
99/* 103/*
100 * Length of name for which a 512-byte block filesystem 104 * Length of name for which a 512-byte block filesystem
@@ -126,11 +130,10 @@ typedef union {
126#define XFS_PUT_COOKIE(c,mp,bno,entry,hash) \ 130#define XFS_PUT_COOKIE(c,mp,bno,entry,hash) \
127 ((c).s.be = XFS_DA_MAKE_BNOENTRY(mp, bno, entry), (c).s.h = (hash)) 131 ((c).s.be = XFS_DA_MAKE_BNOENTRY(mp, bno, entry), (c).s.h = (hash))
128 132
129typedef struct xfs_dir_put_args 133typedef struct xfs_dir_put_args {
130{
131 xfs_dircook_t cook; /* cookie of (next) entry */ 134 xfs_dircook_t cook; /* cookie of (next) entry */
132 xfs_intino_t ino; /* inode number */ 135 xfs_intino_t ino; /* inode number */
133 struct xfs_dirent *dbp; /* buffer pointer */ 136 struct xfs_dirent *dbp; /* buffer pointer */
134 char *name; /* directory entry name */ 137 char *name; /* directory entry name */
135 int namelen; /* length of name */ 138 int namelen; /* length of name */
136 int done; /* output: set if value was stored */ 139 int done; /* output: set if value was stored */
@@ -138,7 +141,8 @@ typedef struct xfs_dir_put_args
138 struct uio *uio; /* uio control structure */ 141 struct uio *uio; /* uio control structure */
139} xfs_dir_put_args_t; 142} xfs_dir_put_args_t;
140 143
141#define XFS_DIR_LEAF_ENTSIZE_BYNAME(len) xfs_dir_leaf_entsize_byname(len) 144#define XFS_DIR_LEAF_ENTSIZE_BYNAME(len) \
145 xfs_dir_leaf_entsize_byname(len)
142static inline int xfs_dir_leaf_entsize_byname(int len) 146static inline int xfs_dir_leaf_entsize_byname(int len)
143{ 147{
144 return (uint)sizeof(xfs_dir_leaf_name_t)-1 + len; 148 return (uint)sizeof(xfs_dir_leaf_name_t)-1 + len;
diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h
index 864bf695568..b4c7f2bc55a 100644
--- a/fs/xfs/xfs_dmapi.h
+++ b/fs/xfs/xfs_dmapi.h
@@ -152,7 +152,7 @@ typedef enum {
152 152
153#define DM_FLAGS_NDELAY 0x001 /* return EAGAIN after dm_pending() */ 153#define DM_FLAGS_NDELAY 0x001 /* return EAGAIN after dm_pending() */
154#define DM_FLAGS_UNWANTED 0x002 /* event not in fsys dm_eventset_t */ 154#define DM_FLAGS_UNWANTED 0x002 /* event not in fsys dm_eventset_t */
155#define DM_FLAGS_ISEM 0x004 /* thread holds i_sem */ 155#define DM_FLAGS_IMUX 0x004 /* thread holds i_mutex */
156#define DM_FLAGS_IALLOCSEM_RD 0x010 /* thread holds i_alloc_sem rd */ 156#define DM_FLAGS_IALLOCSEM_RD 0x010 /* thread holds i_alloc_sem rd */
157#define DM_FLAGS_IALLOCSEM_WR 0x020 /* thread holds i_alloc_sem wr */ 157#define DM_FLAGS_IALLOCSEM_WR 0x020 /* thread holds i_alloc_sem wr */
158 158
@@ -161,21 +161,21 @@ typedef enum {
161 */ 161 */
162#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) 162#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
163#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \ 163#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \
164 DM_FLAGS_ISEM : 0) 164 DM_FLAGS_IMUX : 0)
165#define DM_SEM_FLAG_WR (DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_ISEM) 165#define DM_SEM_FLAG_WR (DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_IMUX)
166#endif 166#endif
167 167
168#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) && \ 168#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) && \
169 (LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,22)) 169 (LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,22))
170#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \ 170#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \
171 DM_FLAGS_IALLOCSEM_RD : DM_FLAGS_ISEM) 171 DM_FLAGS_IALLOCSEM_RD : DM_FLAGS_IMUX)
172#define DM_SEM_FLAG_WR (DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_ISEM) 172#define DM_SEM_FLAG_WR (DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_IMUX)
173#endif 173#endif
174 174
175#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,21) 175#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,21)
176#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \ 176#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \
177 0 : DM_FLAGS_ISEM) 177 0 : DM_FLAGS_IMUX)
178#define DM_SEM_FLAG_WR (DM_FLAGS_ISEM) 178#define DM_SEM_FLAG_WR (DM_FLAGS_IMUX)
179#endif 179#endif
180 180
181 181
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index d7b6b5d1670..2a21c502401 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -54,7 +54,6 @@ xfs_error_trap(int e)
54 if (e != xfs_etrap[i]) 54 if (e != xfs_etrap[i])
55 continue; 55 continue;
56 cmn_err(CE_NOTE, "xfs_error_trap: error %d", e); 56 cmn_err(CE_NOTE, "xfs_error_trap: error %d", e);
57 debug_stop_all_cpus((void *)-1LL);
58 BUG(); 57 BUG();
59 break; 58 break;
60 } 59 }
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 06d8a8426c1..26b8e709a56 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -18,9 +18,6 @@
18#ifndef __XFS_ERROR_H__ 18#ifndef __XFS_ERROR_H__
19#define __XFS_ERROR_H__ 19#define __XFS_ERROR_H__
20 20
21#define prdev(fmt,targ,args...) \
22 printk("XFS: device %s - " fmt "\n", XFS_BUFTARG_NAME(targ), ## args)
23
24#define XFS_ERECOVER 1 /* Failure to recover log */ 21#define XFS_ERECOVER 1 /* Failure to recover log */
25#define XFS_ELOGSTAT 2 /* Failure to stat log in user space */ 22#define XFS_ELOGSTAT 2 /* Failure to stat log in user space */
26#define XFS_ENOLOGSPACE 3 /* Reservation too large */ 23#define XFS_ENOLOGSPACE 3 /* Reservation too large */
@@ -182,8 +179,11 @@ extern int xfs_errortag_clearall_umount(int64_t fsid, char *fsname, int loud);
182struct xfs_mount; 179struct xfs_mount;
183/* PRINTFLIKE4 */ 180/* PRINTFLIKE4 */
184extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp, 181extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp,
185 char *fmt, ...); 182 char *fmt, ...);
186/* PRINTFLIKE3 */ 183/* PRINTFLIKE3 */
187extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...); 184extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...);
188 185
186#define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \
187 xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args)
188
189#endif /* __XFS_ERROR_H__ */ 189#endif /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index ba096f80f48..14010f1fa82 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -3,15 +3,15 @@
3 * All Rights Reserved. 3 * All Rights Reserved.
4 * 4 *
5 * This program is free software; you can redistribute it and/or 5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as 6 * modify it under the terms of the GNU Lesser General Public License
7 * published by the Free Software Foundation. 7 * as published by the Free Software Foundation.
8 * 8 *
9 * This program is distributed in the hope that it would be useful, 9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details. 12 * GNU Lesser General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU Lesser General Public License
15 * along with this program; if not, write the Free Software Foundation, 15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
@@ -65,6 +65,8 @@ struct fsxattr {
65#define XFS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */ 65#define XFS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */
66#define XFS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */ 66#define XFS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */
67#define XFS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */ 67#define XFS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */
68#define XFS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */
69#define XFS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */
68#define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ 70#define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */
69 71
70/* 72/*
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 7ceabd0e2d9..b4d971b0158 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -501,7 +501,7 @@ xfs_reserve_blocks(
501 if (inval == (__uint64_t *)NULL) { 501 if (inval == (__uint64_t *)NULL) {
502 outval->resblks = mp->m_resblks; 502 outval->resblks = mp->m_resblks;
503 outval->resblks_avail = mp->m_resblks_avail; 503 outval->resblks_avail = mp->m_resblks_avail;
504 return(0); 504 return 0;
505 } 505 }
506 506
507 request = *inval; 507 request = *inval;
@@ -537,7 +537,33 @@ xfs_reserve_blocks(
537 outval->resblks = mp->m_resblks; 537 outval->resblks = mp->m_resblks;
538 outval->resblks_avail = mp->m_resblks_avail; 538 outval->resblks_avail = mp->m_resblks_avail;
539 XFS_SB_UNLOCK(mp, s); 539 XFS_SB_UNLOCK(mp, s);
540 return(0); 540 return 0;
541}
542
543void
544xfs_fs_log_dummy(xfs_mount_t *mp)
545{
546 xfs_trans_t *tp;
547 xfs_inode_t *ip;
548
549
550 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
551 atomic_inc(&mp->m_active_trans);
552 if (xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0)) {
553 xfs_trans_cancel(tp, 0);
554 return;
555 }
556
557 ip = mp->m_rootip;
558 xfs_ilock(ip, XFS_ILOCK_EXCL);
559
560 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
561 xfs_trans_ihold(tp, ip);
562 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
563 xfs_trans_set_sync(tp);
564 xfs_trans_commit(tp, 0, NULL);
565
566 xfs_iunlock(ip, XFS_ILOCK_EXCL);
541} 567}
542 568
543int 569int
@@ -550,7 +576,7 @@ xfs_fs_goingdown(
550 struct vfs *vfsp = XFS_MTOVFS(mp); 576 struct vfs *vfsp = XFS_MTOVFS(mp);
551 struct super_block *sb = freeze_bdev(vfsp->vfs_super->s_bdev); 577 struct super_block *sb = freeze_bdev(vfsp->vfs_super->s_bdev);
552 578
553 if (sb) { 579 if (sb && !IS_ERR(sb)) {
554 xfs_force_shutdown(mp, XFS_FORCE_UMOUNT); 580 xfs_force_shutdown(mp, XFS_FORCE_UMOUNT);
555 thaw_bdev(sb->s_bdev, sb); 581 thaw_bdev(sb->s_bdev, sb);
556 } 582 }
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index f32713f14f9..300d0c9d61a 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,5 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, 25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
26 xfs_fsop_resblks_t *outval); 26 xfs_fsop_resblks_t *outval);
27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); 27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
28extern void xfs_fs_log_dummy(xfs_mount_t *mp);
28 29
29#endif /* __XFS_FSOPS_H__ */ 30#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index fc19eedbd11..8e380a1fb79 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -493,7 +493,6 @@ xfs_iget(
493 493
494retry: 494retry:
495 if ((inode = iget_locked(XFS_MTOVFS(mp)->vfs_super, ino))) { 495 if ((inode = iget_locked(XFS_MTOVFS(mp)->vfs_super, ino))) {
496 bhv_desc_t *bdp;
497 xfs_inode_t *ip; 496 xfs_inode_t *ip;
498 497
499 vp = LINVFS_GET_VP(inode); 498 vp = LINVFS_GET_VP(inode);
@@ -517,14 +516,12 @@ retry:
517 * to wait for the inode to go away. 516 * to wait for the inode to go away.
518 */ 517 */
519 if (is_bad_inode(inode) || 518 if (is_bad_inode(inode) ||
520 ((bdp = vn_bhv_lookup(VN_BHV_HEAD(vp), 519 ((ip = xfs_vtoi(vp)) == NULL)) {
521 &xfs_vnodeops)) == NULL)) {
522 iput(inode); 520 iput(inode);
523 delay(1); 521 delay(1);
524 goto retry; 522 goto retry;
525 } 523 }
526 524
527 ip = XFS_BHVTOI(bdp);
528 if (lock_flags != 0) 525 if (lock_flags != 0)
529 xfs_ilock(ip, lock_flags); 526 xfs_ilock(ip, lock_flags);
530 XFS_STATS_INC(xs_ig_found); 527 XFS_STATS_INC(xs_ig_found);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index df0d4572d70..1d7f5a7e063 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -404,9 +404,8 @@ xfs_iformat(
404 INT_GET(dip->di_core.di_nextents, ARCH_CONVERT) + 404 INT_GET(dip->di_core.di_nextents, ARCH_CONVERT) +
405 INT_GET(dip->di_core.di_anextents, ARCH_CONVERT) > 405 INT_GET(dip->di_core.di_anextents, ARCH_CONVERT) >
406 INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT))) { 406 INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT))) {
407 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 407 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
408 "corrupt dinode %Lu, extent total = %d, nblocks = %Lu." 408 "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
409 " Unmount and run xfs_repair.",
410 (unsigned long long)ip->i_ino, 409 (unsigned long long)ip->i_ino,
411 (int)(INT_GET(dip->di_core.di_nextents, ARCH_CONVERT) 410 (int)(INT_GET(dip->di_core.di_nextents, ARCH_CONVERT)
412 + INT_GET(dip->di_core.di_anextents, ARCH_CONVERT)), 411 + INT_GET(dip->di_core.di_anextents, ARCH_CONVERT)),
@@ -418,9 +417,8 @@ xfs_iformat(
418 } 417 }
419 418
420 if (unlikely(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT) > ip->i_mount->m_sb.sb_inodesize)) { 419 if (unlikely(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT) > ip->i_mount->m_sb.sb_inodesize)) {
421 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 420 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
422 "corrupt dinode %Lu, forkoff = 0x%x." 421 "corrupt dinode %Lu, forkoff = 0x%x.",
423 " Unmount and run xfs_repair.",
424 (unsigned long long)ip->i_ino, 422 (unsigned long long)ip->i_ino,
425 (int)(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT))); 423 (int)(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT)));
426 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, 424 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
@@ -451,8 +449,9 @@ xfs_iformat(
451 * no local regular files yet 449 * no local regular files yet
452 */ 450 */
453 if (unlikely((INT_GET(dip->di_core.di_mode, ARCH_CONVERT) & S_IFMT) == S_IFREG)) { 451 if (unlikely((INT_GET(dip->di_core.di_mode, ARCH_CONVERT) & S_IFMT) == S_IFREG)) {
454 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 452 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
455 "corrupt inode (local format for regular file) %Lu. Unmount and run xfs_repair.", 453 "corrupt inode %Lu "
454 "(local format for regular file).",
456 (unsigned long long) ip->i_ino); 455 (unsigned long long) ip->i_ino);
457 XFS_CORRUPTION_ERROR("xfs_iformat(4)", 456 XFS_CORRUPTION_ERROR("xfs_iformat(4)",
458 XFS_ERRLEVEL_LOW, 457 XFS_ERRLEVEL_LOW,
@@ -462,8 +461,9 @@ xfs_iformat(
462 461
463 di_size = INT_GET(dip->di_core.di_size, ARCH_CONVERT); 462 di_size = INT_GET(dip->di_core.di_size, ARCH_CONVERT);
464 if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { 463 if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
465 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 464 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
466 "corrupt inode %Lu (bad size %Ld for local inode). Unmount and run xfs_repair.", 465 "corrupt inode %Lu "
466 "(bad size %Ld for local inode).",
467 (unsigned long long) ip->i_ino, 467 (unsigned long long) ip->i_ino,
468 (long long) di_size); 468 (long long) di_size);
469 XFS_CORRUPTION_ERROR("xfs_iformat(5)", 469 XFS_CORRUPTION_ERROR("xfs_iformat(5)",
@@ -551,8 +551,9 @@ xfs_iformat_local(
551 * kmem_alloc() or memcpy() below. 551 * kmem_alloc() or memcpy() below.
552 */ 552 */
553 if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { 553 if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
554 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 554 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
555 "corrupt inode %Lu (bad size %d for local fork, size = %d). Unmount and run xfs_repair.", 555 "corrupt inode %Lu "
556 "(bad size %d for local fork, size = %d).",
556 (unsigned long long) ip->i_ino, size, 557 (unsigned long long) ip->i_ino, size,
557 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); 558 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
558 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, 559 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
@@ -610,8 +611,8 @@ xfs_iformat_extents(
610 * kmem_alloc() or memcpy() below. 611 * kmem_alloc() or memcpy() below.
611 */ 612 */
612 if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { 613 if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
613 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 614 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
614 "corrupt inode %Lu ((a)extents = %d). Unmount and run xfs_repair.", 615 "corrupt inode %Lu ((a)extents = %d).",
615 (unsigned long long) ip->i_ino, nex); 616 (unsigned long long) ip->i_ino, nex);
616 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, 617 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
617 ip->i_mount, dip); 618 ip->i_mount, dip);
@@ -692,8 +693,8 @@ xfs_iformat_btree(
692 || XFS_BMDR_SPACE_CALC(nrecs) > 693 || XFS_BMDR_SPACE_CALC(nrecs) >
693 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) 694 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
694 || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { 695 || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
695 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 696 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
696 "corrupt inode %Lu (btree). Unmount and run xfs_repair.", 697 "corrupt inode %Lu (btree).",
697 (unsigned long long) ip->i_ino); 698 (unsigned long long) ip->i_ino);
698 XFS_ERROR_REPORT("xfs_iformat_btree", XFS_ERRLEVEL_LOW, 699 XFS_ERROR_REPORT("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
699 ip->i_mount); 700 ip->i_mount);
@@ -809,6 +810,10 @@ _xfs_dic2xflags(
809 flags |= XFS_XFLAG_PROJINHERIT; 810 flags |= XFS_XFLAG_PROJINHERIT;
810 if (di_flags & XFS_DIFLAG_NOSYMLINKS) 811 if (di_flags & XFS_DIFLAG_NOSYMLINKS)
811 flags |= XFS_XFLAG_NOSYMLINKS; 812 flags |= XFS_XFLAG_NOSYMLINKS;
813 if (di_flags & XFS_DIFLAG_EXTSIZE)
814 flags |= XFS_XFLAG_EXTSIZE;
815 if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
816 flags |= XFS_XFLAG_EXTSZINHERIT;
812 } 817 }
813 818
814 return flags; 819 return flags;
@@ -1192,11 +1197,19 @@ xfs_ialloc(
1192 if ((mode & S_IFMT) == S_IFDIR) { 1197 if ((mode & S_IFMT) == S_IFDIR) {
1193 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) 1198 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
1194 di_flags |= XFS_DIFLAG_RTINHERIT; 1199 di_flags |= XFS_DIFLAG_RTINHERIT;
1195 } else { 1200 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
1201 di_flags |= XFS_DIFLAG_EXTSZINHERIT;
1202 ip->i_d.di_extsize = pip->i_d.di_extsize;
1203 }
1204 } else if ((mode & S_IFMT) == S_IFREG) {
1196 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) { 1205 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) {
1197 di_flags |= XFS_DIFLAG_REALTIME; 1206 di_flags |= XFS_DIFLAG_REALTIME;
1198 ip->i_iocore.io_flags |= XFS_IOCORE_RT; 1207 ip->i_iocore.io_flags |= XFS_IOCORE_RT;
1199 } 1208 }
1209 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
1210 di_flags |= XFS_DIFLAG_EXTSIZE;
1211 ip->i_d.di_extsize = pip->i_d.di_extsize;
1212 }
1200 } 1213 }
1201 if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) && 1214 if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
1202 xfs_inherit_noatime) 1215 xfs_inherit_noatime)
@@ -1262,7 +1275,7 @@ xfs_isize_check(
1262 if ((ip->i_d.di_mode & S_IFMT) != S_IFREG) 1275 if ((ip->i_d.di_mode & S_IFMT) != S_IFREG)
1263 return; 1276 return;
1264 1277
1265 if ( ip->i_d.di_flags & XFS_DIFLAG_REALTIME ) 1278 if (ip->i_d.di_flags & (XFS_DIFLAG_REALTIME | XFS_DIFLAG_EXTSIZE))
1266 return; 1279 return;
1267 1280
1268 nimaps = 2; 1281 nimaps = 2;
@@ -1765,22 +1778,19 @@ xfs_igrow_start(
1765 xfs_fsize_t new_size, 1778 xfs_fsize_t new_size,
1766 cred_t *credp) 1779 cred_t *credp)
1767{ 1780{
1768 xfs_fsize_t isize;
1769 int error; 1781 int error;
1770 1782
1771 ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0); 1783 ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0);
1772 ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0); 1784 ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0);
1773 ASSERT(new_size > ip->i_d.di_size); 1785 ASSERT(new_size > ip->i_d.di_size);
1774 1786
1775 error = 0;
1776 isize = ip->i_d.di_size;
1777 /* 1787 /*
1778 * Zero any pages that may have been created by 1788 * Zero any pages that may have been created by
1779 * xfs_write_file() beyond the end of the file 1789 * xfs_write_file() beyond the end of the file
1780 * and any blocks between the old and new file sizes. 1790 * and any blocks between the old and new file sizes.
1781 */ 1791 */
1782 error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, new_size, isize, 1792 error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, new_size,
1783 new_size); 1793 ip->i_d.di_size, new_size);
1784 return error; 1794 return error;
1785} 1795}
1786 1796
@@ -3355,6 +3365,11 @@ xfs_iflush_int(
3355 ip->i_update_core = 0; 3365 ip->i_update_core = 0;
3356 SYNCHRONIZE(); 3366 SYNCHRONIZE();
3357 3367
3368 /*
3369 * Make sure to get the latest atime from the Linux inode.
3370 */
3371 xfs_synchronize_atime(ip);
3372
3358 if (XFS_TEST_ERROR(INT_GET(dip->di_core.di_magic,ARCH_CONVERT) != XFS_DINODE_MAGIC, 3373 if (XFS_TEST_ERROR(INT_GET(dip->di_core.di_magic,ARCH_CONVERT) != XFS_DINODE_MAGIC,
3359 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { 3374 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
3360 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 3375 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 124d30e6143..1cfbcf18ce8 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -436,6 +436,10 @@ void xfs_ichgtime(xfs_inode_t *, int);
436xfs_fsize_t xfs_file_last_byte(xfs_inode_t *); 436xfs_fsize_t xfs_file_last_byte(xfs_inode_t *);
437void xfs_lock_inodes(xfs_inode_t **, int, int, uint); 437void xfs_lock_inodes(xfs_inode_t **, int, int, uint);
438 438
439xfs_inode_t *xfs_vtoi(struct vnode *vp);
440
441void xfs_synchronize_atime(xfs_inode_t *);
442
439#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) 443#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
440 444
441#ifdef DEBUG 445#ifdef DEBUG
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7f3363c621e..36aa1fcb90a 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -271,6 +271,11 @@ xfs_inode_item_format(
271 if (ip->i_update_size) 271 if (ip->i_update_size)
272 ip->i_update_size = 0; 272 ip->i_update_size = 0;
273 273
274 /*
275 * Make sure to get the latest atime from the Linux inode.
276 */
277 xfs_synchronize_atime(ip);
278
274 vecp->i_addr = (xfs_caddr_t)&ip->i_d; 279 vecp->i_addr = (xfs_caddr_t)&ip->i_d;
275 vecp->i_len = sizeof(xfs_dinode_core_t); 280 vecp->i_len = sizeof(xfs_dinode_core_t);
276 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE); 281 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE);
@@ -603,7 +608,7 @@ xfs_inode_item_trylock(
603 if (iip->ili_pushbuf_flag == 0) { 608 if (iip->ili_pushbuf_flag == 0) {
604 iip->ili_pushbuf_flag = 1; 609 iip->ili_pushbuf_flag = 1;
605#ifdef DEBUG 610#ifdef DEBUG
606 iip->ili_push_owner = get_thread_id(); 611 iip->ili_push_owner = current_pid();
607#endif 612#endif
608 /* 613 /*
609 * Inode is left locked in shared mode. 614 * Inode is left locked in shared mode.
@@ -782,7 +787,7 @@ xfs_inode_item_pushbuf(
782 * trying to duplicate our effort. 787 * trying to duplicate our effort.
783 */ 788 */
784 ASSERT(iip->ili_pushbuf_flag != 0); 789 ASSERT(iip->ili_pushbuf_flag != 0);
785 ASSERT(iip->ili_push_owner == get_thread_id()); 790 ASSERT(iip->ili_push_owner == current_pid());
786 791
787 /* 792 /*
788 * If flushlock isn't locked anymore, chances are that the 793 * If flushlock isn't locked anymore, chances are that the
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 45a77a3a6c0..788917f355c 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -40,7 +40,6 @@
40#include "xfs_ialloc.h" 40#include "xfs_ialloc.h"
41#include "xfs_btree.h" 41#include "xfs_btree.h"
42#include "xfs_bmap.h" 42#include "xfs_bmap.h"
43#include "xfs_bit.h"
44#include "xfs_rtalloc.h" 43#include "xfs_rtalloc.h"
45#include "xfs_error.h" 44#include "xfs_error.h"
46#include "xfs_itable.h" 45#include "xfs_itable.h"
@@ -263,7 +262,7 @@ phase2:
263 case BMAPI_WRITE: 262 case BMAPI_WRITE:
264 /* If we found an extent, return it */ 263 /* If we found an extent, return it */
265 if (nimaps && 264 if (nimaps &&
266 (imap.br_startblock != HOLESTARTBLOCK) && 265 (imap.br_startblock != HOLESTARTBLOCK) &&
267 (imap.br_startblock != DELAYSTARTBLOCK)) { 266 (imap.br_startblock != DELAYSTARTBLOCK)) {
268 xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, io, 267 xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, io,
269 offset, count, iomapp, &imap, flags); 268 offset, count, iomapp, &imap, flags);
@@ -318,6 +317,58 @@ out:
318} 317}
319 318
320STATIC int 319STATIC int
320xfs_iomap_eof_align_last_fsb(
321 xfs_mount_t *mp,
322 xfs_iocore_t *io,
323 xfs_fsize_t isize,
324 xfs_extlen_t extsize,
325 xfs_fileoff_t *last_fsb)
326{
327 xfs_fileoff_t new_last_fsb = 0;
328 xfs_extlen_t align;
329 int eof, error;
330
331 if (io->io_flags & XFS_IOCORE_RT)
332 ;
333 /*
334 * If mounted with the "-o swalloc" option, roundup the allocation
335 * request to a stripe width boundary if the file size is >=
336 * stripe width and we are allocating past the allocation eof.
337 */
338 else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) &&
339 (isize >= XFS_FSB_TO_B(mp, mp->m_swidth)))
340 new_last_fsb = roundup_64(*last_fsb, mp->m_swidth);
341 /*
342 * Roundup the allocation request to a stripe unit (m_dalign) boundary
343 * if the file size is >= stripe unit size, and we are allocating past
344 * the allocation eof.
345 */
346 else if (mp->m_dalign && (isize >= XFS_FSB_TO_B(mp, mp->m_dalign)))
347 new_last_fsb = roundup_64(*last_fsb, mp->m_dalign);
348
349 /*
350 * Always round up the allocation request to an extent boundary
351 * (when file on a real-time subvolume or has di_extsize hint).
352 */
353 if (extsize) {
354 if (new_last_fsb)
355 align = roundup_64(new_last_fsb, extsize);
356 else
357 align = extsize;
358 new_last_fsb = roundup_64(*last_fsb, align);
359 }
360
361 if (new_last_fsb) {
362 error = XFS_BMAP_EOF(mp, io, new_last_fsb, XFS_DATA_FORK, &eof);
363 if (error)
364 return error;
365 if (eof)
366 *last_fsb = new_last_fsb;
367 }
368 return 0;
369}
370
371STATIC int
321xfs_flush_space( 372xfs_flush_space(
322 xfs_inode_t *ip, 373 xfs_inode_t *ip,
323 int *fsynced, 374 int *fsynced,
@@ -363,19 +414,20 @@ xfs_iomap_write_direct(
363 xfs_iocore_t *io = &ip->i_iocore; 414 xfs_iocore_t *io = &ip->i_iocore;
364 xfs_fileoff_t offset_fsb; 415 xfs_fileoff_t offset_fsb;
365 xfs_fileoff_t last_fsb; 416 xfs_fileoff_t last_fsb;
366 xfs_filblks_t count_fsb; 417 xfs_filblks_t count_fsb, resaligned;
367 xfs_fsblock_t firstfsb; 418 xfs_fsblock_t firstfsb;
419 xfs_extlen_t extsz, temp;
420 xfs_fsize_t isize;
368 int nimaps; 421 int nimaps;
369 int error;
370 int bmapi_flag; 422 int bmapi_flag;
371 int quota_flag; 423 int quota_flag;
372 int rt; 424 int rt;
373 xfs_trans_t *tp; 425 xfs_trans_t *tp;
374 xfs_bmbt_irec_t imap; 426 xfs_bmbt_irec_t imap;
375 xfs_bmap_free_t free_list; 427 xfs_bmap_free_t free_list;
376 xfs_filblks_t qblocks, resblks; 428 uint qblocks, resblks, resrtextents;
377 int committed; 429 int committed;
378 int resrtextents; 430 int error;
379 431
380 /* 432 /*
381 * Make sure that the dquots are there. This doesn't hold 433 * Make sure that the dquots are there. This doesn't hold
@@ -385,38 +437,53 @@ xfs_iomap_write_direct(
385 if (error) 437 if (error)
386 return XFS_ERROR(error); 438 return XFS_ERROR(error);
387 439
388 offset_fsb = XFS_B_TO_FSBT(mp, offset); 440 rt = XFS_IS_REALTIME_INODE(ip);
389 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); 441 if (unlikely(rt)) {
390 count_fsb = last_fsb - offset_fsb; 442 if (!(extsz = ip->i_d.di_extsize))
391 if (found && (ret_imap->br_startblock == HOLESTARTBLOCK)) { 443 extsz = mp->m_sb.sb_rextsize;
392 xfs_fileoff_t map_last_fsb; 444 } else {
393 445 extsz = ip->i_d.di_extsize;
394 map_last_fsb = ret_imap->br_blockcount + ret_imap->br_startoff;
395 if (map_last_fsb < last_fsb) {
396 last_fsb = map_last_fsb;
397 count_fsb = last_fsb - offset_fsb;
398 }
399 ASSERT(count_fsb > 0);
400 } 446 }
401 447
402 /* 448 isize = ip->i_d.di_size;
403 * Determine if reserving space on the data or realtime partition. 449 if (io->io_new_size > isize)
404 */ 450 isize = io->io_new_size;
405 if ((rt = XFS_IS_REALTIME_INODE(ip))) {
406 xfs_extlen_t extsz;
407 451
408 if (!(extsz = ip->i_d.di_extsize)) 452 offset_fsb = XFS_B_TO_FSBT(mp, offset);
409 extsz = mp->m_sb.sb_rextsize; 453 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
410 resrtextents = qblocks = (count_fsb + extsz - 1); 454 if ((offset + count) > isize) {
411 do_div(resrtextents, mp->m_sb.sb_rextsize); 455 error = xfs_iomap_eof_align_last_fsb(mp, io, isize, extsz,
412 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); 456 &last_fsb);
413 quota_flag = XFS_QMOPT_RES_RTBLKS; 457 if (error)
458 goto error_out;
414 } else { 459 } else {
415 resrtextents = 0; 460 if (found && (ret_imap->br_startblock == HOLESTARTBLOCK))
416 resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, count_fsb); 461 last_fsb = MIN(last_fsb, (xfs_fileoff_t)
417 quota_flag = XFS_QMOPT_RES_REGBLKS; 462 ret_imap->br_blockcount +
463 ret_imap->br_startoff);
464 }
465 count_fsb = last_fsb - offset_fsb;
466 ASSERT(count_fsb > 0);
467
468 resaligned = count_fsb;
469 if (unlikely(extsz)) {
470 if ((temp = do_mod(offset_fsb, extsz)))
471 resaligned += temp;
472 if ((temp = do_mod(resaligned, extsz)))
473 resaligned += extsz - temp;
418 } 474 }
419 475
476 if (unlikely(rt)) {
477 resrtextents = qblocks = resaligned;
478 resrtextents /= mp->m_sb.sb_rextsize;
479 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
480 quota_flag = XFS_QMOPT_RES_RTBLKS;
481 } else {
482 resrtextents = 0;
483 resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
484 quota_flag = XFS_QMOPT_RES_REGBLKS;
485 }
486
420 /* 487 /*
421 * Allocate and setup the transaction 488 * Allocate and setup the transaction
422 */ 489 */
@@ -426,7 +493,6 @@ xfs_iomap_write_direct(
426 XFS_WRITE_LOG_RES(mp), resrtextents, 493 XFS_WRITE_LOG_RES(mp), resrtextents,
427 XFS_TRANS_PERM_LOG_RES, 494 XFS_TRANS_PERM_LOG_RES,
428 XFS_WRITE_LOG_COUNT); 495 XFS_WRITE_LOG_COUNT);
429
430 /* 496 /*
431 * Check for running out of space, note: need lock to return 497 * Check for running out of space, note: need lock to return
432 */ 498 */
@@ -436,20 +502,20 @@ xfs_iomap_write_direct(
436 if (error) 502 if (error)
437 goto error_out; 503 goto error_out;
438 504
439 if (XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag)) { 505 error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
440 error = (EDQUOT); 506 qblocks, 0, quota_flag);
507 if (error)
441 goto error1; 508 goto error1;
442 }
443 509
444 bmapi_flag = XFS_BMAPI_WRITE;
445 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 510 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
446 xfs_trans_ihold(tp, ip); 511 xfs_trans_ihold(tp, ip);
447 512
448 if (!(flags & BMAPI_MMAP) && (offset < ip->i_d.di_size || rt)) 513 bmapi_flag = XFS_BMAPI_WRITE;
514 if ((flags & BMAPI_DIRECT) && (offset < ip->i_d.di_size || extsz))
449 bmapi_flag |= XFS_BMAPI_PREALLOC; 515 bmapi_flag |= XFS_BMAPI_PREALLOC;
450 516
451 /* 517 /*
452 * Issue the bmapi() call to allocate the blocks 518 * Issue the xfs_bmapi() call to allocate the blocks
453 */ 519 */
454 XFS_BMAP_INIT(&free_list, &firstfsb); 520 XFS_BMAP_INIT(&free_list, &firstfsb);
455 nimaps = 1; 521 nimaps = 1;
@@ -484,8 +550,10 @@ xfs_iomap_write_direct(
484 "extent-state : %x \n", 550 "extent-state : %x \n",
485 (ip->i_mount)->m_fsname, 551 (ip->i_mount)->m_fsname,
486 (long long)ip->i_ino, 552 (long long)ip->i_ino,
487 ret_imap->br_startblock, ret_imap->br_startoff, 553 (unsigned long long)ret_imap->br_startblock,
488 ret_imap->br_blockcount,ret_imap->br_state); 554 (unsigned long long)ret_imap->br_startoff,
555 (unsigned long long)ret_imap->br_blockcount,
556 ret_imap->br_state);
489 } 557 }
490 return 0; 558 return 0;
491 559
@@ -501,6 +569,63 @@ error_out:
501 return XFS_ERROR(error); 569 return XFS_ERROR(error);
502} 570}
503 571
572/*
573 * If the caller is doing a write at the end of the file,
574 * then extend the allocation out to the file system's write
575 * iosize. We clean up any extra space left over when the
576 * file is closed in xfs_inactive().
577 *
578 * For sync writes, we are flushing delayed allocate space to
579 * try to make additional space available for allocation near
580 * the filesystem full boundary - preallocation hurts in that
581 * situation, of course.
582 */
583STATIC int
584xfs_iomap_eof_want_preallocate(
585 xfs_mount_t *mp,
586 xfs_iocore_t *io,
587 xfs_fsize_t isize,
588 xfs_off_t offset,
589 size_t count,
590 int ioflag,
591 xfs_bmbt_irec_t *imap,
592 int nimaps,
593 int *prealloc)
594{
595 xfs_fileoff_t start_fsb;
596 xfs_filblks_t count_fsb;
597 xfs_fsblock_t firstblock;
598 int n, error, imaps;
599
600 *prealloc = 0;
601 if ((ioflag & BMAPI_SYNC) || (offset + count) <= isize)
602 return 0;
603
604 /*
605 * If there are any real blocks past eof, then don't
606 * do any speculative allocation.
607 */
608 start_fsb = XFS_B_TO_FSBT(mp, ((xfs_ufsize_t)(offset + count - 1)));
609 count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
610 while (count_fsb > 0) {
611 imaps = nimaps;
612 firstblock = NULLFSBLOCK;
613 error = XFS_BMAPI(mp, NULL, io, start_fsb, count_fsb,
614 0, &firstblock, 0, imap, &imaps, NULL);
615 if (error)
616 return error;
617 for (n = 0; n < imaps; n++) {
618 if ((imap[n].br_startblock != HOLESTARTBLOCK) &&
619 (imap[n].br_startblock != DELAYSTARTBLOCK))
620 return 0;
621 start_fsb += imap[n].br_blockcount;
622 count_fsb -= imap[n].br_blockcount;
623 }
624 }
625 *prealloc = 1;
626 return 0;
627}
628
504int 629int
505xfs_iomap_write_delay( 630xfs_iomap_write_delay(
506 xfs_inode_t *ip, 631 xfs_inode_t *ip,
@@ -514,13 +639,15 @@ xfs_iomap_write_delay(
514 xfs_iocore_t *io = &ip->i_iocore; 639 xfs_iocore_t *io = &ip->i_iocore;
515 xfs_fileoff_t offset_fsb; 640 xfs_fileoff_t offset_fsb;
516 xfs_fileoff_t last_fsb; 641 xfs_fileoff_t last_fsb;
517 xfs_fsize_t isize; 642 xfs_off_t aligned_offset;
643 xfs_fileoff_t ioalign;
518 xfs_fsblock_t firstblock; 644 xfs_fsblock_t firstblock;
645 xfs_extlen_t extsz;
646 xfs_fsize_t isize;
519 int nimaps; 647 int nimaps;
520 int error;
521 xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; 648 xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
522 int aeof; 649 int prealloc, fsynced = 0;
523 int fsynced = 0; 650 int error;
524 651
525 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0); 652 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0);
526 653
@@ -528,152 +655,57 @@ xfs_iomap_write_delay(
528 * Make sure that the dquots are there. This doesn't hold 655 * Make sure that the dquots are there. This doesn't hold
529 * the ilock across a disk read. 656 * the ilock across a disk read.
530 */ 657 */
531
532 error = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED); 658 error = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED);
533 if (error) 659 if (error)
534 return XFS_ERROR(error); 660 return XFS_ERROR(error);
535 661
662 if (XFS_IS_REALTIME_INODE(ip)) {
663 if (!(extsz = ip->i_d.di_extsize))
664 extsz = mp->m_sb.sb_rextsize;
665 } else {
666 extsz = ip->i_d.di_extsize;
667 }
668
669 offset_fsb = XFS_B_TO_FSBT(mp, offset);
670
536retry: 671retry:
537 isize = ip->i_d.di_size; 672 isize = ip->i_d.di_size;
538 if (io->io_new_size > isize) { 673 if (io->io_new_size > isize)
539 isize = io->io_new_size; 674 isize = io->io_new_size;
540 }
541 675
542 aeof = 0; 676 error = xfs_iomap_eof_want_preallocate(mp, io, isize, offset, count,
543 offset_fsb = XFS_B_TO_FSBT(mp, offset); 677 ioflag, imap, XFS_WRITE_IMAPS, &prealloc);
544 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); 678 if (error)
545 /* 679 return error;
546 * If the caller is doing a write at the end of the file,
547 * then extend the allocation (and the buffer used for the write)
548 * out to the file system's write iosize. We clean up any extra
549 * space left over when the file is closed in xfs_inactive().
550 *
551 * For sync writes, we are flushing delayed allocate space to
552 * try to make additional space available for allocation near
553 * the filesystem full boundary - preallocation hurts in that
554 * situation, of course.
555 */
556 if (!(ioflag & BMAPI_SYNC) && ((offset + count) > ip->i_d.di_size)) {
557 xfs_off_t aligned_offset;
558 xfs_filblks_t count_fsb;
559 unsigned int iosize;
560 xfs_fileoff_t ioalign;
561 int n;
562 xfs_fileoff_t start_fsb;
563 680
564 /* 681 if (prealloc) {
565 * If there are any real blocks past eof, then don't
566 * do any speculative allocation.
567 */
568 start_fsb = XFS_B_TO_FSBT(mp,
569 ((xfs_ufsize_t)(offset + count - 1)));
570 count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
571 while (count_fsb > 0) {
572 nimaps = XFS_WRITE_IMAPS;
573 error = XFS_BMAPI(mp, NULL, io, start_fsb, count_fsb,
574 0, &firstblock, 0, imap, &nimaps, NULL);
575 if (error) {
576 return error;
577 }
578 for (n = 0; n < nimaps; n++) {
579 if ( !(io->io_flags & XFS_IOCORE_RT) &&
580 !imap[n].br_startblock) {
581 cmn_err(CE_PANIC,"Access to block "
582 "zero: fs <%s> inode: %lld "
583 "start_block : %llx start_off "
584 ": %llx blkcnt : %llx "
585 "extent-state : %x \n",
586 (ip->i_mount)->m_fsname,
587 (long long)ip->i_ino,
588 imap[n].br_startblock,
589 imap[n].br_startoff,
590 imap[n].br_blockcount,
591 imap[n].br_state);
592 }
593 if ((imap[n].br_startblock != HOLESTARTBLOCK) &&
594 (imap[n].br_startblock != DELAYSTARTBLOCK)) {
595 goto write_map;
596 }
597 start_fsb += imap[n].br_blockcount;
598 count_fsb -= imap[n].br_blockcount;
599 }
600 }
601 iosize = mp->m_writeio_blocks;
602 aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); 682 aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
603 ioalign = XFS_B_TO_FSBT(mp, aligned_offset); 683 ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
604 last_fsb = ioalign + iosize; 684 last_fsb = ioalign + mp->m_writeio_blocks;
605 aeof = 1; 685 } else {
686 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
606 } 687 }
607write_map:
608 nimaps = XFS_WRITE_IMAPS;
609 firstblock = NULLFSBLOCK;
610 688
611 /* 689 if (prealloc || extsz) {
612 * If mounted with the "-o swalloc" option, roundup the allocation 690 error = xfs_iomap_eof_align_last_fsb(mp, io, isize, extsz,
613 * request to a stripe width boundary if the file size is >= 691 &last_fsb);
614 * stripe width and we are allocating past the allocation eof. 692 if (error)
615 */
616 if (!(io->io_flags & XFS_IOCORE_RT) && mp->m_swidth
617 && (mp->m_flags & XFS_MOUNT_SWALLOC)
618 && (isize >= XFS_FSB_TO_B(mp, mp->m_swidth)) && aeof) {
619 int eof;
620 xfs_fileoff_t new_last_fsb;
621
622 new_last_fsb = roundup_64(last_fsb, mp->m_swidth);
623 error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
624 if (error) {
625 return error;
626 }
627 if (eof) {
628 last_fsb = new_last_fsb;
629 }
630 /*
631 * Roundup the allocation request to a stripe unit (m_dalign) boundary
632 * if the file size is >= stripe unit size, and we are allocating past
633 * the allocation eof.
634 */
635 } else if (!(io->io_flags & XFS_IOCORE_RT) && mp->m_dalign &&
636 (isize >= XFS_FSB_TO_B(mp, mp->m_dalign)) && aeof) {
637 int eof;
638 xfs_fileoff_t new_last_fsb;
639 new_last_fsb = roundup_64(last_fsb, mp->m_dalign);
640 error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
641 if (error) {
642 return error;
643 }
644 if (eof) {
645 last_fsb = new_last_fsb;
646 }
647 /*
648 * Round up the allocation request to a real-time extent boundary
649 * if the file is on the real-time subvolume.
650 */
651 } else if (io->io_flags & XFS_IOCORE_RT && aeof) {
652 int eof;
653 xfs_fileoff_t new_last_fsb;
654
655 new_last_fsb = roundup_64(last_fsb, mp->m_sb.sb_rextsize);
656 error = XFS_BMAP_EOF(mp, io, new_last_fsb, XFS_DATA_FORK, &eof);
657 if (error) {
658 return error; 693 return error;
659 }
660 if (eof)
661 last_fsb = new_last_fsb;
662 } 694 }
695
696 nimaps = XFS_WRITE_IMAPS;
697 firstblock = NULLFSBLOCK;
663 error = xfs_bmapi(NULL, ip, offset_fsb, 698 error = xfs_bmapi(NULL, ip, offset_fsb,
664 (xfs_filblks_t)(last_fsb - offset_fsb), 699 (xfs_filblks_t)(last_fsb - offset_fsb),
665 XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | 700 XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
666 XFS_BMAPI_ENTIRE, &firstblock, 1, imap, 701 XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
667 &nimaps, NULL); 702 &nimaps, NULL);
668 /* 703 if (error && (error != ENOSPC))
669 * This can be EDQUOT, if nimaps == 0
670 */
671 if (error && (error != ENOSPC)) {
672 return XFS_ERROR(error); 704 return XFS_ERROR(error);
673 } 705
674 /* 706 /*
675 * If bmapi returned us nothing, and if we didn't get back EDQUOT, 707 * If bmapi returned us nothing, and if we didn't get back EDQUOT,
676 * then we must have run out of space. 708 * then we must have run out of space - flush delalloc, and retry..
677 */ 709 */
678 if (nimaps == 0) { 710 if (nimaps == 0) {
679 xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE, 711 xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE,
@@ -685,17 +717,21 @@ write_map:
685 goto retry; 717 goto retry;
686 } 718 }
687 719
688 *ret_imap = imap[0]; 720 if (!(io->io_flags & XFS_IOCORE_RT) && !ret_imap->br_startblock) {
689 *nmaps = 1;
690 if ( !(io->io_flags & XFS_IOCORE_RT) && !ret_imap->br_startblock) {
691 cmn_err(CE_PANIC,"Access to block zero: fs <%s> inode: %lld " 721 cmn_err(CE_PANIC,"Access to block zero: fs <%s> inode: %lld "
692 "start_block : %llx start_off : %llx blkcnt : %llx " 722 "start_block : %llx start_off : %llx blkcnt : %llx "
693 "extent-state : %x \n", 723 "extent-state : %x \n",
694 (ip->i_mount)->m_fsname, 724 (ip->i_mount)->m_fsname,
695 (long long)ip->i_ino, 725 (long long)ip->i_ino,
696 ret_imap->br_startblock, ret_imap->br_startoff, 726 (unsigned long long)ret_imap->br_startblock,
697 ret_imap->br_blockcount,ret_imap->br_state); 727 (unsigned long long)ret_imap->br_startoff,
728 (unsigned long long)ret_imap->br_blockcount,
729 ret_imap->br_state);
698 } 730 }
731
732 *ret_imap = imap[0];
733 *nmaps = 1;
734
699 return 0; 735 return 0;
700} 736}
701 737
@@ -821,17 +857,21 @@ xfs_iomap_write_allocate(
821 */ 857 */
822 858
823 for (i = 0; i < nimaps; i++) { 859 for (i = 0; i < nimaps; i++) {
824 if ( !(io->io_flags & XFS_IOCORE_RT) && 860 if (!(io->io_flags & XFS_IOCORE_RT) &&
825 !imap[i].br_startblock) { 861 !imap[i].br_startblock) {
826 cmn_err(CE_PANIC,"Access to block zero: " 862 cmn_err(CE_PANIC,"Access to block zero: "
827 "fs <%s> inode: %lld " 863 "fs <%s> inode: %lld "
828 "start_block : %llx start_off : %llx " 864 "start_block : %llx start_off : %llx "
829 "blkcnt : %llx extent-state : %x \n", 865 "blkcnt : %llx extent-state : %x \n",
830 (ip->i_mount)->m_fsname, 866 (ip->i_mount)->m_fsname,
831 (long long)ip->i_ino, 867 (long long)ip->i_ino,
832 imap[i].br_startblock, 868 (unsigned long long)
833 imap[i].br_startoff, 869 imap[i].br_startblock,
834 imap[i].br_blockcount,imap[i].br_state); 870 (unsigned long long)
871 imap[i].br_startoff,
872 (unsigned long long)
873 imap[i].br_blockcount,
874 imap[i].br_state);
835 } 875 }
836 if ((offset_fsb >= imap[i].br_startoff) && 876 if ((offset_fsb >= imap[i].br_startoff) &&
837 (offset_fsb < (imap[i].br_startoff + 877 (offset_fsb < (imap[i].br_startoff +
@@ -868,17 +908,17 @@ xfs_iomap_write_unwritten(
868{ 908{
869 xfs_mount_t *mp = ip->i_mount; 909 xfs_mount_t *mp = ip->i_mount;
870 xfs_iocore_t *io = &ip->i_iocore; 910 xfs_iocore_t *io = &ip->i_iocore;
871 xfs_trans_t *tp;
872 xfs_fileoff_t offset_fsb; 911 xfs_fileoff_t offset_fsb;
873 xfs_filblks_t count_fsb; 912 xfs_filblks_t count_fsb;
874 xfs_filblks_t numblks_fsb; 913 xfs_filblks_t numblks_fsb;
875 xfs_bmbt_irec_t imap; 914 xfs_fsblock_t firstfsb;
915 int nimaps;
916 xfs_trans_t *tp;
917 xfs_bmbt_irec_t imap;
918 xfs_bmap_free_t free_list;
919 uint resblks;
876 int committed; 920 int committed;
877 int error; 921 int error;
878 int nres;
879 int nimaps;
880 xfs_fsblock_t firstfsb;
881 xfs_bmap_free_t free_list;
882 922
883 xfs_iomap_enter_trace(XFS_IOMAP_UNWRITTEN, 923 xfs_iomap_enter_trace(XFS_IOMAP_UNWRITTEN,
884 &ip->i_iocore, offset, count); 924 &ip->i_iocore, offset, count);
@@ -887,9 +927,9 @@ xfs_iomap_write_unwritten(
887 count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); 927 count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
888 count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb); 928 count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb);
889 929
890 do { 930 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
891 nres = XFS_DIOSTRAT_SPACE_RES(mp, 0);
892 931
932 do {
893 /* 933 /*
894 * set up a transaction to convert the range of extents 934 * set up a transaction to convert the range of extents
895 * from unwritten to real. Do allocations in a loop until 935 * from unwritten to real. Do allocations in a loop until
@@ -897,7 +937,7 @@ xfs_iomap_write_unwritten(
897 */ 937 */
898 938
899 tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); 939 tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
900 error = xfs_trans_reserve(tp, nres, 940 error = xfs_trans_reserve(tp, resblks,
901 XFS_WRITE_LOG_RES(mp), 0, 941 XFS_WRITE_LOG_RES(mp), 0,
902 XFS_TRANS_PERM_LOG_RES, 942 XFS_TRANS_PERM_LOG_RES,
903 XFS_WRITE_LOG_COUNT); 943 XFS_WRITE_LOG_COUNT);
@@ -916,7 +956,7 @@ xfs_iomap_write_unwritten(
916 XFS_BMAP_INIT(&free_list, &firstfsb); 956 XFS_BMAP_INIT(&free_list, &firstfsb);
917 nimaps = 1; 957 nimaps = 1;
918 error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, 958 error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
919 XFS_BMAPI_WRITE, &firstfsb, 959 XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb,
920 1, &imap, &nimaps, &free_list); 960 1, &imap, &nimaps, &free_list);
921 if (error) 961 if (error)
922 goto error_on_bmapi_transaction; 962 goto error_on_bmapi_transaction;
@@ -930,15 +970,17 @@ xfs_iomap_write_unwritten(
930 xfs_iunlock(ip, XFS_ILOCK_EXCL); 970 xfs_iunlock(ip, XFS_ILOCK_EXCL);
931 if (error) 971 if (error)
932 goto error0; 972 goto error0;
933 973
934 if ( !(io->io_flags & XFS_IOCORE_RT) && !imap.br_startblock) { 974 if ( !(io->io_flags & XFS_IOCORE_RT) && !imap.br_startblock) {
935 cmn_err(CE_PANIC,"Access to block zero: fs <%s> " 975 cmn_err(CE_PANIC,"Access to block zero: fs <%s> "
936 "inode: %lld start_block : %llx start_off : " 976 "inode: %lld start_block : %llx start_off : "
937 "%llx blkcnt : %llx extent-state : %x \n", 977 "%llx blkcnt : %llx extent-state : %x \n",
938 (ip->i_mount)->m_fsname, 978 (ip->i_mount)->m_fsname,
939 (long long)ip->i_ino, 979 (long long)ip->i_ino,
940 imap.br_startblock,imap.br_startoff, 980 (unsigned long long)imap.br_startblock,
941 imap.br_blockcount,imap.br_state); 981 (unsigned long long)imap.br_startoff,
982 (unsigned long long)imap.br_blockcount,
983 imap.br_state);
942 } 984 }
943 985
944 if ((numblks_fsb = imap.br_blockcount) == 0) { 986 if ((numblks_fsb = imap.br_blockcount) == 0) {
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index fcd6d63bb68..3ce204a524b 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -69,7 +69,7 @@ typedef struct xfs_iomap {
69 xfs_buftarg_t *iomap_target; 69 xfs_buftarg_t *iomap_target;
70 xfs_off_t iomap_offset; /* offset of mapping, bytes */ 70 xfs_off_t iomap_offset; /* offset of mapping, bytes */
71 xfs_off_t iomap_bsize; /* size of mapping, bytes */ 71 xfs_off_t iomap_bsize; /* size of mapping, bytes */
72 size_t iomap_delta; /* offset into mapping, bytes */ 72 xfs_off_t iomap_delta; /* offset into mapping, bytes */
73 iomap_flags_t iomap_flags; 73 iomap_flags_t iomap_flags;
74} xfs_iomap_t; 74} xfs_iomap_t;
75 75
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f63646ead81..c59450e1be4 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -56,6 +56,7 @@ xfs_bulkstat_one_iget(
56{ 56{
57 xfs_dinode_core_t *dic; /* dinode core info pointer */ 57 xfs_dinode_core_t *dic; /* dinode core info pointer */
58 xfs_inode_t *ip; /* incore inode pointer */ 58 xfs_inode_t *ip; /* incore inode pointer */
59 vnode_t *vp;
59 int error; 60 int error;
60 61
61 error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, bno); 62 error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, bno);
@@ -72,6 +73,7 @@ xfs_bulkstat_one_iget(
72 goto out_iput; 73 goto out_iput;
73 } 74 }
74 75
76 vp = XFS_ITOV(ip);
75 dic = &ip->i_d; 77 dic = &ip->i_d;
76 78
77 /* xfs_iget returns the following without needing 79 /* xfs_iget returns the following without needing
@@ -84,8 +86,7 @@ xfs_bulkstat_one_iget(
84 buf->bs_uid = dic->di_uid; 86 buf->bs_uid = dic->di_uid;
85 buf->bs_gid = dic->di_gid; 87 buf->bs_gid = dic->di_gid;
86 buf->bs_size = dic->di_size; 88 buf->bs_size = dic->di_size;
87 buf->bs_atime.tv_sec = dic->di_atime.t_sec; 89 vn_atime_to_bstime(vp, &buf->bs_atime);
88 buf->bs_atime.tv_nsec = dic->di_atime.t_nsec;
89 buf->bs_mtime.tv_sec = dic->di_mtime.t_sec; 90 buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
90 buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec; 91 buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
91 buf->bs_ctime.tv_sec = dic->di_ctime.t_sec; 92 buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 29af51275ca..9176995160e 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -178,6 +178,83 @@ xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
178#define xlog_trace_iclog(iclog,state) 178#define xlog_trace_iclog(iclog,state)
179#endif /* XFS_LOG_TRACE */ 179#endif /* XFS_LOG_TRACE */
180 180
181
182static void
183xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
184{
185 if (*qp) {
186 tic->t_next = (*qp);
187 tic->t_prev = (*qp)->t_prev;
188 (*qp)->t_prev->t_next = tic;
189 (*qp)->t_prev = tic;
190 } else {
191 tic->t_prev = tic->t_next = tic;
192 *qp = tic;
193 }
194
195 tic->t_flags |= XLOG_TIC_IN_Q;
196}
197
198static void
199xlog_del_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
200{
201 if (tic == tic->t_next) {
202 *qp = NULL;
203 } else {
204 *qp = tic->t_next;
205 tic->t_next->t_prev = tic->t_prev;
206 tic->t_prev->t_next = tic->t_next;
207 }
208
209 tic->t_next = tic->t_prev = NULL;
210 tic->t_flags &= ~XLOG_TIC_IN_Q;
211}
212
213static void
214xlog_grant_sub_space(struct log *log, int bytes)
215{
216 log->l_grant_write_bytes -= bytes;
217 if (log->l_grant_write_bytes < 0) {
218 log->l_grant_write_bytes += log->l_logsize;
219 log->l_grant_write_cycle--;
220 }
221
222 log->l_grant_reserve_bytes -= bytes;
223 if ((log)->l_grant_reserve_bytes < 0) {
224 log->l_grant_reserve_bytes += log->l_logsize;
225 log->l_grant_reserve_cycle--;
226 }
227
228}
229
230static void
231xlog_grant_add_space_write(struct log *log, int bytes)
232{
233 log->l_grant_write_bytes += bytes;
234 if (log->l_grant_write_bytes > log->l_logsize) {
235 log->l_grant_write_bytes -= log->l_logsize;
236 log->l_grant_write_cycle++;
237 }
238}
239
240static void
241xlog_grant_add_space_reserve(struct log *log, int bytes)
242{
243 log->l_grant_reserve_bytes += bytes;
244 if (log->l_grant_reserve_bytes > log->l_logsize) {
245 log->l_grant_reserve_bytes -= log->l_logsize;
246 log->l_grant_reserve_cycle++;
247 }
248}
249
250static inline void
251xlog_grant_add_space(struct log *log, int bytes)
252{
253 xlog_grant_add_space_write(log, bytes);
254 xlog_grant_add_space_reserve(log, bytes);
255}
256
257
181/* 258/*
182 * NOTES: 259 * NOTES:
183 * 260 *
@@ -326,7 +403,7 @@ xfs_log_release_iclog(xfs_mount_t *mp,
326 403
327 if (xlog_state_release_iclog(log, iclog)) { 404 if (xlog_state_release_iclog(log, iclog)) {
328 xfs_force_shutdown(mp, XFS_LOG_IO_ERROR); 405 xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
329 return(EIO); 406 return EIO;
330 } 407 }
331 408
332 return 0; 409 return 0;
@@ -428,7 +505,7 @@ xfs_log_mount(xfs_mount_t *mp,
428 if (readonly) 505 if (readonly)
429 vfsp->vfs_flag &= ~VFS_RDONLY; 506 vfsp->vfs_flag &= ~VFS_RDONLY;
430 507
431 error = xlog_recover(mp->m_log, readonly); 508 error = xlog_recover(mp->m_log);
432 509
433 if (readonly) 510 if (readonly)
434 vfsp->vfs_flag |= VFS_RDONLY; 511 vfsp->vfs_flag |= VFS_RDONLY;
@@ -479,7 +556,7 @@ xfs_log_unmount(xfs_mount_t *mp)
479 556
480 error = xfs_log_unmount_write(mp); 557 error = xfs_log_unmount_write(mp);
481 xfs_log_unmount_dealloc(mp); 558 xfs_log_unmount_dealloc(mp);
482 return (error); 559 return error;
483} 560}
484 561
485/* 562/*
@@ -651,7 +728,7 @@ xfs_log_write(xfs_mount_t * mp,
651 if ((error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0))) { 728 if ((error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0))) {
652 xfs_force_shutdown(mp, XFS_LOG_IO_ERROR); 729 xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
653 } 730 }
654 return (error); 731 return error;
655} /* xfs_log_write */ 732} /* xfs_log_write */
656 733
657 734
@@ -759,7 +836,7 @@ xfs_log_need_covered(xfs_mount_t *mp)
759 needed = 1; 836 needed = 1;
760 } 837 }
761 LOG_UNLOCK(log, s); 838 LOG_UNLOCK(log, s);
762 return(needed); 839 return needed;
763} 840}
764 841
765/****************************************************************************** 842/******************************************************************************
@@ -926,7 +1003,7 @@ xlog_bdstrat_cb(struct xfs_buf *bp)
926 XFS_BUF_ERROR(bp, EIO); 1003 XFS_BUF_ERROR(bp, EIO);
927 XFS_BUF_STALE(bp); 1004 XFS_BUF_STALE(bp);
928 xfs_biodone(bp); 1005 xfs_biodone(bp);
929 return (XFS_ERROR(EIO)); 1006 return XFS_ERROR(EIO);
930 1007
931 1008
932} 1009}
@@ -1186,7 +1263,7 @@ xlog_commit_record(xfs_mount_t *mp,
1186 iclog, XLOG_COMMIT_TRANS))) { 1263 iclog, XLOG_COMMIT_TRANS))) {
1187 xfs_force_shutdown(mp, XFS_LOG_IO_ERROR); 1264 xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
1188 } 1265 }
1189 return (error); 1266 return error;
1190} /* xlog_commit_record */ 1267} /* xlog_commit_record */
1191 1268
1192 1269
@@ -1320,8 +1397,7 @@ xlog_sync(xlog_t *log,
1320 1397
1321 /* move grant heads by roundoff in sync */ 1398 /* move grant heads by roundoff in sync */
1322 s = GRANT_LOCK(log); 1399 s = GRANT_LOCK(log);
1323 XLOG_GRANT_ADD_SPACE(log, roundoff, 'w'); 1400 xlog_grant_add_space(log, roundoff);
1324 XLOG_GRANT_ADD_SPACE(log, roundoff, 'r');
1325 GRANT_UNLOCK(log, s); 1401 GRANT_UNLOCK(log, s);
1326 1402
1327 /* put cycle number in every block */ 1403 /* put cycle number in every block */
@@ -1384,7 +1460,7 @@ xlog_sync(xlog_t *log,
1384 if ((error = XFS_bwrite(bp))) { 1460 if ((error = XFS_bwrite(bp))) {
1385 xfs_ioerror_alert("xlog_sync", log->l_mp, bp, 1461 xfs_ioerror_alert("xlog_sync", log->l_mp, bp,
1386 XFS_BUF_ADDR(bp)); 1462 XFS_BUF_ADDR(bp));
1387 return (error); 1463 return error;
1388 } 1464 }
1389 if (split) { 1465 if (split) {
1390 bp = iclog->ic_log->l_xbuf; 1466 bp = iclog->ic_log->l_xbuf;
@@ -1422,10 +1498,10 @@ xlog_sync(xlog_t *log,
1422 if ((error = XFS_bwrite(bp))) { 1498 if ((error = XFS_bwrite(bp))) {
1423 xfs_ioerror_alert("xlog_sync (split)", log->l_mp, 1499 xfs_ioerror_alert("xlog_sync (split)", log->l_mp,
1424 bp, XFS_BUF_ADDR(bp)); 1500 bp, XFS_BUF_ADDR(bp));
1425 return (error); 1501 return error;
1426 } 1502 }
1427 } 1503 }
1428 return (0); 1504 return 0;
1429} /* xlog_sync */ 1505} /* xlog_sync */
1430 1506
1431 1507
@@ -1515,7 +1591,6 @@ xlog_state_finish_copy(xlog_t *log,
1515 * print out info relating to regions written which consume 1591 * print out info relating to regions written which consume
1516 * the reservation 1592 * the reservation
1517 */ 1593 */
1518#if defined(XFS_LOG_RES_DEBUG)
1519STATIC void 1594STATIC void
1520xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) 1595xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
1521{ 1596{
@@ -1605,11 +1680,11 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
1605 ticket->t_res_arr_sum, ticket->t_res_o_flow, 1680 ticket->t_res_arr_sum, ticket->t_res_o_flow,
1606 ticket->t_res_num_ophdrs, ophdr_spc, 1681 ticket->t_res_num_ophdrs, ophdr_spc,
1607 ticket->t_res_arr_sum + 1682 ticket->t_res_arr_sum +
1608 ticket->t_res_o_flow + ophdr_spc, 1683 ticket->t_res_o_flow + ophdr_spc,
1609 ticket->t_res_num); 1684 ticket->t_res_num);
1610 1685
1611 for (i = 0; i < ticket->t_res_num; i++) { 1686 for (i = 0; i < ticket->t_res_num; i++) {
1612 uint r_type = ticket->t_res_arr[i].r_type; 1687 uint r_type = ticket->t_res_arr[i].r_type;
1613 cmn_err(CE_WARN, 1688 cmn_err(CE_WARN,
1614 "region[%u]: %s - %u bytes\n", 1689 "region[%u]: %s - %u bytes\n",
1615 i, 1690 i,
@@ -1618,9 +1693,6 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
1618 ticket->t_res_arr[i].r_len); 1693 ticket->t_res_arr[i].r_len);
1619 } 1694 }
1620} 1695}
1621#else
1622#define xlog_print_tic_res(mp, ticket)
1623#endif
1624 1696
1625/* 1697/*
1626 * Write some region out to in-core log 1698 * Write some region out to in-core log
@@ -1726,7 +1798,7 @@ xlog_write(xfs_mount_t * mp,
1726 for (index = 0; index < nentries; ) { 1798 for (index = 0; index < nentries; ) {
1727 if ((error = xlog_state_get_iclog_space(log, len, &iclog, ticket, 1799 if ((error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
1728 &contwr, &log_offset))) 1800 &contwr, &log_offset)))
1729 return (error); 1801 return error;
1730 1802
1731 ASSERT(log_offset <= iclog->ic_size - 1); 1803 ASSERT(log_offset <= iclog->ic_size - 1);
1732 ptr = (__psint_t) ((char *)iclog->ic_datap+log_offset); 1804 ptr = (__psint_t) ((char *)iclog->ic_datap+log_offset);
@@ -1831,7 +1903,7 @@ xlog_write(xfs_mount_t * mp,
1831 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); 1903 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
1832 record_cnt = data_cnt = 0; 1904 record_cnt = data_cnt = 0;
1833 if ((error = xlog_state_release_iclog(log, iclog))) 1905 if ((error = xlog_state_release_iclog(log, iclog)))
1834 return (error); 1906 return error;
1835 break; /* don't increment index */ 1907 break; /* don't increment index */
1836 } else { /* copied entire region */ 1908 } else { /* copied entire region */
1837 index++; 1909 index++;
@@ -1845,7 +1917,7 @@ xlog_write(xfs_mount_t * mp,
1845 ASSERT(flags & XLOG_COMMIT_TRANS); 1917 ASSERT(flags & XLOG_COMMIT_TRANS);
1846 *commit_iclog = iclog; 1918 *commit_iclog = iclog;
1847 } else if ((error = xlog_state_release_iclog(log, iclog))) 1919 } else if ((error = xlog_state_release_iclog(log, iclog)))
1848 return (error); 1920 return error;
1849 if (index == nentries) 1921 if (index == nentries)
1850 return 0; /* we are done */ 1922 return 0; /* we are done */
1851 else 1923 else
@@ -1862,7 +1934,7 @@ xlog_write(xfs_mount_t * mp,
1862 *commit_iclog = iclog; 1934 *commit_iclog = iclog;
1863 return 0; 1935 return 0;
1864 } 1936 }
1865 return (xlog_state_release_iclog(log, iclog)); 1937 return xlog_state_release_iclog(log, iclog);
1866} /* xlog_write */ 1938} /* xlog_write */
1867 1939
1868 1940
@@ -1978,7 +2050,7 @@ xlog_get_lowest_lsn(
1978 } 2050 }
1979 lsn_log = lsn_log->ic_next; 2051 lsn_log = lsn_log->ic_next;
1980 } while (lsn_log != log->l_iclog); 2052 } while (lsn_log != log->l_iclog);
1981 return(lowest_lsn); 2053 return lowest_lsn;
1982} 2054}
1983 2055
1984 2056
@@ -2330,7 +2402,7 @@ restart:
2330 if (iclog->ic_refcnt == 1) { 2402 if (iclog->ic_refcnt == 1) {
2331 LOG_UNLOCK(log, s); 2403 LOG_UNLOCK(log, s);
2332 if ((error = xlog_state_release_iclog(log, iclog))) 2404 if ((error = xlog_state_release_iclog(log, iclog)))
2333 return (error); 2405 return error;
2334 } else { 2406 } else {
2335 iclog->ic_refcnt--; 2407 iclog->ic_refcnt--;
2336 LOG_UNLOCK(log, s); 2408 LOG_UNLOCK(log, s);
@@ -2389,7 +2461,7 @@ xlog_grant_log_space(xlog_t *log,
2389 2461
2390 /* something is already sleeping; insert new transaction at end */ 2462 /* something is already sleeping; insert new transaction at end */
2391 if (log->l_reserve_headq) { 2463 if (log->l_reserve_headq) {
2392 XLOG_INS_TICKETQ(log->l_reserve_headq, tic); 2464 xlog_ins_ticketq(&log->l_reserve_headq, tic);
2393 xlog_trace_loggrant(log, tic, 2465 xlog_trace_loggrant(log, tic,
2394 "xlog_grant_log_space: sleep 1"); 2466 "xlog_grant_log_space: sleep 1");
2395 /* 2467 /*
@@ -2422,7 +2494,7 @@ redo:
2422 log->l_grant_reserve_bytes); 2494 log->l_grant_reserve_bytes);
2423 if (free_bytes < need_bytes) { 2495 if (free_bytes < need_bytes) {
2424 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) 2496 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
2425 XLOG_INS_TICKETQ(log->l_reserve_headq, tic); 2497 xlog_ins_ticketq(&log->l_reserve_headq, tic);
2426 xlog_trace_loggrant(log, tic, 2498 xlog_trace_loggrant(log, tic,
2427 "xlog_grant_log_space: sleep 2"); 2499 "xlog_grant_log_space: sleep 2");
2428 XFS_STATS_INC(xs_sleep_logspace); 2500 XFS_STATS_INC(xs_sleep_logspace);
@@ -2439,11 +2511,10 @@ redo:
2439 s = GRANT_LOCK(log); 2511 s = GRANT_LOCK(log);
2440 goto redo; 2512 goto redo;
2441 } else if (tic->t_flags & XLOG_TIC_IN_Q) 2513 } else if (tic->t_flags & XLOG_TIC_IN_Q)
2442 XLOG_DEL_TICKETQ(log->l_reserve_headq, tic); 2514 xlog_del_ticketq(&log->l_reserve_headq, tic);
2443 2515
2444 /* we've got enough space */ 2516 /* we've got enough space */
2445 XLOG_GRANT_ADD_SPACE(log, need_bytes, 'w'); 2517 xlog_grant_add_space(log, need_bytes);
2446 XLOG_GRANT_ADD_SPACE(log, need_bytes, 'r');
2447#ifdef DEBUG 2518#ifdef DEBUG
2448 tail_lsn = log->l_tail_lsn; 2519 tail_lsn = log->l_tail_lsn;
2449 /* 2520 /*
@@ -2464,7 +2535,7 @@ redo:
2464 2535
2465 error_return: 2536 error_return:
2466 if (tic->t_flags & XLOG_TIC_IN_Q) 2537 if (tic->t_flags & XLOG_TIC_IN_Q)
2467 XLOG_DEL_TICKETQ(log->l_reserve_headq, tic); 2538 xlog_del_ticketq(&log->l_reserve_headq, tic);
2468 xlog_trace_loggrant(log, tic, "xlog_grant_log_space: err_ret"); 2539 xlog_trace_loggrant(log, tic, "xlog_grant_log_space: err_ret");
2469 /* 2540 /*
2470 * If we are failing, make sure the ticket doesn't have any 2541 * If we are failing, make sure the ticket doesn't have any
@@ -2498,7 +2569,7 @@ xlog_regrant_write_log_space(xlog_t *log,
2498 XLOG_TIC_RESET_RES(tic); 2569 XLOG_TIC_RESET_RES(tic);
2499 2570
2500 if (tic->t_cnt > 0) 2571 if (tic->t_cnt > 0)
2501 return (0); 2572 return 0;
2502 2573
2503#ifdef DEBUG 2574#ifdef DEBUG
2504 if (log->l_flags & XLOG_ACTIVE_RECOVERY) 2575 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
@@ -2533,7 +2604,7 @@ xlog_regrant_write_log_space(xlog_t *log,
2533 2604
2534 if (ntic != log->l_write_headq) { 2605 if (ntic != log->l_write_headq) {
2535 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) 2606 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
2536 XLOG_INS_TICKETQ(log->l_write_headq, tic); 2607 xlog_ins_ticketq(&log->l_write_headq, tic);
2537 2608
2538 xlog_trace_loggrant(log, tic, 2609 xlog_trace_loggrant(log, tic,
2539 "xlog_regrant_write_log_space: sleep 1"); 2610 "xlog_regrant_write_log_space: sleep 1");
@@ -2565,7 +2636,7 @@ redo:
2565 log->l_grant_write_bytes); 2636 log->l_grant_write_bytes);
2566 if (free_bytes < need_bytes) { 2637 if (free_bytes < need_bytes) {
2567 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) 2638 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
2568 XLOG_INS_TICKETQ(log->l_write_headq, tic); 2639 xlog_ins_ticketq(&log->l_write_headq, tic);
2569 XFS_STATS_INC(xs_sleep_logspace); 2640 XFS_STATS_INC(xs_sleep_logspace);
2570 sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s); 2641 sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
2571 2642
@@ -2581,9 +2652,10 @@ redo:
2581 s = GRANT_LOCK(log); 2652 s = GRANT_LOCK(log);
2582 goto redo; 2653 goto redo;
2583 } else if (tic->t_flags & XLOG_TIC_IN_Q) 2654 } else if (tic->t_flags & XLOG_TIC_IN_Q)
2584 XLOG_DEL_TICKETQ(log->l_write_headq, tic); 2655 xlog_del_ticketq(&log->l_write_headq, tic);
2585 2656
2586 XLOG_GRANT_ADD_SPACE(log, need_bytes, 'w'); /* we've got enough space */ 2657 /* we've got enough space */
2658 xlog_grant_add_space_write(log, need_bytes);
2587#ifdef DEBUG 2659#ifdef DEBUG
2588 tail_lsn = log->l_tail_lsn; 2660 tail_lsn = log->l_tail_lsn;
2589 if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) { 2661 if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
@@ -2595,12 +2667,12 @@ redo:
2595 xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: exit"); 2667 xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: exit");
2596 xlog_verify_grant_head(log, 1); 2668 xlog_verify_grant_head(log, 1);
2597 GRANT_UNLOCK(log, s); 2669 GRANT_UNLOCK(log, s);
2598 return (0); 2670 return 0;
2599 2671
2600 2672
2601 error_return: 2673 error_return:
2602 if (tic->t_flags & XLOG_TIC_IN_Q) 2674 if (tic->t_flags & XLOG_TIC_IN_Q)
2603 XLOG_DEL_TICKETQ(log->l_reserve_headq, tic); 2675 xlog_del_ticketq(&log->l_reserve_headq, tic);
2604 xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: err_ret"); 2676 xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: err_ret");
2605 /* 2677 /*
2606 * If we are failing, make sure the ticket doesn't have any 2678 * If we are failing, make sure the ticket doesn't have any
@@ -2633,8 +2705,7 @@ xlog_regrant_reserve_log_space(xlog_t *log,
2633 ticket->t_cnt--; 2705 ticket->t_cnt--;
2634 2706
2635 s = GRANT_LOCK(log); 2707 s = GRANT_LOCK(log);
2636 XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'w'); 2708 xlog_grant_sub_space(log, ticket->t_curr_res);
2637 XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'r');
2638 ticket->t_curr_res = ticket->t_unit_res; 2709 ticket->t_curr_res = ticket->t_unit_res;
2639 XLOG_TIC_RESET_RES(ticket); 2710 XLOG_TIC_RESET_RES(ticket);
2640 xlog_trace_loggrant(log, ticket, 2711 xlog_trace_loggrant(log, ticket,
@@ -2647,7 +2718,7 @@ xlog_regrant_reserve_log_space(xlog_t *log,
2647 return; 2718 return;
2648 } 2719 }
2649 2720
2650 XLOG_GRANT_ADD_SPACE(log, ticket->t_unit_res, 'r'); 2721 xlog_grant_add_space_reserve(log, ticket->t_unit_res);
2651 xlog_trace_loggrant(log, ticket, 2722 xlog_trace_loggrant(log, ticket,
2652 "xlog_regrant_reserve_log_space: exit"); 2723 "xlog_regrant_reserve_log_space: exit");
2653 xlog_verify_grant_head(log, 0); 2724 xlog_verify_grant_head(log, 0);
@@ -2683,8 +2754,7 @@ xlog_ungrant_log_space(xlog_t *log,
2683 s = GRANT_LOCK(log); 2754 s = GRANT_LOCK(log);
2684 xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: enter"); 2755 xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: enter");
2685 2756
2686 XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'w'); 2757 xlog_grant_sub_space(log, ticket->t_curr_res);
2687 XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'r');
2688 2758
2689 xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: sub current"); 2759 xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: sub current");
2690 2760
@@ -2693,8 +2763,7 @@ xlog_ungrant_log_space(xlog_t *log,
2693 */ 2763 */
2694 if (ticket->t_cnt > 0) { 2764 if (ticket->t_cnt > 0) {
2695 ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV); 2765 ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
2696 XLOG_GRANT_SUB_SPACE(log, ticket->t_unit_res*ticket->t_cnt,'w'); 2766 xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt);
2697 XLOG_GRANT_SUB_SPACE(log, ticket->t_unit_res*ticket->t_cnt,'r');
2698 } 2767 }
2699 2768
2700 xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: exit"); 2769 xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: exit");
@@ -2768,7 +2837,7 @@ xlog_state_release_iclog(xlog_t *log,
2768 if (sync) { 2837 if (sync) {
2769 return xlog_sync(log, iclog); 2838 return xlog_sync(log, iclog);
2770 } 2839 }
2771 return (0); 2840 return 0;
2772 2841
2773} /* xlog_state_release_iclog */ 2842} /* xlog_state_release_iclog */
2774 2843
@@ -3058,7 +3127,7 @@ try_again:
3058 } while (iclog != log->l_iclog); 3127 } while (iclog != log->l_iclog);
3059 3128
3060 LOG_UNLOCK(log, s); 3129 LOG_UNLOCK(log, s);
3061 return (0); 3130 return 0;
3062} /* xlog_state_sync */ 3131} /* xlog_state_sync */
3063 3132
3064 3133
@@ -3476,12 +3545,12 @@ xlog_state_ioerror(
3476 ic->ic_state = XLOG_STATE_IOERROR; 3545 ic->ic_state = XLOG_STATE_IOERROR;
3477 ic = ic->ic_next; 3546 ic = ic->ic_next;
3478 } while (ic != iclog); 3547 } while (ic != iclog);
3479 return (0); 3548 return 0;
3480 } 3549 }
3481 /* 3550 /*
3482 * Return non-zero, if state transition has already happened. 3551 * Return non-zero, if state transition has already happened.
3483 */ 3552 */
3484 return (1); 3553 return 1;
3485} 3554}
3486 3555
3487/* 3556/*
@@ -3518,7 +3587,7 @@ xfs_log_force_umount(
3518 log->l_flags & XLOG_ACTIVE_RECOVERY) { 3587 log->l_flags & XLOG_ACTIVE_RECOVERY) {
3519 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; 3588 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
3520 XFS_BUF_DONE(mp->m_sb_bp); 3589 XFS_BUF_DONE(mp->m_sb_bp);
3521 return (0); 3590 return 0;
3522 } 3591 }
3523 3592
3524 /* 3593 /*
@@ -3527,7 +3596,7 @@ xfs_log_force_umount(
3527 */ 3596 */
3528 if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) { 3597 if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) {
3529 ASSERT(XLOG_FORCED_SHUTDOWN(log)); 3598 ASSERT(XLOG_FORCED_SHUTDOWN(log));
3530 return (1); 3599 return 1;
3531 } 3600 }
3532 retval = 0; 3601 retval = 0;
3533 /* 3602 /*
@@ -3609,7 +3678,7 @@ xfs_log_force_umount(
3609 } 3678 }
3610#endif 3679#endif
3611 /* return non-zero if log IOERROR transition had already happened */ 3680 /* return non-zero if log IOERROR transition had already happened */
3612 return (retval); 3681 return retval;
3613} 3682}
3614 3683
3615STATIC int 3684STATIC int
@@ -3623,8 +3692,8 @@ xlog_iclogs_empty(xlog_t *log)
3623 * any language. 3692 * any language.
3624 */ 3693 */
3625 if (iclog->ic_header.h_num_logops) 3694 if (iclog->ic_header.h_num_logops)
3626 return(0); 3695 return 0;
3627 iclog = iclog->ic_next; 3696 iclog = iclog->ic_next;
3628 } while (iclog != log->l_iclog); 3697 } while (iclog != log->l_iclog);
3629 return(1); 3698 return 1;
3630} 3699}
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 158829ca56f..4b2ac88dbb8 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -30,13 +30,7 @@
30 * By comparing each compnent, we don't have to worry about extra 30 * By comparing each compnent, we don't have to worry about extra
31 * endian issues in treating two 32 bit numbers as one 64 bit number 31 * endian issues in treating two 32 bit numbers as one 64 bit number
32 */ 32 */
33static 33static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
34#if defined(__GNUC__) && (__GNUC__ == 2) && ( (__GNUC_MINOR__ == 95) || (__GNUC_MINOR__ == 96))
35__attribute__((unused)) /* gcc 2.95, 2.96 miscompile this when inlined */
36#else
37__inline__
38#endif
39xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
40{ 34{
41 if (CYCLE_LSN(lsn1) != CYCLE_LSN(lsn2)) 35 if (CYCLE_LSN(lsn1) != CYCLE_LSN(lsn2))
42 return (CYCLE_LSN(lsn1)<CYCLE_LSN(lsn2))? -999 : 999; 36 return (CYCLE_LSN(lsn1)<CYCLE_LSN(lsn2))? -999 : 999;
@@ -102,7 +96,6 @@ xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
102 96
103 97
104/* Region types for iovec's i_type */ 98/* Region types for iovec's i_type */
105#if defined(XFS_LOG_RES_DEBUG)
106#define XLOG_REG_TYPE_BFORMAT 1 99#define XLOG_REG_TYPE_BFORMAT 1
107#define XLOG_REG_TYPE_BCHUNK 2 100#define XLOG_REG_TYPE_BCHUNK 2
108#define XLOG_REG_TYPE_EFI_FORMAT 3 101#define XLOG_REG_TYPE_EFI_FORMAT 3
@@ -123,21 +116,13 @@ xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
123#define XLOG_REG_TYPE_COMMIT 18 116#define XLOG_REG_TYPE_COMMIT 18
124#define XLOG_REG_TYPE_TRANSHDR 19 117#define XLOG_REG_TYPE_TRANSHDR 19
125#define XLOG_REG_TYPE_MAX 19 118#define XLOG_REG_TYPE_MAX 19
126#endif
127 119
128#if defined(XFS_LOG_RES_DEBUG)
129#define XLOG_VEC_SET_TYPE(vecp, t) ((vecp)->i_type = (t)) 120#define XLOG_VEC_SET_TYPE(vecp, t) ((vecp)->i_type = (t))
130#else
131#define XLOG_VEC_SET_TYPE(vecp, t)
132#endif
133
134 121
135typedef struct xfs_log_iovec { 122typedef struct xfs_log_iovec {
136 xfs_caddr_t i_addr; /* beginning address of region */ 123 xfs_caddr_t i_addr; /* beginning address of region */
137 int i_len; /* length in bytes of region */ 124 int i_len; /* length in bytes of region */
138#if defined(XFS_LOG_RES_DEBUG) 125 uint i_type; /* type of region */
139 uint i_type; /* type of region */
140#endif
141} xfs_log_iovec_t; 126} xfs_log_iovec_t;
142 127
143typedef void* xfs_log_ticket_t; 128typedef void* xfs_log_ticket_t;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 8f285149681..34bcbf50789 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -253,7 +253,6 @@ typedef __uint32_t xlog_tid_t;
253 253
254 254
255/* Ticket reservation region accounting */ 255/* Ticket reservation region accounting */
256#if defined(XFS_LOG_RES_DEBUG)
257#define XLOG_TIC_LEN_MAX 15 256#define XLOG_TIC_LEN_MAX 15
258#define XLOG_TIC_RESET_RES(t) ((t)->t_res_num = \ 257#define XLOG_TIC_RESET_RES(t) ((t)->t_res_num = \
259 (t)->t_res_arr_sum = (t)->t_res_num_ophdrs = 0) 258 (t)->t_res_arr_sum = (t)->t_res_num_ophdrs = 0)
@@ -278,15 +277,9 @@ typedef __uint32_t xlog_tid_t;
278 * we don't care about. 277 * we don't care about.
279 */ 278 */
280typedef struct xlog_res { 279typedef struct xlog_res {
281 uint r_len; 280 uint r_len; /* region length :4 */
282 uint r_type; 281 uint r_type; /* region's transaction type :4 */
283} xlog_res_t; 282} xlog_res_t;
284#else
285#define XLOG_TIC_RESET_RES(t)
286#define XLOG_TIC_ADD_OPHDR(t)
287#define XLOG_TIC_ADD_REGION(t, len, type)
288#endif
289
290 283
291typedef struct xlog_ticket { 284typedef struct xlog_ticket {
292 sv_t t_sema; /* sleep on this semaphore : 20 */ 285 sv_t t_sema; /* sleep on this semaphore : 20 */
@@ -301,14 +294,12 @@ typedef struct xlog_ticket {
301 char t_flags; /* properties of reservation : 1 */ 294 char t_flags; /* properties of reservation : 1 */
302 uint t_trans_type; /* transaction type : 4 */ 295 uint t_trans_type; /* transaction type : 4 */
303 296
304#if defined (XFS_LOG_RES_DEBUG)
305 /* reservation array fields */ 297 /* reservation array fields */
306 uint t_res_num; /* num in array : 4 */ 298 uint t_res_num; /* num in array : 4 */
307 xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : X */
308 uint t_res_num_ophdrs; /* num op hdrs : 4 */ 299 uint t_res_num_ophdrs; /* num op hdrs : 4 */
309 uint t_res_arr_sum; /* array sum : 4 */ 300 uint t_res_arr_sum; /* array sum : 4 */
310 uint t_res_o_flow; /* sum overflow : 4 */ 301 uint t_res_o_flow; /* sum overflow : 4 */
311#endif 302 xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */
312} xlog_ticket_t; 303} xlog_ticket_t;
313 304
314#endif 305#endif
@@ -494,83 +485,13 @@ typedef struct log {
494 485
495#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) 486#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
496 487
497#define XLOG_GRANT_SUB_SPACE(log,bytes,type) \
498 xlog_grant_sub_space(log,bytes,type)
499static inline void xlog_grant_sub_space(struct log *log, int bytes, int type)
500{
501 if (type == 'w') { \
502 (log)->l_grant_write_bytes -= (bytes); \
503 if ((log)->l_grant_write_bytes < 0) { \
504 (log)->l_grant_write_bytes += (log)->l_logsize; \
505 (log)->l_grant_write_cycle--; \
506 } \
507 } else { \
508 (log)->l_grant_reserve_bytes -= (bytes); \
509 if ((log)->l_grant_reserve_bytes < 0) { \
510 (log)->l_grant_reserve_bytes += (log)->l_logsize;\
511 (log)->l_grant_reserve_cycle--; \
512 } \
513 } \
514}
515
516#define XLOG_GRANT_ADD_SPACE(log,bytes,type) \
517 xlog_grant_add_space(log,bytes,type)
518static inline void
519xlog_grant_add_space(struct log *log, int bytes, int type)
520{
521 if (type == 'w') { \
522 (log)->l_grant_write_bytes += (bytes); \
523 if ((log)->l_grant_write_bytes > (log)->l_logsize) { \
524 (log)->l_grant_write_bytes -= (log)->l_logsize; \
525 (log)->l_grant_write_cycle++; \
526 } \
527 } else { \
528 (log)->l_grant_reserve_bytes += (bytes); \
529 if ((log)->l_grant_reserve_bytes > (log)->l_logsize) { \
530 (log)->l_grant_reserve_bytes -= (log)->l_logsize;\
531 (log)->l_grant_reserve_cycle++; \
532 } \
533 } \
534}
535
536#define XLOG_INS_TICKETQ(q, tic) xlog_ins_ticketq(q, tic)
537static inline void
538xlog_ins_ticketq(struct xlog_ticket *q, struct xlog_ticket *tic)
539{ \
540 if (q) { \
541 (tic)->t_next = (q); \
542 (tic)->t_prev = (q)->t_prev; \
543 (q)->t_prev->t_next = (tic); \
544 (q)->t_prev = (tic); \
545 } else { \
546 (tic)->t_prev = (tic)->t_next = (tic); \
547 (q) = (tic); \
548 } \
549 (tic)->t_flags |= XLOG_TIC_IN_Q; \
550}
551
552#define XLOG_DEL_TICKETQ(q, tic) xlog_del_ticketq(q, tic)
553static inline void
554xlog_del_ticketq(struct xlog_ticket *q, struct xlog_ticket *tic)
555{ \
556 if ((tic) == (tic)->t_next) { \
557 (q) = NULL; \
558 } else { \
559 (q) = (tic)->t_next; \
560 (tic)->t_next->t_prev = (tic)->t_prev; \
561 (tic)->t_prev->t_next = (tic)->t_next; \
562 } \
563 (tic)->t_next = (tic)->t_prev = NULL; \
564 (tic)->t_flags &= ~XLOG_TIC_IN_Q; \
565}
566 488
567/* common routines */ 489/* common routines */
568extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); 490extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
569extern int xlog_find_tail(xlog_t *log, 491extern int xlog_find_tail(xlog_t *log,
570 xfs_daddr_t *head_blk, 492 xfs_daddr_t *head_blk,
571 xfs_daddr_t *tail_blk, 493 xfs_daddr_t *tail_blk);
572 int readonly); 494extern int xlog_recover(xlog_t *log);
573extern int xlog_recover(xlog_t *log, int readonly);
574extern int xlog_recover_finish(xlog_t *log, int mfsi_flags); 495extern int xlog_recover_finish(xlog_t *log, int mfsi_flags);
575extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); 496extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
576extern void xlog_recover_process_iunlinks(xlog_t *log); 497extern void xlog_recover_process_iunlinks(xlog_t *log);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 8ab7df76806..7d46cbd6a07 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -783,8 +783,7 @@ int
783xlog_find_tail( 783xlog_find_tail(
784 xlog_t *log, 784 xlog_t *log,
785 xfs_daddr_t *head_blk, 785 xfs_daddr_t *head_blk,
786 xfs_daddr_t *tail_blk, 786 xfs_daddr_t *tail_blk)
787 int readonly)
788{ 787{
789 xlog_rec_header_t *rhead; 788 xlog_rec_header_t *rhead;
790 xlog_op_header_t *op_head; 789 xlog_op_header_t *op_head;
@@ -2563,10 +2562,12 @@ xlog_recover_do_quotaoff_trans(
2563 2562
2564 /* 2563 /*
2565 * The logitem format's flag tells us if this was user quotaoff, 2564 * The logitem format's flag tells us if this was user quotaoff,
2566 * group quotaoff or both. 2565 * group/project quotaoff or both.
2567 */ 2566 */
2568 if (qoff_f->qf_flags & XFS_UQUOTA_ACCT) 2567 if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
2569 log->l_quotaoffs_flag |= XFS_DQ_USER; 2568 log->l_quotaoffs_flag |= XFS_DQ_USER;
2569 if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
2570 log->l_quotaoffs_flag |= XFS_DQ_PROJ;
2570 if (qoff_f->qf_flags & XFS_GQUOTA_ACCT) 2571 if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
2571 log->l_quotaoffs_flag |= XFS_DQ_GROUP; 2572 log->l_quotaoffs_flag |= XFS_DQ_GROUP;
2572 2573
@@ -3890,14 +3891,13 @@ xlog_do_recover(
3890 */ 3891 */
3891int 3892int
3892xlog_recover( 3893xlog_recover(
3893 xlog_t *log, 3894 xlog_t *log)
3894 int readonly)
3895{ 3895{
3896 xfs_daddr_t head_blk, tail_blk; 3896 xfs_daddr_t head_blk, tail_blk;
3897 int error; 3897 int error;
3898 3898
3899 /* find the tail of the log */ 3899 /* find the tail of the log */
3900 if ((error = xlog_find_tail(log, &head_blk, &tail_blk, readonly))) 3900 if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))
3901 return error; 3901 return error;
3902 3902
3903 if (tail_blk != head_blk) { 3903 if (tail_blk != head_blk) {
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 541d5dd474b..62188ea392c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -51,7 +51,7 @@ STATIC int xfs_uuid_mount(xfs_mount_t *);
51STATIC void xfs_uuid_unmount(xfs_mount_t *mp); 51STATIC void xfs_uuid_unmount(xfs_mount_t *mp);
52STATIC void xfs_unmountfs_wait(xfs_mount_t *); 52STATIC void xfs_unmountfs_wait(xfs_mount_t *);
53 53
54static struct { 54static const struct {
55 short offset; 55 short offset;
56 short type; /* 0 = integer 56 short type; /* 0 = integer
57 * 1 = binary / string (no translation) 57 * 1 = binary / string (no translation)
@@ -117,7 +117,7 @@ xfs_mount_init(void)
117 117
118 AIL_LOCKINIT(&mp->m_ail_lock, "xfs_ail"); 118 AIL_LOCKINIT(&mp->m_ail_lock, "xfs_ail");
119 spinlock_init(&mp->m_sb_lock, "xfs_sb"); 119 spinlock_init(&mp->m_sb_lock, "xfs_sb");
120 mutex_init(&mp->m_ilock, MUTEX_DEFAULT, "xfs_ilock"); 120 mutex_init(&mp->m_ilock);
121 initnsema(&mp->m_growlock, 1, "xfs_grow"); 121 initnsema(&mp->m_growlock, 1, "xfs_grow");
122 /* 122 /*
123 * Initialize the AIL. 123 * Initialize the AIL.
@@ -646,7 +646,7 @@ xfs_mountfs(
646 646
647 if (mp->m_sb_bp == NULL) { 647 if (mp->m_sb_bp == NULL) {
648 if ((error = xfs_readsb(mp))) { 648 if ((error = xfs_readsb(mp))) {
649 return (error); 649 return error;
650 } 650 }
651 } 651 }
652 xfs_mount_common(mp, sbp); 652 xfs_mount_common(mp, sbp);
@@ -889,7 +889,7 @@ xfs_mountfs(
889 * For client case we are done now 889 * For client case we are done now
890 */ 890 */
891 if (mfsi_flags & XFS_MFSI_CLIENT) { 891 if (mfsi_flags & XFS_MFSI_CLIENT) {
892 return(0); 892 return 0;
893 } 893 }
894 894
895 /* 895 /*
@@ -1077,8 +1077,7 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
1077 1077
1078 xfs_iflush_all(mp); 1078 xfs_iflush_all(mp);
1079 1079
1080 XFS_QM_DQPURGEALL(mp, 1080 XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
1081 XFS_QMOPT_UQUOTA | XFS_QMOPT_GQUOTA | XFS_QMOPT_UMOUNTING);
1082 1081
1083 /* 1082 /*
1084 * Flush out the log synchronously so that we know for sure 1083 * Flush out the log synchronously so that we know for sure
@@ -1183,7 +1182,7 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
1183 xfs_fs_cmn_err(CE_ALERT, mp, "Superblock write error detected while unmounting. Filesystem may not be marked shared readonly"); 1182 xfs_fs_cmn_err(CE_ALERT, mp, "Superblock write error detected while unmounting. Filesystem may not be marked shared readonly");
1184 } 1183 }
1185 xfs_buf_relse(sbp); 1184 xfs_buf_relse(sbp);
1186 return (error); 1185 return error;
1187} 1186}
1188 1187
1189/* 1188/*
@@ -1258,19 +1257,19 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
1258 lcounter += delta; 1257 lcounter += delta;
1259 if (lcounter < 0) { 1258 if (lcounter < 0) {
1260 ASSERT(0); 1259 ASSERT(0);
1261 return (XFS_ERROR(EINVAL)); 1260 return XFS_ERROR(EINVAL);
1262 } 1261 }
1263 mp->m_sb.sb_icount = lcounter; 1262 mp->m_sb.sb_icount = lcounter;
1264 return (0); 1263 return 0;
1265 case XFS_SBS_IFREE: 1264 case XFS_SBS_IFREE:
1266 lcounter = (long long)mp->m_sb.sb_ifree; 1265 lcounter = (long long)mp->m_sb.sb_ifree;
1267 lcounter += delta; 1266 lcounter += delta;
1268 if (lcounter < 0) { 1267 if (lcounter < 0) {
1269 ASSERT(0); 1268 ASSERT(0);
1270 return (XFS_ERROR(EINVAL)); 1269 return XFS_ERROR(EINVAL);
1271 } 1270 }
1272 mp->m_sb.sb_ifree = lcounter; 1271 mp->m_sb.sb_ifree = lcounter;
1273 return (0); 1272 return 0;
1274 case XFS_SBS_FDBLOCKS: 1273 case XFS_SBS_FDBLOCKS:
1275 1274
1276 lcounter = (long long)mp->m_sb.sb_fdblocks; 1275 lcounter = (long long)mp->m_sb.sb_fdblocks;
@@ -1297,101 +1296,101 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
1297 if (rsvd) { 1296 if (rsvd) {
1298 lcounter = (long long)mp->m_resblks_avail + delta; 1297 lcounter = (long long)mp->m_resblks_avail + delta;
1299 if (lcounter < 0) { 1298 if (lcounter < 0) {
1300 return (XFS_ERROR(ENOSPC)); 1299 return XFS_ERROR(ENOSPC);
1301 } 1300 }
1302 mp->m_resblks_avail = lcounter; 1301 mp->m_resblks_avail = lcounter;
1303 return (0); 1302 return 0;
1304 } else { /* not reserved */ 1303 } else { /* not reserved */
1305 return (XFS_ERROR(ENOSPC)); 1304 return XFS_ERROR(ENOSPC);
1306 } 1305 }
1307 } 1306 }
1308 } 1307 }
1309 1308
1310 mp->m_sb.sb_fdblocks = lcounter; 1309 mp->m_sb.sb_fdblocks = lcounter;
1311 return (0); 1310 return 0;
1312 case XFS_SBS_FREXTENTS: 1311 case XFS_SBS_FREXTENTS:
1313 lcounter = (long long)mp->m_sb.sb_frextents; 1312 lcounter = (long long)mp->m_sb.sb_frextents;
1314 lcounter += delta; 1313 lcounter += delta;
1315 if (lcounter < 0) { 1314 if (lcounter < 0) {
1316 return (XFS_ERROR(ENOSPC)); 1315 return XFS_ERROR(ENOSPC);
1317 } 1316 }
1318 mp->m_sb.sb_frextents = lcounter; 1317 mp->m_sb.sb_frextents = lcounter;
1319 return (0); 1318 return 0;
1320 case XFS_SBS_DBLOCKS: 1319 case XFS_SBS_DBLOCKS:
1321 lcounter = (long long)mp->m_sb.sb_dblocks; 1320 lcounter = (long long)mp->m_sb.sb_dblocks;
1322 lcounter += delta; 1321 lcounter += delta;
1323 if (lcounter < 0) { 1322 if (lcounter < 0) {
1324 ASSERT(0); 1323 ASSERT(0);
1325 return (XFS_ERROR(EINVAL)); 1324 return XFS_ERROR(EINVAL);
1326 } 1325 }
1327 mp->m_sb.sb_dblocks = lcounter; 1326 mp->m_sb.sb_dblocks = lcounter;
1328 return (0); 1327 return 0;
1329 case XFS_SBS_AGCOUNT: 1328 case XFS_SBS_AGCOUNT:
1330 scounter = mp->m_sb.sb_agcount; 1329 scounter = mp->m_sb.sb_agcount;
1331 scounter += delta; 1330 scounter += delta;
1332 if (scounter < 0) { 1331 if (scounter < 0) {
1333 ASSERT(0); 1332 ASSERT(0);
1334 return (XFS_ERROR(EINVAL)); 1333 return XFS_ERROR(EINVAL);
1335 } 1334 }
1336 mp->m_sb.sb_agcount = scounter; 1335 mp->m_sb.sb_agcount = scounter;
1337 return (0); 1336 return 0;
1338 case XFS_SBS_IMAX_PCT: 1337 case XFS_SBS_IMAX_PCT:
1339 scounter = mp->m_sb.sb_imax_pct; 1338 scounter = mp->m_sb.sb_imax_pct;
1340 scounter += delta; 1339 scounter += delta;
1341 if (scounter < 0) { 1340 if (scounter < 0) {
1342 ASSERT(0); 1341 ASSERT(0);
1343 return (XFS_ERROR(EINVAL)); 1342 return XFS_ERROR(EINVAL);
1344 } 1343 }
1345 mp->m_sb.sb_imax_pct = scounter; 1344 mp->m_sb.sb_imax_pct = scounter;
1346 return (0); 1345 return 0;
1347 case XFS_SBS_REXTSIZE: 1346 case XFS_SBS_REXTSIZE:
1348 scounter = mp->m_sb.sb_rextsize; 1347 scounter = mp->m_sb.sb_rextsize;
1349 scounter += delta; 1348 scounter += delta;
1350 if (scounter < 0) { 1349 if (scounter < 0) {
1351 ASSERT(0); 1350 ASSERT(0);
1352 return (XFS_ERROR(EINVAL)); 1351 return XFS_ERROR(EINVAL);
1353 } 1352 }
1354 mp->m_sb.sb_rextsize = scounter; 1353 mp->m_sb.sb_rextsize = scounter;
1355 return (0); 1354 return 0;
1356 case XFS_SBS_RBMBLOCKS: 1355 case XFS_SBS_RBMBLOCKS:
1357 scounter = mp->m_sb.sb_rbmblocks; 1356 scounter = mp->m_sb.sb_rbmblocks;
1358 scounter += delta; 1357 scounter += delta;
1359 if (scounter < 0) { 1358 if (scounter < 0) {
1360 ASSERT(0); 1359 ASSERT(0);
1361 return (XFS_ERROR(EINVAL)); 1360 return XFS_ERROR(EINVAL);
1362 } 1361 }
1363 mp->m_sb.sb_rbmblocks = scounter; 1362 mp->m_sb.sb_rbmblocks = scounter;
1364 return (0); 1363 return 0;
1365 case XFS_SBS_RBLOCKS: 1364 case XFS_SBS_RBLOCKS:
1366 lcounter = (long long)mp->m_sb.sb_rblocks; 1365 lcounter = (long long)mp->m_sb.sb_rblocks;
1367 lcounter += delta; 1366 lcounter += delta;
1368 if (lcounter < 0) { 1367 if (lcounter < 0) {
1369 ASSERT(0); 1368 ASSERT(0);
1370 return (XFS_ERROR(EINVAL)); 1369 return XFS_ERROR(EINVAL);
1371 } 1370 }
1372 mp->m_sb.sb_rblocks = lcounter; 1371 mp->m_sb.sb_rblocks = lcounter;
1373 return (0); 1372 return 0;
1374 case XFS_SBS_REXTENTS: 1373 case XFS_SBS_REXTENTS:
1375 lcounter = (long long)mp->m_sb.sb_rextents; 1374 lcounter = (long long)mp->m_sb.sb_rextents;
1376 lcounter += delta; 1375 lcounter += delta;
1377 if (lcounter < 0) { 1376 if (lcounter < 0) {
1378 ASSERT(0); 1377 ASSERT(0);
1379 return (XFS_ERROR(EINVAL)); 1378 return XFS_ERROR(EINVAL);
1380 } 1379 }
1381 mp->m_sb.sb_rextents = lcounter; 1380 mp->m_sb.sb_rextents = lcounter;
1382 return (0); 1381 return 0;
1383 case XFS_SBS_REXTSLOG: 1382 case XFS_SBS_REXTSLOG:
1384 scounter = mp->m_sb.sb_rextslog; 1383 scounter = mp->m_sb.sb_rextslog;
1385 scounter += delta; 1384 scounter += delta;
1386 if (scounter < 0) { 1385 if (scounter < 0) {
1387 ASSERT(0); 1386 ASSERT(0);
1388 return (XFS_ERROR(EINVAL)); 1387 return XFS_ERROR(EINVAL);
1389 } 1388 }
1390 mp->m_sb.sb_rextslog = scounter; 1389 mp->m_sb.sb_rextslog = scounter;
1391 return (0); 1390 return 0;
1392 default: 1391 default:
1393 ASSERT(0); 1392 ASSERT(0);
1394 return (XFS_ERROR(EINVAL)); 1393 return XFS_ERROR(EINVAL);
1395 } 1394 }
1396} 1395}
1397 1396
@@ -1410,7 +1409,7 @@ xfs_mod_incore_sb(xfs_mount_t *mp, xfs_sb_field_t field, int delta, int rsvd)
1410 s = XFS_SB_LOCK(mp); 1409 s = XFS_SB_LOCK(mp);
1411 status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); 1410 status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
1412 XFS_SB_UNLOCK(mp, s); 1411 XFS_SB_UNLOCK(mp, s);
1413 return (status); 1412 return status;
1414} 1413}
1415 1414
1416/* 1415/*
@@ -1471,7 +1470,7 @@ xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd)
1471 } 1470 }
1472 } 1471 }
1473 XFS_SB_UNLOCK(mp, s); 1472 XFS_SB_UNLOCK(mp, s);
1474 return (status); 1473 return status;
1475} 1474}
1476 1475
1477/* 1476/*
@@ -1501,7 +1500,7 @@ xfs_getsb(
1501 } 1500 }
1502 XFS_BUF_HOLD(bp); 1501 XFS_BUF_HOLD(bp);
1503 ASSERT(XFS_BUF_ISDONE(bp)); 1502 ASSERT(XFS_BUF_ISDONE(bp));
1504 return (bp); 1503 return bp;
1505} 1504}
1506 1505
1507/* 1506/*
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 08b2e0a5d80..cd3cf9613a0 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -308,7 +308,6 @@ typedef struct xfs_mount {
308 xfs_buftarg_t *m_ddev_targp; /* saves taking the address */ 308 xfs_buftarg_t *m_ddev_targp; /* saves taking the address */
309 xfs_buftarg_t *m_logdev_targp;/* ptr to log device */ 309 xfs_buftarg_t *m_logdev_targp;/* ptr to log device */
310 xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */ 310 xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */
311#define m_dev m_ddev_targp->pbr_dev
312 __uint8_t m_dircook_elog; /* log d-cookie entry bits */ 311 __uint8_t m_dircook_elog; /* log d-cookie entry bits */
313 __uint8_t m_blkbit_log; /* blocklog + NBBY */ 312 __uint8_t m_blkbit_log; /* blocklog + NBBY */
314 __uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ 313 __uint8_t m_blkbb_log; /* blocklog - BBSHIFT */
@@ -393,7 +392,7 @@ typedef struct xfs_mount {
393 user */ 392 user */
394#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment 393#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment
395 allocations */ 394 allocations */
396#define XFS_MOUNT_COMPAT_ATTR (1ULL << 8) /* do not use attr2 format */ 395#define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */
397 /* (1ULL << 9) -- currently unused */ 396 /* (1ULL << 9) -- currently unused */
398#define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */ 397#define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */
399#define XFS_MOUNT_SHARED (1ULL << 11) /* shared mount */ 398#define XFS_MOUNT_SHARED (1ULL << 11) /* shared mount */
@@ -533,7 +532,7 @@ typedef struct xfs_mod_sb {
533 int msb_delta; /* Change to make to specified field */ 532 int msb_delta; /* Change to make to specified field */
534} xfs_mod_sb_t; 533} xfs_mod_sb_t;
535 534
536#define XFS_MOUNT_ILOCK(mp) mutex_lock(&((mp)->m_ilock), PINOD) 535#define XFS_MOUNT_ILOCK(mp) mutex_lock(&((mp)->m_ilock))
537#define XFS_MOUNT_IUNLOCK(mp) mutex_unlock(&((mp)->m_ilock)) 536#define XFS_MOUNT_IUNLOCK(mp) mutex_unlock(&((mp)->m_ilock))
538#define XFS_SB_LOCK(mp) mutex_spinlock(&(mp)->m_sb_lock) 537#define XFS_SB_LOCK(mp) mutex_spinlock(&(mp)->m_sb_lock)
539#define XFS_SB_UNLOCK(mp,s) mutex_spinunlock(&(mp)->m_sb_lock,(s)) 538#define XFS_SB_UNLOCK(mp,s) mutex_spinunlock(&(mp)->m_sb_lock,(s))
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 4d4e8f4e768..81a05cfd77d 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -243,7 +243,6 @@ xfs_rename(
243 xfs_inode_t *inodes[4]; 243 xfs_inode_t *inodes[4];
244 int target_ip_dropped = 0; /* dropped target_ip link? */ 244 int target_ip_dropped = 0; /* dropped target_ip link? */
245 vnode_t *src_dir_vp; 245 vnode_t *src_dir_vp;
246 bhv_desc_t *target_dir_bdp;
247 int spaceres; 246 int spaceres;
248 int target_link_zero = 0; 247 int target_link_zero = 0;
249 int num_inodes; 248 int num_inodes;
@@ -260,14 +259,12 @@ xfs_rename(
260 * Find the XFS behavior descriptor for the target directory 259 * Find the XFS behavior descriptor for the target directory
261 * vnode since it was not handed to us. 260 * vnode since it was not handed to us.
262 */ 261 */
263 target_dir_bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(target_dir_vp), 262 target_dp = xfs_vtoi(target_dir_vp);
264 &xfs_vnodeops); 263 if (target_dp == NULL) {
265 if (target_dir_bdp == NULL) {
266 return XFS_ERROR(EXDEV); 264 return XFS_ERROR(EXDEV);
267 } 265 }
268 266
269 src_dp = XFS_BHVTOI(src_dir_bdp); 267 src_dp = XFS_BHVTOI(src_dir_bdp);
270 target_dp = XFS_BHVTOI(target_dir_bdp);
271 mp = src_dp->i_mount; 268 mp = src_dp->i_mount;
272 269
273 if (DM_EVENT_ENABLED(src_dir_vp->v_vfsp, src_dp, DM_EVENT_RENAME) || 270 if (DM_EVENT_ENABLED(src_dir_vp->v_vfsp, src_dp, DM_EVENT_RENAME) ||
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index c4b20872f07..a59c102cf21 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -238,6 +238,7 @@ xfs_bioerror_relse(
238 } 238 }
239 return (EIO); 239 return (EIO);
240} 240}
241
241/* 242/*
242 * Prints out an ALERT message about I/O error. 243 * Prints out an ALERT message about I/O error.
243 */ 244 */
@@ -252,11 +253,9 @@ xfs_ioerror_alert(
252 "I/O error in filesystem (\"%s\") meta-data dev %s block 0x%llx" 253 "I/O error in filesystem (\"%s\") meta-data dev %s block 0x%llx"
253 " (\"%s\") error %d buf count %zd", 254 " (\"%s\") error %d buf count %zd",
254 (!mp || !mp->m_fsname) ? "(fs name not set)" : mp->m_fsname, 255 (!mp || !mp->m_fsname) ? "(fs name not set)" : mp->m_fsname,
255 XFS_BUFTARG_NAME(bp->pb_target), 256 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
256 (__uint64_t)blkno, 257 (__uint64_t)blkno, func,
257 func, 258 XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp));
258 XFS_BUF_GETERROR(bp),
259 XFS_BUF_COUNT(bp));
260} 259}
261 260
262/* 261/*
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 4a17d335f89..bf168a91ddb 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -68,18 +68,6 @@ struct xfs_mount;
68 (XFS_SB_VERSION_NUMBITS | \ 68 (XFS_SB_VERSION_NUMBITS | \
69 XFS_SB_VERSION_OKREALFBITS | \ 69 XFS_SB_VERSION_OKREALFBITS | \
70 XFS_SB_VERSION_OKSASHFBITS) 70 XFS_SB_VERSION_OKSASHFBITS)
71#define XFS_SB_VERSION_MKFS(ia,dia,extflag,dirv2,na,sflag,morebits) \
72 (((ia) || (dia) || (extflag) || (dirv2) || (na) || (sflag) || \
73 (morebits)) ? \
74 (XFS_SB_VERSION_4 | \
75 ((ia) ? XFS_SB_VERSION_ALIGNBIT : 0) | \
76 ((dia) ? XFS_SB_VERSION_DALIGNBIT : 0) | \
77 ((extflag) ? XFS_SB_VERSION_EXTFLGBIT : 0) | \
78 ((dirv2) ? XFS_SB_VERSION_DIRV2BIT : 0) | \
79 ((na) ? XFS_SB_VERSION_LOGV2BIT : 0) | \
80 ((sflag) ? XFS_SB_VERSION_SECTORBIT : 0) | \
81 ((morebits) ? XFS_SB_VERSION_MOREBITSBIT : 0)) : \
82 XFS_SB_VERSION_1)
83 71
84/* 72/*
85 * There are two words to hold XFS "feature" bits: the original 73 * There are two words to hold XFS "feature" bits: the original
@@ -105,11 +93,6 @@ struct xfs_mount;
105 (XFS_SB_VERSION2_OKREALFBITS | \ 93 (XFS_SB_VERSION2_OKREALFBITS | \
106 XFS_SB_VERSION2_OKSASHFBITS ) 94 XFS_SB_VERSION2_OKSASHFBITS )
107 95
108/*
109 * mkfs macro to set up sb_features2 word
110 */
111#define XFS_SB_VERSION2_MKFS(resvd1, sbcntr) 0
112
113typedef struct xfs_sb 96typedef struct xfs_sb
114{ 97{
115 __uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */ 98 __uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 279e043d732..d3d714e6b32 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1014,6 +1014,7 @@ xfs_trans_cancel(
1014 xfs_log_item_t *lip; 1014 xfs_log_item_t *lip;
1015 int i; 1015 int i;
1016#endif 1016#endif
1017 xfs_mount_t *mp = tp->t_mountp;
1017 1018
1018 /* 1019 /*
1019 * See if the caller is being too lazy to figure out if 1020 * See if the caller is being too lazy to figure out if
@@ -1026,9 +1027,10 @@ xfs_trans_cancel(
1026 * filesystem. This happens in paths where we detect 1027 * filesystem. This happens in paths where we detect
1027 * corruption and decide to give up. 1028 * corruption and decide to give up.
1028 */ 1029 */
1029 if ((tp->t_flags & XFS_TRANS_DIRTY) && 1030 if ((tp->t_flags & XFS_TRANS_DIRTY) && !XFS_FORCED_SHUTDOWN(mp)) {
1030 !XFS_FORCED_SHUTDOWN(tp->t_mountp)) 1031 XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp);
1031 xfs_force_shutdown(tp->t_mountp, XFS_CORRUPT_INCORE); 1032 xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
1033 }
1032#ifdef DEBUG 1034#ifdef DEBUG
1033 if (!(flags & XFS_TRANS_ABORT)) { 1035 if (!(flags & XFS_TRANS_ABORT)) {
1034 licp = &(tp->t_items); 1036 licp = &(tp->t_items);
@@ -1040,7 +1042,7 @@ xfs_trans_cancel(
1040 } 1042 }
1041 1043
1042 lip = lidp->lid_item; 1044 lip = lidp->lid_item;
1043 if (!XFS_FORCED_SHUTDOWN(tp->t_mountp)) 1045 if (!XFS_FORCED_SHUTDOWN(mp))
1044 ASSERT(!(lip->li_type == XFS_LI_EFD)); 1046 ASSERT(!(lip->li_type == XFS_LI_EFD));
1045 } 1047 }
1046 licp = licp->lic_next; 1048 licp = licp->lic_next;
@@ -1048,7 +1050,7 @@ xfs_trans_cancel(
1048 } 1050 }
1049#endif 1051#endif
1050 xfs_trans_unreserve_and_mod_sb(tp); 1052 xfs_trans_unreserve_and_mod_sb(tp);
1051 XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(tp->t_mountp, tp); 1053 XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(mp, tp);
1052 1054
1053 if (tp->t_ticket) { 1055 if (tp->t_ticket) {
1054 if (flags & XFS_TRANS_RELEASE_LOG_RES) { 1056 if (flags & XFS_TRANS_RELEASE_LOG_RES) {
@@ -1057,7 +1059,7 @@ xfs_trans_cancel(
1057 } else { 1059 } else {
1058 log_flags = 0; 1060 log_flags = 0;
1059 } 1061 }
1060 xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, log_flags); 1062 xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
1061 } 1063 }
1062 1064
1063 /* mark this thread as no longer being in a transaction */ 1065 /* mark this thread as no longer being in a transaction */
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index a889963fdd1..d77901c07f6 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -973,7 +973,6 @@ void xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *);
973void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *); 973void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
974void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *); 974void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
975void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *); 975void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
976void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
977void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); 976void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
978void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); 977void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
979void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); 978void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index 486147ef0e3..1117d600d74 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -78,7 +78,7 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
78 lidp->lid_size = 0; 78 lidp->lid_size = 0;
79 lip->li_desc = lidp; 79 lip->li_desc = lidp;
80 lip->li_mountp = tp->t_mountp; 80 lip->li_mountp = tp->t_mountp;
81 return (lidp); 81 return lidp;
82 } 82 }
83 83
84 /* 84 /*
@@ -119,7 +119,7 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
119 lidp->lid_size = 0; 119 lidp->lid_size = 0;
120 lip->li_desc = lidp; 120 lip->li_desc = lidp;
121 lip->li_mountp = tp->t_mountp; 121 lip->li_mountp = tp->t_mountp;
122 return (lidp); 122 return lidp;
123} 123}
124 124
125/* 125/*
@@ -180,7 +180,7 @@ xfs_trans_find_item(xfs_trans_t *tp, xfs_log_item_t *lip)
180{ 180{
181 ASSERT(lip->li_desc != NULL); 181 ASSERT(lip->li_desc != NULL);
182 182
183 return (lip->li_desc); 183 return lip->li_desc;
184} 184}
185 185
186 186
@@ -219,10 +219,10 @@ xfs_trans_first_item(xfs_trans_t *tp)
219 continue; 219 continue;
220 } 220 }
221 221
222 return (XFS_LIC_SLOT(licp, i)); 222 return XFS_LIC_SLOT(licp, i);
223 } 223 }
224 cmn_err(CE_WARN, "xfs_trans_first_item() -- no first item"); 224 cmn_err(CE_WARN, "xfs_trans_first_item() -- no first item");
225 return(NULL); 225 return NULL;
226} 226}
227 227
228 228
@@ -252,7 +252,7 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
252 continue; 252 continue;
253 } 253 }
254 254
255 return (XFS_LIC_SLOT(licp, i)); 255 return XFS_LIC_SLOT(licp, i);
256 } 256 }
257 257
258 /* 258 /*
@@ -261,7 +261,7 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
261 * If there is no next chunk, return NULL. 261 * If there is no next chunk, return NULL.
262 */ 262 */
263 if (licp->lic_next == NULL) { 263 if (licp->lic_next == NULL) {
264 return (NULL); 264 return NULL;
265 } 265 }
266 266
267 licp = licp->lic_next; 267 licp = licp->lic_next;
@@ -271,7 +271,7 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
271 continue; 271 continue;
272 } 272 }
273 273
274 return (XFS_LIC_SLOT(licp, i)); 274 return XFS_LIC_SLOT(licp, i);
275 } 275 }
276 ASSERT(0); 276 ASSERT(0);
277 /* NOTREACHED */ 277 /* NOTREACHED */
@@ -425,7 +425,7 @@ xfs_trans_unlock_chunk(
425 } 425 }
426 } 426 }
427 427
428 return (freed); 428 return freed;
429} 429}
430 430
431 431
@@ -478,7 +478,7 @@ xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx)
478 */ 478 */
479 lbsp->lbc_ag = ag; 479 lbsp->lbc_ag = ag;
480 lbsp->lbc_idx = idx; 480 lbsp->lbc_idx = idx;
481 return (lbsp); 481 return lbsp;
482 } 482 }
483 483
484 /* 484 /*
@@ -512,7 +512,7 @@ xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx)
512 tp->t_busy_free--; 512 tp->t_busy_free--;
513 lbsp->lbc_ag = ag; 513 lbsp->lbc_ag = ag;
514 lbsp->lbc_idx = idx; 514 lbsp->lbc_idx = idx;
515 return (lbsp); 515 return lbsp;
516} 516}
517 517
518 518
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index fefe1d60377..34654ec6ae1 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -55,16 +55,13 @@ xfs_get_dir_entry(
55 xfs_inode_t **ipp) 55 xfs_inode_t **ipp)
56{ 56{
57 vnode_t *vp; 57 vnode_t *vp;
58 bhv_desc_t *bdp;
59 58
60 vp = VNAME_TO_VNODE(dentry); 59 vp = VNAME_TO_VNODE(dentry);
61 bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(vp), &xfs_vnodeops); 60
62 if (!bdp) { 61 *ipp = xfs_vtoi(vp);
63 *ipp = NULL; 62 if (!*ipp)
64 return XFS_ERROR(ENOENT); 63 return XFS_ERROR(ENOENT);
65 }
66 VN_HOLD(vp); 64 VN_HOLD(vp);
67 *ipp = XFS_BHVTOI(bdp);
68 return 0; 65 return 0;
69} 66}
70 67
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 7bdbd991ab1..b6ad370fab3 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -53,6 +53,7 @@
53#include "xfs_acl.h" 53#include "xfs_acl.h"
54#include "xfs_attr.h" 54#include "xfs_attr.h"
55#include "xfs_clnt.h" 55#include "xfs_clnt.h"
56#include "xfs_fsops.h"
56 57
57STATIC int xfs_sync(bhv_desc_t *, int, cred_t *); 58STATIC int xfs_sync(bhv_desc_t *, int, cred_t *);
58 59
@@ -290,8 +291,8 @@ xfs_start_flags(
290 mp->m_flags |= XFS_MOUNT_IDELETE; 291 mp->m_flags |= XFS_MOUNT_IDELETE;
291 if (ap->flags & XFSMNT_DIRSYNC) 292 if (ap->flags & XFSMNT_DIRSYNC)
292 mp->m_flags |= XFS_MOUNT_DIRSYNC; 293 mp->m_flags |= XFS_MOUNT_DIRSYNC;
293 if (ap->flags & XFSMNT_COMPAT_ATTR) 294 if (ap->flags & XFSMNT_ATTR2)
294 mp->m_flags |= XFS_MOUNT_COMPAT_ATTR; 295 mp->m_flags |= XFS_MOUNT_ATTR2;
295 296
296 if (ap->flags2 & XFSMNT2_COMPAT_IOSIZE) 297 if (ap->flags2 & XFSMNT2_COMPAT_IOSIZE)
297 mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; 298 mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
@@ -312,6 +313,8 @@ xfs_start_flags(
312 mp->m_flags |= XFS_MOUNT_NOUUID; 313 mp->m_flags |= XFS_MOUNT_NOUUID;
313 if (ap->flags & XFSMNT_BARRIER) 314 if (ap->flags & XFSMNT_BARRIER)
314 mp->m_flags |= XFS_MOUNT_BARRIER; 315 mp->m_flags |= XFS_MOUNT_BARRIER;
316 else
317 mp->m_flags &= ~XFS_MOUNT_BARRIER;
315 318
316 return 0; 319 return 0;
317} 320}
@@ -330,10 +333,11 @@ xfs_finish_flags(
330 333
331 /* Fail a mount where the logbuf is smaller then the log stripe */ 334 /* Fail a mount where the logbuf is smaller then the log stripe */
332 if (XFS_SB_VERSION_HASLOGV2(&mp->m_sb)) { 335 if (XFS_SB_VERSION_HASLOGV2(&mp->m_sb)) {
333 if ((ap->logbufsize == -1) && 336 if ((ap->logbufsize <= 0) &&
334 (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) { 337 (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) {
335 mp->m_logbsize = mp->m_sb.sb_logsunit; 338 mp->m_logbsize = mp->m_sb.sb_logsunit;
336 } else if (ap->logbufsize < mp->m_sb.sb_logsunit) { 339 } else if (ap->logbufsize > 0 &&
340 ap->logbufsize < mp->m_sb.sb_logsunit) {
337 cmn_err(CE_WARN, 341 cmn_err(CE_WARN,
338 "XFS: logbuf size must be greater than or equal to log stripe size"); 342 "XFS: logbuf size must be greater than or equal to log stripe size");
339 return XFS_ERROR(EINVAL); 343 return XFS_ERROR(EINVAL);
@@ -347,6 +351,10 @@ xfs_finish_flags(
347 } 351 }
348 } 352 }
349 353
354 if (XFS_SB_VERSION_HASATTR2(&mp->m_sb)) {
355 mp->m_flags |= XFS_MOUNT_ATTR2;
356 }
357
350 /* 358 /*
351 * prohibit r/w mounts of read-only filesystems 359 * prohibit r/w mounts of read-only filesystems
352 */ 360 */
@@ -382,10 +390,6 @@ xfs_finish_flags(
382 return XFS_ERROR(EINVAL); 390 return XFS_ERROR(EINVAL);
383 } 391 }
384 392
385 if (XFS_SB_VERSION_HASATTR2(&mp->m_sb)) {
386 mp->m_flags &= ~XFS_MOUNT_COMPAT_ATTR;
387 }
388
389 return 0; 393 return 0;
390} 394}
391 395
@@ -504,13 +508,13 @@ xfs_mount(
504 if (error) 508 if (error)
505 goto error2; 509 goto error2;
506 510
511 if ((mp->m_flags & XFS_MOUNT_BARRIER) && !(vfsp->vfs_flag & VFS_RDONLY))
512 xfs_mountfs_check_barriers(mp);
513
507 error = XFS_IOINIT(vfsp, args, flags); 514 error = XFS_IOINIT(vfsp, args, flags);
508 if (error) 515 if (error)
509 goto error2; 516 goto error2;
510 517
511 if ((args->flags & XFSMNT_BARRIER) &&
512 !(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY))
513 xfs_mountfs_check_barriers(mp);
514 return 0; 518 return 0;
515 519
516error2: 520error2:
@@ -655,6 +659,11 @@ xfs_mntupdate(
655 else 659 else
656 mp->m_flags &= ~XFS_MOUNT_NOATIME; 660 mp->m_flags &= ~XFS_MOUNT_NOATIME;
657 661
662 if (args->flags & XFSMNT_BARRIER)
663 mp->m_flags |= XFS_MOUNT_BARRIER;
664 else
665 mp->m_flags &= ~XFS_MOUNT_BARRIER;
666
658 if ((vfsp->vfs_flag & VFS_RDONLY) && 667 if ((vfsp->vfs_flag & VFS_RDONLY) &&
659 !(*flags & MS_RDONLY)) { 668 !(*flags & MS_RDONLY)) {
660 vfsp->vfs_flag &= ~VFS_RDONLY; 669 vfsp->vfs_flag &= ~VFS_RDONLY;
@@ -1634,6 +1643,7 @@ xfs_vget(
1634#define MNTOPT_NORECOVERY "norecovery" /* don't run XFS recovery */ 1643#define MNTOPT_NORECOVERY "norecovery" /* don't run XFS recovery */
1635#define MNTOPT_BARRIER "barrier" /* use writer barriers for log write and 1644#define MNTOPT_BARRIER "barrier" /* use writer barriers for log write and
1636 * unwritten extent conversion */ 1645 * unwritten extent conversion */
1646#define MNTOPT_NOBARRIER "nobarrier" /* .. disable */
1637#define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */ 1647#define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */
1638#define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */ 1648#define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */
1639#define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */ 1649#define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */
@@ -1680,7 +1690,6 @@ xfs_parseargs(
1680 int iosize; 1690 int iosize;
1681 1691
1682 args->flags2 |= XFSMNT2_COMPAT_IOSIZE; 1692 args->flags2 |= XFSMNT2_COMPAT_IOSIZE;
1683 args->flags |= XFSMNT_COMPAT_ATTR;
1684 1693
1685#if 0 /* XXX: off by default, until some remaining issues ironed out */ 1694#if 0 /* XXX: off by default, until some remaining issues ironed out */
1686 args->flags |= XFSMNT_IDELETE; /* default to on */ 1695 args->flags |= XFSMNT_IDELETE; /* default to on */
@@ -1806,6 +1815,8 @@ xfs_parseargs(
1806 args->flags |= XFSMNT_NOUUID; 1815 args->flags |= XFSMNT_NOUUID;
1807 } else if (!strcmp(this_char, MNTOPT_BARRIER)) { 1816 } else if (!strcmp(this_char, MNTOPT_BARRIER)) {
1808 args->flags |= XFSMNT_BARRIER; 1817 args->flags |= XFSMNT_BARRIER;
1818 } else if (!strcmp(this_char, MNTOPT_NOBARRIER)) {
1819 args->flags &= ~XFSMNT_BARRIER;
1809 } else if (!strcmp(this_char, MNTOPT_IKEEP)) { 1820 } else if (!strcmp(this_char, MNTOPT_IKEEP)) {
1810 args->flags &= ~XFSMNT_IDELETE; 1821 args->flags &= ~XFSMNT_IDELETE;
1811 } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) { 1822 } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
@@ -1815,9 +1826,9 @@ xfs_parseargs(
1815 } else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) { 1826 } else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) {
1816 args->flags2 |= XFSMNT2_COMPAT_IOSIZE; 1827 args->flags2 |= XFSMNT2_COMPAT_IOSIZE;
1817 } else if (!strcmp(this_char, MNTOPT_ATTR2)) { 1828 } else if (!strcmp(this_char, MNTOPT_ATTR2)) {
1818 args->flags &= ~XFSMNT_COMPAT_ATTR; 1829 args->flags |= XFSMNT_ATTR2;
1819 } else if (!strcmp(this_char, MNTOPT_NOATTR2)) { 1830 } else if (!strcmp(this_char, MNTOPT_NOATTR2)) {
1820 args->flags |= XFSMNT_COMPAT_ATTR; 1831 args->flags &= ~XFSMNT_ATTR2;
1821 } else if (!strcmp(this_char, "osyncisdsync")) { 1832 } else if (!strcmp(this_char, "osyncisdsync")) {
1822 /* no-op, this is now the default */ 1833 /* no-op, this is now the default */
1823printk("XFS: osyncisdsync is now the default, option is deprecated.\n"); 1834printk("XFS: osyncisdsync is now the default, option is deprecated.\n");
@@ -1892,7 +1903,6 @@ xfs_showargs(
1892 { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID }, 1903 { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID },
1893 { XFS_MOUNT_NORECOVERY, "," MNTOPT_NORECOVERY }, 1904 { XFS_MOUNT_NORECOVERY, "," MNTOPT_NORECOVERY },
1894 { XFS_MOUNT_OSYNCISOSYNC, "," MNTOPT_OSYNCISOSYNC }, 1905 { XFS_MOUNT_OSYNCISOSYNC, "," MNTOPT_OSYNCISOSYNC },
1895 { XFS_MOUNT_BARRIER, "," MNTOPT_BARRIER },
1896 { XFS_MOUNT_IDELETE, "," MNTOPT_NOIKEEP }, 1906 { XFS_MOUNT_IDELETE, "," MNTOPT_NOIKEEP },
1897 { 0, NULL } 1907 { 0, NULL }
1898 }; 1908 };
@@ -1914,33 +1924,28 @@ xfs_showargs(
1914 1924
1915 if (mp->m_logbufs > 0) 1925 if (mp->m_logbufs > 0)
1916 seq_printf(m, "," MNTOPT_LOGBUFS "=%d", mp->m_logbufs); 1926 seq_printf(m, "," MNTOPT_LOGBUFS "=%d", mp->m_logbufs);
1917
1918 if (mp->m_logbsize > 0) 1927 if (mp->m_logbsize > 0)
1919 seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10); 1928 seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10);
1920 1929
1921 if (mp->m_logname) 1930 if (mp->m_logname)
1922 seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname); 1931 seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname);
1923
1924 if (mp->m_rtname) 1932 if (mp->m_rtname)
1925 seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname); 1933 seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname);
1926 1934
1927 if (mp->m_dalign > 0) 1935 if (mp->m_dalign > 0)
1928 seq_printf(m, "," MNTOPT_SUNIT "=%d", 1936 seq_printf(m, "," MNTOPT_SUNIT "=%d",
1929 (int)XFS_FSB_TO_BB(mp, mp->m_dalign)); 1937 (int)XFS_FSB_TO_BB(mp, mp->m_dalign));
1930
1931 if (mp->m_swidth > 0) 1938 if (mp->m_swidth > 0)
1932 seq_printf(m, "," MNTOPT_SWIDTH "=%d", 1939 seq_printf(m, "," MNTOPT_SWIDTH "=%d",
1933 (int)XFS_FSB_TO_BB(mp, mp->m_swidth)); 1940 (int)XFS_FSB_TO_BB(mp, mp->m_swidth));
1934 1941
1935 if (!(mp->m_flags & XFS_MOUNT_COMPAT_ATTR))
1936 seq_printf(m, "," MNTOPT_ATTR2);
1937
1938 if (!(mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE)) 1942 if (!(mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE))
1939 seq_printf(m, "," MNTOPT_LARGEIO); 1943 seq_printf(m, "," MNTOPT_LARGEIO);
1944 if (mp->m_flags & XFS_MOUNT_BARRIER)
1945 seq_printf(m, "," MNTOPT_BARRIER);
1940 1946
1941 if (!(vfsp->vfs_flag & VFS_32BITINODES)) 1947 if (!(vfsp->vfs_flag & VFS_32BITINODES))
1942 seq_printf(m, "," MNTOPT_64BITINODE); 1948 seq_printf(m, "," MNTOPT_64BITINODE);
1943
1944 if (vfsp->vfs_flag & VFS_GRPID) 1949 if (vfsp->vfs_flag & VFS_GRPID)
1945 seq_printf(m, "," MNTOPT_GRPID); 1950 seq_printf(m, "," MNTOPT_GRPID);
1946 1951
@@ -1959,6 +1964,7 @@ xfs_freeze(
1959 /* Push the superblock and write an unmount record */ 1964 /* Push the superblock and write an unmount record */
1960 xfs_log_unmount_write(mp); 1965 xfs_log_unmount_write(mp);
1961 xfs_unmountfs_writesb(mp); 1966 xfs_unmountfs_writesb(mp);
1967 xfs_fs_log_dummy(mp);
1962} 1968}
1963 1969
1964 1970
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 7c1f7453146..eaab355f5a8 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -15,6 +15,9 @@
15 * along with this program; if not, write the Free Software Foundation, 15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18
19#include <linux/capability.h>
20
18#include "xfs.h" 21#include "xfs.h"
19#include "xfs_fs.h" 22#include "xfs_fs.h"
20#include "xfs_types.h" 23#include "xfs_types.h"
@@ -182,8 +185,7 @@ xfs_getattr(
182 break; 185 break;
183 } 186 }
184 187
185 vap->va_atime.tv_sec = ip->i_d.di_atime.t_sec; 188 vn_atime_to_timespec(vp, &vap->va_atime);
186 vap->va_atime.tv_nsec = ip->i_d.di_atime.t_nsec;
187 vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec; 189 vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
188 vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; 190 vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
189 vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec; 191 vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
@@ -336,7 +338,7 @@ xfs_setattr(
336 code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, projid, qflags, 338 code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, projid, qflags,
337 &udqp, &gdqp); 339 &udqp, &gdqp);
338 if (code) 340 if (code)
339 return (code); 341 return code;
340 } 342 }
341 343
342 /* 344 /*
@@ -541,24 +543,6 @@ xfs_setattr(
541 } 543 }
542 544
543 /* 545 /*
544 * Can't set extent size unless the file is marked, or
545 * about to be marked as a realtime file.
546 *
547 * This check will be removed when fixed size extents
548 * with buffered data writes is implemented.
549 *
550 */
551 if ((mask & XFS_AT_EXTSIZE) &&
552 ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
553 vap->va_extsize) &&
554 (!((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
555 ((mask & XFS_AT_XFLAGS) &&
556 (vap->va_xflags & XFS_XFLAG_REALTIME))))) {
557 code = XFS_ERROR(EINVAL);
558 goto error_return;
559 }
560
561 /*
562 * Can't change realtime flag if any extents are allocated. 546 * Can't change realtime flag if any extents are allocated.
563 */ 547 */
564 if ((ip->i_d.di_nextents || ip->i_delayed_blks) && 548 if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
@@ -820,13 +804,17 @@ xfs_setattr(
820 di_flags |= XFS_DIFLAG_RTINHERIT; 804 di_flags |= XFS_DIFLAG_RTINHERIT;
821 if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS) 805 if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS)
822 di_flags |= XFS_DIFLAG_NOSYMLINKS; 806 di_flags |= XFS_DIFLAG_NOSYMLINKS;
823 } else { 807 if (vap->va_xflags & XFS_XFLAG_EXTSZINHERIT)
808 di_flags |= XFS_DIFLAG_EXTSZINHERIT;
809 } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
824 if (vap->va_xflags & XFS_XFLAG_REALTIME) { 810 if (vap->va_xflags & XFS_XFLAG_REALTIME) {
825 di_flags |= XFS_DIFLAG_REALTIME; 811 di_flags |= XFS_DIFLAG_REALTIME;
826 ip->i_iocore.io_flags |= XFS_IOCORE_RT; 812 ip->i_iocore.io_flags |= XFS_IOCORE_RT;
827 } else { 813 } else {
828 ip->i_iocore.io_flags &= ~XFS_IOCORE_RT; 814 ip->i_iocore.io_flags &= ~XFS_IOCORE_RT;
829 } 815 }
816 if (vap->va_xflags & XFS_XFLAG_EXTSIZE)
817 di_flags |= XFS_DIFLAG_EXTSIZE;
830 } 818 }
831 ip->i_d.di_flags = di_flags; 819 ip->i_d.di_flags = di_flags;
832 } 820 }
@@ -996,10 +984,6 @@ xfs_readlink(
996 goto error_return; 984 goto error_return;
997 } 985 }
998 986
999 if (!(ioflags & IO_INVIS)) {
1000 xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
1001 }
1002
1003 /* 987 /*
1004 * See if the symlink is stored inline. 988 * See if the symlink is stored inline.
1005 */ 989 */
@@ -1043,11 +1027,8 @@ xfs_readlink(
1043 1027
1044 } 1028 }
1045 1029
1046
1047error_return: 1030error_return:
1048
1049 xfs_iunlock(ip, XFS_ILOCK_SHARED); 1031 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1050
1051 return error; 1032 return error;
1052} 1033}
1053 1034
@@ -1222,7 +1203,7 @@ xfs_inactive_free_eofblocks(
1222 last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); 1203 last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1223 map_len = last_fsb - end_fsb; 1204 map_len = last_fsb - end_fsb;
1224 if (map_len <= 0) 1205 if (map_len <= 0)
1225 return (0); 1206 return 0;
1226 1207
1227 nimaps = 1; 1208 nimaps = 1;
1228 xfs_ilock(ip, XFS_ILOCK_SHARED); 1209 xfs_ilock(ip, XFS_ILOCK_SHARED);
@@ -1231,12 +1212,13 @@ xfs_inactive_free_eofblocks(
1231 xfs_iunlock(ip, XFS_ILOCK_SHARED); 1212 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1232 1213
1233 if (!error && (nimaps != 0) && 1214 if (!error && (nimaps != 0) &&
1234 (imap.br_startblock != HOLESTARTBLOCK)) { 1215 (imap.br_startblock != HOLESTARTBLOCK ||
1216 ip->i_delayed_blks)) {
1235 /* 1217 /*
1236 * Attach the dquots to the inode up front. 1218 * Attach the dquots to the inode up front.
1237 */ 1219 */
1238 if ((error = XFS_QM_DQATTACH(mp, ip, 0))) 1220 if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1239 return (error); 1221 return error;
1240 1222
1241 /* 1223 /*
1242 * There are blocks after the end of file. 1224 * There are blocks after the end of file.
@@ -1264,7 +1246,7 @@ xfs_inactive_free_eofblocks(
1264 ASSERT(XFS_FORCED_SHUTDOWN(mp)); 1246 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1265 xfs_trans_cancel(tp, 0); 1247 xfs_trans_cancel(tp, 0);
1266 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 1248 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1267 return (error); 1249 return error;
1268 } 1250 }
1269 1251
1270 xfs_ilock(ip, XFS_ILOCK_EXCL); 1252 xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1292,7 +1274,7 @@ xfs_inactive_free_eofblocks(
1292 } 1274 }
1293 xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 1275 xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1294 } 1276 }
1295 return (error); 1277 return error;
1296} 1278}
1297 1279
1298/* 1280/*
@@ -1470,7 +1452,7 @@ xfs_inactive_symlink_local(
1470 if (error) { 1452 if (error) {
1471 xfs_trans_cancel(*tpp, 0); 1453 xfs_trans_cancel(*tpp, 0);
1472 *tpp = NULL; 1454 *tpp = NULL;
1473 return (error); 1455 return error;
1474 } 1456 }
1475 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 1457 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1476 1458
@@ -1483,7 +1465,7 @@ xfs_inactive_symlink_local(
1483 XFS_DATA_FORK); 1465 XFS_DATA_FORK);
1484 ASSERT(ip->i_df.if_bytes == 0); 1466 ASSERT(ip->i_df.if_bytes == 0);
1485 } 1467 }
1486 return (0); 1468 return 0;
1487} 1469}
1488 1470
1489/* 1471/*
@@ -1509,7 +1491,7 @@ xfs_inactive_attrs(
1509 if (error) { 1491 if (error) {
1510 *tpp = NULL; 1492 *tpp = NULL;
1511 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 1493 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1512 return (error); /* goto out*/ 1494 return error; /* goto out */
1513 } 1495 }
1514 1496
1515 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); 1497 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
@@ -1522,7 +1504,7 @@ xfs_inactive_attrs(
1522 xfs_trans_cancel(tp, 0); 1504 xfs_trans_cancel(tp, 0);
1523 *tpp = NULL; 1505 *tpp = NULL;
1524 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 1506 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1525 return (error); 1507 return error;
1526 } 1508 }
1527 1509
1528 xfs_ilock(ip, XFS_ILOCK_EXCL); 1510 xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1533,7 +1515,7 @@ xfs_inactive_attrs(
1533 ASSERT(ip->i_d.di_anextents == 0); 1515 ASSERT(ip->i_d.di_anextents == 0);
1534 1516
1535 *tpp = tp; 1517 *tpp = tp;
1536 return (0); 1518 return 0;
1537} 1519}
1538 1520
1539STATIC int 1521STATIC int
@@ -1566,11 +1548,13 @@ xfs_release(
1566 1548
1567 if (ip->i_d.di_nlink != 0) { 1549 if (ip->i_d.di_nlink != 0) {
1568 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && 1550 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1569 ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0)) && 1551 ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0 ||
1552 ip->i_delayed_blks > 0)) &&
1570 (ip->i_df.if_flags & XFS_IFEXTENTS)) && 1553 (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
1571 (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)))) { 1554 (!(ip->i_d.di_flags &
1555 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
1572 if ((error = xfs_inactive_free_eofblocks(mp, ip))) 1556 if ((error = xfs_inactive_free_eofblocks(mp, ip)))
1573 return (error); 1557 return error;
1574 /* Update linux inode block count after free above */ 1558 /* Update linux inode block count after free above */
1575 LINVFS_GET_IP(vp)->i_blocks = XFS_FSB_TO_BB(mp, 1559 LINVFS_GET_IP(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1576 ip->i_d.di_nblocks + ip->i_delayed_blks); 1560 ip->i_d.di_nblocks + ip->i_delayed_blks);
@@ -1625,7 +1609,8 @@ xfs_inactive(
1625 * only one with a reference to the inode. 1609 * only one with a reference to the inode.
1626 */ 1610 */
1627 truncate = ((ip->i_d.di_nlink == 0) && 1611 truncate = ((ip->i_d.di_nlink == 0) &&
1628 ((ip->i_d.di_size != 0) || (ip->i_d.di_nextents > 0)) && 1612 ((ip->i_d.di_size != 0) || (ip->i_d.di_nextents > 0) ||
1613 (ip->i_delayed_blks > 0)) &&
1629 ((ip->i_d.di_mode & S_IFMT) == S_IFREG)); 1614 ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
1630 1615
1631 mp = ip->i_mount; 1616 mp = ip->i_mount;
@@ -1643,12 +1628,14 @@ xfs_inactive(
1643 1628
1644 if (ip->i_d.di_nlink != 0) { 1629 if (ip->i_d.di_nlink != 0) {
1645 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && 1630 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1646 ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0)) && 1631 ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0 ||
1647 (ip->i_df.if_flags & XFS_IFEXTENTS)) && 1632 ip->i_delayed_blks > 0)) &&
1648 (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)) || 1633 (ip->i_df.if_flags & XFS_IFEXTENTS) &&
1649 (ip->i_delayed_blks != 0))) { 1634 (!(ip->i_d.di_flags &
1635 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
1636 (ip->i_delayed_blks != 0)))) {
1650 if ((error = xfs_inactive_free_eofblocks(mp, ip))) 1637 if ((error = xfs_inactive_free_eofblocks(mp, ip)))
1651 return (VN_INACTIVE_CACHE); 1638 return VN_INACTIVE_CACHE;
1652 /* Update linux inode block count after free above */ 1639 /* Update linux inode block count after free above */
1653 LINVFS_GET_IP(vp)->i_blocks = XFS_FSB_TO_BB(mp, 1640 LINVFS_GET_IP(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1654 ip->i_d.di_nblocks + ip->i_delayed_blks); 1641 ip->i_d.di_nblocks + ip->i_delayed_blks);
@@ -1659,7 +1646,7 @@ xfs_inactive(
1659 ASSERT(ip->i_d.di_nlink == 0); 1646 ASSERT(ip->i_d.di_nlink == 0);
1660 1647
1661 if ((error = XFS_QM_DQATTACH(mp, ip, 0))) 1648 if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1662 return (VN_INACTIVE_CACHE); 1649 return VN_INACTIVE_CACHE;
1663 1650
1664 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); 1651 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1665 if (truncate) { 1652 if (truncate) {
@@ -1682,7 +1669,7 @@ xfs_inactive(
1682 ASSERT(XFS_FORCED_SHUTDOWN(mp)); 1669 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1683 xfs_trans_cancel(tp, 0); 1670 xfs_trans_cancel(tp, 0);
1684 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 1671 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1685 return (VN_INACTIVE_CACHE); 1672 return VN_INACTIVE_CACHE;
1686 } 1673 }
1687 1674
1688 xfs_ilock(ip, XFS_ILOCK_EXCL); 1675 xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1703,7 +1690,7 @@ xfs_inactive(
1703 xfs_trans_cancel(tp, 1690 xfs_trans_cancel(tp,
1704 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 1691 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1705 xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 1692 xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1706 return (VN_INACTIVE_CACHE); 1693 return VN_INACTIVE_CACHE;
1707 } 1694 }
1708 } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) { 1695 } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) {
1709 1696
@@ -1717,7 +1704,7 @@ xfs_inactive(
1717 1704
1718 if (error) { 1705 if (error) {
1719 ASSERT(tp == NULL); 1706 ASSERT(tp == NULL);
1720 return (VN_INACTIVE_CACHE); 1707 return VN_INACTIVE_CACHE;
1721 } 1708 }
1722 1709
1723 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 1710 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
@@ -1730,7 +1717,7 @@ xfs_inactive(
1730 if (error) { 1717 if (error) {
1731 ASSERT(XFS_FORCED_SHUTDOWN(mp)); 1718 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1732 xfs_trans_cancel(tp, 0); 1719 xfs_trans_cancel(tp, 0);
1733 return (VN_INACTIVE_CACHE); 1720 return VN_INACTIVE_CACHE;
1734 } 1721 }
1735 1722
1736 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 1723 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
@@ -1752,7 +1739,7 @@ xfs_inactive(
1752 * cancelled, and the inode is unlocked. Just get out. 1739 * cancelled, and the inode is unlocked. Just get out.
1753 */ 1740 */
1754 if (error) 1741 if (error)
1755 return (VN_INACTIVE_CACHE); 1742 return VN_INACTIVE_CACHE;
1756 } else if (ip->i_afp) { 1743 } else if (ip->i_afp) {
1757 xfs_idestroy_fork(ip, XFS_ATTR_FORK); 1744 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1758 } 1745 }
@@ -2059,8 +2046,8 @@ std_return:
2059 abort_return: 2046 abort_return:
2060 cancel_flags |= XFS_TRANS_ABORT; 2047 cancel_flags |= XFS_TRANS_ABORT;
2061 /* FALLTHROUGH */ 2048 /* FALLTHROUGH */
2062 error_return:
2063 2049
2050 error_return:
2064 if (tp != NULL) 2051 if (tp != NULL)
2065 xfs_trans_cancel(tp, cancel_flags); 2052 xfs_trans_cancel(tp, cancel_flags);
2066 2053
@@ -2590,7 +2577,6 @@ xfs_link(
2590 int cancel_flags; 2577 int cancel_flags;
2591 int committed; 2578 int committed;
2592 vnode_t *target_dir_vp; 2579 vnode_t *target_dir_vp;
2593 bhv_desc_t *src_bdp;
2594 int resblks; 2580 int resblks;
2595 char *target_name = VNAME(dentry); 2581 char *target_name = VNAME(dentry);
2596 int target_namelen; 2582 int target_namelen;
@@ -2603,8 +2589,7 @@ xfs_link(
2603 if (VN_ISDIR(src_vp)) 2589 if (VN_ISDIR(src_vp))
2604 return XFS_ERROR(EPERM); 2590 return XFS_ERROR(EPERM);
2605 2591
2606 src_bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(src_vp), &xfs_vnodeops); 2592 sip = xfs_vtoi(src_vp);
2607 sip = XFS_BHVTOI(src_bdp);
2608 tdp = XFS_BHVTOI(target_dir_bdp); 2593 tdp = XFS_BHVTOI(target_dir_bdp);
2609 mp = tdp->i_mount; 2594 mp = tdp->i_mount;
2610 if (XFS_FORCED_SHUTDOWN(mp)) 2595 if (XFS_FORCED_SHUTDOWN(mp))
@@ -2736,9 +2721,9 @@ std_return:
2736 abort_return: 2721 abort_return:
2737 cancel_flags |= XFS_TRANS_ABORT; 2722 cancel_flags |= XFS_TRANS_ABORT;
2738 /* FALLTHROUGH */ 2723 /* FALLTHROUGH */
2724
2739 error_return: 2725 error_return:
2740 xfs_trans_cancel(tp, cancel_flags); 2726 xfs_trans_cancel(tp, cancel_flags);
2741
2742 goto std_return; 2727 goto std_return;
2743} 2728}
2744/* 2729/*
@@ -3211,10 +3196,12 @@ std_return:
3211 } 3196 }
3212 return error; 3197 return error;
3213 3198
3214 error1: 3199error1:
3215 xfs_bmap_cancel(&free_list); 3200 xfs_bmap_cancel(&free_list);
3216 cancel_flags |= XFS_TRANS_ABORT; 3201 cancel_flags |= XFS_TRANS_ABORT;
3217 error_return: 3202 /* FALLTHROUGH */
3203
3204error_return:
3218 xfs_trans_cancel(tp, cancel_flags); 3205 xfs_trans_cancel(tp, cancel_flags);
3219 goto std_return; 3206 goto std_return;
3220} 3207}
@@ -3237,7 +3224,6 @@ xfs_readdir(
3237 xfs_trans_t *tp = NULL; 3224 xfs_trans_t *tp = NULL;
3238 int error = 0; 3225 int error = 0;
3239 uint lock_mode; 3226 uint lock_mode;
3240 xfs_off_t start_offset;
3241 3227
3242 vn_trace_entry(BHV_TO_VNODE(dir_bdp), __FUNCTION__, 3228 vn_trace_entry(BHV_TO_VNODE(dir_bdp), __FUNCTION__,
3243 (inst_t *)__return_address); 3229 (inst_t *)__return_address);
@@ -3248,11 +3234,7 @@ xfs_readdir(
3248 } 3234 }
3249 3235
3250 lock_mode = xfs_ilock_map_shared(dp); 3236 lock_mode = xfs_ilock_map_shared(dp);
3251 start_offset = uiop->uio_offset;
3252 error = XFS_DIR_GETDENTS(dp->i_mount, tp, dp, uiop, eofp); 3237 error = XFS_DIR_GETDENTS(dp->i_mount, tp, dp, uiop, eofp);
3253 if (start_offset != uiop->uio_offset) {
3254 xfs_ichgtime(dp, XFS_ICHGTIME_ACC);
3255 }
3256 xfs_iunlock_map_shared(dp, lock_mode); 3238 xfs_iunlock_map_shared(dp, lock_mode);
3257 return error; 3239 return error;
3258} 3240}
@@ -3635,9 +3617,9 @@ xfs_rwlock(
3635 if (locktype == VRWLOCK_WRITE) { 3617 if (locktype == VRWLOCK_WRITE) {
3636 xfs_ilock(ip, XFS_IOLOCK_EXCL); 3618 xfs_ilock(ip, XFS_IOLOCK_EXCL);
3637 } else if (locktype == VRWLOCK_TRY_READ) { 3619 } else if (locktype == VRWLOCK_TRY_READ) {
3638 return (xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)); 3620 return xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED);
3639 } else if (locktype == VRWLOCK_TRY_WRITE) { 3621 } else if (locktype == VRWLOCK_TRY_WRITE) {
3640 return (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)); 3622 return xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL);
3641 } else { 3623 } else {
3642 ASSERT((locktype == VRWLOCK_READ) || 3624 ASSERT((locktype == VRWLOCK_READ) ||
3643 (locktype == VRWLOCK_WRITE_DIRECT)); 3625 (locktype == VRWLOCK_WRITE_DIRECT));
@@ -3829,7 +3811,12 @@ xfs_reclaim(
3829 vn_iowait(vp); 3811 vn_iowait(vp);
3830 3812
3831 ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); 3813 ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
3832 ASSERT(VN_CACHED(vp) == 0); 3814
3815 /*
3816 * Make sure the atime in the XFS inode is correct before freeing the
3817 * Linux inode.
3818 */
3819 xfs_synchronize_atime(ip);
3833 3820
3834 /* If we have nothing to flush with this inode then complete the 3821 /* If we have nothing to flush with this inode then complete the
3835 * teardown now, otherwise break the link between the xfs inode 3822 * teardown now, otherwise break the link between the xfs inode
@@ -3880,7 +3867,7 @@ xfs_finish_reclaim(
3880 xfs_ifunlock(ip); 3867 xfs_ifunlock(ip);
3881 xfs_iunlock(ip, XFS_ILOCK_EXCL); 3868 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3882 } 3869 }
3883 return(1); 3870 return 1;
3884 } 3871 }
3885 ip->i_flags |= XFS_IRECLAIM; 3872 ip->i_flags |= XFS_IRECLAIM;
3886 write_unlock(&ih->ih_lock); 3873 write_unlock(&ih->ih_lock);
@@ -3958,8 +3945,9 @@ xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
3958 } 3945 }
3959 } 3946 }
3960 XFS_MOUNT_IUNLOCK(mp); 3947 XFS_MOUNT_IUNLOCK(mp);
3961 xfs_finish_reclaim(ip, noblock, 3948 if (xfs_finish_reclaim(ip, noblock,
3962 XFS_IFLUSH_DELWRI_ELSE_ASYNC); 3949 XFS_IFLUSH_DELWRI_ELSE_ASYNC))
3950 delay(1);
3963 purged = 1; 3951 purged = 1;
3964 break; 3952 break;
3965 } 3953 }
@@ -3998,42 +3986,36 @@ xfs_alloc_file_space(
3998 int alloc_type, 3986 int alloc_type,
3999 int attr_flags) 3987 int attr_flags)
4000{ 3988{
3989 xfs_mount_t *mp = ip->i_mount;
3990 xfs_off_t count;
4001 xfs_filblks_t allocated_fsb; 3991 xfs_filblks_t allocated_fsb;
4002 xfs_filblks_t allocatesize_fsb; 3992 xfs_filblks_t allocatesize_fsb;
4003 int committed; 3993 xfs_extlen_t extsz, temp;
4004 xfs_off_t count; 3994 xfs_fileoff_t startoffset_fsb;
4005 xfs_filblks_t datablocks;
4006 int error;
4007 xfs_fsblock_t firstfsb; 3995 xfs_fsblock_t firstfsb;
4008 xfs_bmap_free_t free_list; 3996 int nimaps;
4009 xfs_bmbt_irec_t *imapp; 3997 int bmapi_flag;
4010 xfs_bmbt_irec_t imaps[1]; 3998 int quota_flag;
4011 xfs_mount_t *mp;
4012 int numrtextents;
4013 int reccount;
4014 uint resblks;
4015 int rt; 3999 int rt;
4016 int rtextsize;
4017 xfs_fileoff_t startoffset_fsb;
4018 xfs_trans_t *tp; 4000 xfs_trans_t *tp;
4019 int xfs_bmapi_flags; 4001 xfs_bmbt_irec_t imaps[1], *imapp;
4002 xfs_bmap_free_t free_list;
4003 uint qblocks, resblks, resrtextents;
4004 int committed;
4005 int error;
4020 4006
4021 vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address); 4007 vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
4022 mp = ip->i_mount;
4023 4008
4024 if (XFS_FORCED_SHUTDOWN(mp)) 4009 if (XFS_FORCED_SHUTDOWN(mp))
4025 return XFS_ERROR(EIO); 4010 return XFS_ERROR(EIO);
4026 4011
4027 /* 4012 rt = XFS_IS_REALTIME_INODE(ip);
4028 * determine if this is a realtime file 4013 if (unlikely(rt)) {
4029 */ 4014 if (!(extsz = ip->i_d.di_extsize))
4030 if ((rt = XFS_IS_REALTIME_INODE(ip)) != 0) { 4015 extsz = mp->m_sb.sb_rextsize;
4031 if (ip->i_d.di_extsize) 4016 } else {
4032 rtextsize = ip->i_d.di_extsize; 4017 extsz = ip->i_d.di_extsize;
4033 else 4018 }
4034 rtextsize = mp->m_sb.sb_rextsize;
4035 } else
4036 rtextsize = 0;
4037 4019
4038 if ((error = XFS_QM_DQATTACH(mp, ip, 0))) 4020 if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4039 return error; 4021 return error;
@@ -4044,8 +4026,8 @@ xfs_alloc_file_space(
4044 count = len; 4026 count = len;
4045 error = 0; 4027 error = 0;
4046 imapp = &imaps[0]; 4028 imapp = &imaps[0];
4047 reccount = 1; 4029 nimaps = 1;
4048 xfs_bmapi_flags = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0); 4030 bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
4049 startoffset_fsb = XFS_B_TO_FSBT(mp, offset); 4031 startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
4050 allocatesize_fsb = XFS_B_TO_FSB(mp, count); 4032 allocatesize_fsb = XFS_B_TO_FSB(mp, count);
4051 4033
@@ -4062,47 +4044,55 @@ xfs_alloc_file_space(
4062 offset, end_dmi_offset - offset, 4044 offset, end_dmi_offset - offset,
4063 0, NULL); 4045 0, NULL);
4064 if (error) 4046 if (error)
4065 return(error); 4047 return error;
4066 } 4048 }
4067 4049
4068 /* 4050 /*
4069 * allocate file space until done or until there is an error 4051 * Allocate file space until done or until there is an error
4070 */ 4052 */
4071retry: 4053retry:
4072 while (allocatesize_fsb && !error) { 4054 while (allocatesize_fsb && !error) {
4055 xfs_fileoff_t s, e;
4056
4073 /* 4057 /*
4074 * determine if reserving space on 4058 * Determine space reservations for data/realtime.
4075 * the data or realtime partition.
4076 */ 4059 */
4077 if (rt) { 4060 if (unlikely(extsz)) {
4078 xfs_fileoff_t s, e;
4079
4080 s = startoffset_fsb; 4061 s = startoffset_fsb;
4081 do_div(s, rtextsize); 4062 do_div(s, extsz);
4082 s *= rtextsize; 4063 s *= extsz;
4083 e = roundup_64(startoffset_fsb + allocatesize_fsb, 4064 e = startoffset_fsb + allocatesize_fsb;
4084 rtextsize); 4065 if ((temp = do_mod(startoffset_fsb, extsz)))
4085 numrtextents = (int)(e - s) / mp->m_sb.sb_rextsize; 4066 e += temp;
4086 datablocks = 0; 4067 if ((temp = do_mod(e, extsz)))
4068 e += extsz - temp;
4087 } else { 4069 } else {
4088 datablocks = allocatesize_fsb; 4070 s = 0;
4089 numrtextents = 0; 4071 e = allocatesize_fsb;
4072 }
4073
4074 if (unlikely(rt)) {
4075 resrtextents = qblocks = (uint)(e - s);
4076 resrtextents /= mp->m_sb.sb_rextsize;
4077 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4078 quota_flag = XFS_QMOPT_RES_RTBLKS;
4079 } else {
4080 resrtextents = 0;
4081 resblks = qblocks = \
4082 XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
4083 quota_flag = XFS_QMOPT_RES_REGBLKS;
4090 } 4084 }
4091 4085
4092 /* 4086 /*
4093 * allocate and setup the transaction 4087 * Allocate and setup the transaction.
4094 */ 4088 */
4095 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); 4089 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4096 resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks); 4090 error = xfs_trans_reserve(tp, resblks,
4097 error = xfs_trans_reserve(tp, 4091 XFS_WRITE_LOG_RES(mp), resrtextents,
4098 resblks,
4099 XFS_WRITE_LOG_RES(mp),
4100 numrtextents,
4101 XFS_TRANS_PERM_LOG_RES, 4092 XFS_TRANS_PERM_LOG_RES,
4102 XFS_WRITE_LOG_COUNT); 4093 XFS_WRITE_LOG_COUNT);
4103
4104 /* 4094 /*
4105 * check for running out of space 4095 * Check for running out of space
4106 */ 4096 */
4107 if (error) { 4097 if (error) {
4108 /* 4098 /*
@@ -4113,8 +4103,8 @@ retry:
4113 break; 4103 break;
4114 } 4104 }
4115 xfs_ilock(ip, XFS_ILOCK_EXCL); 4105 xfs_ilock(ip, XFS_ILOCK_EXCL);
4116 error = XFS_TRANS_RESERVE_QUOTA(mp, tp, 4106 error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
4117 ip->i_udquot, ip->i_gdquot, resblks, 0, 0); 4107 qblocks, 0, quota_flag);
4118 if (error) 4108 if (error)
4119 goto error1; 4109 goto error1;
4120 4110
@@ -4122,19 +4112,19 @@ retry:
4122 xfs_trans_ihold(tp, ip); 4112 xfs_trans_ihold(tp, ip);
4123 4113
4124 /* 4114 /*
4125 * issue the bmapi() call to allocate the blocks 4115 * Issue the xfs_bmapi() call to allocate the blocks
4126 */ 4116 */
4127 XFS_BMAP_INIT(&free_list, &firstfsb); 4117 XFS_BMAP_INIT(&free_list, &firstfsb);
4128 error = xfs_bmapi(tp, ip, startoffset_fsb, 4118 error = xfs_bmapi(tp, ip, startoffset_fsb,
4129 allocatesize_fsb, xfs_bmapi_flags, 4119 allocatesize_fsb, bmapi_flag,
4130 &firstfsb, 0, imapp, &reccount, 4120 &firstfsb, 0, imapp, &nimaps,
4131 &free_list); 4121 &free_list);
4132 if (error) { 4122 if (error) {
4133 goto error0; 4123 goto error0;
4134 } 4124 }
4135 4125
4136 /* 4126 /*
4137 * complete the transaction 4127 * Complete the transaction
4138 */ 4128 */
4139 error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed); 4129 error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
4140 if (error) { 4130 if (error) {
@@ -4149,7 +4139,7 @@ retry:
4149 4139
4150 allocated_fsb = imapp->br_blockcount; 4140 allocated_fsb = imapp->br_blockcount;
4151 4141
4152 if (reccount == 0) { 4142 if (nimaps == 0) {
4153 error = XFS_ERROR(ENOSPC); 4143 error = XFS_ERROR(ENOSPC);
4154 break; 4144 break;
4155 } 4145 }
@@ -4172,9 +4162,11 @@ dmapi_enospc_check:
4172 4162
4173 return error; 4163 return error;
4174 4164
4175 error0: 4165error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
4176 xfs_bmap_cancel(&free_list); 4166 xfs_bmap_cancel(&free_list);
4177 error1: 4167 XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag);
4168
4169error1: /* Just cancel transaction */
4178 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 4170 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4179 xfs_iunlock(ip, XFS_ILOCK_EXCL); 4171 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4180 goto dmapi_enospc_check; 4172 goto dmapi_enospc_check;
@@ -4312,7 +4304,7 @@ xfs_free_file_space(
4312 offset, end_dmi_offset - offset, 4304 offset, end_dmi_offset - offset,
4313 AT_DELAY_FLAG(attr_flags), NULL); 4305 AT_DELAY_FLAG(attr_flags), NULL);
4314 if (error) 4306 if (error)
4315 return(error); 4307 return error;
4316 } 4308 }
4317 4309
4318 ASSERT(attr_flags & ATTR_NOLOCK ? attr_flags & ATTR_DMI : 1); 4310 ASSERT(attr_flags & ATTR_NOLOCK ? attr_flags & ATTR_DMI : 1);
@@ -4419,8 +4411,8 @@ xfs_free_file_space(
4419 } 4411 }
4420 xfs_ilock(ip, XFS_ILOCK_EXCL); 4412 xfs_ilock(ip, XFS_ILOCK_EXCL);
4421 error = XFS_TRANS_RESERVE_QUOTA(mp, tp, 4413 error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
4422 ip->i_udquot, ip->i_gdquot, resblks, 0, rt ? 4414 ip->i_udquot, ip->i_gdquot, resblks, 0,
4423 XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); 4415 XFS_QMOPT_RES_REGBLKS);
4424 if (error) 4416 if (error)
4425 goto error1; 4417 goto error1;
4426 4418