aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-04-06 03:02:57 -0400
committerIngo Molnar <mingo@elte.hu>2009-04-06 03:02:57 -0400
commitf541ae326fa120fa5c57433e4d9a133df212ce41 (patch)
treebdbd94ec72cfc601118051cb35e8617d55510177 /fs
parente255357764f92afcafafbd4879b222b8c752065a (diff)
parent0221c81b1b8eb0cbb6b30a0ced52ead32d2b4e4c (diff)
Merge branch 'linus' into perfcounters/core-v2
Merge reason: we have gathered quite a few conflicts, need to merge upstream Conflicts: arch/powerpc/kernel/Makefile arch/x86/ia32/ia32entry.S arch/x86/include/asm/hardirq.h arch/x86/include/asm/unistd_32.h arch/x86/include/asm/unistd_64.h arch/x86/kernel/cpu/common.c arch/x86/kernel/irq.c arch/x86/kernel/syscall_table_32.S arch/x86/mm/iomap_32.c include/linux/sched.h kernel/Makefile Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs_vfs.h4
-rw-r--r--fs/9p/vfs_dentry.c4
-rw-r--r--fs/9p/vfs_super.c5
-rw-r--r--fs/Kconfig65
-rw-r--r--fs/Makefile11
-rw-r--r--fs/adfs/adfs.h2
-rw-r--r--fs/adfs/dir.c2
-rw-r--r--fs/adfs/super.c16
-rw-r--r--fs/affs/affs.h3
-rw-r--r--fs/affs/amigaffs.c8
-rw-r--r--fs/affs/namei.c4
-rw-r--r--fs/affs/super.c4
-rw-r--r--fs/afs/Kconfig8
-rw-r--r--fs/afs/Makefile3
-rw-r--r--fs/afs/cache.c503
-rw-r--r--fs/afs/cache.h15
-rw-r--r--fs/afs/cell.c16
-rw-r--r--fs/afs/dir.c2
-rw-r--r--fs/afs/file.c220
-rw-r--r--fs/afs/inode.c31
-rw-r--r--fs/afs/internal.h53
-rw-r--r--fs/afs/main.c27
-rw-r--r--fs/afs/mntpt.c4
-rw-r--r--fs/afs/proc.c1
-rw-r--r--fs/afs/vlocation.c25
-rw-r--r--fs/afs/volume.c14
-rw-r--r--fs/afs/write.c21
-rw-r--r--fs/aio.c42
-rw-r--r--fs/anon_inodes.c2
-rw-r--r--fs/attr.c3
-rw-r--r--fs/autofs/root.c2
-rw-r--r--fs/autofs4/autofs_i.h2
-rw-r--r--fs/autofs4/dev-ioctl.c29
-rw-r--r--fs/autofs4/expire.c27
-rw-r--r--fs/autofs4/inode.c2
-rw-r--r--fs/autofs4/root.c45
-rw-r--r--fs/befs/linuxvfs.c3
-rw-r--r--fs/binfmt_elf.c22
-rw-r--r--fs/binfmt_elf_fdpic.c25
-rw-r--r--fs/binfmt_som.c7
-rw-r--r--fs/bio-integrity.c86
-rw-r--r--fs/bio.c94
-rw-r--r--fs/block_dev.c147
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c2
-rw-r--r--fs/btrfs/async-thread.c7
-rw-r--r--fs/btrfs/btrfs_inode.h31
-rw-r--r--fs/btrfs/ctree.c904
-rw-r--r--fs/btrfs/ctree.h164
-rw-r--r--fs/btrfs/delayed-ref.c668
-rw-r--r--fs/btrfs/delayed-ref.h193
-rw-r--r--fs/btrfs/dir-item.c3
-rw-r--r--fs/btrfs/disk-io.c95
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/extent-tree.c2111
-rw-r--r--fs/btrfs/extent_io.c67
-rw-r--r--fs/btrfs/extent_io.h3
-rw-r--r--fs/btrfs/extent_map.c1
-rw-r--r--fs/btrfs/file-item.c7
-rw-r--r--fs/btrfs/file.c50
-rw-r--r--fs/btrfs/free-space-cache.c530
-rw-r--r--fs/btrfs/free-space-cache.h44
-rw-r--r--fs/btrfs/inode-item.c3
-rw-r--r--fs/btrfs/inode.c211
-rw-r--r--fs/btrfs/ioctl.c2
-rw-r--r--fs/btrfs/locking.c31
-rw-r--r--fs/btrfs/locking.h2
-rw-r--r--fs/btrfs/ordered-data.c118
-rw-r--r--fs/btrfs/ordered-data.h4
-rw-r--r--fs/btrfs/super.c54
-rw-r--r--fs/btrfs/transaction.c158
-rw-r--r--fs/btrfs/transaction.h8
-rw-r--r--fs/btrfs/tree-defrag.c2
-rw-r--r--fs/btrfs/tree-log.c456
-rw-r--r--fs/btrfs/tree-log.h17
-rw-r--r--fs/btrfs/volumes.c49
-rw-r--r--fs/btrfs/volumes.h2
-rw-r--r--fs/buffer.c236
-rw-r--r--fs/cachefiles/Kconfig39
-rw-r--r--fs/cachefiles/Makefile18
-rw-r--r--fs/cachefiles/bind.c286
-rw-r--r--fs/cachefiles/daemon.c755
-rw-r--r--fs/cachefiles/interface.c449
-rw-r--r--fs/cachefiles/internal.h360
-rw-r--r--fs/cachefiles/key.c159
-rw-r--r--fs/cachefiles/main.c106
-rw-r--r--fs/cachefiles/namei.c771
-rw-r--r--fs/cachefiles/proc.c134
-rw-r--r--fs/cachefiles/rdwr.c879
-rw-r--r--fs/cachefiles/security.c116
-rw-r--r--fs/cachefiles/xattr.c291
-rw-r--r--fs/cifs/CHANGES11
-rw-r--r--fs/cifs/Kconfig21
-rw-r--r--fs/cifs/README22
-rw-r--r--fs/cifs/cifs_debug.c3
-rw-r--r--fs/cifs/cifs_dfs_ref.c36
-rw-r--r--fs/cifs/cifs_fs_sb.h1
-rw-r--r--fs/cifs/cifsfs.c3
-rw-r--r--fs/cifs/cifsfs.h4
-rw-r--r--fs/cifs/cifsglob.h2
-rw-r--r--fs/cifs/cifspdu.h76
-rw-r--r--fs/cifs/cifsproto.h9
-rw-r--r--fs/cifs/cifssmb.c27
-rw-r--r--fs/cifs/connect.c9
-rw-r--r--fs/cifs/dir.c14
-rw-r--r--fs/cifs/file.c199
-rw-r--r--fs/cifs/inode.c7
-rw-r--r--fs/cifs/smbfsctl.h84
-rw-r--r--fs/coda/dir.c2
-rw-r--r--fs/compat.c148
-rw-r--r--fs/compat_ioctl.c10
-rw-r--r--fs/configfs/dir.c2
-rw-r--r--fs/cramfs/inode.c39
-rw-r--r--fs/cramfs/uncompress.c2
-rw-r--r--fs/dcache.c50
-rw-r--r--fs/debugfs/inode.c16
-rw-r--r--fs/devpts/inode.c193
-rw-r--r--fs/dlm/dir.c18
-rw-r--r--fs/dlm/dlm_internal.h2
-rw-r--r--fs/dlm/lock.c60
-rw-r--r--fs/dlm/lockspace.c2
-rw-r--r--fs/dlm/lowcomms.c181
-rw-r--r--fs/dlm/user.c24
-rw-r--r--fs/drop_caches.c2
-rw-r--r--fs/ecryptfs/crypto.c51
-rw-r--r--fs/ecryptfs/dentry.c2
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h6
-rw-r--r--fs/ecryptfs/inode.c32
-rw-r--r--fs/ecryptfs/keystore.c6
-rw-r--r--fs/ecryptfs/main.c5
-rw-r--r--fs/ecryptfs/messaging.c3
-rw-r--r--fs/efs/super.c20
-rw-r--r--fs/eventfd.c26
-rw-r--r--fs/eventpoll.c626
-rw-r--r--fs/exec.c52
-rw-r--r--fs/exofs/BUGS3
-rw-r--r--fs/exofs/Kbuild16
-rw-r--r--fs/exofs/Kconfig13
-rw-r--r--fs/exofs/common.h184
-rw-r--r--fs/exofs/dir.c672
-rw-r--r--fs/exofs/exofs.h180
-rw-r--r--fs/exofs/file.c87
-rw-r--r--fs/exofs/inode.c1303
-rw-r--r--fs/exofs/namei.c342
-rw-r--r--fs/exofs/osd.c153
-rw-r--r--fs/exofs/super.c584
-rw-r--r--fs/exofs/symlink.c57
-rw-r--r--fs/ext2/acl.c2
-rw-r--r--fs/ext2/balloc.c8
-rw-r--r--fs/ext2/ialloc.c10
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext2/super.c1
-rw-r--r--fs/ext2/xattr.c8
-rw-r--r--fs/ext3/acl.c2
-rw-r--r--fs/ext3/balloc.c8
-rw-r--r--fs/ext3/dir.c2
-rw-r--r--fs/ext3/file.c6
-rw-r--r--fs/ext3/ialloc.c12
-rw-r--r--fs/ext3/inode.c160
-rw-r--r--fs/ext3/ioctl.c59
-rw-r--r--fs/ext3/namei.c41
-rw-r--r--fs/ext3/super.c48
-rw-r--r--fs/ext3/xattr.c6
-rw-r--r--fs/ext4/Kconfig2
-rw-r--r--fs/ext4/acl.c2
-rw-r--r--fs/ext4/balloc.c16
-rw-r--r--fs/ext4/dir.c16
-rw-r--r--fs/ext4/ext4.h95
-rw-r--r--fs/ext4/ext4_extents.h1
-rw-r--r--fs/ext4/ext4_i.h6
-rw-r--r--fs/ext4/ext4_sb.h14
-rw-r--r--fs/ext4/extents.c131
-rw-r--r--fs/ext4/file.c7
-rw-r--r--fs/ext4/ialloc.c297
-rw-r--r--fs/ext4/inode.c467
-rw-r--r--fs/ext4/ioctl.c17
-rw-r--r--fs/ext4/mballoc.c213
-rw-r--r--fs/ext4/mballoc.h8
-rw-r--r--fs/ext4/namei.c170
-rw-r--r--fs/ext4/resize.c8
-rw-r--r--fs/ext4/super.c381
-rw-r--r--fs/ext4/xattr.c6
-rw-r--r--fs/fat/inode.c12
-rw-r--r--fs/fat/namei_msdos.c2
-rw-r--r--fs/fat/namei_vfat.c4
-rw-r--r--fs/fcntl.c37
-rw-r--r--fs/file_table.c4
-rw-r--r--fs/fs-writeback.c38
-rw-r--r--fs/fs_struct.c177
-rw-r--r--fs/fscache/Kconfig56
-rw-r--r--fs/fscache/Makefile19
-rw-r--r--fs/fscache/cache.c415
-rw-r--r--fs/fscache/cookie.c500
-rw-r--r--fs/fscache/fsdef.c144
-rw-r--r--fs/fscache/histogram.c109
-rw-r--r--fs/fscache/internal.h380
-rw-r--r--fs/fscache/main.c124
-rw-r--r--fs/fscache/netfs.c103
-rw-r--r--fs/fscache/object.c810
-rw-r--r--fs/fscache/operation.c459
-rw-r--r--fs/fscache/page.c816
-rw-r--r--fs/fscache/proc.c68
-rw-r--r--fs/fscache/stats.c212
-rw-r--r--fs/fuse/dir.c3
-rw-r--r--fs/fuse/file.c60
-rw-r--r--fs/fuse/fuse_i.h2
-rw-r--r--fs/generic_acl.c2
-rw-r--r--fs/gfs2/Kconfig17
-rw-r--r--fs/gfs2/Makefile4
-rw-r--r--fs/gfs2/acl.c3
-rw-r--r--fs/gfs2/bmap.c1
-rw-r--r--fs/gfs2/dir.c1
-rw-r--r--fs/gfs2/eaops.c1
-rw-r--r--fs/gfs2/eattr.c1
-rw-r--r--fs/gfs2/glock.c268
-rw-r--r--fs/gfs2/glock.h127
-rw-r--r--fs/gfs2/glops.c160
-rw-r--r--fs/gfs2/glops.h1
-rw-r--r--fs/gfs2/incore.h71
-rw-r--r--fs/gfs2/inode.c13
-rw-r--r--fs/gfs2/inode.h22
-rw-r--r--fs/gfs2/lock_dlm.c241
-rw-r--r--fs/gfs2/locking.c232
-rw-r--r--fs/gfs2/locking/dlm/Makefile3
-rw-r--r--fs/gfs2/locking/dlm/lock.c708
-rw-r--r--fs/gfs2/locking/dlm/lock_dlm.h166
-rw-r--r--fs/gfs2/locking/dlm/main.c48
-rw-r--r--fs/gfs2/locking/dlm/mount.c276
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c226
-rw-r--r--fs/gfs2/locking/dlm/thread.c68
-rw-r--r--fs/gfs2/log.c1
-rw-r--r--fs/gfs2/lops.c1
-rw-r--r--fs/gfs2/main.c13
-rw-r--r--fs/gfs2/meta_io.c22
-rw-r--r--fs/gfs2/meta_io.h1
-rw-r--r--fs/gfs2/mount.c128
-rw-r--r--fs/gfs2/mount.h17
-rw-r--r--fs/gfs2/ops_address.c5
-rw-r--r--fs/gfs2/ops_dentry.c3
-rw-r--r--fs/gfs2/ops_export.c1
-rw-r--r--fs/gfs2/ops_file.c81
-rw-r--r--fs/gfs2/ops_fstype.c156
-rw-r--r--fs/gfs2/ops_inode.c1
-rw-r--r--fs/gfs2/ops_super.c44
-rw-r--r--fs/gfs2/quota.c203
-rw-r--r--fs/gfs2/quota.h2
-rw-r--r--fs/gfs2/recovery.c28
-rw-r--r--fs/gfs2/rgrp.c189
-rw-r--r--fs/gfs2/super.c3
-rw-r--r--fs/gfs2/super.h28
-rw-r--r--fs/gfs2/sys.c236
-rw-r--r--fs/gfs2/trans.c19
-rw-r--r--fs/gfs2/util.c11
-rw-r--r--fs/hfs/hfs_fs.h2
-rw-r--r--fs/hfs/super.c3
-rw-r--r--fs/hfs/sysdep.c2
-rw-r--r--fs/hfsplus/hfsplus_fs.h2
-rw-r--r--fs/hfsplus/inode.c2
-rw-r--r--fs/hfsplus/options.c2
-rw-r--r--fs/hfsplus/super.c3
-rw-r--r--fs/hostfs/hostfs_kern.c4
-rw-r--r--fs/hpfs/dentry.c2
-rw-r--r--fs/hpfs/super.c5
-rw-r--r--fs/hppfs/hppfs.c7
-rw-r--r--fs/hugetlbfs/inode.c21
-rw-r--r--fs/inode.c84
-rw-r--r--fs/internal.h8
-rw-r--r--fs/ioctl.c18
-rw-r--r--fs/isofs/inode.c5
-rw-r--r--fs/jbd/commit.c23
-rw-r--r--fs/jbd/journal.c34
-rw-r--r--fs/jbd/transaction.c2
-rw-r--r--fs/jbd2/commit.c5
-rw-r--r--fs/jbd2/revoke.c24
-rw-r--r--fs/jbd2/transaction.c2
-rw-r--r--fs/jffs2/acl.c2
-rw-r--r--fs/jfs/Kconfig1
-rw-r--r--fs/jfs/acl.c4
-rw-r--r--fs/jfs/inode.c6
-rw-r--r--fs/jfs/jfs_debug.c1
-rw-r--r--fs/jfs/jfs_dtree.c18
-rw-r--r--fs/jfs/jfs_extent.c73
-rw-r--r--fs/jfs/jfs_imap.c10
-rw-r--r--fs/jfs/jfs_inode.c4
-rw-r--r--fs/jfs/jfs_inode.h2
-rw-r--r--fs/jfs/jfs_metapage.c18
-rw-r--r--fs/jfs/jfs_types.h29
-rw-r--r--fs/jfs/jfs_xtree.c277
-rw-r--r--fs/jfs/jfs_xtree.h2
-rw-r--r--fs/jfs/namei.c10
-rw-r--r--fs/jfs/super.c4
-rw-r--r--fs/jfs/xattr.c12
-rw-r--r--fs/libfs.c5
-rw-r--r--fs/lockd/mon.c8
-rw-r--r--fs/lockd/svc.c42
-rw-r--r--fs/minix/inode.c13
-rw-r--r--fs/mpage.c13
-rw-r--r--fs/namei.c70
-rw-r--r--fs/namespace.c75
-rw-r--r--fs/ncpfs/dir.c4
-rw-r--r--fs/nfs/Kconfig8
-rw-r--r--fs/nfs/Makefile1
-rw-r--r--fs/nfs/callback.c31
-rw-r--r--fs/nfs/callback.h1
-rw-r--r--fs/nfs/client.c147
-rw-r--r--fs/nfs/dir.c21
-rw-r--r--fs/nfs/file.c75
-rw-r--r--fs/nfs/fscache-index.c337
-rw-r--r--fs/nfs/fscache.c523
-rw-r--r--fs/nfs/fscache.h220
-rw-r--r--fs/nfs/getroot.c4
-rw-r--r--fs/nfs/inode.c323
-rw-r--r--fs/nfs/internal.h8
-rw-r--r--fs/nfs/iostat.h18
-rw-r--r--fs/nfs/nfs2xdr.c9
-rw-r--r--fs/nfs/nfs3acl.c27
-rw-r--r--fs/nfs/nfs3proc.c7
-rw-r--r--fs/nfs/nfs3xdr.c71
-rw-r--r--fs/nfs/nfs4_fs.h2
-rw-r--r--fs/nfs/nfs4namespace.c15
-rw-r--r--fs/nfs/nfs4proc.c49
-rw-r--r--fs/nfs/nfs4state.c10
-rw-r--r--fs/nfs/nfs4xdr.c213
-rw-r--r--fs/nfs/pagelist.c11
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/read.c27
-rw-r--r--fs/nfs/super.c49
-rw-r--r--fs/nfs/write.c53
-rw-r--r--fs/nfsd/nfs4xdr.c1
-rw-r--r--fs/nfsd/nfsctl.c6
-rw-r--r--fs/nfsd/nfssvc.c12
-rw-r--r--fs/nfsd/vfs.c9
-rw-r--r--fs/notify/inotify/inotify.c16
-rw-r--r--fs/ntfs/dir.c4
-rw-r--r--fs/ntfs/inode.c3
-rw-r--r--fs/ntfs/layout.h329
-rw-r--r--fs/ntfs/logfile.h6
-rw-r--r--fs/ntfs/mft.c2
-rw-r--r--fs/ntfs/super.c50
-rw-r--r--fs/ntfs/usnjrnl.h48
-rw-r--r--fs/ocfs2/acl.c2
-rw-r--r--fs/ocfs2/alloc.c60
-rw-r--r--fs/ocfs2/alloc.h3
-rw-r--r--fs/ocfs2/aops.c30
-rw-r--r--fs/ocfs2/cluster/heartbeat.c96
-rw-r--r--fs/ocfs2/cluster/heartbeat.h3
-rw-r--r--fs/ocfs2/cluster/nodemanager.c9
-rw-r--r--fs/ocfs2/dcache.c2
-rw-r--r--fs/ocfs2/dcache.h2
-rw-r--r--fs/ocfs2/dir.c2806
-rw-r--r--fs/ocfs2/dir.h57
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h58
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c87
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c29
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c387
-rw-r--r--fs/ocfs2/dlm/dlmthread.c20
-rw-r--r--fs/ocfs2/dlmglue.c46
-rw-r--r--fs/ocfs2/dlmglue.h2
-rw-r--r--fs/ocfs2/export.c84
-rw-r--r--fs/ocfs2/inode.c48
-rw-r--r--fs/ocfs2/inode.h5
-rw-r--r--fs/ocfs2/journal.c173
-rw-r--r--fs/ocfs2/journal.h77
-rw-r--r--fs/ocfs2/localalloc.c86
-rw-r--r--fs/ocfs2/mmap.c6
-rw-r--r--fs/ocfs2/namei.c253
-rw-r--r--fs/ocfs2/ocfs2.h76
-rw-r--r--fs/ocfs2/ocfs2_fs.h142
-rw-r--r--fs/ocfs2/ocfs2_lockid.h4
-rw-r--r--fs/ocfs2/suballoc.c254
-rw-r--r--fs/ocfs2/suballoc.h4
-rw-r--r--fs/ocfs2/super.c188
-rw-r--r--fs/ocfs2/xattr.c38
-rw-r--r--fs/ocfs2/xattr.h2
-rw-r--r--fs/omfs/inode.c7
-rw-r--r--fs/open.c3
-rw-r--r--fs/partitions/check.c14
-rw-r--r--fs/partitions/ibm.c101
-rw-r--r--fs/pipe.c31
-rw-r--r--fs/proc/base.c73
-rw-r--r--fs/proc/generic.c65
-rw-r--r--fs/proc/inode-alloc.txt14
-rw-r--r--fs/proc/inode.c21
-rw-r--r--fs/proc/internal.h1
-rw-r--r--fs/proc/meminfo.c2
-rw-r--r--fs/proc/nommu.c2
-rw-r--r--fs/proc/page.c2
-rw-r--r--fs/proc/proc_sysctl.c4
-rw-r--r--fs/proc/proc_tty.c13
-rw-r--r--fs/proc/root.c3
-rw-r--r--fs/proc/task_mmu.c8
-rw-r--r--fs/proc/task_nommu.c7
-rw-r--r--fs/proc/uptime.c38
-rw-r--r--fs/qnx4/inode.c3
-rw-r--r--fs/quota/Kconfig59
-rw-r--r--fs/quota/Makefile14
-rw-r--r--fs/quota/dquot.c (renamed from fs/dquot.c)577
-rw-r--r--fs/quota/quota.c (renamed from fs/quota.c)37
-rw-r--r--fs/quota/quota_tree.c (renamed from fs/quota_tree.c)132
-rw-r--r--fs/quota/quota_tree.h (renamed from fs/quota_tree.h)0
-rw-r--r--fs/quota/quota_v1.c (renamed from fs/quota_v1.c)48
-rw-r--r--fs/quota/quota_v2.c (renamed from fs/quota_v2.c)3
-rw-r--r--fs/quota/quotaio_v1.h (renamed from fs/quotaio_v1.h)0
-rw-r--r--fs/quota/quotaio_v2.h (renamed from fs/quotaio_v2.h)0
-rw-r--r--fs/ramfs/file-nommu.c21
-rw-r--r--fs/ramfs/inode.c94
-rw-r--r--fs/read_write.c56
-rw-r--r--fs/reiserfs/Kconfig1
-rw-r--r--fs/reiserfs/Makefile4
-rw-r--r--fs/reiserfs/README4
-rw-r--r--fs/reiserfs/bitmap.c86
-rw-r--r--fs/reiserfs/dir.c28
-rw-r--r--fs/reiserfs/do_balan.c313
-rw-r--r--fs/reiserfs/file.c34
-rw-r--r--fs/reiserfs/fix_node.c1021
-rw-r--r--fs/reiserfs/hashes.c2
-rw-r--r--fs/reiserfs/ibalance.c22
-rw-r--r--fs/reiserfs/inode.c213
-rw-r--r--fs/reiserfs/ioctl.c2
-rw-r--r--fs/reiserfs/item_ops.c68
-rw-r--r--fs/reiserfs/journal.c1077
-rw-r--r--fs/reiserfs/lbalance.c66
-rw-r--r--fs/reiserfs/namei.c186
-rw-r--r--fs/reiserfs/objectid.c12
-rw-r--r--fs/reiserfs/prints.c134
-rw-r--r--fs/reiserfs/procfs.c16
-rw-r--r--fs/reiserfs/resize.c6
-rw-r--r--fs/reiserfs/stree.c1174
-rw-r--r--fs/reiserfs/super.c368
-rw-r--r--fs/reiserfs/tail_conversion.c96
-rw-r--r--fs/reiserfs/xattr.c1377
-rw-r--r--fs/reiserfs/xattr_acl.c259
-rw-r--r--fs/reiserfs/xattr_security.c80
-rw-r--r--fs/reiserfs/xattr_trusted.c45
-rw-r--r--fs/reiserfs/xattr_user.c31
-rw-r--r--fs/seq_file.c2
-rw-r--r--fs/smbfs/dir.c4
-rw-r--r--fs/splice.c3
-rw-r--r--fs/squashfs/block.c18
-rw-r--r--fs/squashfs/cache.c4
-rw-r--r--fs/squashfs/inode.c6
-rw-r--r--fs/squashfs/squashfs.h2
-rw-r--r--fs/squashfs/super.c5
-rw-r--r--fs/super.c34
-rw-r--r--fs/sync.c16
-rw-r--r--fs/sysfs/bin.c253
-rw-r--r--fs/sysfs/dir.c35
-rw-r--r--fs/sysfs/file.c26
-rw-r--r--fs/sysfs/inode.c17
-rw-r--r--fs/sysfs/mount.c6
-rw-r--r--fs/sysfs/sysfs.h3
-rw-r--r--fs/sysv/inode.c3
-rw-r--r--fs/sysv/namei.c2
-rw-r--r--fs/sysv/sysv.h2
-rw-r--r--fs/ubifs/Kconfig4
-rw-r--r--fs/ubifs/file.c9
-rw-r--r--fs/ubifs/super.c3
-rw-r--r--fs/udf/balloc.c164
-rw-r--r--fs/udf/dir.c14
-rw-r--r--fs/udf/directory.c38
-rw-r--r--fs/udf/ecma_167.h416
-rw-r--r--fs/udf/ialloc.c17
-rw-r--r--fs/udf/inode.c213
-rw-r--r--fs/udf/misc.c29
-rw-r--r--fs/udf/namei.c86
-rw-r--r--fs/udf/osta_udf.h22
-rw-r--r--fs/udf/partition.c2
-rw-r--r--fs/udf/super.c605
-rw-r--r--fs/udf/truncate.c44
-rw-r--r--fs/udf/udf_i.h6
-rw-r--r--fs/udf/udf_sb.h9
-rw-r--r--fs/udf/udfdecl.h57
-rw-r--r--fs/udf/udfend.h28
-rw-r--r--fs/udf/udftime.c6
-rw-r--r--fs/udf/unicode.c62
-rw-r--r--fs/ufs/balloc.c12
-rw-r--r--fs/ufs/ialloc.c8
-rw-r--r--fs/ufs/inode.c39
-rw-r--r--fs/ufs/namei.c2
-rw-r--r--fs/ufs/super.c16
-rw-r--r--fs/ufs/ufs.h2
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/linux-2.6/mutex.h25
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c12
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c107
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c37
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h13
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c157
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c147
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_vnode.h32
-rw-r--r--fs/xfs/quota/xfs_dquot.c28
-rw-r--r--fs/xfs/quota/xfs_dquot.h18
-rw-r--r--fs/xfs/quota/xfs_qm.c212
-rw-r--r--fs/xfs/quota/xfs_qm.h26
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c1
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c190
-rw-r--r--fs/xfs/quota/xfs_quota_priv.h40
-rw-r--r--fs/xfs/quota/xfs_trans_dquot.c16
-rw-r--r--fs/xfs/support/debug.c1
-rw-r--r--fs/xfs/support/uuid.c71
-rw-r--r--fs/xfs/support/uuid.h4
-rw-r--r--fs/xfs/xfs_ag.h4
-rw-r--r--fs/xfs/xfs_alloc.c26
-rw-r--r--fs/xfs/xfs_alloc.h6
-rw-r--r--fs/xfs/xfs_attr_leaf.c58
-rw-r--r--fs/xfs/xfs_bmap.c76
-rw-r--r--fs/xfs/xfs_bmap.h6
-rw-r--r--fs/xfs/xfs_btree.c4
-rw-r--r--fs/xfs/xfs_btree.h2
-rw-r--r--fs/xfs/xfs_da_btree.c2
-rw-r--r--fs/xfs/xfs_da_btree.h9
-rw-r--r--fs/xfs/xfs_dfrag.c68
-rw-r--r--fs/xfs/xfs_dinode.h4
-rw-r--r--fs/xfs/xfs_dir2.c2
-rw-r--r--fs/xfs/xfs_dir2_block.c7
-rw-r--r--fs/xfs/xfs_dir2_data.h2
-rw-r--r--fs/xfs/xfs_dir2_leaf.c17
-rw-r--r--fs/xfs/xfs_dir2_node.c2
-rw-r--r--fs/xfs/xfs_dir2_sf.c13
-rw-r--r--fs/xfs/xfs_extfree_item.h6
-rw-r--r--fs/xfs/xfs_filestream.c9
-rw-r--r--fs/xfs/xfs_fsops.c2
-rw-r--r--fs/xfs/xfs_ialloc.c12
-rw-r--r--fs/xfs/xfs_ialloc_btree.c2
-rw-r--r--fs/xfs/xfs_ialloc_btree.h22
-rw-r--r--fs/xfs/xfs_iget.c15
-rw-r--r--fs/xfs/xfs_inode.h2
-rw-r--r--fs/xfs/xfs_inode_item.h2
-rw-r--r--fs/xfs/xfs_iomap.h2
-rw-r--r--fs/xfs/xfs_itable.c9
-rw-r--r--fs/xfs/xfs_log.c67
-rw-r--r--fs/xfs/xfs_log.h3
-rw-r--r--fs/xfs/xfs_log_priv.h3
-rw-r--r--fs/xfs/xfs_log_recover.c325
-rw-r--r--fs/xfs/xfs_mount.c253
-rw-r--r--fs/xfs/xfs_mount.h19
-rw-r--r--fs/xfs/xfs_qmops.c1
-rw-r--r--fs/xfs/xfs_quota.h3
-rw-r--r--fs/xfs/xfs_rtalloc.c10
-rw-r--r--fs/xfs/xfs_rtalloc.h8
-rw-r--r--fs/xfs/xfs_trans.h24
-rw-r--r--fs/xfs/xfs_trans_ail.c4
-rw-r--r--fs/xfs/xfs_trans_item.c2
-rw-r--r--fs/xfs/xfs_trans_space.h2
-rw-r--r--fs/xfs/xfs_types.h8
-rw-r--r--fs/xfs/xfs_utils.c2
-rw-r--r--fs/xfs/xfs_vnodeops.c408
-rw-r--r--fs/xfs/xfs_vnodeops.h3
553 files changed, 34873 insertions, 15740 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index c295ba786edd..f0c7de78e205 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -41,8 +41,8 @@ extern struct file_system_type v9fs_fs_type;
41extern const struct address_space_operations v9fs_addr_operations; 41extern const struct address_space_operations v9fs_addr_operations;
42extern const struct file_operations v9fs_file_operations; 42extern const struct file_operations v9fs_file_operations;
43extern const struct file_operations v9fs_dir_operations; 43extern const struct file_operations v9fs_dir_operations;
44extern struct dentry_operations v9fs_dentry_operations; 44extern const struct dentry_operations v9fs_dentry_operations;
45extern struct dentry_operations v9fs_cached_dentry_operations; 45extern const struct dentry_operations v9fs_cached_dentry_operations;
46 46
47struct inode *v9fs_get_inode(struct super_block *sb, int mode); 47struct inode *v9fs_get_inode(struct super_block *sb, int mode);
48ino_t v9fs_qid2ino(struct p9_qid *qid); 48ino_t v9fs_qid2ino(struct p9_qid *qid);
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 06dcc7c4f234..d74325295b1e 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -104,12 +104,12 @@ void v9fs_dentry_release(struct dentry *dentry)
104 } 104 }
105} 105}
106 106
107struct dentry_operations v9fs_cached_dentry_operations = { 107const struct dentry_operations v9fs_cached_dentry_operations = {
108 .d_delete = v9fs_cached_dentry_delete, 108 .d_delete = v9fs_cached_dentry_delete,
109 .d_release = v9fs_dentry_release, 109 .d_release = v9fs_dentry_release,
110}; 110};
111 111
112struct dentry_operations v9fs_dentry_operations = { 112const struct dentry_operations v9fs_dentry_operations = {
113 .d_delete = v9fs_dentry_delete, 113 .d_delete = v9fs_dentry_delete,
114 .d_release = v9fs_dentry_release, 114 .d_release = v9fs_dentry_release,
115}; 115};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 93212e40221a..5f8ab8adb5f5 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -168,8 +168,9 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
168 p9stat_free(st); 168 p9stat_free(st);
169 kfree(st); 169 kfree(st);
170 170
171P9_DPRINTK(P9_DEBUG_VFS, " return simple set mount\n"); 171P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
172 return simple_set_mnt(mnt, sb); 172 simple_set_mnt(mnt, sb);
173 return 0;
173 174
174release_sb: 175release_sb:
175 if (sb) { 176 if (sb) {
diff --git a/fs/Kconfig b/fs/Kconfig
index 93945dd0b1ae..86b203fc3c56 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -56,61 +56,7 @@ endif # BLOCK
56 56
57source "fs/notify/Kconfig" 57source "fs/notify/Kconfig"
58 58
59config QUOTA 59source "fs/quota/Kconfig"
60 bool "Quota support"
61 help
62 If you say Y here, you will be able to set per user limits for disk
63 usage (also called disk quotas). Currently, it works for the
64 ext2, ext3, and reiserfs file system. ext3 also supports journalled
65 quotas for which you don't need to run quotacheck(8) after an unclean
66 shutdown.
67 For further details, read the Quota mini-HOWTO, available from
68 <http://www.tldp.org/docs.html#howto>, or the documentation provided
69 with the quota tools. Probably the quota support is only useful for
70 multi user systems. If unsure, say N.
71
72config QUOTA_NETLINK_INTERFACE
73 bool "Report quota messages through netlink interface"
74 depends on QUOTA && NET
75 help
76 If you say Y here, quota warnings (about exceeding softlimit, reaching
77 hardlimit, etc.) will be reported through netlink interface. If unsure,
78 say Y.
79
80config PRINT_QUOTA_WARNING
81 bool "Print quota warnings to console (OBSOLETE)"
82 depends on QUOTA
83 default y
84 help
85 If you say Y here, quota warnings (about exceeding softlimit, reaching
86 hardlimit, etc.) will be printed to the process' controlling terminal.
87 Note that this behavior is currently deprecated and may go away in
88 future. Please use notification via netlink socket instead.
89
90# Generic support for tree structured quota files. Seleted when needed.
91config QUOTA_TREE
92 tristate
93
94config QFMT_V1
95 tristate "Old quota format support"
96 depends on QUOTA
97 help
98 This quota format was (is) used by kernels earlier than 2.4.22. If
99 you have quota working and you don't want to convert to new quota
100 format say Y here.
101
102config QFMT_V2
103 tristate "Quota format v2 support"
104 depends on QUOTA
105 select QUOTA_TREE
106 help
107 This quota format allows using quotas with 32-bit UIDs/GIDs. If you
108 need this functionality say Y here.
109
110config QUOTACTL
111 bool
112 depends on XFS_QUOTA || QUOTA
113 default y
114 60
115source "fs/autofs/Kconfig" 61source "fs/autofs/Kconfig"
116source "fs/autofs4/Kconfig" 62source "fs/autofs4/Kconfig"
@@ -120,6 +66,13 @@ config GENERIC_ACL
120 bool 66 bool
121 select FS_POSIX_ACL 67 select FS_POSIX_ACL
122 68
69menu "Caches"
70
71source "fs/fscache/Kconfig"
72source "fs/cachefiles/Kconfig"
73
74endmenu
75
123if BLOCK 76if BLOCK
124menu "CD-ROM/DVD Filesystems" 77menu "CD-ROM/DVD Filesystems"
125 78
@@ -223,6 +176,8 @@ source "fs/romfs/Kconfig"
223source "fs/sysv/Kconfig" 176source "fs/sysv/Kconfig"
224source "fs/ufs/Kconfig" 177source "fs/ufs/Kconfig"
225 178
179source "fs/exofs/Kconfig"
180
226endif # MISC_FILESYSTEMS 181endif # MISC_FILESYSTEMS
227 182
228menuconfig NETWORK_FILESYSTEMS 183menuconfig NETWORK_FILESYSTEMS
diff --git a/fs/Makefile b/fs/Makefile
index dc20db348679..70b2aed87133 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
11 attr.o bad_inode.o file.o filesystems.o namespace.o \ 11 attr.o bad_inode.o file.o filesystems.o namespace.o \
12 seq_file.o xattr.o libfs.o fs-writeback.o \ 12 seq_file.o xattr.o libfs.o fs-writeback.o \
13 pnode.o drop_caches.o splice.o sync.o utimes.o \ 13 pnode.o drop_caches.o splice.o sync.o utimes.o \
14 stack.o 14 stack.o fs_struct.o
15 15
16ifeq ($(CONFIG_BLOCK),y) 16ifeq ($(CONFIG_BLOCK),y)
17obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o 17obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
@@ -51,11 +51,7 @@ obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o
51obj-$(CONFIG_NFS_COMMON) += nfs_common/ 51obj-$(CONFIG_NFS_COMMON) += nfs_common/
52obj-$(CONFIG_GENERIC_ACL) += generic_acl.o 52obj-$(CONFIG_GENERIC_ACL) += generic_acl.o
53 53
54obj-$(CONFIG_QUOTA) += dquot.o 54obj-y += quota/
55obj-$(CONFIG_QFMT_V1) += quota_v1.o
56obj-$(CONFIG_QFMT_V2) += quota_v2.o
57obj-$(CONFIG_QUOTA_TREE) += quota_tree.o
58obj-$(CONFIG_QUOTACTL) += quota.o
59 55
60obj-$(CONFIG_PROC_FS) += proc/ 56obj-$(CONFIG_PROC_FS) += proc/
61obj-y += partitions/ 57obj-y += partitions/
@@ -67,6 +63,7 @@ obj-$(CONFIG_PROFILING) += dcookies.o
67obj-$(CONFIG_DLM) += dlm/ 63obj-$(CONFIG_DLM) += dlm/
68 64
69# Do not add any filesystems before this line 65# Do not add any filesystems before this line
66obj-$(CONFIG_FSCACHE) += fscache/
70obj-$(CONFIG_REISERFS_FS) += reiserfs/ 67obj-$(CONFIG_REISERFS_FS) += reiserfs/
71obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 68obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
72obj-$(CONFIG_EXT2_FS) += ext2/ 69obj-$(CONFIG_EXT2_FS) += ext2/
@@ -120,7 +117,9 @@ obj-$(CONFIG_AFS_FS) += afs/
120obj-$(CONFIG_BEFS_FS) += befs/ 117obj-$(CONFIG_BEFS_FS) += befs/
121obj-$(CONFIG_HOSTFS) += hostfs/ 118obj-$(CONFIG_HOSTFS) += hostfs/
122obj-$(CONFIG_HPPFS) += hppfs/ 119obj-$(CONFIG_HPPFS) += hppfs/
120obj-$(CONFIG_CACHEFILES) += cachefiles/
123obj-$(CONFIG_DEBUG_FS) += debugfs/ 121obj-$(CONFIG_DEBUG_FS) += debugfs/
124obj-$(CONFIG_OCFS2_FS) += ocfs2/ 122obj-$(CONFIG_OCFS2_FS) += ocfs2/
125obj-$(CONFIG_BTRFS_FS) += btrfs/ 123obj-$(CONFIG_BTRFS_FS) += btrfs/
126obj-$(CONFIG_GFS2_FS) += gfs2/ 124obj-$(CONFIG_GFS2_FS) += gfs2/
125obj-$(CONFIG_EXOFS_FS) += exofs/
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 831157502d5a..e0a85dbeeb88 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -86,7 +86,7 @@ void __adfs_error(struct super_block *sb, const char *function,
86/* dir_*.c */ 86/* dir_*.c */
87extern const struct inode_operations adfs_dir_inode_operations; 87extern const struct inode_operations adfs_dir_inode_operations;
88extern const struct file_operations adfs_dir_operations; 88extern const struct file_operations adfs_dir_operations;
89extern struct dentry_operations adfs_dentry_operations; 89extern const struct dentry_operations adfs_dentry_operations;
90extern struct adfs_dir_ops adfs_f_dir_ops; 90extern struct adfs_dir_ops adfs_f_dir_ops;
91extern struct adfs_dir_ops adfs_fplus_dir_ops; 91extern struct adfs_dir_ops adfs_fplus_dir_ops;
92 92
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 85a30e929800..e867ccf37246 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -263,7 +263,7 @@ adfs_compare(struct dentry *parent, struct qstr *entry, struct qstr *name)
263 return 0; 263 return 0;
264} 264}
265 265
266struct dentry_operations adfs_dentry_operations = { 266const struct dentry_operations adfs_dentry_operations = {
267 .d_hash = adfs_hash, 267 .d_hash = adfs_hash,
268 .d_compare = adfs_compare, 268 .d_compare = adfs_compare,
269}; 269};
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 7f83a46f2b7e..dd9becca4241 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -219,16 +219,20 @@ static int adfs_remount(struct super_block *sb, int *flags, char *data)
219 219
220static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf) 220static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf)
221{ 221{
222 struct adfs_sb_info *asb = ADFS_SB(dentry->d_sb); 222 struct super_block *sb = dentry->d_sb;
223 struct adfs_sb_info *sbi = ADFS_SB(sb);
224 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
223 225
224 buf->f_type = ADFS_SUPER_MAGIC; 226 buf->f_type = ADFS_SUPER_MAGIC;
225 buf->f_namelen = asb->s_namelen; 227 buf->f_namelen = sbi->s_namelen;
226 buf->f_bsize = dentry->d_sb->s_blocksize; 228 buf->f_bsize = sb->s_blocksize;
227 buf->f_blocks = asb->s_size; 229 buf->f_blocks = sbi->s_size;
228 buf->f_files = asb->s_ids_per_zone * asb->s_map_size; 230 buf->f_files = sbi->s_ids_per_zone * sbi->s_map_size;
229 buf->f_bavail = 231 buf->f_bavail =
230 buf->f_bfree = adfs_map_free(dentry->d_sb); 232 buf->f_bfree = adfs_map_free(sb);
231 buf->f_ffree = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks; 233 buf->f_ffree = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks;
234 buf->f_fsid.val[0] = (u32)id;
235 buf->f_fsid.val[1] = (u32)(id >> 32);
232 236
233 return 0; 237 return 0;
234} 238}
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index e9ec915f7553..1a2d5e3c7f4e 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -199,8 +199,7 @@ extern const struct address_space_operations affs_symlink_aops;
199extern const struct address_space_operations affs_aops; 199extern const struct address_space_operations affs_aops;
200extern const struct address_space_operations affs_aops_ofs; 200extern const struct address_space_operations affs_aops_ofs;
201 201
202extern struct dentry_operations affs_dentry_operations; 202extern const struct dentry_operations affs_dentry_operations;
203extern struct dentry_operations affs_dentry_operations_intl;
204 203
205static inline void 204static inline void
206affs_set_blocksize(struct super_block *sb, int size) 205affs_set_blocksize(struct super_block *sb, int size)
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 805573005de6..7d0f0a30f7a3 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -179,14 +179,18 @@ affs_remove_link(struct dentry *dentry)
179 affs_lock_dir(dir); 179 affs_lock_dir(dir);
180 affs_fix_dcache(dentry, link_ino); 180 affs_fix_dcache(dentry, link_ino);
181 retval = affs_remove_hash(dir, link_bh); 181 retval = affs_remove_hash(dir, link_bh);
182 if (retval) 182 if (retval) {
183 affs_unlock_dir(dir);
183 goto done; 184 goto done;
185 }
184 mark_buffer_dirty_inode(link_bh, inode); 186 mark_buffer_dirty_inode(link_bh, inode);
185 187
186 memcpy(AFFS_TAIL(sb, bh)->name, AFFS_TAIL(sb, link_bh)->name, 32); 188 memcpy(AFFS_TAIL(sb, bh)->name, AFFS_TAIL(sb, link_bh)->name, 32);
187 retval = affs_insert_hash(dir, bh); 189 retval = affs_insert_hash(dir, bh);
188 if (retval) 190 if (retval) {
191 affs_unlock_dir(dir);
189 goto done; 192 goto done;
193 }
190 mark_buffer_dirty_inode(bh, inode); 194 mark_buffer_dirty_inode(bh, inode);
191 195
192 affs_unlock_dir(dir); 196 affs_unlock_dir(dir);
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index cfcf1b6cf82b..960d336ec694 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -19,12 +19,12 @@ static int affs_intl_toupper(int ch);
19static int affs_intl_hash_dentry(struct dentry *, struct qstr *); 19static int affs_intl_hash_dentry(struct dentry *, struct qstr *);
20static int affs_intl_compare_dentry(struct dentry *, struct qstr *, struct qstr *); 20static int affs_intl_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
21 21
22struct dentry_operations affs_dentry_operations = { 22const struct dentry_operations affs_dentry_operations = {
23 .d_hash = affs_hash_dentry, 23 .d_hash = affs_hash_dentry,
24 .d_compare = affs_compare_dentry, 24 .d_compare = affs_compare_dentry,
25}; 25};
26 26
27static struct dentry_operations affs_intl_dentry_operations = { 27static const struct dentry_operations affs_intl_dentry_operations = {
28 .d_hash = affs_intl_hash_dentry, 28 .d_hash = affs_intl_hash_dentry,
29 .d_compare = affs_intl_compare_dentry, 29 .d_compare = affs_intl_compare_dentry,
30}; 30};
diff --git a/fs/affs/super.c b/fs/affs/super.c
index a19d64b582aa..5ce695e707fe 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -533,6 +533,7 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
533{ 533{
534 struct super_block *sb = dentry->d_sb; 534 struct super_block *sb = dentry->d_sb;
535 int free; 535 int free;
536 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
536 537
537 pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size, 538 pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size,
538 AFFS_SB(sb)->s_reserved); 539 AFFS_SB(sb)->s_reserved);
@@ -543,6 +544,9 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
543 buf->f_blocks = AFFS_SB(sb)->s_partition_size - AFFS_SB(sb)->s_reserved; 544 buf->f_blocks = AFFS_SB(sb)->s_partition_size - AFFS_SB(sb)->s_reserved;
544 buf->f_bfree = free; 545 buf->f_bfree = free;
545 buf->f_bavail = free; 546 buf->f_bavail = free;
547 buf->f_fsid.val[0] = (u32)id;
548 buf->f_fsid.val[1] = (u32)(id >> 32);
549 buf->f_namelen = 30;
546 return 0; 550 return 0;
547} 551}
548 552
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index e7b522fe15e1..5c4e61d3c772 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig
@@ -19,3 +19,11 @@ config AFS_DEBUG
19 See <file:Documentation/filesystems/afs.txt> for more information. 19 See <file:Documentation/filesystems/afs.txt> for more information.
20 20
21 If unsure, say N. 21 If unsure, say N.
22
23config AFS_FSCACHE
24 bool "Provide AFS client caching support (EXPERIMENTAL)"
25 depends on EXPERIMENTAL
26 depends on AFS_FS=m && FSCACHE || AFS_FS=y && FSCACHE=y
27 help
28 Say Y here if you want AFS data to be cached locally on disk through
29 the generic filesystem cache manager
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index a66671082cfb..4f64b95d57bd 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -2,7 +2,10 @@
2# Makefile for Red Hat Linux AFS client. 2# Makefile for Red Hat Linux AFS client.
3# 3#
4 4
5afs-cache-$(CONFIG_AFS_FSCACHE) := cache.o
6
5kafs-objs := \ 7kafs-objs := \
8 $(afs-cache-y) \
6 callback.o \ 9 callback.o \
7 cell.o \ 10 cell.o \
8 cmservice.o \ 11 cmservice.o \
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index de0d7de69edc..e2b1d3f16519 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -1,6 +1,6 @@
1/* AFS caching stuff 1/* AFS caching stuff
2 * 2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -9,248 +9,395 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#ifdef AFS_CACHING_SUPPORT 12#include <linux/slab.h>
13static cachefs_match_val_t afs_cell_cache_match(void *target, 13#include <linux/sched.h>
14 const void *entry); 14#include "internal.h"
15static void afs_cell_cache_update(void *source, void *entry); 15
16 16static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
17struct cachefs_index_def afs_cache_cell_index_def = { 17 void *buffer, uint16_t buflen);
18 .name = "cell_ix", 18static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
19 .data_size = sizeof(struct afs_cache_cell), 19 void *buffer, uint16_t buflen);
20 .keys[0] = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 }, 20static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
21 .match = afs_cell_cache_match, 21 const void *buffer,
22 .update = afs_cell_cache_update, 22 uint16_t buflen);
23
24static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
25 void *buffer, uint16_t buflen);
26static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
27 void *buffer, uint16_t buflen);
28static enum fscache_checkaux afs_vlocation_cache_check_aux(
29 void *cookie_netfs_data, const void *buffer, uint16_t buflen);
30
31static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
32 void *buffer, uint16_t buflen);
33
34static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
35 void *buffer, uint16_t buflen);
36static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
37 uint64_t *size);
38static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
39 void *buffer, uint16_t buflen);
40static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
41 const void *buffer,
42 uint16_t buflen);
43static void afs_vnode_cache_now_uncached(void *cookie_netfs_data);
44
45struct fscache_netfs afs_cache_netfs = {
46 .name = "afs",
47 .version = 0,
48};
49
50struct fscache_cookie_def afs_cell_cache_index_def = {
51 .name = "AFS.cell",
52 .type = FSCACHE_COOKIE_TYPE_INDEX,
53 .get_key = afs_cell_cache_get_key,
54 .get_aux = afs_cell_cache_get_aux,
55 .check_aux = afs_cell_cache_check_aux,
56};
57
58struct fscache_cookie_def afs_vlocation_cache_index_def = {
59 .name = "AFS.vldb",
60 .type = FSCACHE_COOKIE_TYPE_INDEX,
61 .get_key = afs_vlocation_cache_get_key,
62 .get_aux = afs_vlocation_cache_get_aux,
63 .check_aux = afs_vlocation_cache_check_aux,
64};
65
66struct fscache_cookie_def afs_volume_cache_index_def = {
67 .name = "AFS.volume",
68 .type = FSCACHE_COOKIE_TYPE_INDEX,
69 .get_key = afs_volume_cache_get_key,
70};
71
72struct fscache_cookie_def afs_vnode_cache_index_def = {
73 .name = "AFS.vnode",
74 .type = FSCACHE_COOKIE_TYPE_DATAFILE,
75 .get_key = afs_vnode_cache_get_key,
76 .get_attr = afs_vnode_cache_get_attr,
77 .get_aux = afs_vnode_cache_get_aux,
78 .check_aux = afs_vnode_cache_check_aux,
79 .now_uncached = afs_vnode_cache_now_uncached,
23}; 80};
24#endif
25 81
26/* 82/*
27 * match a cell record obtained from the cache 83 * set the key for the index entry
28 */ 84 */
29#ifdef AFS_CACHING_SUPPORT 85static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
30static cachefs_match_val_t afs_cell_cache_match(void *target, 86 void *buffer, uint16_t bufmax)
31 const void *entry)
32{ 87{
33 const struct afs_cache_cell *ccell = entry; 88 const struct afs_cell *cell = cookie_netfs_data;
34 struct afs_cell *cell = target; 89 uint16_t klen;
35 90
36 _enter("{%s},{%s}", ccell->name, cell->name); 91 _enter("%p,%p,%u", cell, buffer, bufmax);
37 92
38 if (strncmp(ccell->name, cell->name, sizeof(ccell->name)) == 0) { 93 klen = strlen(cell->name);
39 _leave(" = SUCCESS"); 94 if (klen > bufmax)
40 return CACHEFS_MATCH_SUCCESS; 95 return 0;
41 }
42 96
43 _leave(" = FAILED"); 97 memcpy(buffer, cell->name, klen);
44 return CACHEFS_MATCH_FAILED; 98 return klen;
45} 99}
46#endif
47 100
48/* 101/*
49 * update a cell record in the cache 102 * provide new auxilliary cache data
50 */ 103 */
51#ifdef AFS_CACHING_SUPPORT 104static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
52static void afs_cell_cache_update(void *source, void *entry) 105 void *buffer, uint16_t bufmax)
53{ 106{
54 struct afs_cache_cell *ccell = entry; 107 const struct afs_cell *cell = cookie_netfs_data;
55 struct afs_cell *cell = source; 108 uint16_t dlen;
56 109
57 _enter("%p,%p", source, entry); 110 _enter("%p,%p,%u", cell, buffer, bufmax);
58 111
59 strncpy(ccell->name, cell->name, sizeof(ccell->name)); 112 dlen = cell->vl_naddrs * sizeof(cell->vl_addrs[0]);
113 dlen = min(dlen, bufmax);
114 dlen &= ~(sizeof(cell->vl_addrs[0]) - 1);
60 115
61 memcpy(ccell->vl_servers, 116 memcpy(buffer, cell->vl_addrs, dlen);
62 cell->vl_addrs, 117 return dlen;
63 min(sizeof(ccell->vl_servers), sizeof(cell->vl_addrs))); 118}
64 119
120/*
121 * check that the auxilliary data indicates that the entry is still valid
122 */
123static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
124 const void *buffer,
125 uint16_t buflen)
126{
127 _leave(" = OKAY");
128 return FSCACHE_CHECKAUX_OKAY;
65} 129}
66#endif
67
68#ifdef AFS_CACHING_SUPPORT
69static cachefs_match_val_t afs_vlocation_cache_match(void *target,
70 const void *entry);
71static void afs_vlocation_cache_update(void *source, void *entry);
72
73struct cachefs_index_def afs_vlocation_cache_index_def = {
74 .name = "vldb",
75 .data_size = sizeof(struct afs_cache_vlocation),
76 .keys[0] = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
77 .match = afs_vlocation_cache_match,
78 .update = afs_vlocation_cache_update,
79};
80#endif
81 130
131/*****************************************************************************/
82/* 132/*
83 * match a VLDB record stored in the cache 133 * set the key for the index entry
84 * - may also load target from entry
85 */ 134 */
86#ifdef AFS_CACHING_SUPPORT 135static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
87static cachefs_match_val_t afs_vlocation_cache_match(void *target, 136 void *buffer, uint16_t bufmax)
88 const void *entry)
89{ 137{
90 const struct afs_cache_vlocation *vldb = entry; 138 const struct afs_vlocation *vlocation = cookie_netfs_data;
91 struct afs_vlocation *vlocation = target; 139 uint16_t klen;
140
141 _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax);
142
143 klen = strnlen(vlocation->vldb.name, sizeof(vlocation->vldb.name));
144 if (klen > bufmax)
145 return 0;
92 146
93 _enter("{%s},{%s}", vlocation->vldb.name, vldb->name); 147 memcpy(buffer, vlocation->vldb.name, klen);
94 148
95 if (strncmp(vlocation->vldb.name, vldb->name, sizeof(vldb->name)) == 0 149 _leave(" = %u", klen);
96 ) { 150 return klen;
97 if (!vlocation->valid || 151}
98 vlocation->vldb.rtime == vldb->rtime 152
153/*
154 * provide new auxilliary cache data
155 */
156static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
157 void *buffer, uint16_t bufmax)
158{
159 const struct afs_vlocation *vlocation = cookie_netfs_data;
160 uint16_t dlen;
161
162 _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax);
163
164 dlen = sizeof(struct afs_cache_vlocation);
165 dlen -= offsetof(struct afs_cache_vlocation, nservers);
166 if (dlen > bufmax)
167 return 0;
168
169 memcpy(buffer, (uint8_t *)&vlocation->vldb.nservers, dlen);
170
171 _leave(" = %u", dlen);
172 return dlen;
173}
174
175/*
176 * check that the auxilliary data indicates that the entry is still valid
177 */
178static
179enum fscache_checkaux afs_vlocation_cache_check_aux(void *cookie_netfs_data,
180 const void *buffer,
181 uint16_t buflen)
182{
183 const struct afs_cache_vlocation *cvldb;
184 struct afs_vlocation *vlocation = cookie_netfs_data;
185 uint16_t dlen;
186
187 _enter("{%s},%p,%u", vlocation->vldb.name, buffer, buflen);
188
189 /* check the size of the data is what we're expecting */
190 dlen = sizeof(struct afs_cache_vlocation);
191 dlen -= offsetof(struct afs_cache_vlocation, nservers);
192 if (dlen != buflen)
193 return FSCACHE_CHECKAUX_OBSOLETE;
194
195 cvldb = container_of(buffer, struct afs_cache_vlocation, nservers);
196
197 /* if what's on disk is more valid than what's in memory, then use the
198 * VL record from the cache */
199 if (!vlocation->valid || vlocation->vldb.rtime == cvldb->rtime) {
200 memcpy((uint8_t *)&vlocation->vldb.nservers, buffer, dlen);
201 vlocation->valid = 1;
202 _leave(" = SUCCESS [c->m]");
203 return FSCACHE_CHECKAUX_OKAY;
204 }
205
206 /* need to update the cache if the cached info differs */
207 if (memcmp(&vlocation->vldb, buffer, dlen) != 0) {
208 /* delete if the volume IDs for this name differ */
209 if (memcmp(&vlocation->vldb.vid, &cvldb->vid,
210 sizeof(cvldb->vid)) != 0
99 ) { 211 ) {
100 vlocation->vldb = *vldb; 212 _leave(" = OBSOLETE");
101 vlocation->valid = 1; 213 return FSCACHE_CHECKAUX_OBSOLETE;
102 _leave(" = SUCCESS [c->m]");
103 return CACHEFS_MATCH_SUCCESS;
104 } else if (memcmp(&vlocation->vldb, vldb, sizeof(*vldb)) != 0) {
105 /* delete if VIDs for this name differ */
106 if (memcmp(&vlocation->vldb.vid,
107 &vldb->vid,
108 sizeof(vldb->vid)) != 0) {
109 _leave(" = DELETE");
110 return CACHEFS_MATCH_SUCCESS_DELETE;
111 }
112
113 _leave(" = UPDATE");
114 return CACHEFS_MATCH_SUCCESS_UPDATE;
115 } else {
116 _leave(" = SUCCESS");
117 return CACHEFS_MATCH_SUCCESS;
118 } 214 }
215
216 _leave(" = UPDATE");
217 return FSCACHE_CHECKAUX_NEEDS_UPDATE;
119 } 218 }
120 219
121 _leave(" = FAILED"); 220 _leave(" = OKAY");
122 return CACHEFS_MATCH_FAILED; 221 return FSCACHE_CHECKAUX_OKAY;
123} 222}
124#endif
125 223
224/*****************************************************************************/
126/* 225/*
127 * update a VLDB record stored in the cache 226 * set the key for the volume index entry
128 */ 227 */
129#ifdef AFS_CACHING_SUPPORT 228static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
130static void afs_vlocation_cache_update(void *source, void *entry) 229 void *buffer, uint16_t bufmax)
131{ 230{
132 struct afs_cache_vlocation *vldb = entry; 231 const struct afs_volume *volume = cookie_netfs_data;
133 struct afs_vlocation *vlocation = source; 232 uint16_t klen;
233
234 _enter("{%u},%p,%u", volume->type, buffer, bufmax);
235
236 klen = sizeof(volume->type);
237 if (klen > bufmax)
238 return 0;
134 239
135 _enter(""); 240 memcpy(buffer, &volume->type, sizeof(volume->type));
241
242 _leave(" = %u", klen);
243 return klen;
136 244
137 *vldb = vlocation->vldb;
138} 245}
139#endif
140
141#ifdef AFS_CACHING_SUPPORT
142static cachefs_match_val_t afs_volume_cache_match(void *target,
143 const void *entry);
144static void afs_volume_cache_update(void *source, void *entry);
145
146struct cachefs_index_def afs_volume_cache_index_def = {
147 .name = "volume",
148 .data_size = sizeof(struct afs_cache_vhash),
149 .keys[0] = { CACHEFS_INDEX_KEYS_BIN, 1 },
150 .keys[1] = { CACHEFS_INDEX_KEYS_BIN, 1 },
151 .match = afs_volume_cache_match,
152 .update = afs_volume_cache_update,
153};
154#endif
155 246
247/*****************************************************************************/
156/* 248/*
157 * match a volume hash record stored in the cache 249 * set the key for the index entry
158 */ 250 */
159#ifdef AFS_CACHING_SUPPORT 251static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
160static cachefs_match_val_t afs_volume_cache_match(void *target, 252 void *buffer, uint16_t bufmax)
161 const void *entry)
162{ 253{
163 const struct afs_cache_vhash *vhash = entry; 254 const struct afs_vnode *vnode = cookie_netfs_data;
164 struct afs_volume *volume = target; 255 uint16_t klen;
165 256
166 _enter("{%u},{%u}", volume->type, vhash->vtype); 257 _enter("{%x,%x,%llx},%p,%u",
258 vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
259 buffer, bufmax);
167 260
168 if (volume->type == vhash->vtype) { 261 klen = sizeof(vnode->fid.vnode);
169 _leave(" = SUCCESS"); 262 if (klen > bufmax)
170 return CACHEFS_MATCH_SUCCESS; 263 return 0;
171 } 264
265 memcpy(buffer, &vnode->fid.vnode, sizeof(vnode->fid.vnode));
172 266
173 _leave(" = FAILED"); 267 _leave(" = %u", klen);
174 return CACHEFS_MATCH_FAILED; 268 return klen;
175} 269}
176#endif
177 270
178/* 271/*
179 * update a volume hash record stored in the cache 272 * provide updated file attributes
180 */ 273 */
181#ifdef AFS_CACHING_SUPPORT 274static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
182static void afs_volume_cache_update(void *source, void *entry) 275 uint64_t *size)
183{ 276{
184 struct afs_cache_vhash *vhash = entry; 277 const struct afs_vnode *vnode = cookie_netfs_data;
185 struct afs_volume *volume = source;
186 278
187 _enter(""); 279 _enter("{%x,%x,%llx},",
280 vnode->fid.vnode, vnode->fid.unique,
281 vnode->status.data_version);
188 282
189 vhash->vtype = volume->type; 283 *size = vnode->status.size;
190} 284}
191#endif
192
193#ifdef AFS_CACHING_SUPPORT
194static cachefs_match_val_t afs_vnode_cache_match(void *target,
195 const void *entry);
196static void afs_vnode_cache_update(void *source, void *entry);
197
198struct cachefs_index_def afs_vnode_cache_index_def = {
199 .name = "vnode",
200 .data_size = sizeof(struct afs_cache_vnode),
201 .keys[0] = { CACHEFS_INDEX_KEYS_BIN, 4 },
202 .match = afs_vnode_cache_match,
203 .update = afs_vnode_cache_update,
204};
205#endif
206 285
207/* 286/*
208 * match a vnode record stored in the cache 287 * provide new auxilliary cache data
288 */
289static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
290 void *buffer, uint16_t bufmax)
291{
292 const struct afs_vnode *vnode = cookie_netfs_data;
293 uint16_t dlen;
294
295 _enter("{%x,%x,%Lx},%p,%u",
296 vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
297 buffer, bufmax);
298
299 dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version);
300 if (dlen > bufmax)
301 return 0;
302
303 memcpy(buffer, &vnode->fid.unique, sizeof(vnode->fid.unique));
304 buffer += sizeof(vnode->fid.unique);
305 memcpy(buffer, &vnode->status.data_version,
306 sizeof(vnode->status.data_version));
307
308 _leave(" = %u", dlen);
309 return dlen;
310}
311
312/*
313 * check that the auxilliary data indicates that the entry is still valid
209 */ 314 */
210#ifdef AFS_CACHING_SUPPORT 315static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
211static cachefs_match_val_t afs_vnode_cache_match(void *target, 316 const void *buffer,
212 const void *entry) 317 uint16_t buflen)
213{ 318{
214 const struct afs_cache_vnode *cvnode = entry; 319 struct afs_vnode *vnode = cookie_netfs_data;
215 struct afs_vnode *vnode = target; 320 uint16_t dlen;
216 321
217 _enter("{%x,%x,%Lx},{%x,%x,%Lx}", 322 _enter("{%x,%x,%llx},%p,%u",
218 vnode->fid.vnode, 323 vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
219 vnode->fid.unique, 324 buffer, buflen);
220 vnode->status.version, 325
221 cvnode->vnode_id, 326 /* check the size of the data is what we're expecting */
222 cvnode->vnode_unique, 327 dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version);
223 cvnode->data_version); 328 if (dlen != buflen) {
224 329 _leave(" = OBSOLETE [len %hx != %hx]", dlen, buflen);
225 if (vnode->fid.vnode != cvnode->vnode_id) { 330 return FSCACHE_CHECKAUX_OBSOLETE;
226 _leave(" = FAILED");
227 return CACHEFS_MATCH_FAILED;
228 } 331 }
229 332
230 if (vnode->fid.unique != cvnode->vnode_unique || 333 if (memcmp(buffer,
231 vnode->status.version != cvnode->data_version) { 334 &vnode->fid.unique,
232 _leave(" = DELETE"); 335 sizeof(vnode->fid.unique)
233 return CACHEFS_MATCH_SUCCESS_DELETE; 336 ) != 0) {
337 unsigned unique;
338
339 memcpy(&unique, buffer, sizeof(unique));
340
341 _leave(" = OBSOLETE [uniq %x != %x]",
342 unique, vnode->fid.unique);
343 return FSCACHE_CHECKAUX_OBSOLETE;
344 }
345
346 if (memcmp(buffer + sizeof(vnode->fid.unique),
347 &vnode->status.data_version,
348 sizeof(vnode->status.data_version)
349 ) != 0) {
350 afs_dataversion_t version;
351
352 memcpy(&version, buffer + sizeof(vnode->fid.unique),
353 sizeof(version));
354
355 _leave(" = OBSOLETE [vers %llx != %llx]",
356 version, vnode->status.data_version);
357 return FSCACHE_CHECKAUX_OBSOLETE;
234 } 358 }
235 359
236 _leave(" = SUCCESS"); 360 _leave(" = SUCCESS");
237 return CACHEFS_MATCH_SUCCESS; 361 return FSCACHE_CHECKAUX_OKAY;
238} 362}
239#endif
240 363
241/* 364/*
242 * update a vnode record stored in the cache 365 * indication the cookie is no longer uncached
366 * - this function is called when the backing store currently caching a cookie
367 * is removed
368 * - the netfs should use this to clean up any markers indicating cached pages
369 * - this is mandatory for any object that may have data
243 */ 370 */
244#ifdef AFS_CACHING_SUPPORT 371static void afs_vnode_cache_now_uncached(void *cookie_netfs_data)
245static void afs_vnode_cache_update(void *source, void *entry)
246{ 372{
247 struct afs_cache_vnode *cvnode = entry; 373 struct afs_vnode *vnode = cookie_netfs_data;
248 struct afs_vnode *vnode = source; 374 struct pagevec pvec;
375 pgoff_t first;
376 int loop, nr_pages;
377
378 _enter("{%x,%x,%Lx}",
379 vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version);
380
381 pagevec_init(&pvec, 0);
382 first = 0;
383
384 for (;;) {
385 /* grab a bunch of pages to clean */
386 nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping,
387 first,
388 PAGEVEC_SIZE - pagevec_count(&pvec));
389 if (!nr_pages)
390 break;
249 391
250 _enter(""); 392 for (loop = 0; loop < nr_pages; loop++)
393 ClearPageFsCache(pvec.pages[loop]);
394
395 first = pvec.pages[nr_pages - 1]->index + 1;
396
397 pvec.nr = nr_pages;
398 pagevec_release(&pvec);
399 cond_resched();
400 }
251 401
252 cvnode->vnode_id = vnode->fid.vnode; 402 _leave("");
253 cvnode->vnode_unique = vnode->fid.unique;
254 cvnode->data_version = vnode->status.version;
255} 403}
256#endif
diff --git a/fs/afs/cache.h b/fs/afs/cache.h
index 36a3642cf90e..5c4f6b499e90 100644
--- a/fs/afs/cache.h
+++ b/fs/afs/cache.h
@@ -1,6 +1,6 @@
1/* AFS local cache management interface 1/* AFS local cache management interface
2 * 2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -9,15 +9,4 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#ifndef AFS_CACHE_H 12#include <linux/fscache.h>
13#define AFS_CACHE_H
14
15#undef AFS_CACHING_SUPPORT
16
17#include <linux/mm.h>
18#ifdef AFS_CACHING_SUPPORT
19#include <linux/cachefs.h>
20#endif
21#include "types.h"
22
23#endif /* AFS_CACHE_H */
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 5e1df14e16b1..e19c13f059ed 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -147,12 +147,11 @@ struct afs_cell *afs_cell_create(const char *name, char *vllist)
147 if (ret < 0) 147 if (ret < 0)
148 goto error; 148 goto error;
149 149
150#ifdef AFS_CACHING_SUPPORT 150#ifdef CONFIG_AFS_FSCACHE
151 /* put it up for caching */ 151 /* put it up for caching (this never returns an error) */
152 cachefs_acquire_cookie(afs_cache_netfs.primary_index, 152 cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index,
153 &afs_vlocation_cache_index_def, 153 &afs_cell_cache_index_def,
154 cell, 154 cell);
155 &cell->cache);
156#endif 155#endif
157 156
158 /* add to the cell lists */ 157 /* add to the cell lists */
@@ -362,10 +361,9 @@ static void afs_cell_destroy(struct afs_cell *cell)
362 list_del_init(&cell->proc_link); 361 list_del_init(&cell->proc_link);
363 up_write(&afs_proc_cells_sem); 362 up_write(&afs_proc_cells_sem);
364 363
365#ifdef AFS_CACHING_SUPPORT 364#ifdef CONFIG_AFS_FSCACHE
366 cachefs_relinquish_cookie(cell->cache, 0); 365 fscache_relinquish_cookie(cell->cache, 0);
367#endif 366#endif
368
369 key_put(cell->anonymous_key); 367 key_put(cell->anonymous_key);
370 kfree(cell); 368 kfree(cell);
371 369
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 99cf390641f7..9bd757774c9e 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -62,7 +62,7 @@ const struct inode_operations afs_dir_inode_operations = {
62 .setattr = afs_setattr, 62 .setattr = afs_setattr,
63}; 63};
64 64
65static struct dentry_operations afs_fs_dentry_operations = { 65static const struct dentry_operations afs_fs_dentry_operations = {
66 .d_revalidate = afs_d_revalidate, 66 .d_revalidate = afs_d_revalidate,
67 .d_delete = afs_d_delete, 67 .d_delete = afs_d_delete,
68 .d_release = afs_d_release, 68 .d_release = afs_d_release,
diff --git a/fs/afs/file.c b/fs/afs/file.c
index a3901769a96c..7a1d942ef68d 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -23,6 +23,9 @@ static void afs_invalidatepage(struct page *page, unsigned long offset);
23static int afs_releasepage(struct page *page, gfp_t gfp_flags); 23static int afs_releasepage(struct page *page, gfp_t gfp_flags);
24static int afs_launder_page(struct page *page); 24static int afs_launder_page(struct page *page);
25 25
26static int afs_readpages(struct file *filp, struct address_space *mapping,
27 struct list_head *pages, unsigned nr_pages);
28
26const struct file_operations afs_file_operations = { 29const struct file_operations afs_file_operations = {
27 .open = afs_open, 30 .open = afs_open,
28 .release = afs_release, 31 .release = afs_release,
@@ -46,6 +49,7 @@ const struct inode_operations afs_file_inode_operations = {
46 49
47const struct address_space_operations afs_fs_aops = { 50const struct address_space_operations afs_fs_aops = {
48 .readpage = afs_readpage, 51 .readpage = afs_readpage,
52 .readpages = afs_readpages,
49 .set_page_dirty = afs_set_page_dirty, 53 .set_page_dirty = afs_set_page_dirty,
50 .launder_page = afs_launder_page, 54 .launder_page = afs_launder_page,
51 .releasepage = afs_releasepage, 55 .releasepage = afs_releasepage,
@@ -101,37 +105,18 @@ int afs_release(struct inode *inode, struct file *file)
101/* 105/*
102 * deal with notification that a page was read from the cache 106 * deal with notification that a page was read from the cache
103 */ 107 */
104#ifdef AFS_CACHING_SUPPORT 108static void afs_file_readpage_read_complete(struct page *page,
105static void afs_readpage_read_complete(void *cookie_data, 109 void *data,
106 struct page *page, 110 int error)
107 void *data,
108 int error)
109{ 111{
110 _enter("%p,%p,%p,%d", cookie_data, page, data, error); 112 _enter("%p,%p,%d", page, data, error);
111 113
112 if (error) 114 /* if the read completes with an error, we just unlock the page and let
113 SetPageError(page); 115 * the VM reissue the readpage */
114 else 116 if (!error)
115 SetPageUptodate(page); 117 SetPageUptodate(page);
116 unlock_page(page); 118 unlock_page(page);
117
118} 119}
119#endif
120
121/*
122 * deal with notification that a page was written to the cache
123 */
124#ifdef AFS_CACHING_SUPPORT
125static void afs_readpage_write_complete(void *cookie_data,
126 struct page *page,
127 void *data,
128 int error)
129{
130 _enter("%p,%p,%p,%d", cookie_data, page, data, error);
131
132 unlock_page(page);
133}
134#endif
135 120
136/* 121/*
137 * AFS read page from file, directory or symlink 122 * AFS read page from file, directory or symlink
@@ -161,9 +146,9 @@ static int afs_readpage(struct file *file, struct page *page)
161 if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) 146 if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
162 goto error; 147 goto error;
163 148
164#ifdef AFS_CACHING_SUPPORT
165 /* is it cached? */ 149 /* is it cached? */
166 ret = cachefs_read_or_alloc_page(vnode->cache, 150#ifdef CONFIG_AFS_FSCACHE
151 ret = fscache_read_or_alloc_page(vnode->cache,
167 page, 152 page,
168 afs_file_readpage_read_complete, 153 afs_file_readpage_read_complete,
169 NULL, 154 NULL,
@@ -171,20 +156,21 @@ static int afs_readpage(struct file *file, struct page *page)
171#else 156#else
172 ret = -ENOBUFS; 157 ret = -ENOBUFS;
173#endif 158#endif
174
175 switch (ret) { 159 switch (ret) {
176 /* read BIO submitted and wb-journal entry found */
177 case 1:
178 BUG(); // TODO - handle wb-journal match
179
180 /* read BIO submitted (page in cache) */ 160 /* read BIO submitted (page in cache) */
181 case 0: 161 case 0:
182 break; 162 break;
183 163
184 /* no page available in cache */ 164 /* page not yet cached */
185 case -ENOBUFS:
186 case -ENODATA: 165 case -ENODATA:
166 _debug("cache said ENODATA");
167 goto go_on;
168
169 /* page will not be cached */
170 case -ENOBUFS:
171 _debug("cache said ENOBUFS");
187 default: 172 default:
173 go_on:
188 offset = page->index << PAGE_CACHE_SHIFT; 174 offset = page->index << PAGE_CACHE_SHIFT;
189 len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE); 175 len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE);
190 176
@@ -198,27 +184,25 @@ static int afs_readpage(struct file *file, struct page *page)
198 set_bit(AFS_VNODE_DELETED, &vnode->flags); 184 set_bit(AFS_VNODE_DELETED, &vnode->flags);
199 ret = -ESTALE; 185 ret = -ESTALE;
200 } 186 }
201#ifdef AFS_CACHING_SUPPORT 187
202 cachefs_uncache_page(vnode->cache, page); 188#ifdef CONFIG_AFS_FSCACHE
189 fscache_uncache_page(vnode->cache, page);
203#endif 190#endif
191 BUG_ON(PageFsCache(page));
204 goto error; 192 goto error;
205 } 193 }
206 194
207 SetPageUptodate(page); 195 SetPageUptodate(page);
208 196
209#ifdef AFS_CACHING_SUPPORT 197 /* send the page to the cache */
210 if (cachefs_write_page(vnode->cache, 198#ifdef CONFIG_AFS_FSCACHE
211 page, 199 if (PageFsCache(page) &&
212 afs_file_readpage_write_complete, 200 fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) {
213 NULL, 201 fscache_uncache_page(vnode->cache, page);
214 GFP_KERNEL) != 0 202 BUG_ON(PageFsCache(page));
215 ) {
216 cachefs_uncache_page(vnode->cache, page);
217 unlock_page(page);
218 } 203 }
219#else
220 unlock_page(page);
221#endif 204#endif
205 unlock_page(page);
222 } 206 }
223 207
224 _leave(" = 0"); 208 _leave(" = 0");
@@ -232,34 +216,59 @@ error:
232} 216}
233 217
234/* 218/*
235 * invalidate part or all of a page 219 * read a set of pages
236 */ 220 */
237static void afs_invalidatepage(struct page *page, unsigned long offset) 221static int afs_readpages(struct file *file, struct address_space *mapping,
222 struct list_head *pages, unsigned nr_pages)
238{ 223{
239 int ret = 1; 224 struct afs_vnode *vnode;
225 int ret = 0;
240 226
241 _enter("{%lu},%lu", page->index, offset); 227 _enter(",{%lu},,%d", mapping->host->i_ino, nr_pages);
242 228
243 BUG_ON(!PageLocked(page)); 229 vnode = AFS_FS_I(mapping->host);
230 if (vnode->flags & AFS_VNODE_DELETED) {
231 _leave(" = -ESTALE");
232 return -ESTALE;
233 }
244 234
245 if (PagePrivate(page)) { 235 /* attempt to read as many of the pages as possible */
246 /* We release buffers only if the entire page is being 236#ifdef CONFIG_AFS_FSCACHE
247 * invalidated. 237 ret = fscache_read_or_alloc_pages(vnode->cache,
248 * The get_block cached value has been unconditionally 238 mapping,
249 * invalidated, so real IO is not possible anymore. 239 pages,
250 */ 240 &nr_pages,
251 if (offset == 0) { 241 afs_file_readpage_read_complete,
252 BUG_ON(!PageLocked(page)); 242 NULL,
253 243 mapping_gfp_mask(mapping));
254 ret = 0; 244#else
255 if (!PageWriteback(page)) 245 ret = -ENOBUFS;
256 ret = page->mapping->a_ops->releasepage(page, 246#endif
257 0); 247
258 /* possibly should BUG_ON(!ret); - neilb */ 248 switch (ret) {
259 } 249 /* all pages are being read from the cache */
250 case 0:
251 BUG_ON(!list_empty(pages));
252 BUG_ON(nr_pages != 0);
253 _leave(" = 0 [reading all]");
254 return 0;
255
256 /* there were pages that couldn't be read from the cache */
257 case -ENODATA:
258 case -ENOBUFS:
259 break;
260
261 /* other error */
262 default:
263 _leave(" = %d", ret);
264 return ret;
260 } 265 }
261 266
262 _leave(" = %d", ret); 267 /* load the missing pages from the network */
268 ret = read_cache_pages(mapping, pages, (void *) afs_readpage, file);
269
270 _leave(" = %d [netting]", ret);
271 return ret;
263} 272}
264 273
265/* 274/*
@@ -273,25 +282,82 @@ static int afs_launder_page(struct page *page)
273} 282}
274 283
275/* 284/*
276 * release a page and cleanup its private data 285 * invalidate part or all of a page
286 * - release a page and clean up its private data if offset is 0 (indicating
287 * the entire page)
288 */
289static void afs_invalidatepage(struct page *page, unsigned long offset)
290{
291 struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
292
293 _enter("{%lu},%lu", page->index, offset);
294
295 BUG_ON(!PageLocked(page));
296
297 /* we clean up only if the entire page is being invalidated */
298 if (offset == 0) {
299#ifdef CONFIG_AFS_FSCACHE
300 if (PageFsCache(page)) {
301 struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
302 fscache_wait_on_page_write(vnode->cache, page);
303 fscache_uncache_page(vnode->cache, page);
304 ClearPageFsCache(page);
305 }
306#endif
307
308 if (PagePrivate(page)) {
309 if (wb && !PageWriteback(page)) {
310 set_page_private(page, 0);
311 afs_put_writeback(wb);
312 }
313
314 if (!page_private(page))
315 ClearPagePrivate(page);
316 }
317 }
318
319 _leave("");
320}
321
322/*
323 * release a page and clean up its private state if it's not busy
324 * - return true if the page can now be released, false if not
277 */ 325 */
278static int afs_releasepage(struct page *page, gfp_t gfp_flags) 326static int afs_releasepage(struct page *page, gfp_t gfp_flags)
279{ 327{
328 struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
280 struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); 329 struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
281 struct afs_writeback *wb;
282 330
283 _enter("{{%x:%u}[%lu],%lx},%x", 331 _enter("{{%x:%u}[%lu],%lx},%x",
284 vnode->fid.vid, vnode->fid.vnode, page->index, page->flags, 332 vnode->fid.vid, vnode->fid.vnode, page->index, page->flags,
285 gfp_flags); 333 gfp_flags);
286 334
335 /* deny if page is being written to the cache and the caller hasn't
336 * elected to wait */
337#ifdef CONFIG_AFS_FSCACHE
338 if (PageFsCache(page)) {
339 if (fscache_check_page_write(vnode->cache, page)) {
340 if (!(gfp_flags & __GFP_WAIT)) {
341 _leave(" = F [cache busy]");
342 return 0;
343 }
344 fscache_wait_on_page_write(vnode->cache, page);
345 }
346
347 fscache_uncache_page(vnode->cache, page);
348 ClearPageFsCache(page);
349 }
350#endif
351
287 if (PagePrivate(page)) { 352 if (PagePrivate(page)) {
288 wb = (struct afs_writeback *) page_private(page); 353 if (wb) {
289 ASSERT(wb != NULL); 354 set_page_private(page, 0);
290 set_page_private(page, 0); 355 afs_put_writeback(wb);
356 }
291 ClearPagePrivate(page); 357 ClearPagePrivate(page);
292 afs_put_writeback(wb);
293 } 358 }
294 359
295 _leave(" = 0"); 360 /* indicate that the page can be released */
296 return 0; 361 _leave(" = T");
362 return 1;
297} 363}
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index bb47217f6a18..c048f0658751 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -61,6 +61,11 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
61 return -EBADMSG; 61 return -EBADMSG;
62 } 62 }
63 63
64#ifdef CONFIG_AFS_FSCACHE
65 if (vnode->status.size != inode->i_size)
66 fscache_attr_changed(vnode->cache);
67#endif
68
64 inode->i_nlink = vnode->status.nlink; 69 inode->i_nlink = vnode->status.nlink;
65 inode->i_uid = vnode->status.owner; 70 inode->i_uid = vnode->status.owner;
66 inode->i_gid = 0; 71 inode->i_gid = 0;
@@ -149,15 +154,6 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
149 return inode; 154 return inode;
150 } 155 }
151 156
152#ifdef AFS_CACHING_SUPPORT
153 /* set up caching before reading the status, as fetch-status reads the
154 * first page of symlinks to see if they're really mntpts */
155 cachefs_acquire_cookie(vnode->volume->cache,
156 NULL,
157 vnode,
158 &vnode->cache);
159#endif
160
161 if (!status) { 157 if (!status) {
162 /* it's a remotely extant inode */ 158 /* it's a remotely extant inode */
163 set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); 159 set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
@@ -183,6 +179,15 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
183 } 179 }
184 } 180 }
185 181
182 /* set up caching before mapping the status, as map-status reads the
183 * first page of symlinks to see if they're really mountpoints */
184 inode->i_size = vnode->status.size;
185#ifdef CONFIG_AFS_FSCACHE
186 vnode->cache = fscache_acquire_cookie(vnode->volume->cache,
187 &afs_vnode_cache_index_def,
188 vnode);
189#endif
190
186 ret = afs_inode_map_status(vnode, key); 191 ret = afs_inode_map_status(vnode, key);
187 if (ret < 0) 192 if (ret < 0)
188 goto bad_inode; 193 goto bad_inode;
@@ -196,6 +201,10 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
196 201
197 /* failure */ 202 /* failure */
198bad_inode: 203bad_inode:
204#ifdef CONFIG_AFS_FSCACHE
205 fscache_relinquish_cookie(vnode->cache, 0);
206 vnode->cache = NULL;
207#endif
199 iget_failed(inode); 208 iget_failed(inode);
200 _leave(" = %d [bad]", ret); 209 _leave(" = %d [bad]", ret);
201 return ERR_PTR(ret); 210 return ERR_PTR(ret);
@@ -340,8 +349,8 @@ void afs_clear_inode(struct inode *inode)
340 ASSERT(list_empty(&vnode->writebacks)); 349 ASSERT(list_empty(&vnode->writebacks));
341 ASSERT(!vnode->cb_promised); 350 ASSERT(!vnode->cb_promised);
342 351
343#ifdef AFS_CACHING_SUPPORT 352#ifdef CONFIG_AFS_FSCACHE
344 cachefs_relinquish_cookie(vnode->cache, 0); 353 fscache_relinquish_cookie(vnode->cache, 0);
345 vnode->cache = NULL; 354 vnode->cache = NULL;
346#endif 355#endif
347 356
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 67f259d99cd6..106be66dafd2 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -21,6 +21,7 @@
21 21
22#include "afs.h" 22#include "afs.h"
23#include "afs_vl.h" 23#include "afs_vl.h"
24#include "cache.h"
24 25
25#define AFS_CELL_MAX_ADDRS 15 26#define AFS_CELL_MAX_ADDRS 15
26 27
@@ -193,8 +194,8 @@ struct afs_cell {
193 struct key *anonymous_key; /* anonymous user key for this cell */ 194 struct key *anonymous_key; /* anonymous user key for this cell */
194 struct list_head proc_link; /* /proc cell list link */ 195 struct list_head proc_link; /* /proc cell list link */
195 struct proc_dir_entry *proc_dir; /* /proc dir for this cell */ 196 struct proc_dir_entry *proc_dir; /* /proc dir for this cell */
196#ifdef AFS_CACHING_SUPPORT 197#ifdef CONFIG_AFS_FSCACHE
197 struct cachefs_cookie *cache; /* caching cookie */ 198 struct fscache_cookie *cache; /* caching cookie */
198#endif 199#endif
199 200
200 /* server record management */ 201 /* server record management */
@@ -249,8 +250,8 @@ struct afs_vlocation {
249 struct list_head grave; /* link in master graveyard list */ 250 struct list_head grave; /* link in master graveyard list */
250 struct list_head update; /* link in master update list */ 251 struct list_head update; /* link in master update list */
251 struct afs_cell *cell; /* cell to which volume belongs */ 252 struct afs_cell *cell; /* cell to which volume belongs */
252#ifdef AFS_CACHING_SUPPORT 253#ifdef CONFIG_AFS_FSCACHE
253 struct cachefs_cookie *cache; /* caching cookie */ 254 struct fscache_cookie *cache; /* caching cookie */
254#endif 255#endif
255 struct afs_cache_vlocation vldb; /* volume information DB record */ 256 struct afs_cache_vlocation vldb; /* volume information DB record */
256 struct afs_volume *vols[3]; /* volume access record pointer (index by type) */ 257 struct afs_volume *vols[3]; /* volume access record pointer (index by type) */
@@ -302,8 +303,8 @@ struct afs_volume {
302 atomic_t usage; 303 atomic_t usage;
303 struct afs_cell *cell; /* cell to which belongs (unrefd ptr) */ 304 struct afs_cell *cell; /* cell to which belongs (unrefd ptr) */
304 struct afs_vlocation *vlocation; /* volume location */ 305 struct afs_vlocation *vlocation; /* volume location */
305#ifdef AFS_CACHING_SUPPORT 306#ifdef CONFIG_AFS_FSCACHE
306 struct cachefs_cookie *cache; /* caching cookie */ 307 struct fscache_cookie *cache; /* caching cookie */
307#endif 308#endif
308 afs_volid_t vid; /* volume ID */ 309 afs_volid_t vid; /* volume ID */
309 afs_voltype_t type; /* type of volume */ 310 afs_voltype_t type; /* type of volume */
@@ -333,8 +334,8 @@ struct afs_vnode {
333 struct afs_server *server; /* server currently supplying this file */ 334 struct afs_server *server; /* server currently supplying this file */
334 struct afs_fid fid; /* the file identifier for this inode */ 335 struct afs_fid fid; /* the file identifier for this inode */
335 struct afs_file_status status; /* AFS status info for this file */ 336 struct afs_file_status status; /* AFS status info for this file */
336#ifdef AFS_CACHING_SUPPORT 337#ifdef CONFIG_AFS_FSCACHE
337 struct cachefs_cookie *cache; /* caching cookie */ 338 struct fscache_cookie *cache; /* caching cookie */
338#endif 339#endif
339 struct afs_permits *permits; /* cache of permits so far obtained */ 340 struct afs_permits *permits; /* cache of permits so far obtained */
340 struct mutex permits_lock; /* lock for altering permits list */ 341 struct mutex permits_lock; /* lock for altering permits list */
@@ -428,6 +429,22 @@ struct afs_uuid {
428 429
429/*****************************************************************************/ 430/*****************************************************************************/
430/* 431/*
432 * cache.c
433 */
434#ifdef CONFIG_AFS_FSCACHE
435extern struct fscache_netfs afs_cache_netfs;
436extern struct fscache_cookie_def afs_cell_cache_index_def;
437extern struct fscache_cookie_def afs_vlocation_cache_index_def;
438extern struct fscache_cookie_def afs_volume_cache_index_def;
439extern struct fscache_cookie_def afs_vnode_cache_index_def;
440#else
441#define afs_cell_cache_index_def (*(struct fscache_cookie_def *) NULL)
442#define afs_vlocation_cache_index_def (*(struct fscache_cookie_def *) NULL)
443#define afs_volume_cache_index_def (*(struct fscache_cookie_def *) NULL)
444#define afs_vnode_cache_index_def (*(struct fscache_cookie_def *) NULL)
445#endif
446
447/*
431 * callback.c 448 * callback.c
432 */ 449 */
433extern void afs_init_callback_state(struct afs_server *); 450extern void afs_init_callback_state(struct afs_server *);
@@ -446,9 +463,6 @@ extern void afs_callback_update_kill(void);
446 */ 463 */
447extern struct rw_semaphore afs_proc_cells_sem; 464extern struct rw_semaphore afs_proc_cells_sem;
448extern struct list_head afs_proc_cells; 465extern struct list_head afs_proc_cells;
449#ifdef AFS_CACHING_SUPPORT
450extern struct cachefs_index_def afs_cache_cell_index_def;
451#endif
452 466
453#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0) 467#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
454extern int afs_cell_init(char *); 468extern int afs_cell_init(char *);
@@ -554,9 +568,6 @@ extern void afs_clear_inode(struct inode *);
554 * main.c 568 * main.c
555 */ 569 */
556extern struct afs_uuid afs_uuid; 570extern struct afs_uuid afs_uuid;
557#ifdef AFS_CACHING_SUPPORT
558extern struct cachefs_netfs afs_cache_netfs;
559#endif
560 571
561/* 572/*
562 * misc.c 573 * misc.c
@@ -637,10 +648,6 @@ extern int afs_get_MAC_address(u8 *, size_t);
637/* 648/*
638 * vlclient.c 649 * vlclient.c
639 */ 650 */
640#ifdef AFS_CACHING_SUPPORT
641extern struct cachefs_index_def afs_vlocation_cache_index_def;
642#endif
643
644extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *, 651extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *,
645 const char *, struct afs_cache_vlocation *, 652 const char *, struct afs_cache_vlocation *,
646 const struct afs_wait_mode *); 653 const struct afs_wait_mode *);
@@ -664,12 +671,6 @@ extern void afs_vlocation_purge(void);
664/* 671/*
665 * vnode.c 672 * vnode.c
666 */ 673 */
667#ifdef AFS_CACHING_SUPPORT
668extern struct cachefs_index_def afs_vnode_cache_index_def;
669#endif
670
671extern struct afs_timer_ops afs_vnode_cb_timed_out_ops;
672
673static inline struct afs_vnode *AFS_FS_I(struct inode *inode) 674static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
674{ 675{
675 return container_of(inode, struct afs_vnode, vfs_inode); 676 return container_of(inode, struct afs_vnode, vfs_inode);
@@ -711,10 +712,6 @@ extern int afs_vnode_release_lock(struct afs_vnode *, struct key *);
711/* 712/*
712 * volume.c 713 * volume.c
713 */ 714 */
714#ifdef AFS_CACHING_SUPPORT
715extern struct cachefs_index_def afs_volume_cache_index_def;
716#endif
717
718#define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0) 715#define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0)
719 716
720extern void afs_put_volume(struct afs_volume *); 717extern void afs_put_volume(struct afs_volume *);
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 2d3e5d4fb9f7..66d54d348c55 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -1,6 +1,6 @@
1/* AFS client file system 1/* AFS client file system
2 * 2 *
3 * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2002,5 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -29,18 +29,6 @@ static char *rootcell;
29module_param(rootcell, charp, 0); 29module_param(rootcell, charp, 0);
30MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list"); 30MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
31 31
32#ifdef AFS_CACHING_SUPPORT
33static struct cachefs_netfs_operations afs_cache_ops = {
34 .get_page_cookie = afs_cache_get_page_cookie,
35};
36
37struct cachefs_netfs afs_cache_netfs = {
38 .name = "afs",
39 .version = 0,
40 .ops = &afs_cache_ops,
41};
42#endif
43
44struct afs_uuid afs_uuid; 32struct afs_uuid afs_uuid;
45 33
46/* 34/*
@@ -104,10 +92,9 @@ static int __init afs_init(void)
104 if (ret < 0) 92 if (ret < 0)
105 return ret; 93 return ret;
106 94
107#ifdef AFS_CACHING_SUPPORT 95#ifdef CONFIG_AFS_FSCACHE
108 /* we want to be able to cache */ 96 /* we want to be able to cache */
109 ret = cachefs_register_netfs(&afs_cache_netfs, 97 ret = fscache_register_netfs(&afs_cache_netfs);
110 &afs_cache_cell_index_def);
111 if (ret < 0) 98 if (ret < 0)
112 goto error_cache; 99 goto error_cache;
113#endif 100#endif
@@ -142,8 +129,8 @@ error_fs:
142error_open_socket: 129error_open_socket:
143error_vl_update_init: 130error_vl_update_init:
144error_cell_init: 131error_cell_init:
145#ifdef AFS_CACHING_SUPPORT 132#ifdef CONFIG_AFS_FSCACHE
146 cachefs_unregister_netfs(&afs_cache_netfs); 133 fscache_unregister_netfs(&afs_cache_netfs);
147error_cache: 134error_cache:
148#endif 135#endif
149 afs_callback_update_kill(); 136 afs_callback_update_kill();
@@ -175,8 +162,8 @@ static void __exit afs_exit(void)
175 afs_vlocation_purge(); 162 afs_vlocation_purge();
176 flush_scheduled_work(); 163 flush_scheduled_work();
177 afs_cell_purge(); 164 afs_cell_purge();
178#ifdef AFS_CACHING_SUPPORT 165#ifdef CONFIG_AFS_FSCACHE
179 cachefs_unregister_netfs(&afs_cache_netfs); 166 fscache_unregister_netfs(&afs_cache_netfs);
180#endif 167#endif
181 afs_proc_cleanup(); 168 afs_proc_cleanup();
182 rcu_barrier(); 169 rcu_barrier();
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 78db4953a800..2b9e2d03a390 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -173,9 +173,9 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
173 if (PageError(page)) 173 if (PageError(page))
174 goto error; 174 goto error;
175 175
176 buf = kmap(page); 176 buf = kmap_atomic(page, KM_USER0);
177 memcpy(devname, buf, size); 177 memcpy(devname, buf, size);
178 kunmap(page); 178 kunmap_atomic(buf, KM_USER0);
179 page_cache_release(page); 179 page_cache_release(page);
180 page = NULL; 180 page = NULL;
181 181
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 7578c1ab9e0b..8630615e57fe 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -146,7 +146,6 @@ int afs_proc_init(void)
146 proc_afs = proc_mkdir("fs/afs", NULL); 146 proc_afs = proc_mkdir("fs/afs", NULL);
147 if (!proc_afs) 147 if (!proc_afs)
148 goto error_dir; 148 goto error_dir;
149 proc_afs->owner = THIS_MODULE;
150 149
151 p = proc_create("cells", 0, proc_afs, &afs_proc_cells_fops); 150 p = proc_create("cells", 0, proc_afs, &afs_proc_cells_fops);
152 if (!p) 151 if (!p)
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 849fc3160cb5..ec2a7431e458 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -281,9 +281,8 @@ static void afs_vlocation_apply_update(struct afs_vlocation *vl,
281 281
282 vl->vldb = *vldb; 282 vl->vldb = *vldb;
283 283
284#ifdef AFS_CACHING_SUPPORT 284#ifdef CONFIG_AFS_FSCACHE
285 /* update volume entry in local cache */ 285 fscache_update_cookie(vl->cache);
286 cachefs_update_cookie(vl->cache);
287#endif 286#endif
288} 287}
289 288
@@ -304,11 +303,9 @@ static int afs_vlocation_fill_in_record(struct afs_vlocation *vl,
304 memset(&vldb, 0, sizeof(vldb)); 303 memset(&vldb, 0, sizeof(vldb));
305 304
306 /* see if we have an in-cache copy (will set vl->valid if there is) */ 305 /* see if we have an in-cache copy (will set vl->valid if there is) */
307#ifdef AFS_CACHING_SUPPORT 306#ifdef CONFIG_AFS_FSCACHE
308 cachefs_acquire_cookie(cell->cache, 307 vl->cache = fscache_acquire_cookie(vl->cell->cache,
309 &afs_volume_cache_index_def, 308 &afs_vlocation_cache_index_def, vl);
310 vlocation,
311 &vl->cache);
312#endif 309#endif
313 310
314 if (vl->valid) { 311 if (vl->valid) {
@@ -420,6 +417,11 @@ fill_in_record:
420 spin_unlock(&vl->lock); 417 spin_unlock(&vl->lock);
421 wake_up(&vl->waitq); 418 wake_up(&vl->waitq);
422 419
420 /* update volume entry in local cache */
421#ifdef CONFIG_AFS_FSCACHE
422 fscache_update_cookie(vl->cache);
423#endif
424
423 /* schedule for regular updates */ 425 /* schedule for regular updates */
424 afs_vlocation_queue_for_updates(vl); 426 afs_vlocation_queue_for_updates(vl);
425 goto success; 427 goto success;
@@ -465,7 +467,7 @@ found_in_memory:
465 spin_unlock(&vl->lock); 467 spin_unlock(&vl->lock);
466 468
467success: 469success:
468 _leave(" = %p",vl); 470 _leave(" = %p", vl);
469 return vl; 471 return vl;
470 472
471error_abandon: 473error_abandon:
@@ -523,10 +525,9 @@ static void afs_vlocation_destroy(struct afs_vlocation *vl)
523{ 525{
524 _enter("%p", vl); 526 _enter("%p", vl);
525 527
526#ifdef AFS_CACHING_SUPPORT 528#ifdef CONFIG_AFS_FSCACHE
527 cachefs_relinquish_cookie(vl->cache, 0); 529 fscache_relinquish_cookie(vl->cache, 0);
528#endif 530#endif
529
530 afs_put_cell(vl->cell); 531 afs_put_cell(vl->cell);
531 kfree(vl); 532 kfree(vl);
532} 533}
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 8bab0e3437f9..a353e69e2391 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -124,13 +124,11 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
124 } 124 }
125 125
126 /* attach the cache and volume location */ 126 /* attach the cache and volume location */
127#ifdef AFS_CACHING_SUPPORT 127#ifdef CONFIG_AFS_FSCACHE
128 cachefs_acquire_cookie(vlocation->cache, 128 volume->cache = fscache_acquire_cookie(vlocation->cache,
129 &afs_vnode_cache_index_def, 129 &afs_volume_cache_index_def,
130 volume, 130 volume);
131 &volume->cache);
132#endif 131#endif
133
134 afs_get_vlocation(vlocation); 132 afs_get_vlocation(vlocation);
135 volume->vlocation = vlocation; 133 volume->vlocation = vlocation;
136 134
@@ -194,8 +192,8 @@ void afs_put_volume(struct afs_volume *volume)
194 up_write(&vlocation->cell->vl_sem); 192 up_write(&vlocation->cell->vl_sem);
195 193
196 /* finish cleaning up the volume */ 194 /* finish cleaning up the volume */
197#ifdef AFS_CACHING_SUPPORT 195#ifdef CONFIG_AFS_FSCACHE
198 cachefs_relinquish_cookie(volume->cache, 0); 196 fscache_relinquish_cookie(volume->cache, 0);
199#endif 197#endif
200 afs_put_vlocation(vlocation); 198 afs_put_vlocation(vlocation);
201 199
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 3fb36d433621..c2e7a7ff0080 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -780,3 +780,24 @@ int afs_fsync(struct file *file, struct dentry *dentry, int datasync)
780 _leave(" = %d", ret); 780 _leave(" = %d", ret);
781 return ret; 781 return ret;
782} 782}
783
784/*
785 * notification that a previously read-only page is about to become writable
786 * - if it returns an error, the caller will deliver a bus error signal
787 */
788int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
789{
790 struct afs_vnode *vnode = AFS_FS_I(vma->vm_file->f_mapping->host);
791
792 _enter("{{%x:%u}},{%lx}",
793 vnode->fid.vid, vnode->fid.vnode, page->index);
794
795 /* wait for the page to be written to the cache before we allow it to
796 * be modified */
797#ifdef CONFIG_AFS_FSCACHE
798 fscache_wait_on_page_write(vnode->cache, page);
799#endif
800
801 _leave(" = 0");
802 return 0;
803}
diff --git a/fs/aio.c b/fs/aio.c
index 8fa77e233944..76da12537956 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -443,7 +443,7 @@ static struct kiocb *__aio_get_req(struct kioctx *ctx)
443 req->private = NULL; 443 req->private = NULL;
444 req->ki_iovec = NULL; 444 req->ki_iovec = NULL;
445 INIT_LIST_HEAD(&req->ki_run_list); 445 INIT_LIST_HEAD(&req->ki_run_list);
446 req->ki_eventfd = ERR_PTR(-EINVAL); 446 req->ki_eventfd = NULL;
447 447
448 /* Check if the completion queue has enough free space to 448 /* Check if the completion queue has enough free space to
449 * accept an event from this io. 449 * accept an event from this io.
@@ -485,8 +485,6 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
485{ 485{
486 assert_spin_locked(&ctx->ctx_lock); 486 assert_spin_locked(&ctx->ctx_lock);
487 487
488 if (!IS_ERR(req->ki_eventfd))
489 fput(req->ki_eventfd);
490 if (req->ki_dtor) 488 if (req->ki_dtor)
491 req->ki_dtor(req); 489 req->ki_dtor(req);
492 if (req->ki_iovec != &req->ki_inline_vec) 490 if (req->ki_iovec != &req->ki_inline_vec)
@@ -508,8 +506,11 @@ static void aio_fput_routine(struct work_struct *data)
508 list_del(&req->ki_list); 506 list_del(&req->ki_list);
509 spin_unlock_irq(&fput_lock); 507 spin_unlock_irq(&fput_lock);
510 508
511 /* Complete the fput */ 509 /* Complete the fput(s) */
512 __fput(req->ki_filp); 510 if (req->ki_filp != NULL)
511 __fput(req->ki_filp);
512 if (req->ki_eventfd != NULL)
513 __fput(req->ki_eventfd);
513 514
514 /* Link the iocb into the context's free list */ 515 /* Link the iocb into the context's free list */
515 spin_lock_irq(&ctx->ctx_lock); 516 spin_lock_irq(&ctx->ctx_lock);
@@ -527,12 +528,14 @@ static void aio_fput_routine(struct work_struct *data)
527 */ 528 */
528static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) 529static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
529{ 530{
531 int schedule_putreq = 0;
532
530 dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n", 533 dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
531 req, atomic_long_read(&req->ki_filp->f_count)); 534 req, atomic_long_read(&req->ki_filp->f_count));
532 535
533 assert_spin_locked(&ctx->ctx_lock); 536 assert_spin_locked(&ctx->ctx_lock);
534 537
535 req->ki_users --; 538 req->ki_users--;
536 BUG_ON(req->ki_users < 0); 539 BUG_ON(req->ki_users < 0);
537 if (likely(req->ki_users)) 540 if (likely(req->ki_users))
538 return 0; 541 return 0;
@@ -540,10 +543,23 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
540 req->ki_cancel = NULL; 543 req->ki_cancel = NULL;
541 req->ki_retry = NULL; 544 req->ki_retry = NULL;
542 545
543 /* Must be done under the lock to serialise against cancellation. 546 /*
544 * Call this aio_fput as it duplicates fput via the fput_work. 547 * Try to optimize the aio and eventfd file* puts, by avoiding to
548 * schedule work in case it is not __fput() time. In normal cases,
549 * we would not be holding the last reference to the file*, so
550 * this function will be executed w/out any aio kthread wakeup.
545 */ 551 */
546 if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) { 552 if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count)))
553 schedule_putreq++;
554 else
555 req->ki_filp = NULL;
556 if (req->ki_eventfd != NULL) {
557 if (unlikely(atomic_long_dec_and_test(&req->ki_eventfd->f_count)))
558 schedule_putreq++;
559 else
560 req->ki_eventfd = NULL;
561 }
562 if (unlikely(schedule_putreq)) {
547 get_ioctx(ctx); 563 get_ioctx(ctx);
548 spin_lock(&fput_lock); 564 spin_lock(&fput_lock);
549 list_add(&req->ki_list, &fput_head); 565 list_add(&req->ki_list, &fput_head);
@@ -571,7 +587,7 @@ int aio_put_req(struct kiocb *req)
571static struct kioctx *lookup_ioctx(unsigned long ctx_id) 587static struct kioctx *lookup_ioctx(unsigned long ctx_id)
572{ 588{
573 struct mm_struct *mm = current->mm; 589 struct mm_struct *mm = current->mm;
574 struct kioctx *ctx = NULL; 590 struct kioctx *ctx, *ret = NULL;
575 struct hlist_node *n; 591 struct hlist_node *n;
576 592
577 rcu_read_lock(); 593 rcu_read_lock();
@@ -579,12 +595,13 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
579 hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) { 595 hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
580 if (ctx->user_id == ctx_id && !ctx->dead) { 596 if (ctx->user_id == ctx_id && !ctx->dead) {
581 get_ioctx(ctx); 597 get_ioctx(ctx);
598 ret = ctx;
582 break; 599 break;
583 } 600 }
584 } 601 }
585 602
586 rcu_read_unlock(); 603 rcu_read_unlock();
587 return ctx; 604 return ret;
588} 605}
589 606
590/* 607/*
@@ -1009,7 +1026,7 @@ int aio_complete(struct kiocb *iocb, long res, long res2)
1009 * eventfd. The eventfd_signal() function is safe to be called 1026 * eventfd. The eventfd_signal() function is safe to be called
1010 * from IRQ context. 1027 * from IRQ context.
1011 */ 1028 */
1012 if (!IS_ERR(iocb->ki_eventfd)) 1029 if (iocb->ki_eventfd != NULL)
1013 eventfd_signal(iocb->ki_eventfd, 1); 1030 eventfd_signal(iocb->ki_eventfd, 1);
1014 1031
1015put_rq: 1032put_rq:
@@ -1608,6 +1625,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1608 req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd); 1625 req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd);
1609 if (IS_ERR(req->ki_eventfd)) { 1626 if (IS_ERR(req->ki_eventfd)) {
1610 ret = PTR_ERR(req->ki_eventfd); 1627 ret = PTR_ERR(req->ki_eventfd);
1628 req->ki_eventfd = NULL;
1611 goto out_put_req; 1629 goto out_put_req;
1612 } 1630 }
1613 } 1631 }
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 3bbdb9d02376..1dd96d4406c0 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -48,7 +48,7 @@ static struct file_system_type anon_inode_fs_type = {
48 .get_sb = anon_inodefs_get_sb, 48 .get_sb = anon_inodefs_get_sb,
49 .kill_sb = kill_anon_super, 49 .kill_sb = kill_anon_super,
50}; 50};
51static struct dentry_operations anon_inodefs_dentry_operations = { 51static const struct dentry_operations anon_inodefs_dentry_operations = {
52 .d_delete = anon_inodefs_delete_dentry, 52 .d_delete = anon_inodefs_delete_dentry,
53}; 53};
54 54
diff --git a/fs/attr.c b/fs/attr.c
index f4360192a938..9fe1b1bd30a8 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -173,7 +173,8 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
173 if (!error) { 173 if (!error) {
174 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 174 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
175 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) 175 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid))
176 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; 176 error = vfs_dq_transfer(inode, attr) ?
177 -EDQUOT : 0;
177 if (!error) 178 if (!error)
178 error = inode_setattr(inode, attr); 179 error = inode_setattr(inode, attr);
179 } 180 }
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 8aacade56956..4a1401cea0a1 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -192,7 +192,7 @@ static int autofs_revalidate(struct dentry * dentry, struct nameidata *nd)
192 return 1; 192 return 1;
193} 193}
194 194
195static struct dentry_operations autofs_dentry_operations = { 195static const struct dentry_operations autofs_dentry_operations = {
196 .d_revalidate = autofs_revalidate, 196 .d_revalidate = autofs_revalidate,
197}; 197};
198 198
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index a76803108d06..b7ff33c63101 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -186,6 +186,8 @@ int autofs4_expire_wait(struct dentry *dentry);
186int autofs4_expire_run(struct super_block *, struct vfsmount *, 186int autofs4_expire_run(struct super_block *, struct vfsmount *,
187 struct autofs_sb_info *, 187 struct autofs_sb_info *,
188 struct autofs_packet_expire __user *); 188 struct autofs_packet_expire __user *);
189int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
190 struct autofs_sb_info *sbi, int when);
189int autofs4_expire_multi(struct super_block *, struct vfsmount *, 191int autofs4_expire_multi(struct super_block *, struct vfsmount *,
190 struct autofs_sb_info *, int __user *); 192 struct autofs_sb_info *, int __user *);
191struct dentry *autofs4_expire_direct(struct super_block *sb, 193struct dentry *autofs4_expire_direct(struct super_block *sb,
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 025e105bffea..9e5ae8a4f5c8 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -525,40 +525,13 @@ static int autofs_dev_ioctl_expire(struct file *fp,
525 struct autofs_sb_info *sbi, 525 struct autofs_sb_info *sbi,
526 struct autofs_dev_ioctl *param) 526 struct autofs_dev_ioctl *param)
527{ 527{
528 struct dentry *dentry;
529 struct vfsmount *mnt; 528 struct vfsmount *mnt;
530 int err = -EAGAIN;
531 int how; 529 int how;
532 530
533 how = param->expire.how; 531 how = param->expire.how;
534 mnt = fp->f_path.mnt; 532 mnt = fp->f_path.mnt;
535 533
536 if (autofs_type_trigger(sbi->type)) 534 return autofs4_do_expire_multi(sbi->sb, mnt, sbi, how);
537 dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how);
538 else
539 dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how);
540
541 if (dentry) {
542 struct autofs_info *ino = autofs4_dentry_ino(dentry);
543
544 /*
545 * This is synchronous because it makes the daemon a
546 * little easier
547 */
548 err = autofs4_wait(sbi, dentry, NFY_EXPIRE);
549
550 spin_lock(&sbi->fs_lock);
551 if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
552 ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
553 sbi->sb->s_root->d_mounted++;
554 }
555 ino->flags &= ~AUTOFS_INF_EXPIRING;
556 complete_all(&ino->expire_complete);
557 spin_unlock(&sbi->fs_lock);
558 dput(dentry);
559 }
560
561 return err;
562} 535}
563 536
564/* Check if autofs mount point is in use */ 537/* Check if autofs mount point is in use */
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index e3bd50776f9e..75f7ddacf7d6 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -478,22 +478,16 @@ int autofs4_expire_run(struct super_block *sb,
478 return ret; 478 return ret;
479} 479}
480 480
481/* Call repeatedly until it returns -EAGAIN, meaning there's nothing 481int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
482 more to be done */ 482 struct autofs_sb_info *sbi, int when)
483int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
484 struct autofs_sb_info *sbi, int __user *arg)
485{ 483{
486 struct dentry *dentry; 484 struct dentry *dentry;
487 int ret = -EAGAIN; 485 int ret = -EAGAIN;
488 int do_now = 0;
489
490 if (arg && get_user(do_now, arg))
491 return -EFAULT;
492 486
493 if (autofs_type_trigger(sbi->type)) 487 if (autofs_type_trigger(sbi->type))
494 dentry = autofs4_expire_direct(sb, mnt, sbi, do_now); 488 dentry = autofs4_expire_direct(sb, mnt, sbi, when);
495 else 489 else
496 dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now); 490 dentry = autofs4_expire_indirect(sb, mnt, sbi, when);
497 491
498 if (dentry) { 492 if (dentry) {
499 struct autofs_info *ino = autofs4_dentry_ino(dentry); 493 struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -516,3 +510,16 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
516 return ret; 510 return ret;
517} 511}
518 512
513/* Call repeatedly until it returns -EAGAIN, meaning there's nothing
514 more to be done */
515int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
516 struct autofs_sb_info *sbi, int __user *arg)
517{
518 int do_now = 0;
519
520 if (arg && get_user(do_now, arg))
521 return -EFAULT;
522
523 return autofs4_do_expire_multi(sb, mnt, sbi, do_now);
524}
525
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 716e12b627b2..69c8142da838 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -310,7 +310,7 @@ static struct autofs_info *autofs4_mkroot(struct autofs_sb_info *sbi)
310 return ino; 310 return ino;
311} 311}
312 312
313static struct dentry_operations autofs4_sb_dentry_operations = { 313static const struct dentry_operations autofs4_sb_dentry_operations = {
314 .d_release = autofs4_dentry_release, 314 .d_release = autofs4_dentry_release,
315}; 315};
316 316
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 2a41c2a7fc52..e383bf0334f1 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -349,13 +349,13 @@ void autofs4_dentry_release(struct dentry *de)
349} 349}
350 350
351/* For dentries of directories in the root dir */ 351/* For dentries of directories in the root dir */
352static struct dentry_operations autofs4_root_dentry_operations = { 352static const struct dentry_operations autofs4_root_dentry_operations = {
353 .d_revalidate = autofs4_revalidate, 353 .d_revalidate = autofs4_revalidate,
354 .d_release = autofs4_dentry_release, 354 .d_release = autofs4_dentry_release,
355}; 355};
356 356
357/* For other dentries */ 357/* For other dentries */
358static struct dentry_operations autofs4_dentry_operations = { 358static const struct dentry_operations autofs4_dentry_operations = {
359 .d_revalidate = autofs4_revalidate, 359 .d_revalidate = autofs4_revalidate,
360 .d_release = autofs4_dentry_release, 360 .d_release = autofs4_dentry_release,
361}; 361};
@@ -485,22 +485,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
485 DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d", 485 DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
486 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode); 486 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
487 487
488 expiring = autofs4_lookup_expiring(sbi, dentry->d_parent, &dentry->d_name);
489 if (expiring) {
490 /*
491 * If we are racing with expire the request might not
492 * be quite complete but the directory has been removed
493 * so it must have been successful, so just wait for it.
494 */
495 ino = autofs4_dentry_ino(expiring);
496 autofs4_expire_wait(expiring);
497 spin_lock(&sbi->lookup_lock);
498 if (!list_empty(&ino->expiring))
499 list_del_init(&ino->expiring);
500 spin_unlock(&sbi->lookup_lock);
501 dput(expiring);
502 }
503
504 unhashed = autofs4_lookup_active(sbi, dentry->d_parent, &dentry->d_name); 488 unhashed = autofs4_lookup_active(sbi, dentry->d_parent, &dentry->d_name);
505 if (unhashed) 489 if (unhashed)
506 dentry = unhashed; 490 dentry = unhashed;
@@ -538,14 +522,31 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
538 } 522 }
539 523
540 if (!oz_mode) { 524 if (!oz_mode) {
525 mutex_unlock(&dir->i_mutex);
526 expiring = autofs4_lookup_expiring(sbi,
527 dentry->d_parent,
528 &dentry->d_name);
529 if (expiring) {
530 /*
531 * If we are racing with expire the request might not
532 * be quite complete but the directory has been removed
533 * so it must have been successful, so just wait for it.
534 */
535 ino = autofs4_dentry_ino(expiring);
536 autofs4_expire_wait(expiring);
537 spin_lock(&sbi->lookup_lock);
538 if (!list_empty(&ino->expiring))
539 list_del_init(&ino->expiring);
540 spin_unlock(&sbi->lookup_lock);
541 dput(expiring);
542 }
543
541 spin_lock(&dentry->d_lock); 544 spin_lock(&dentry->d_lock);
542 dentry->d_flags |= DCACHE_AUTOFS_PENDING; 545 dentry->d_flags |= DCACHE_AUTOFS_PENDING;
543 spin_unlock(&dentry->d_lock); 546 spin_unlock(&dentry->d_lock);
544 if (dentry->d_op && dentry->d_op->d_revalidate) { 547 if (dentry->d_op && dentry->d_op->d_revalidate)
545 mutex_unlock(&dir->i_mutex);
546 (dentry->d_op->d_revalidate)(dentry, nd); 548 (dentry->d_op->d_revalidate)(dentry, nd);
547 mutex_lock(&dir->i_mutex); 549 mutex_lock(&dir->i_mutex);
548 }
549 } 550 }
550 551
551 /* 552 /*
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index d06cb023ad02..76afd0d6b86c 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -900,6 +900,7 @@ static int
900befs_statfs(struct dentry *dentry, struct kstatfs *buf) 900befs_statfs(struct dentry *dentry, struct kstatfs *buf)
901{ 901{
902 struct super_block *sb = dentry->d_sb; 902 struct super_block *sb = dentry->d_sb;
903 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
903 904
904 befs_debug(sb, "---> befs_statfs()"); 905 befs_debug(sb, "---> befs_statfs()");
905 906
@@ -910,6 +911,8 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
910 buf->f_bavail = buf->f_bfree; 911 buf->f_bavail = buf->f_bfree;
911 buf->f_files = 0; /* UNKNOWN */ 912 buf->f_files = 0; /* UNKNOWN */
912 buf->f_ffree = 0; /* UNKNOWN */ 913 buf->f_ffree = 0; /* UNKNOWN */
914 buf->f_fsid.val[0] = (u32)id;
915 buf->f_fsid.val[1] = (u32)(id >> 32);
913 buf->f_namelen = BEFS_NAME_LEN; 916 buf->f_namelen = BEFS_NAME_LEN;
914 917
915 befs_debug(sb, "<--- befs_statfs()"); 918 befs_debug(sb, "<--- befs_statfs()");
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 33b7235f853b..40381df34869 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -12,8 +12,6 @@
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/stat.h>
16#include <linux/time.h>
17#include <linux/mm.h> 15#include <linux/mm.h>
18#include <linux/mman.h> 16#include <linux/mman.h>
19#include <linux/errno.h> 17#include <linux/errno.h>
@@ -21,20 +19,15 @@
21#include <linux/binfmts.h> 19#include <linux/binfmts.h>
22#include <linux/string.h> 20#include <linux/string.h>
23#include <linux/file.h> 21#include <linux/file.h>
24#include <linux/fcntl.h>
25#include <linux/ptrace.h>
26#include <linux/slab.h> 22#include <linux/slab.h>
27#include <linux/shm.h>
28#include <linux/personality.h> 23#include <linux/personality.h>
29#include <linux/elfcore.h> 24#include <linux/elfcore.h>
30#include <linux/init.h> 25#include <linux/init.h>
31#include <linux/highuid.h> 26#include <linux/highuid.h>
32#include <linux/smp.h>
33#include <linux/compiler.h> 27#include <linux/compiler.h>
34#include <linux/highmem.h> 28#include <linux/highmem.h>
35#include <linux/pagemap.h> 29#include <linux/pagemap.h>
36#include <linux/security.h> 30#include <linux/security.h>
37#include <linux/syscalls.h>
38#include <linux/random.h> 31#include <linux/random.h>
39#include <linux/elf.h> 32#include <linux/elf.h>
40#include <linux/utsname.h> 33#include <linux/utsname.h>
@@ -576,7 +569,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
576 unsigned long error; 569 unsigned long error;
577 struct elf_phdr *elf_ppnt, *elf_phdata; 570 struct elf_phdr *elf_ppnt, *elf_phdata;
578 unsigned long elf_bss, elf_brk; 571 unsigned long elf_bss, elf_brk;
579 int elf_exec_fileno;
580 int retval, i; 572 int retval, i;
581 unsigned int size; 573 unsigned int size;
582 unsigned long elf_entry; 574 unsigned long elf_entry;
@@ -631,12 +623,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
631 goto out_free_ph; 623 goto out_free_ph;
632 } 624 }
633 625
634 retval = get_unused_fd();
635 if (retval < 0)
636 goto out_free_ph;
637 get_file(bprm->file);
638 fd_install(elf_exec_fileno = retval, bprm->file);
639
640 elf_ppnt = elf_phdata; 626 elf_ppnt = elf_phdata;
641 elf_bss = 0; 627 elf_bss = 0;
642 elf_brk = 0; 628 elf_brk = 0;
@@ -655,13 +641,13 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
655 retval = -ENOEXEC; 641 retval = -ENOEXEC;
656 if (elf_ppnt->p_filesz > PATH_MAX || 642 if (elf_ppnt->p_filesz > PATH_MAX ||
657 elf_ppnt->p_filesz < 2) 643 elf_ppnt->p_filesz < 2)
658 goto out_free_file; 644 goto out_free_ph;
659 645
660 retval = -ENOMEM; 646 retval = -ENOMEM;
661 elf_interpreter = kmalloc(elf_ppnt->p_filesz, 647 elf_interpreter = kmalloc(elf_ppnt->p_filesz,
662 GFP_KERNEL); 648 GFP_KERNEL);
663 if (!elf_interpreter) 649 if (!elf_interpreter)
664 goto out_free_file; 650 goto out_free_ph;
665 651
666 retval = kernel_read(bprm->file, elf_ppnt->p_offset, 652 retval = kernel_read(bprm->file, elf_ppnt->p_offset,
667 elf_interpreter, 653 elf_interpreter,
@@ -956,8 +942,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
956 942
957 kfree(elf_phdata); 943 kfree(elf_phdata);
958 944
959 sys_close(elf_exec_fileno);
960
961 set_binfmt(&elf_format); 945 set_binfmt(&elf_format);
962 946
963#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES 947#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
@@ -1028,8 +1012,6 @@ out_free_dentry:
1028 fput(interpreter); 1012 fput(interpreter);
1029out_free_interp: 1013out_free_interp:
1030 kfree(elf_interpreter); 1014 kfree(elf_interpreter);
1031out_free_file:
1032 sys_close(elf_exec_fileno);
1033out_free_ph: 1015out_free_ph:
1034 kfree(elf_phdata); 1016 kfree(elf_phdata);
1035 goto out; 1017 goto out;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index f3e72c5c19f5..70cfc4b84ae0 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -972,9 +972,12 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
972 params->elfhdr_addr = seg->addr; 972 params->elfhdr_addr = seg->addr;
973 973
974 /* clear any space allocated but not loaded */ 974 /* clear any space allocated but not loaded */
975 if (phdr->p_filesz < phdr->p_memsz) 975 if (phdr->p_filesz < phdr->p_memsz) {
976 clear_user((void *) (seg->addr + phdr->p_filesz), 976 ret = clear_user((void *) (seg->addr + phdr->p_filesz),
977 phdr->p_memsz - phdr->p_filesz); 977 phdr->p_memsz - phdr->p_filesz);
978 if (ret)
979 return ret;
980 }
978 981
979 if (mm) { 982 if (mm) {
980 if (phdr->p_flags & PF_X) { 983 if (phdr->p_flags & PF_X) {
@@ -1014,7 +1017,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1014 struct elf32_fdpic_loadseg *seg; 1017 struct elf32_fdpic_loadseg *seg;
1015 struct elf32_phdr *phdr; 1018 struct elf32_phdr *phdr;
1016 unsigned long load_addr, delta_vaddr; 1019 unsigned long load_addr, delta_vaddr;
1017 int loop, dvset; 1020 int loop, dvset, ret;
1018 1021
1019 load_addr = params->load_addr; 1022 load_addr = params->load_addr;
1020 delta_vaddr = 0; 1023 delta_vaddr = 0;
@@ -1114,7 +1117,9 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1114 * PT_LOAD */ 1117 * PT_LOAD */
1115 if (prot & PROT_WRITE && disp > 0) { 1118 if (prot & PROT_WRITE && disp > 0) {
1116 kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp); 1119 kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp);
1117 clear_user((void __user *) maddr, disp); 1120 ret = clear_user((void __user *) maddr, disp);
1121 if (ret)
1122 return ret;
1118 maddr += disp; 1123 maddr += disp;
1119 } 1124 }
1120 1125
@@ -1149,15 +1154,19 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1149 if (prot & PROT_WRITE && excess1 > 0) { 1154 if (prot & PROT_WRITE && excess1 > 0) {
1150 kdebug("clear[%d] ad=%lx sz=%lx", 1155 kdebug("clear[%d] ad=%lx sz=%lx",
1151 loop, maddr + phdr->p_filesz, excess1); 1156 loop, maddr + phdr->p_filesz, excess1);
1152 clear_user((void __user *) maddr + phdr->p_filesz, 1157 ret = clear_user((void __user *) maddr + phdr->p_filesz,
1153 excess1); 1158 excess1);
1159 if (ret)
1160 return ret;
1154 } 1161 }
1155 1162
1156#else 1163#else
1157 if (excess > 0) { 1164 if (excess > 0) {
1158 kdebug("clear[%d] ad=%lx sz=%lx", 1165 kdebug("clear[%d] ad=%lx sz=%lx",
1159 loop, maddr + phdr->p_filesz, excess); 1166 loop, maddr + phdr->p_filesz, excess);
1160 clear_user((void *) maddr + phdr->p_filesz, excess); 1167 ret = clear_user((void *) maddr + phdr->p_filesz, excess);
1168 if (ret)
1169 return ret;
1161 } 1170 }
1162#endif 1171#endif
1163 1172
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 08644a61616e..eff74b9c9e77 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -188,7 +188,6 @@ out:
188static int 188static int
189load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) 189load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
190{ 190{
191 int som_exec_fileno;
192 int retval; 191 int retval;
193 unsigned int size; 192 unsigned int size;
194 unsigned long som_entry; 193 unsigned long som_entry;
@@ -220,12 +219,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
220 goto out_free; 219 goto out_free;
221 } 220 }
222 221
223 retval = get_unused_fd();
224 if (retval < 0)
225 goto out_free;
226 get_file(bprm->file);
227 fd_install(som_exec_fileno = retval, bprm->file);
228
229 /* Flush all traces of the currently running executable */ 222 /* Flush all traces of the currently running executable */
230 retval = flush_old_exec(bprm); 223 retval = flush_old_exec(bprm);
231 if (retval) 224 if (retval)
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 549b0144da11..31c46a241bac 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -26,23 +26,23 @@
26#include <linux/workqueue.h> 26#include <linux/workqueue.h>
27 27
28static struct kmem_cache *bio_integrity_slab __read_mostly; 28static struct kmem_cache *bio_integrity_slab __read_mostly;
29static mempool_t *bio_integrity_pool;
30static struct bio_set *integrity_bio_set;
29static struct workqueue_struct *kintegrityd_wq; 31static struct workqueue_struct *kintegrityd_wq;
30 32
31/** 33/**
32 * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio 34 * bio_integrity_alloc - Allocate integrity payload and attach it to bio
33 * @bio: bio to attach integrity metadata to 35 * @bio: bio to attach integrity metadata to
34 * @gfp_mask: Memory allocation mask 36 * @gfp_mask: Memory allocation mask
35 * @nr_vecs: Number of integrity metadata scatter-gather elements 37 * @nr_vecs: Number of integrity metadata scatter-gather elements
36 * @bs: bio_set to allocate from
37 * 38 *
38 * Description: This function prepares a bio for attaching integrity 39 * Description: This function prepares a bio for attaching integrity
39 * metadata. nr_vecs specifies the maximum number of pages containing 40 * metadata. nr_vecs specifies the maximum number of pages containing
40 * integrity metadata that can be attached. 41 * integrity metadata that can be attached.
41 */ 42 */
42struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, 43struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
43 gfp_t gfp_mask, 44 gfp_t gfp_mask,
44 unsigned int nr_vecs, 45 unsigned int nr_vecs)
45 struct bio_set *bs)
46{ 46{
47 struct bio_integrity_payload *bip; 47 struct bio_integrity_payload *bip;
48 struct bio_vec *iv; 48 struct bio_vec *iv;
@@ -50,7 +50,7 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
50 50
51 BUG_ON(bio == NULL); 51 BUG_ON(bio == NULL);
52 52
53 bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); 53 bip = mempool_alloc(bio_integrity_pool, gfp_mask);
54 if (unlikely(bip == NULL)) { 54 if (unlikely(bip == NULL)) {
55 printk(KERN_ERR "%s: could not alloc bip\n", __func__); 55 printk(KERN_ERR "%s: could not alloc bip\n", __func__);
56 return NULL; 56 return NULL;
@@ -58,10 +58,10 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
58 58
59 memset(bip, 0, sizeof(*bip)); 59 memset(bip, 0, sizeof(*bip));
60 60
61 iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, bs); 61 iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, integrity_bio_set);
62 if (unlikely(iv == NULL)) { 62 if (unlikely(iv == NULL)) {
63 printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__); 63 printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__);
64 mempool_free(bip, bs->bio_integrity_pool); 64 mempool_free(bip, bio_integrity_pool);
65 return NULL; 65 return NULL;
66 } 66 }
67 67
@@ -72,35 +72,16 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
72 72
73 return bip; 73 return bip;
74} 74}
75EXPORT_SYMBOL(bio_integrity_alloc_bioset);
76
77/**
78 * bio_integrity_alloc - Allocate integrity payload and attach it to bio
79 * @bio: bio to attach integrity metadata to
80 * @gfp_mask: Memory allocation mask
81 * @nr_vecs: Number of integrity metadata scatter-gather elements
82 *
83 * Description: This function prepares a bio for attaching integrity
84 * metadata. nr_vecs specifies the maximum number of pages containing
85 * integrity metadata that can be attached.
86 */
87struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
88 gfp_t gfp_mask,
89 unsigned int nr_vecs)
90{
91 return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
92}
93EXPORT_SYMBOL(bio_integrity_alloc); 75EXPORT_SYMBOL(bio_integrity_alloc);
94 76
95/** 77/**
96 * bio_integrity_free - Free bio integrity payload 78 * bio_integrity_free - Free bio integrity payload
97 * @bio: bio containing bip to be freed 79 * @bio: bio containing bip to be freed
98 * @bs: bio_set this bio was allocated from
99 * 80 *
100 * Description: Used to free the integrity portion of a bio. Usually 81 * Description: Used to free the integrity portion of a bio. Usually
101 * called from bio_free(). 82 * called from bio_free().
102 */ 83 */
103void bio_integrity_free(struct bio *bio, struct bio_set *bs) 84void bio_integrity_free(struct bio *bio)
104{ 85{
105 struct bio_integrity_payload *bip = bio->bi_integrity; 86 struct bio_integrity_payload *bip = bio->bi_integrity;
106 87
@@ -111,8 +92,8 @@ void bio_integrity_free(struct bio *bio, struct bio_set *bs)
111 && bip->bip_buf != NULL) 92 && bip->bip_buf != NULL)
112 kfree(bip->bip_buf); 93 kfree(bip->bip_buf);
113 94
114 bvec_free_bs(bs, bip->bip_vec, bip->bip_pool); 95 bvec_free_bs(integrity_bio_set, bip->bip_vec, bip->bip_pool);
115 mempool_free(bip, bs->bio_integrity_pool); 96 mempool_free(bip, bio_integrity_pool);
116 97
117 bio->bi_integrity = NULL; 98 bio->bi_integrity = NULL;
118} 99}
@@ -685,19 +666,18 @@ EXPORT_SYMBOL(bio_integrity_split);
685 * bio_integrity_clone - Callback for cloning bios with integrity metadata 666 * bio_integrity_clone - Callback for cloning bios with integrity metadata
686 * @bio: New bio 667 * @bio: New bio
687 * @bio_src: Original bio 668 * @bio_src: Original bio
688 * @bs: bio_set to allocate bip from 669 * @gfp_mask: Memory allocation mask
689 * 670 *
690 * Description: Called to allocate a bip when cloning a bio 671 * Description: Called to allocate a bip when cloning a bio
691 */ 672 */
692int bio_integrity_clone(struct bio *bio, struct bio *bio_src, 673int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask)
693 struct bio_set *bs)
694{ 674{
695 struct bio_integrity_payload *bip_src = bio_src->bi_integrity; 675 struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
696 struct bio_integrity_payload *bip; 676 struct bio_integrity_payload *bip;
697 677
698 BUG_ON(bip_src == NULL); 678 BUG_ON(bip_src == NULL);
699 679
700 bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs); 680 bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
701 681
702 if (bip == NULL) 682 if (bip == NULL)
703 return -EIO; 683 return -EIO;
@@ -713,37 +693,25 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
713} 693}
714EXPORT_SYMBOL(bio_integrity_clone); 694EXPORT_SYMBOL(bio_integrity_clone);
715 695
716int bioset_integrity_create(struct bio_set *bs, int pool_size) 696static int __init bio_integrity_init(void)
717{ 697{
718 bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, 698 kintegrityd_wq = create_workqueue("kintegrityd");
719 bio_integrity_slab);
720 if (!bs->bio_integrity_pool)
721 return -1;
722
723 return 0;
724}
725EXPORT_SYMBOL(bioset_integrity_create);
726 699
727void bioset_integrity_free(struct bio_set *bs) 700 if (!kintegrityd_wq)
728{ 701 panic("Failed to create kintegrityd\n");
729 if (bs->bio_integrity_pool)
730 mempool_destroy(bs->bio_integrity_pool);
731}
732EXPORT_SYMBOL(bioset_integrity_free);
733 702
734void __init bio_integrity_init_slab(void)
735{
736 bio_integrity_slab = KMEM_CACHE(bio_integrity_payload, 703 bio_integrity_slab = KMEM_CACHE(bio_integrity_payload,
737 SLAB_HWCACHE_ALIGN|SLAB_PANIC); 704 SLAB_HWCACHE_ALIGN|SLAB_PANIC);
738}
739 705
740static int __init integrity_init(void) 706 bio_integrity_pool = mempool_create_slab_pool(BIO_POOL_SIZE,
741{ 707 bio_integrity_slab);
742 kintegrityd_wq = create_workqueue("kintegrityd"); 708 if (!bio_integrity_pool)
709 panic("bio_integrity: can't allocate bip pool\n");
743 710
744 if (!kintegrityd_wq) 711 integrity_bio_set = bioset_create(BIO_POOL_SIZE, 0);
745 panic("Failed to create kintegrityd\n"); 712 if (!integrity_bio_set)
713 panic("bio_integrity: can't allocate bio_set\n");
746 714
747 return 0; 715 return 0;
748} 716}
749subsys_initcall(integrity_init); 717subsys_initcall(bio_integrity_init);
diff --git a/fs/bio.c b/fs/bio.c
index 124b95c4d582..e0c9e545bbfa 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -248,7 +248,7 @@ void bio_free(struct bio *bio, struct bio_set *bs)
248 bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio)); 248 bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
249 249
250 if (bio_integrity(bio)) 250 if (bio_integrity(bio))
251 bio_integrity_free(bio, bs); 251 bio_integrity_free(bio);
252 252
253 /* 253 /*
254 * If we have front padding, adjust the bio pointer before freeing 254 * If we have front padding, adjust the bio pointer before freeing
@@ -301,48 +301,51 @@ void bio_init(struct bio *bio)
301 **/ 301 **/
302struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) 302struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
303{ 303{
304 struct bio_vec *bvl = NULL;
304 struct bio *bio = NULL; 305 struct bio *bio = NULL;
305 void *uninitialized_var(p); 306 unsigned long idx = 0;
307 void *p = NULL;
306 308
307 if (bs) { 309 if (bs) {
308 p = mempool_alloc(bs->bio_pool, gfp_mask); 310 p = mempool_alloc(bs->bio_pool, gfp_mask);
309 311 if (!p)
310 if (p) 312 goto err;
311 bio = p + bs->front_pad; 313 bio = p + bs->front_pad;
312 } else 314 } else {
313 bio = kmalloc(sizeof(*bio), gfp_mask); 315 bio = kmalloc(sizeof(*bio), gfp_mask);
316 if (!bio)
317 goto err;
318 }
314 319
315 if (likely(bio)) { 320 bio_init(bio);
316 struct bio_vec *bvl = NULL; 321
317 322 if (unlikely(!nr_iovecs))
318 bio_init(bio); 323 goto out_set;
319 if (likely(nr_iovecs)) { 324
320 unsigned long uninitialized_var(idx); 325 if (nr_iovecs <= BIO_INLINE_VECS) {
321 326 bvl = bio->bi_inline_vecs;
322 if (nr_iovecs <= BIO_INLINE_VECS) { 327 nr_iovecs = BIO_INLINE_VECS;
323 idx = 0; 328 } else {
324 bvl = bio->bi_inline_vecs; 329 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
325 nr_iovecs = BIO_INLINE_VECS; 330 if (unlikely(!bvl))
326 } else { 331 goto err_free;
327 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, 332
328 bs); 333 nr_iovecs = bvec_nr_vecs(idx);
329 nr_iovecs = bvec_nr_vecs(idx);
330 }
331 if (unlikely(!bvl)) {
332 if (bs)
333 mempool_free(p, bs->bio_pool);
334 else
335 kfree(bio);
336 bio = NULL;
337 goto out;
338 }
339 bio->bi_flags |= idx << BIO_POOL_OFFSET;
340 bio->bi_max_vecs = nr_iovecs;
341 }
342 bio->bi_io_vec = bvl;
343 } 334 }
344out: 335 bio->bi_flags |= idx << BIO_POOL_OFFSET;
336 bio->bi_max_vecs = nr_iovecs;
337out_set:
338 bio->bi_io_vec = bvl;
339
345 return bio; 340 return bio;
341
342err_free:
343 if (bs)
344 mempool_free(p, bs->bio_pool);
345 else
346 kfree(bio);
347err:
348 return NULL;
346} 349}
347 350
348struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) 351struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
@@ -463,10 +466,12 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
463 if (bio_integrity(bio)) { 466 if (bio_integrity(bio)) {
464 int ret; 467 int ret;
465 468
466 ret = bio_integrity_clone(b, bio, fs_bio_set); 469 ret = bio_integrity_clone(b, bio, gfp_mask);
467 470
468 if (ret < 0) 471 if (ret < 0) {
472 bio_put(b);
469 return NULL; 473 return NULL;
474 }
470 } 475 }
471 476
472 return b; 477 return b;
@@ -1415,8 +1420,7 @@ static void bio_pair_end_2(struct bio *bi, int err)
1415} 1420}
1416 1421
1417/* 1422/*
1418 * split a bio - only worry about a bio with a single page 1423 * split a bio - only worry about a bio with a single page in its iovec
1419 * in it's iovec
1420 */ 1424 */
1421struct bio_pair *bio_split(struct bio *bi, int first_sectors) 1425struct bio_pair *bio_split(struct bio *bi, int first_sectors)
1422{ 1426{
@@ -1524,7 +1528,6 @@ void bioset_free(struct bio_set *bs)
1524 if (bs->bio_pool) 1528 if (bs->bio_pool)
1525 mempool_destroy(bs->bio_pool); 1529 mempool_destroy(bs->bio_pool);
1526 1530
1527 bioset_integrity_free(bs);
1528 biovec_free_pools(bs); 1531 biovec_free_pools(bs);
1529 bio_put_slab(bs); 1532 bio_put_slab(bs);
1530 1533
@@ -1565,9 +1568,6 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
1565 if (!bs->bio_pool) 1568 if (!bs->bio_pool)
1566 goto bad; 1569 goto bad;
1567 1570
1568 if (bioset_integrity_create(bs, pool_size))
1569 goto bad;
1570
1571 if (!biovec_create_pools(bs, pool_size)) 1571 if (!biovec_create_pools(bs, pool_size))
1572 return bs; 1572 return bs;
1573 1573
@@ -1584,6 +1584,13 @@ static void __init biovec_init_slabs(void)
1584 int size; 1584 int size;
1585 struct biovec_slab *bvs = bvec_slabs + i; 1585 struct biovec_slab *bvs = bvec_slabs + i;
1586 1586
1587#ifndef CONFIG_BLK_DEV_INTEGRITY
1588 if (bvs->nr_vecs <= BIO_INLINE_VECS) {
1589 bvs->slab = NULL;
1590 continue;
1591 }
1592#endif
1593
1587 size = bvs->nr_vecs * sizeof(struct bio_vec); 1594 size = bvs->nr_vecs * sizeof(struct bio_vec);
1588 bvs->slab = kmem_cache_create(bvs->name, size, 0, 1595 bvs->slab = kmem_cache_create(bvs->name, size, 0,
1589 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1596 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
@@ -1598,7 +1605,6 @@ static int __init init_bio(void)
1598 if (!bio_slabs) 1605 if (!bio_slabs)
1599 panic("bio: can't allocate bios\n"); 1606 panic("bio: can't allocate bios\n");
1600 1607
1601 bio_integrity_init_slab();
1602 biovec_init_slabs(); 1608 biovec_init_slabs();
1603 1609
1604 fs_bio_set = bioset_create(BIO_POOL_SIZE, 0); 1610 fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b3c1efff5e1d..f45dbc18dd17 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -18,6 +18,7 @@
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/blkpg.h> 19#include <linux/blkpg.h>
20#include <linux/buffer_head.h> 20#include <linux/buffer_head.h>
21#include <linux/pagevec.h>
21#include <linux/writeback.h> 22#include <linux/writeback.h>
22#include <linux/mpage.h> 23#include <linux/mpage.h>
23#include <linux/mount.h> 24#include <linux/mount.h>
@@ -174,6 +175,152 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
174 iov, offset, nr_segs, blkdev_get_blocks, NULL); 175 iov, offset, nr_segs, blkdev_get_blocks, NULL);
175} 176}
176 177
178/*
179 * Write out and wait upon all the dirty data associated with a block
180 * device via its mapping. Does not take the superblock lock.
181 */
182int sync_blockdev(struct block_device *bdev)
183{
184 int ret = 0;
185
186 if (bdev)
187 ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
188 return ret;
189}
190EXPORT_SYMBOL(sync_blockdev);
191
192/*
193 * Write out and wait upon all dirty data associated with this
194 * device. Filesystem data as well as the underlying block
195 * device. Takes the superblock lock.
196 */
197int fsync_bdev(struct block_device *bdev)
198{
199 struct super_block *sb = get_super(bdev);
200 if (sb) {
201 int res = fsync_super(sb);
202 drop_super(sb);
203 return res;
204 }
205 return sync_blockdev(bdev);
206}
207EXPORT_SYMBOL(fsync_bdev);
208
209/**
210 * freeze_bdev -- lock a filesystem and force it into a consistent state
211 * @bdev: blockdevice to lock
212 *
213 * This takes the block device bd_mount_sem to make sure no new mounts
214 * happen on bdev until thaw_bdev() is called.
215 * If a superblock is found on this device, we take the s_umount semaphore
216 * on it to make sure nobody unmounts until the snapshot creation is done.
217 * The reference counter (bd_fsfreeze_count) guarantees that only the last
218 * unfreeze process can unfreeze the frozen filesystem actually when multiple
219 * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
220 * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
221 * actually.
222 */
223struct super_block *freeze_bdev(struct block_device *bdev)
224{
225 struct super_block *sb;
226 int error = 0;
227
228 mutex_lock(&bdev->bd_fsfreeze_mutex);
229 if (bdev->bd_fsfreeze_count > 0) {
230 bdev->bd_fsfreeze_count++;
231 sb = get_super(bdev);
232 mutex_unlock(&bdev->bd_fsfreeze_mutex);
233 return sb;
234 }
235 bdev->bd_fsfreeze_count++;
236
237 down(&bdev->bd_mount_sem);
238 sb = get_super(bdev);
239 if (sb && !(sb->s_flags & MS_RDONLY)) {
240 sb->s_frozen = SB_FREEZE_WRITE;
241 smp_wmb();
242
243 __fsync_super(sb);
244
245 sb->s_frozen = SB_FREEZE_TRANS;
246 smp_wmb();
247
248 sync_blockdev(sb->s_bdev);
249
250 if (sb->s_op->freeze_fs) {
251 error = sb->s_op->freeze_fs(sb);
252 if (error) {
253 printk(KERN_ERR
254 "VFS:Filesystem freeze failed\n");
255 sb->s_frozen = SB_UNFROZEN;
256 drop_super(sb);
257 up(&bdev->bd_mount_sem);
258 bdev->bd_fsfreeze_count--;
259 mutex_unlock(&bdev->bd_fsfreeze_mutex);
260 return ERR_PTR(error);
261 }
262 }
263 }
264
265 sync_blockdev(bdev);
266 mutex_unlock(&bdev->bd_fsfreeze_mutex);
267
268 return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
269}
270EXPORT_SYMBOL(freeze_bdev);
271
272/**
273 * thaw_bdev -- unlock filesystem
274 * @bdev: blockdevice to unlock
275 * @sb: associated superblock
276 *
277 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
278 */
279int thaw_bdev(struct block_device *bdev, struct super_block *sb)
280{
281 int error = 0;
282
283 mutex_lock(&bdev->bd_fsfreeze_mutex);
284 if (!bdev->bd_fsfreeze_count) {
285 mutex_unlock(&bdev->bd_fsfreeze_mutex);
286 return -EINVAL;
287 }
288
289 bdev->bd_fsfreeze_count--;
290 if (bdev->bd_fsfreeze_count > 0) {
291 if (sb)
292 drop_super(sb);
293 mutex_unlock(&bdev->bd_fsfreeze_mutex);
294 return 0;
295 }
296
297 if (sb) {
298 BUG_ON(sb->s_bdev != bdev);
299 if (!(sb->s_flags & MS_RDONLY)) {
300 if (sb->s_op->unfreeze_fs) {
301 error = sb->s_op->unfreeze_fs(sb);
302 if (error) {
303 printk(KERN_ERR
304 "VFS:Filesystem thaw failed\n");
305 sb->s_frozen = SB_FREEZE_TRANS;
306 bdev->bd_fsfreeze_count++;
307 mutex_unlock(&bdev->bd_fsfreeze_mutex);
308 return error;
309 }
310 }
311 sb->s_frozen = SB_UNFROZEN;
312 smp_wmb();
313 wake_up(&sb->s_wait_unfrozen);
314 }
315 drop_super(sb);
316 }
317
318 up(&bdev->bd_mount_sem);
319 mutex_unlock(&bdev->bd_fsfreeze_mutex);
320 return 0;
321}
322EXPORT_SYMBOL(thaw_bdev);
323
177static int blkdev_writepage(struct page *page, struct writeback_control *wbc) 324static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
178{ 325{
179 return block_write_full_page(page, blkdev_get_block, wbc); 326 return block_write_full_page(page, blkdev_get_block, wbc);
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d2cf5a54a4b8..9adf5e4f7e96 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
8 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 8 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
9 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 9 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
10 ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ 10 ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
11 compression.o 11 compression.o delayed-ref.o
12else 12else
13 13
14# Normal Makefile 14# Normal Makefile
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 1d53b62dbba5..7fdd184a528d 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -256,7 +256,7 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
256 } 256 }
257 257
258 if (!acl) 258 if (!acl)
259 inode->i_mode &= ~current->fs->umask; 259 inode->i_mode &= ~current_umask();
260 } 260 }
261 261
262 if (IS_POSIXACL(dir) && acl) { 262 if (IS_POSIXACL(dir) && acl) {
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c84ca1f5259a..51bfdfc8fcda 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -20,7 +20,6 @@
20#include <linux/list.h> 20#include <linux/list.h>
21#include <linux/spinlock.h> 21#include <linux/spinlock.h>
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/ftrace.h>
24#include "async-thread.h" 23#include "async-thread.h"
25 24
26#define WORK_QUEUED_BIT 0 25#define WORK_QUEUED_BIT 0
@@ -195,6 +194,9 @@ again_locked:
195 if (!list_empty(&worker->pending)) 194 if (!list_empty(&worker->pending))
196 continue; 195 continue;
197 196
197 if (kthread_should_stop())
198 break;
199
198 /* still no more work?, sleep for real */ 200 /* still no more work?, sleep for real */
199 spin_lock_irq(&worker->lock); 201 spin_lock_irq(&worker->lock);
200 set_current_state(TASK_INTERRUPTIBLE); 202 set_current_state(TASK_INTERRUPTIBLE);
@@ -208,7 +210,8 @@ again_locked:
208 worker->working = 0; 210 worker->working = 0;
209 spin_unlock_irq(&worker->lock); 211 spin_unlock_irq(&worker->lock);
210 212
211 schedule(); 213 if (!kthread_should_stop())
214 schedule();
212 } 215 }
213 __set_current_state(TASK_RUNNING); 216 __set_current_state(TASK_RUNNING);
214 } 217 }
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 72677ce2b74f..b30986f00b9d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,6 +66,12 @@ struct btrfs_inode {
66 */ 66 */
67 struct list_head delalloc_inodes; 67 struct list_head delalloc_inodes;
68 68
69 /*
70 * list for tracking inodes that must be sent to disk before a
71 * rename or truncate commit
72 */
73 struct list_head ordered_operations;
74
69 /* the space_info for where this inode's data allocations are done */ 75 /* the space_info for where this inode's data allocations are done */
70 struct btrfs_space_info *space_info; 76 struct btrfs_space_info *space_info;
71 77
@@ -86,12 +92,6 @@ struct btrfs_inode {
86 */ 92 */
87 u64 logged_trans; 93 u64 logged_trans;
88 94
89 /*
90 * trans that last made a change that should be fully fsync'd. This
91 * gets reset to zero each time the inode is logged
92 */
93 u64 log_dirty_trans;
94
95 /* total number of bytes pending delalloc, used by stat to calc the 95 /* total number of bytes pending delalloc, used by stat to calc the
96 * real block usage of the file 96 * real block usage of the file
97 */ 97 */
@@ -121,6 +121,25 @@ struct btrfs_inode {
121 /* the start of block group preferred for allocations. */ 121 /* the start of block group preferred for allocations. */
122 u64 block_group; 122 u64 block_group;
123 123
124 /* the fsync log has some corner cases that mean we have to check
125 * directories to see if any unlinks have been done before
126 * the directory was logged. See tree-log.c for all the
127 * details
128 */
129 u64 last_unlink_trans;
130
131 /*
132 * ordered_data_close is set by truncate when a file that used
133 * to have good data has been truncated to zero. When it is set
134 * the btrfs file release call will add this inode to the
135 * ordered operations list so that we make sure to flush out any
136 * new data the application may have written before commit.
137 *
138 * yes, its silly to have a single bitflag, but we might grow more
139 * of these.
140 */
141 unsigned ordered_data_close:1;
142
124 struct inode vfs_inode; 143 struct inode vfs_inode;
125}; 144};
126 145
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 42491d728e99..e5b2533b691a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -254,18 +254,13 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
254 * empty_size -- a hint that you plan on doing more cow. This is the size in 254 * empty_size -- a hint that you plan on doing more cow. This is the size in
255 * bytes the allocator should try to find free next to the block it returns. 255 * bytes the allocator should try to find free next to the block it returns.
256 * This is just a hint and may be ignored by the allocator. 256 * This is just a hint and may be ignored by the allocator.
257 *
258 * prealloc_dest -- if you have already reserved a destination for the cow,
259 * this uses that block instead of allocating a new one.
260 * btrfs_alloc_reserved_extent is used to finish the allocation.
261 */ 257 */
262static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, 258static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
263 struct btrfs_root *root, 259 struct btrfs_root *root,
264 struct extent_buffer *buf, 260 struct extent_buffer *buf,
265 struct extent_buffer *parent, int parent_slot, 261 struct extent_buffer *parent, int parent_slot,
266 struct extent_buffer **cow_ret, 262 struct extent_buffer **cow_ret,
267 u64 search_start, u64 empty_size, 263 u64 search_start, u64 empty_size)
268 u64 prealloc_dest)
269{ 264{
270 u64 parent_start; 265 u64 parent_start;
271 struct extent_buffer *cow; 266 struct extent_buffer *cow;
@@ -277,7 +272,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
277 if (*cow_ret == buf) 272 if (*cow_ret == buf)
278 unlock_orig = 1; 273 unlock_orig = 1;
279 274
280 WARN_ON(!btrfs_tree_locked(buf)); 275 btrfs_assert_tree_locked(buf);
281 276
282 if (parent) 277 if (parent)
283 parent_start = parent->start; 278 parent_start = parent->start;
@@ -291,26 +286,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
291 level = btrfs_header_level(buf); 286 level = btrfs_header_level(buf);
292 nritems = btrfs_header_nritems(buf); 287 nritems = btrfs_header_nritems(buf);
293 288
294 if (prealloc_dest) { 289 cow = btrfs_alloc_free_block(trans, root, buf->len,
295 struct btrfs_key ins; 290 parent_start, root->root_key.objectid,
296 291 trans->transid, level,
297 ins.objectid = prealloc_dest; 292 search_start, empty_size);
298 ins.offset = buf->len;
299 ins.type = BTRFS_EXTENT_ITEM_KEY;
300
301 ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
302 root->root_key.objectid,
303 trans->transid, level, &ins);
304 BUG_ON(ret);
305 cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
306 buf->len, level);
307 } else {
308 cow = btrfs_alloc_free_block(trans, root, buf->len,
309 parent_start,
310 root->root_key.objectid,
311 trans->transid, level,
312 search_start, empty_size);
313 }
314 if (IS_ERR(cow)) 293 if (IS_ERR(cow))
315 return PTR_ERR(cow); 294 return PTR_ERR(cow);
316 295
@@ -413,7 +392,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
413noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, 392noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
414 struct btrfs_root *root, struct extent_buffer *buf, 393 struct btrfs_root *root, struct extent_buffer *buf,
415 struct extent_buffer *parent, int parent_slot, 394 struct extent_buffer *parent, int parent_slot,
416 struct extent_buffer **cow_ret, u64 prealloc_dest) 395 struct extent_buffer **cow_ret)
417{ 396{
418 u64 search_start; 397 u64 search_start;
419 int ret; 398 int ret;
@@ -436,7 +415,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
436 btrfs_header_owner(buf) == root->root_key.objectid && 415 btrfs_header_owner(buf) == root->root_key.objectid &&
437 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 416 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
438 *cow_ret = buf; 417 *cow_ret = buf;
439 WARN_ON(prealloc_dest);
440 return 0; 418 return 0;
441 } 419 }
442 420
@@ -447,8 +425,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
447 btrfs_set_lock_blocking(buf); 425 btrfs_set_lock_blocking(buf);
448 426
449 ret = __btrfs_cow_block(trans, root, buf, parent, 427 ret = __btrfs_cow_block(trans, root, buf, parent,
450 parent_slot, cow_ret, search_start, 0, 428 parent_slot, cow_ret, search_start, 0);
451 prealloc_dest);
452 return ret; 429 return ret;
453} 430}
454 431
@@ -617,7 +594,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
617 err = __btrfs_cow_block(trans, root, cur, parent, i, 594 err = __btrfs_cow_block(trans, root, cur, parent, i,
618 &cur, search_start, 595 &cur, search_start,
619 min(16 * blocksize, 596 min(16 * blocksize,
620 (end_slot - i) * blocksize), 0); 597 (end_slot - i) * blocksize));
621 if (err) { 598 if (err) {
622 btrfs_tree_unlock(cur); 599 btrfs_tree_unlock(cur);
623 free_extent_buffer(cur); 600 free_extent_buffer(cur);
@@ -937,7 +914,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
937 BUG_ON(!child); 914 BUG_ON(!child);
938 btrfs_tree_lock(child); 915 btrfs_tree_lock(child);
939 btrfs_set_lock_blocking(child); 916 btrfs_set_lock_blocking(child);
940 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); 917 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
941 BUG_ON(ret); 918 BUG_ON(ret);
942 919
943 spin_lock(&root->node_lock); 920 spin_lock(&root->node_lock);
@@ -945,6 +922,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
945 spin_unlock(&root->node_lock); 922 spin_unlock(&root->node_lock);
946 923
947 ret = btrfs_update_extent_ref(trans, root, child->start, 924 ret = btrfs_update_extent_ref(trans, root, child->start,
925 child->len,
948 mid->start, child->start, 926 mid->start, child->start,
949 root->root_key.objectid, 927 root->root_key.objectid,
950 trans->transid, level - 1); 928 trans->transid, level - 1);
@@ -971,6 +949,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
971 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) 949 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
972 return 0; 950 return 0;
973 951
952 if (trans->transaction->delayed_refs.flushing &&
953 btrfs_header_nritems(mid) > 2)
954 return 0;
955
974 if (btrfs_header_nritems(mid) < 2) 956 if (btrfs_header_nritems(mid) < 2)
975 err_on_enospc = 1; 957 err_on_enospc = 1;
976 958
@@ -979,7 +961,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
979 btrfs_tree_lock(left); 961 btrfs_tree_lock(left);
980 btrfs_set_lock_blocking(left); 962 btrfs_set_lock_blocking(left);
981 wret = btrfs_cow_block(trans, root, left, 963 wret = btrfs_cow_block(trans, root, left,
982 parent, pslot - 1, &left, 0); 964 parent, pslot - 1, &left);
983 if (wret) { 965 if (wret) {
984 ret = wret; 966 ret = wret;
985 goto enospc; 967 goto enospc;
@@ -990,7 +972,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
990 btrfs_tree_lock(right); 972 btrfs_tree_lock(right);
991 btrfs_set_lock_blocking(right); 973 btrfs_set_lock_blocking(right);
992 wret = btrfs_cow_block(trans, root, right, 974 wret = btrfs_cow_block(trans, root, right,
993 parent, pslot + 1, &right, 0); 975 parent, pslot + 1, &right);
994 if (wret) { 976 if (wret) {
995 ret = wret; 977 ret = wret;
996 goto enospc; 978 goto enospc;
@@ -1171,7 +1153,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1171 wret = 1; 1153 wret = 1;
1172 } else { 1154 } else {
1173 ret = btrfs_cow_block(trans, root, left, parent, 1155 ret = btrfs_cow_block(trans, root, left, parent,
1174 pslot - 1, &left, 0); 1156 pslot - 1, &left);
1175 if (ret) 1157 if (ret)
1176 wret = 1; 1158 wret = 1;
1177 else { 1159 else {
@@ -1222,7 +1204,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1222 } else { 1204 } else {
1223 ret = btrfs_cow_block(trans, root, right, 1205 ret = btrfs_cow_block(trans, root, right,
1224 parent, pslot + 1, 1206 parent, pslot + 1,
1225 &right, 0); 1207 &right);
1226 if (ret) 1208 if (ret)
1227 wret = 1; 1209 wret = 1;
1228 else { 1210 else {
@@ -1262,9 +1244,9 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1262 * readahead one full node of leaves, finding things that are close 1244 * readahead one full node of leaves, finding things that are close
1263 * to the block in 'slot', and triggering ra on them. 1245 * to the block in 'slot', and triggering ra on them.
1264 */ 1246 */
1265static noinline void reada_for_search(struct btrfs_root *root, 1247static void reada_for_search(struct btrfs_root *root,
1266 struct btrfs_path *path, 1248 struct btrfs_path *path,
1267 int level, int slot, u64 objectid) 1249 int level, int slot, u64 objectid)
1268{ 1250{
1269 struct extent_buffer *node; 1251 struct extent_buffer *node;
1270 struct btrfs_disk_key disk_key; 1252 struct btrfs_disk_key disk_key;
@@ -1465,6 +1447,117 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1465} 1447}
1466 1448
1467/* 1449/*
1450 * helper function for btrfs_search_slot. The goal is to find a block
1451 * in cache without setting the path to blocking. If we find the block
1452 * we return zero and the path is unchanged.
1453 *
1454 * If we can't find the block, we set the path blocking and do some
1455 * reada. -EAGAIN is returned and the search must be repeated.
1456 */
1457static int
1458read_block_for_search(struct btrfs_trans_handle *trans,
1459 struct btrfs_root *root, struct btrfs_path *p,
1460 struct extent_buffer **eb_ret, int level, int slot,
1461 struct btrfs_key *key)
1462{
1463 u64 blocknr;
1464 u64 gen;
1465 u32 blocksize;
1466 struct extent_buffer *b = *eb_ret;
1467 struct extent_buffer *tmp;
1468
1469 blocknr = btrfs_node_blockptr(b, slot);
1470 gen = btrfs_node_ptr_generation(b, slot);
1471 blocksize = btrfs_level_size(root, level - 1);
1472
1473 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1474 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1475 *eb_ret = tmp;
1476 return 0;
1477 }
1478
1479 /*
1480 * reduce lock contention at high levels
1481 * of the btree by dropping locks before
1482 * we read.
1483 */
1484 btrfs_release_path(NULL, p);
1485 if (tmp)
1486 free_extent_buffer(tmp);
1487 if (p->reada)
1488 reada_for_search(root, p, level, slot, key->objectid);
1489
1490 tmp = read_tree_block(root, blocknr, blocksize, gen);
1491 if (tmp)
1492 free_extent_buffer(tmp);
1493 return -EAGAIN;
1494}
1495
1496/*
1497 * helper function for btrfs_search_slot. This does all of the checks
1498 * for node-level blocks and does any balancing required based on
1499 * the ins_len.
1500 *
1501 * If no extra work was required, zero is returned. If we had to
1502 * drop the path, -EAGAIN is returned and btrfs_search_slot must
1503 * start over
1504 */
1505static int
1506setup_nodes_for_search(struct btrfs_trans_handle *trans,
1507 struct btrfs_root *root, struct btrfs_path *p,
1508 struct extent_buffer *b, int level, int ins_len)
1509{
1510 int ret;
1511 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1512 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1513 int sret;
1514
1515 sret = reada_for_balance(root, p, level);
1516 if (sret)
1517 goto again;
1518
1519 btrfs_set_path_blocking(p);
1520 sret = split_node(trans, root, p, level);
1521 btrfs_clear_path_blocking(p, NULL);
1522
1523 BUG_ON(sret > 0);
1524 if (sret) {
1525 ret = sret;
1526 goto done;
1527 }
1528 b = p->nodes[level];
1529 } else if (ins_len < 0 && btrfs_header_nritems(b) <
1530 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
1531 int sret;
1532
1533 sret = reada_for_balance(root, p, level);
1534 if (sret)
1535 goto again;
1536
1537 btrfs_set_path_blocking(p);
1538 sret = balance_level(trans, root, p, level);
1539 btrfs_clear_path_blocking(p, NULL);
1540
1541 if (sret) {
1542 ret = sret;
1543 goto done;
1544 }
1545 b = p->nodes[level];
1546 if (!b) {
1547 btrfs_release_path(NULL, p);
1548 goto again;
1549 }
1550 BUG_ON(btrfs_header_nritems(b) == 1);
1551 }
1552 return 0;
1553
1554again:
1555 ret = -EAGAIN;
1556done:
1557 return ret;
1558}
1559
1560/*
1468 * look for key in the tree. path is filled in with nodes along the way 1561 * look for key in the tree. path is filled in with nodes along the way
1469 * if key is found, we return zero and you can find the item in the leaf 1562 * if key is found, we return zero and you can find the item in the leaf
1470 * level of the path (level 0) 1563 * level of the path (level 0)
@@ -1482,17 +1575,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1482 ins_len, int cow) 1575 ins_len, int cow)
1483{ 1576{
1484 struct extent_buffer *b; 1577 struct extent_buffer *b;
1485 struct extent_buffer *tmp;
1486 int slot; 1578 int slot;
1487 int ret; 1579 int ret;
1488 int level; 1580 int level;
1489 int should_reada = p->reada;
1490 int lowest_unlock = 1; 1581 int lowest_unlock = 1;
1491 int blocksize;
1492 u8 lowest_level = 0; 1582 u8 lowest_level = 0;
1493 u64 blocknr;
1494 u64 gen;
1495 struct btrfs_key prealloc_block;
1496 1583
1497 lowest_level = p->lowest_level; 1584 lowest_level = p->lowest_level;
1498 WARN_ON(lowest_level && ins_len > 0); 1585 WARN_ON(lowest_level && ins_len > 0);
@@ -1501,8 +1588,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1501 if (ins_len < 0) 1588 if (ins_len < 0)
1502 lowest_unlock = 2; 1589 lowest_unlock = 2;
1503 1590
1504 prealloc_block.objectid = 0;
1505
1506again: 1591again:
1507 if (p->skip_locking) 1592 if (p->skip_locking)
1508 b = btrfs_root_node(root); 1593 b = btrfs_root_node(root);
@@ -1523,50 +1608,21 @@ again:
1523 if (cow) { 1608 if (cow) {
1524 int wret; 1609 int wret;
1525 1610
1526 /* is a cow on this block not required */ 1611 /*
1612 * if we don't really need to cow this block
1613 * then we don't want to set the path blocking,
1614 * so we test it here
1615 */
1527 if (btrfs_header_generation(b) == trans->transid && 1616 if (btrfs_header_generation(b) == trans->transid &&
1528 btrfs_header_owner(b) == root->root_key.objectid && 1617 btrfs_header_owner(b) == root->root_key.objectid &&
1529 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { 1618 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
1530 goto cow_done; 1619 goto cow_done;
1531 } 1620 }
1532
1533 /* ok, we have to cow, is our old prealloc the right
1534 * size?
1535 */
1536 if (prealloc_block.objectid &&
1537 prealloc_block.offset != b->len) {
1538 btrfs_release_path(root, p);
1539 btrfs_free_reserved_extent(root,
1540 prealloc_block.objectid,
1541 prealloc_block.offset);
1542 prealloc_block.objectid = 0;
1543 goto again;
1544 }
1545
1546 /*
1547 * for higher level blocks, try not to allocate blocks
1548 * with the block and the parent locks held.
1549 */
1550 if (level > 0 && !prealloc_block.objectid) {
1551 u32 size = b->len;
1552 u64 hint = b->start;
1553
1554 btrfs_release_path(root, p);
1555 ret = btrfs_reserve_extent(trans, root,
1556 size, size, 0,
1557 hint, (u64)-1,
1558 &prealloc_block, 0);
1559 BUG_ON(ret);
1560 goto again;
1561 }
1562
1563 btrfs_set_path_blocking(p); 1621 btrfs_set_path_blocking(p);
1564 1622
1565 wret = btrfs_cow_block(trans, root, b, 1623 wret = btrfs_cow_block(trans, root, b,
1566 p->nodes[level + 1], 1624 p->nodes[level + 1],
1567 p->slots[level + 1], 1625 p->slots[level + 1], &b);
1568 &b, prealloc_block.objectid);
1569 prealloc_block.objectid = 0;
1570 if (wret) { 1626 if (wret) {
1571 free_extent_buffer(b); 1627 free_extent_buffer(b);
1572 ret = wret; 1628 ret = wret;
@@ -1611,51 +1667,15 @@ cow_done:
1611 if (ret && slot > 0) 1667 if (ret && slot > 0)
1612 slot -= 1; 1668 slot -= 1;
1613 p->slots[level] = slot; 1669 p->slots[level] = slot;
1614 if ((p->search_for_split || ins_len > 0) && 1670 ret = setup_nodes_for_search(trans, root, p, b, level,
1615 btrfs_header_nritems(b) >= 1671 ins_len);
1616 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 1672 if (ret == -EAGAIN)
1617 int sret; 1673 goto again;
1618 1674 else if (ret)
1619 sret = reada_for_balance(root, p, level); 1675 goto done;
1620 if (sret) 1676 b = p->nodes[level];
1621 goto again; 1677 slot = p->slots[level];
1622
1623 btrfs_set_path_blocking(p);
1624 sret = split_node(trans, root, p, level);
1625 btrfs_clear_path_blocking(p, NULL);
1626
1627 BUG_ON(sret > 0);
1628 if (sret) {
1629 ret = sret;
1630 goto done;
1631 }
1632 b = p->nodes[level];
1633 slot = p->slots[level];
1634 } else if (ins_len < 0 &&
1635 btrfs_header_nritems(b) <
1636 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
1637 int sret;
1638
1639 sret = reada_for_balance(root, p, level);
1640 if (sret)
1641 goto again;
1642
1643 btrfs_set_path_blocking(p);
1644 sret = balance_level(trans, root, p, level);
1645 btrfs_clear_path_blocking(p, NULL);
1646 1678
1647 if (sret) {
1648 ret = sret;
1649 goto done;
1650 }
1651 b = p->nodes[level];
1652 if (!b) {
1653 btrfs_release_path(NULL, p);
1654 goto again;
1655 }
1656 slot = p->slots[level];
1657 BUG_ON(btrfs_header_nritems(b) == 1);
1658 }
1659 unlock_up(p, level, lowest_unlock); 1679 unlock_up(p, level, lowest_unlock);
1660 1680
1661 /* this is only true while dropping a snapshot */ 1681 /* this is only true while dropping a snapshot */
@@ -1664,44 +1684,11 @@ cow_done:
1664 goto done; 1684 goto done;
1665 } 1685 }
1666 1686
1667 blocknr = btrfs_node_blockptr(b, slot); 1687 ret = read_block_for_search(trans, root, p,
1668 gen = btrfs_node_ptr_generation(b, slot); 1688 &b, level, slot, key);
1669 blocksize = btrfs_level_size(root, level - 1); 1689 if (ret == -EAGAIN)
1690 goto again;
1670 1691
1671 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1672 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1673 b = tmp;
1674 } else {
1675 /*
1676 * reduce lock contention at high levels
1677 * of the btree by dropping locks before
1678 * we read.
1679 */
1680 if (level > 0) {
1681 btrfs_release_path(NULL, p);
1682 if (tmp)
1683 free_extent_buffer(tmp);
1684 if (should_reada)
1685 reada_for_search(root, p,
1686 level, slot,
1687 key->objectid);
1688
1689 tmp = read_tree_block(root, blocknr,
1690 blocksize, gen);
1691 if (tmp)
1692 free_extent_buffer(tmp);
1693 goto again;
1694 } else {
1695 btrfs_set_path_blocking(p);
1696 if (tmp)
1697 free_extent_buffer(tmp);
1698 if (should_reada)
1699 reada_for_search(root, p,
1700 level, slot,
1701 key->objectid);
1702 b = read_node_slot(root, b, slot);
1703 }
1704 }
1705 if (!p->skip_locking) { 1692 if (!p->skip_locking) {
1706 int lret; 1693 int lret;
1707 1694
@@ -1742,12 +1729,8 @@ done:
1742 * we don't really know what they plan on doing with the path 1729 * we don't really know what they plan on doing with the path
1743 * from here on, so for now just mark it as blocking 1730 * from here on, so for now just mark it as blocking
1744 */ 1731 */
1745 btrfs_set_path_blocking(p); 1732 if (!p->leave_spinning)
1746 if (prealloc_block.objectid) { 1733 btrfs_set_path_blocking(p);
1747 btrfs_free_reserved_extent(root,
1748 prealloc_block.objectid,
1749 prealloc_block.offset);
1750 }
1751 return ret; 1734 return ret;
1752} 1735}
1753 1736
@@ -1768,7 +1751,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1768 int ret; 1751 int ret;
1769 1752
1770 eb = btrfs_lock_root_node(root); 1753 eb = btrfs_lock_root_node(root);
1771 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); 1754 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb);
1772 BUG_ON(ret); 1755 BUG_ON(ret);
1773 1756
1774 btrfs_set_lock_blocking(eb); 1757 btrfs_set_lock_blocking(eb);
@@ -1826,7 +1809,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1826 } 1809 }
1827 1810
1828 ret = btrfs_cow_block(trans, root, eb, parent, slot, 1811 ret = btrfs_cow_block(trans, root, eb, parent, slot,
1829 &eb, 0); 1812 &eb);
1830 BUG_ON(ret); 1813 BUG_ON(ret);
1831 1814
1832 if (root->root_key.objectid == 1815 if (root->root_key.objectid ==
@@ -2139,7 +2122,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2139 spin_unlock(&root->node_lock); 2122 spin_unlock(&root->node_lock);
2140 2123
2141 ret = btrfs_update_extent_ref(trans, root, lower->start, 2124 ret = btrfs_update_extent_ref(trans, root, lower->start,
2142 lower->start, c->start, 2125 lower->len, lower->start, c->start,
2143 root->root_key.objectid, 2126 root->root_key.objectid,
2144 trans->transid, level - 1); 2127 trans->transid, level - 1);
2145 BUG_ON(ret); 2128 BUG_ON(ret);
@@ -2174,8 +2157,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2174 BUG_ON(!path->nodes[level]); 2157 BUG_ON(!path->nodes[level]);
2175 lower = path->nodes[level]; 2158 lower = path->nodes[level];
2176 nritems = btrfs_header_nritems(lower); 2159 nritems = btrfs_header_nritems(lower);
2177 if (slot > nritems) 2160 BUG_ON(slot > nritems);
2178 BUG();
2179 if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root)) 2161 if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
2180 BUG(); 2162 BUG();
2181 if (slot != nritems) { 2163 if (slot != nritems) {
@@ -2221,7 +2203,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2221 ret = insert_new_root(trans, root, path, level + 1); 2203 ret = insert_new_root(trans, root, path, level + 1);
2222 if (ret) 2204 if (ret)
2223 return ret; 2205 return ret;
2224 } else { 2206 } else if (!trans->transaction->delayed_refs.flushing) {
2225 ret = push_nodes_for_insert(trans, root, path, level); 2207 ret = push_nodes_for_insert(trans, root, path, level);
2226 c = path->nodes[level]; 2208 c = path->nodes[level];
2227 if (!ret && btrfs_header_nritems(c) < 2209 if (!ret && btrfs_header_nritems(c) <
@@ -2329,66 +2311,27 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2329 return ret; 2311 return ret;
2330} 2312}
2331 2313
2332/* 2314static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2333 * push some data in the path leaf to the right, trying to free up at 2315 struct btrfs_root *root,
2334 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2316 struct btrfs_path *path,
2335 * 2317 int data_size, int empty,
2336 * returns 1 if the push failed because the other node didn't have enough 2318 struct extent_buffer *right,
2337 * room, 0 if everything worked out and < 0 if there were major errors. 2319 int free_space, u32 left_nritems)
2338 */
2339static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2340 *root, struct btrfs_path *path, int data_size,
2341 int empty)
2342{ 2320{
2343 struct extent_buffer *left = path->nodes[0]; 2321 struct extent_buffer *left = path->nodes[0];
2344 struct extent_buffer *right; 2322 struct extent_buffer *upper = path->nodes[1];
2345 struct extent_buffer *upper;
2346 struct btrfs_disk_key disk_key; 2323 struct btrfs_disk_key disk_key;
2347 int slot; 2324 int slot;
2348 u32 i; 2325 u32 i;
2349 int free_space;
2350 int push_space = 0; 2326 int push_space = 0;
2351 int push_items = 0; 2327 int push_items = 0;
2352 struct btrfs_item *item; 2328 struct btrfs_item *item;
2353 u32 left_nritems;
2354 u32 nr; 2329 u32 nr;
2355 u32 right_nritems; 2330 u32 right_nritems;
2356 u32 data_end; 2331 u32 data_end;
2357 u32 this_item_size; 2332 u32 this_item_size;
2358 int ret; 2333 int ret;
2359 2334
2360 slot = path->slots[1];
2361 if (!path->nodes[1])
2362 return 1;
2363
2364 upper = path->nodes[1];
2365 if (slot >= btrfs_header_nritems(upper) - 1)
2366 return 1;
2367
2368 WARN_ON(!btrfs_tree_locked(path->nodes[1]));
2369
2370 right = read_node_slot(root, upper, slot + 1);
2371 btrfs_tree_lock(right);
2372 btrfs_set_lock_blocking(right);
2373
2374 free_space = btrfs_leaf_free_space(root, right);
2375 if (free_space < data_size)
2376 goto out_unlock;
2377
2378 /* cow and double check */
2379 ret = btrfs_cow_block(trans, root, right, upper,
2380 slot + 1, &right, 0);
2381 if (ret)
2382 goto out_unlock;
2383
2384 free_space = btrfs_leaf_free_space(root, right);
2385 if (free_space < data_size)
2386 goto out_unlock;
2387
2388 left_nritems = btrfs_header_nritems(left);
2389 if (left_nritems == 0)
2390 goto out_unlock;
2391
2392 if (empty) 2335 if (empty)
2393 nr = 0; 2336 nr = 0;
2394 else 2337 else
@@ -2397,6 +2340,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2397 if (path->slots[0] >= left_nritems) 2340 if (path->slots[0] >= left_nritems)
2398 push_space += data_size; 2341 push_space += data_size;
2399 2342
2343 slot = path->slots[1];
2400 i = left_nritems - 1; 2344 i = left_nritems - 1;
2401 while (i >= nr) { 2345 while (i >= nr) {
2402 item = btrfs_item_nr(left, i); 2346 item = btrfs_item_nr(left, i);
@@ -2528,24 +2472,82 @@ out_unlock:
2528} 2472}
2529 2473
2530/* 2474/*
2475 * push some data in the path leaf to the right, trying to free up at
2476 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2477 *
2478 * returns 1 if the push failed because the other node didn't have enough
2479 * room, 0 if everything worked out and < 0 if there were major errors.
2480 */
2481static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2482 *root, struct btrfs_path *path, int data_size,
2483 int empty)
2484{
2485 struct extent_buffer *left = path->nodes[0];
2486 struct extent_buffer *right;
2487 struct extent_buffer *upper;
2488 int slot;
2489 int free_space;
2490 u32 left_nritems;
2491 int ret;
2492
2493 if (!path->nodes[1])
2494 return 1;
2495
2496 slot = path->slots[1];
2497 upper = path->nodes[1];
2498 if (slot >= btrfs_header_nritems(upper) - 1)
2499 return 1;
2500
2501 btrfs_assert_tree_locked(path->nodes[1]);
2502
2503 right = read_node_slot(root, upper, slot + 1);
2504 btrfs_tree_lock(right);
2505 btrfs_set_lock_blocking(right);
2506
2507 free_space = btrfs_leaf_free_space(root, right);
2508 if (free_space < data_size)
2509 goto out_unlock;
2510
2511 /* cow and double check */
2512 ret = btrfs_cow_block(trans, root, right, upper,
2513 slot + 1, &right);
2514 if (ret)
2515 goto out_unlock;
2516
2517 free_space = btrfs_leaf_free_space(root, right);
2518 if (free_space < data_size)
2519 goto out_unlock;
2520
2521 left_nritems = btrfs_header_nritems(left);
2522 if (left_nritems == 0)
2523 goto out_unlock;
2524
2525 return __push_leaf_right(trans, root, path, data_size, empty,
2526 right, free_space, left_nritems);
2527out_unlock:
2528 btrfs_tree_unlock(right);
2529 free_extent_buffer(right);
2530 return 1;
2531}
2532
2533/*
2531 * push some data in the path leaf to the left, trying to free up at 2534 * push some data in the path leaf to the left, trying to free up at
2532 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2535 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2533 */ 2536 */
2534static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 2537static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2535 *root, struct btrfs_path *path, int data_size, 2538 struct btrfs_root *root,
2536 int empty) 2539 struct btrfs_path *path, int data_size,
2540 int empty, struct extent_buffer *left,
2541 int free_space, int right_nritems)
2537{ 2542{
2538 struct btrfs_disk_key disk_key; 2543 struct btrfs_disk_key disk_key;
2539 struct extent_buffer *right = path->nodes[0]; 2544 struct extent_buffer *right = path->nodes[0];
2540 struct extent_buffer *left;
2541 int slot; 2545 int slot;
2542 int i; 2546 int i;
2543 int free_space;
2544 int push_space = 0; 2547 int push_space = 0;
2545 int push_items = 0; 2548 int push_items = 0;
2546 struct btrfs_item *item; 2549 struct btrfs_item *item;
2547 u32 old_left_nritems; 2550 u32 old_left_nritems;
2548 u32 right_nritems;
2549 u32 nr; 2551 u32 nr;
2550 int ret = 0; 2552 int ret = 0;
2551 int wret; 2553 int wret;
@@ -2553,41 +2555,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2553 u32 old_left_item_size; 2555 u32 old_left_item_size;
2554 2556
2555 slot = path->slots[1]; 2557 slot = path->slots[1];
2556 if (slot == 0)
2557 return 1;
2558 if (!path->nodes[1])
2559 return 1;
2560
2561 right_nritems = btrfs_header_nritems(right);
2562 if (right_nritems == 0)
2563 return 1;
2564
2565 WARN_ON(!btrfs_tree_locked(path->nodes[1]));
2566
2567 left = read_node_slot(root, path->nodes[1], slot - 1);
2568 btrfs_tree_lock(left);
2569 btrfs_set_lock_blocking(left);
2570
2571 free_space = btrfs_leaf_free_space(root, left);
2572 if (free_space < data_size) {
2573 ret = 1;
2574 goto out;
2575 }
2576
2577 /* cow and double check */
2578 ret = btrfs_cow_block(trans, root, left,
2579 path->nodes[1], slot - 1, &left, 0);
2580 if (ret) {
2581 /* we hit -ENOSPC, but it isn't fatal here */
2582 ret = 1;
2583 goto out;
2584 }
2585
2586 free_space = btrfs_leaf_free_space(root, left);
2587 if (free_space < data_size) {
2588 ret = 1;
2589 goto out;
2590 }
2591 2558
2592 if (empty) 2559 if (empty)
2593 nr = right_nritems; 2560 nr = right_nritems;
@@ -2755,6 +2722,154 @@ out:
2755} 2722}
2756 2723
2757/* 2724/*
2725 * push some data in the path leaf to the left, trying to free up at
2726 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2727 */
2728static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2729 *root, struct btrfs_path *path, int data_size,
2730 int empty)
2731{
2732 struct extent_buffer *right = path->nodes[0];
2733 struct extent_buffer *left;
2734 int slot;
2735 int free_space;
2736 u32 right_nritems;
2737 int ret = 0;
2738
2739 slot = path->slots[1];
2740 if (slot == 0)
2741 return 1;
2742 if (!path->nodes[1])
2743 return 1;
2744
2745 right_nritems = btrfs_header_nritems(right);
2746 if (right_nritems == 0)
2747 return 1;
2748
2749 btrfs_assert_tree_locked(path->nodes[1]);
2750
2751 left = read_node_slot(root, path->nodes[1], slot - 1);
2752 btrfs_tree_lock(left);
2753 btrfs_set_lock_blocking(left);
2754
2755 free_space = btrfs_leaf_free_space(root, left);
2756 if (free_space < data_size) {
2757 ret = 1;
2758 goto out;
2759 }
2760
2761 /* cow and double check */
2762 ret = btrfs_cow_block(trans, root, left,
2763 path->nodes[1], slot - 1, &left);
2764 if (ret) {
2765 /* we hit -ENOSPC, but it isn't fatal here */
2766 ret = 1;
2767 goto out;
2768 }
2769
2770 free_space = btrfs_leaf_free_space(root, left);
2771 if (free_space < data_size) {
2772 ret = 1;
2773 goto out;
2774 }
2775
2776 return __push_leaf_left(trans, root, path, data_size,
2777 empty, left, free_space, right_nritems);
2778out:
2779 btrfs_tree_unlock(left);
2780 free_extent_buffer(left);
2781 return ret;
2782}
2783
2784/*
2785 * split the path's leaf in two, making sure there is at least data_size
2786 * available for the resulting leaf level of the path.
2787 *
2788 * returns 0 if all went well and < 0 on failure.
2789 */
2790static noinline int copy_for_split(struct btrfs_trans_handle *trans,
2791 struct btrfs_root *root,
2792 struct btrfs_path *path,
2793 struct extent_buffer *l,
2794 struct extent_buffer *right,
2795 int slot, int mid, int nritems)
2796{
2797 int data_copy_size;
2798 int rt_data_off;
2799 int i;
2800 int ret = 0;
2801 int wret;
2802 struct btrfs_disk_key disk_key;
2803
2804 nritems = nritems - mid;
2805 btrfs_set_header_nritems(right, nritems);
2806 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2807
2808 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2809 btrfs_item_nr_offset(mid),
2810 nritems * sizeof(struct btrfs_item));
2811
2812 copy_extent_buffer(right, l,
2813 btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2814 data_copy_size, btrfs_leaf_data(l) +
2815 leaf_data_end(root, l), data_copy_size);
2816
2817 rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2818 btrfs_item_end_nr(l, mid);
2819
2820 for (i = 0; i < nritems; i++) {
2821 struct btrfs_item *item = btrfs_item_nr(right, i);
2822 u32 ioff;
2823
2824 if (!right->map_token) {
2825 map_extent_buffer(right, (unsigned long)item,
2826 sizeof(struct btrfs_item),
2827 &right->map_token, &right->kaddr,
2828 &right->map_start, &right->map_len,
2829 KM_USER1);
2830 }
2831
2832 ioff = btrfs_item_offset(right, item);
2833 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2834 }
2835
2836 if (right->map_token) {
2837 unmap_extent_buffer(right, right->map_token, KM_USER1);
2838 right->map_token = NULL;
2839 }
2840
2841 btrfs_set_header_nritems(l, mid);
2842 ret = 0;
2843 btrfs_item_key(right, &disk_key, 0);
2844 wret = insert_ptr(trans, root, path, &disk_key, right->start,
2845 path->slots[1] + 1, 1);
2846 if (wret)
2847 ret = wret;
2848
2849 btrfs_mark_buffer_dirty(right);
2850 btrfs_mark_buffer_dirty(l);
2851 BUG_ON(path->slots[0] != slot);
2852
2853 ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
2854 BUG_ON(ret);
2855
2856 if (mid <= slot) {
2857 btrfs_tree_unlock(path->nodes[0]);
2858 free_extent_buffer(path->nodes[0]);
2859 path->nodes[0] = right;
2860 path->slots[0] -= mid;
2861 path->slots[1] += 1;
2862 } else {
2863 btrfs_tree_unlock(right);
2864 free_extent_buffer(right);
2865 }
2866
2867 BUG_ON(path->slots[0] < 0);
2868
2869 return ret;
2870}
2871
2872/*
2758 * split the path's leaf in two, making sure there is at least data_size 2873 * split the path's leaf in two, making sure there is at least data_size
2759 * available for the resulting leaf level of the path. 2874 * available for the resulting leaf level of the path.
2760 * 2875 *
@@ -2771,17 +2886,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
2771 int mid; 2886 int mid;
2772 int slot; 2887 int slot;
2773 struct extent_buffer *right; 2888 struct extent_buffer *right;
2774 int data_copy_size;
2775 int rt_data_off;
2776 int i;
2777 int ret = 0; 2889 int ret = 0;
2778 int wret; 2890 int wret;
2779 int double_split; 2891 int double_split;
2780 int num_doubles = 0; 2892 int num_doubles = 0;
2781 struct btrfs_disk_key disk_key;
2782 2893
2783 /* first try to make some room by pushing left and right */ 2894 /* first try to make some room by pushing left and right */
2784 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { 2895 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY &&
2896 !trans->transaction->delayed_refs.flushing) {
2785 wret = push_leaf_right(trans, root, path, data_size, 0); 2897 wret = push_leaf_right(trans, root, path, data_size, 0);
2786 if (wret < 0) 2898 if (wret < 0)
2787 return wret; 2899 return wret;
@@ -2830,11 +2942,14 @@ again:
2830 write_extent_buffer(right, root->fs_info->chunk_tree_uuid, 2942 write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
2831 (unsigned long)btrfs_header_chunk_tree_uuid(right), 2943 (unsigned long)btrfs_header_chunk_tree_uuid(right),
2832 BTRFS_UUID_SIZE); 2944 BTRFS_UUID_SIZE);
2945
2833 if (mid <= slot) { 2946 if (mid <= slot) {
2834 if (nritems == 1 || 2947 if (nritems == 1 ||
2835 leaf_space_used(l, mid, nritems - mid) + data_size > 2948 leaf_space_used(l, mid, nritems - mid) + data_size >
2836 BTRFS_LEAF_DATA_SIZE(root)) { 2949 BTRFS_LEAF_DATA_SIZE(root)) {
2837 if (slot >= nritems) { 2950 if (slot >= nritems) {
2951 struct btrfs_disk_key disk_key;
2952
2838 btrfs_cpu_key_to_disk(&disk_key, ins_key); 2953 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2839 btrfs_set_header_nritems(right, 0); 2954 btrfs_set_header_nritems(right, 0);
2840 wret = insert_ptr(trans, root, path, 2955 wret = insert_ptr(trans, root, path,
@@ -2862,6 +2977,8 @@ again:
2862 if (leaf_space_used(l, 0, mid) + data_size > 2977 if (leaf_space_used(l, 0, mid) + data_size >
2863 BTRFS_LEAF_DATA_SIZE(root)) { 2978 BTRFS_LEAF_DATA_SIZE(root)) {
2864 if (!extend && data_size && slot == 0) { 2979 if (!extend && data_size && slot == 0) {
2980 struct btrfs_disk_key disk_key;
2981
2865 btrfs_cpu_key_to_disk(&disk_key, ins_key); 2982 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2866 btrfs_set_header_nritems(right, 0); 2983 btrfs_set_header_nritems(right, 0);
2867 wret = insert_ptr(trans, root, path, 2984 wret = insert_ptr(trans, root, path,
@@ -2894,76 +3011,16 @@ again:
2894 } 3011 }
2895 } 3012 }
2896 } 3013 }
2897 nritems = nritems - mid;
2898 btrfs_set_header_nritems(right, nritems);
2899 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2900
2901 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2902 btrfs_item_nr_offset(mid),
2903 nritems * sizeof(struct btrfs_item));
2904
2905 copy_extent_buffer(right, l,
2906 btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2907 data_copy_size, btrfs_leaf_data(l) +
2908 leaf_data_end(root, l), data_copy_size);
2909
2910 rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2911 btrfs_item_end_nr(l, mid);
2912
2913 for (i = 0; i < nritems; i++) {
2914 struct btrfs_item *item = btrfs_item_nr(right, i);
2915 u32 ioff;
2916
2917 if (!right->map_token) {
2918 map_extent_buffer(right, (unsigned long)item,
2919 sizeof(struct btrfs_item),
2920 &right->map_token, &right->kaddr,
2921 &right->map_start, &right->map_len,
2922 KM_USER1);
2923 }
2924
2925 ioff = btrfs_item_offset(right, item);
2926 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2927 }
2928
2929 if (right->map_token) {
2930 unmap_extent_buffer(right, right->map_token, KM_USER1);
2931 right->map_token = NULL;
2932 }
2933 3014
2934 btrfs_set_header_nritems(l, mid); 3015 ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
2935 ret = 0;
2936 btrfs_item_key(right, &disk_key, 0);
2937 wret = insert_ptr(trans, root, path, &disk_key, right->start,
2938 path->slots[1] + 1, 1);
2939 if (wret)
2940 ret = wret;
2941
2942 btrfs_mark_buffer_dirty(right);
2943 btrfs_mark_buffer_dirty(l);
2944 BUG_ON(path->slots[0] != slot);
2945
2946 ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
2947 BUG_ON(ret); 3016 BUG_ON(ret);
2948 3017
2949 if (mid <= slot) {
2950 btrfs_tree_unlock(path->nodes[0]);
2951 free_extent_buffer(path->nodes[0]);
2952 path->nodes[0] = right;
2953 path->slots[0] -= mid;
2954 path->slots[1] += 1;
2955 } else {
2956 btrfs_tree_unlock(right);
2957 free_extent_buffer(right);
2958 }
2959
2960 BUG_ON(path->slots[0] < 0);
2961
2962 if (double_split) { 3018 if (double_split) {
2963 BUG_ON(num_doubles != 0); 3019 BUG_ON(num_doubles != 0);
2964 num_doubles++; 3020 num_doubles++;
2965 goto again; 3021 goto again;
2966 } 3022 }
3023
2967 return ret; 3024 return ret;
2968} 3025}
2969 3026
@@ -3021,26 +3078,27 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
3021 return -EAGAIN; 3078 return -EAGAIN;
3022 } 3079 }
3023 3080
3081 btrfs_set_path_blocking(path);
3024 ret = split_leaf(trans, root, &orig_key, path, 3082 ret = split_leaf(trans, root, &orig_key, path,
3025 sizeof(struct btrfs_item), 1); 3083 sizeof(struct btrfs_item), 1);
3026 path->keep_locks = 0; 3084 path->keep_locks = 0;
3027 BUG_ON(ret); 3085 BUG_ON(ret);
3028 3086
3087 btrfs_unlock_up_safe(path, 1);
3088 leaf = path->nodes[0];
3089 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
3090
3091split:
3029 /* 3092 /*
3030 * make sure any changes to the path from split_leaf leave it 3093 * make sure any changes to the path from split_leaf leave it
3031 * in a blocking state 3094 * in a blocking state
3032 */ 3095 */
3033 btrfs_set_path_blocking(path); 3096 btrfs_set_path_blocking(path);
3034 3097
3035 leaf = path->nodes[0];
3036 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
3037
3038split:
3039 item = btrfs_item_nr(leaf, path->slots[0]); 3098 item = btrfs_item_nr(leaf, path->slots[0]);
3040 orig_offset = btrfs_item_offset(leaf, item); 3099 orig_offset = btrfs_item_offset(leaf, item);
3041 item_size = btrfs_item_size(leaf, item); 3100 item_size = btrfs_item_size(leaf, item);
3042 3101
3043
3044 buf = kmalloc(item_size, GFP_NOFS); 3102 buf = kmalloc(item_size, GFP_NOFS);
3045 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, 3103 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3046 path->slots[0]), item_size); 3104 path->slots[0]), item_size);
@@ -3445,39 +3503,27 @@ out:
3445} 3503}
3446 3504
3447/* 3505/*
3448 * Given a key and some data, insert items into the tree. 3506 * this is a helper for btrfs_insert_empty_items, the main goal here is
3449 * This does all the path init required, making room in the tree if needed. 3507 * to save stack depth by doing the bulk of the work in a function
3508 * that doesn't call btrfs_search_slot
3450 */ 3509 */
3451int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, 3510static noinline_for_stack int
3452 struct btrfs_root *root, 3511setup_items_for_insert(struct btrfs_trans_handle *trans,
3453 struct btrfs_path *path, 3512 struct btrfs_root *root, struct btrfs_path *path,
3454 struct btrfs_key *cpu_key, u32 *data_size, 3513 struct btrfs_key *cpu_key, u32 *data_size,
3455 int nr) 3514 u32 total_data, u32 total_size, int nr)
3456{ 3515{
3457 struct extent_buffer *leaf;
3458 struct btrfs_item *item; 3516 struct btrfs_item *item;
3459 int ret = 0;
3460 int slot;
3461 int slot_orig;
3462 int i; 3517 int i;
3463 u32 nritems; 3518 u32 nritems;
3464 u32 total_size = 0;
3465 u32 total_data = 0;
3466 unsigned int data_end; 3519 unsigned int data_end;
3467 struct btrfs_disk_key disk_key; 3520 struct btrfs_disk_key disk_key;
3521 int ret;
3522 struct extent_buffer *leaf;
3523 int slot;
3468 3524
3469 for (i = 0; i < nr; i++)
3470 total_data += data_size[i];
3471
3472 total_size = total_data + (nr * sizeof(struct btrfs_item));
3473 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3474 if (ret == 0)
3475 return -EEXIST;
3476 if (ret < 0)
3477 goto out;
3478
3479 slot_orig = path->slots[0];
3480 leaf = path->nodes[0]; 3525 leaf = path->nodes[0];
3526 slot = path->slots[0];
3481 3527
3482 nritems = btrfs_header_nritems(leaf); 3528 nritems = btrfs_header_nritems(leaf);
3483 data_end = leaf_data_end(root, leaf); 3529 data_end = leaf_data_end(root, leaf);
@@ -3489,9 +3535,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3489 BUG(); 3535 BUG();
3490 } 3536 }
3491 3537
3492 slot = path->slots[0];
3493 BUG_ON(slot < 0);
3494
3495 if (slot != nritems) { 3538 if (slot != nritems) {
3496 unsigned int old_data = btrfs_item_end_nr(leaf, slot); 3539 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3497 3540
@@ -3547,21 +3590,60 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3547 data_end -= data_size[i]; 3590 data_end -= data_size[i];
3548 btrfs_set_item_size(leaf, item, data_size[i]); 3591 btrfs_set_item_size(leaf, item, data_size[i]);
3549 } 3592 }
3593
3550 btrfs_set_header_nritems(leaf, nritems + nr); 3594 btrfs_set_header_nritems(leaf, nritems + nr);
3551 btrfs_mark_buffer_dirty(leaf);
3552 3595
3553 ret = 0; 3596 ret = 0;
3554 if (slot == 0) { 3597 if (slot == 0) {
3598 struct btrfs_disk_key disk_key;
3555 btrfs_cpu_key_to_disk(&disk_key, cpu_key); 3599 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3556 ret = fixup_low_keys(trans, root, path, &disk_key, 1); 3600 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3557 } 3601 }
3602 btrfs_unlock_up_safe(path, 1);
3603 btrfs_mark_buffer_dirty(leaf);
3558 3604
3559 if (btrfs_leaf_free_space(root, leaf) < 0) { 3605 if (btrfs_leaf_free_space(root, leaf) < 0) {
3560 btrfs_print_leaf(root, leaf); 3606 btrfs_print_leaf(root, leaf);
3561 BUG(); 3607 BUG();
3562 } 3608 }
3609 return ret;
3610}
3611
3612/*
3613 * Given a key and some data, insert items into the tree.
3614 * This does all the path init required, making room in the tree if needed.
3615 */
3616int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3617 struct btrfs_root *root,
3618 struct btrfs_path *path,
3619 struct btrfs_key *cpu_key, u32 *data_size,
3620 int nr)
3621{
3622 struct extent_buffer *leaf;
3623 int ret = 0;
3624 int slot;
3625 int i;
3626 u32 total_size = 0;
3627 u32 total_data = 0;
3628
3629 for (i = 0; i < nr; i++)
3630 total_data += data_size[i];
3631
3632 total_size = total_data + (nr * sizeof(struct btrfs_item));
3633 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3634 if (ret == 0)
3635 return -EEXIST;
3636 if (ret < 0)
3637 goto out;
3638
3639 leaf = path->nodes[0];
3640 slot = path->slots[0];
3641 BUG_ON(slot < 0);
3642
3643 ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
3644 total_data, total_size, nr);
3645
3563out: 3646out:
3564 btrfs_unlock_up_safe(path, 1);
3565 return ret; 3647 return ret;
3566} 3648}
3567 3649
@@ -3749,7 +3831,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3749 } 3831 }
3750 3832
3751 /* delete the leaf if it is mostly empty */ 3833 /* delete the leaf if it is mostly empty */
3752 if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) { 3834 if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 &&
3835 !trans->transaction->delayed_refs.flushing) {
3753 /* push_leaf_left fixes the path. 3836 /* push_leaf_left fixes the path.
3754 * make sure the path still points to our leaf 3837 * make sure the path still points to our leaf
3755 * for possible call to del_ptr below 3838 * for possible call to del_ptr below
@@ -3757,6 +3840,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3757 slot = path->slots[1]; 3840 slot = path->slots[1];
3758 extent_buffer_get(leaf); 3841 extent_buffer_get(leaf);
3759 3842
3843 btrfs_set_path_blocking(path);
3760 wret = push_leaf_left(trans, root, path, 1, 1); 3844 wret = push_leaf_left(trans, root, path, 1, 1);
3761 if (wret < 0 && wret != -ENOSPC) 3845 if (wret < 0 && wret != -ENOSPC)
3762 ret = wret; 3846 ret = wret;
@@ -4042,28 +4126,44 @@ next:
4042int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) 4126int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4043{ 4127{
4044 int slot; 4128 int slot;
4045 int level = 1; 4129 int level;
4046 struct extent_buffer *c; 4130 struct extent_buffer *c;
4047 struct extent_buffer *next = NULL; 4131 struct extent_buffer *next;
4048 struct btrfs_key key; 4132 struct btrfs_key key;
4049 u32 nritems; 4133 u32 nritems;
4050 int ret; 4134 int ret;
4135 int old_spinning = path->leave_spinning;
4136 int force_blocking = 0;
4051 4137
4052 nritems = btrfs_header_nritems(path->nodes[0]); 4138 nritems = btrfs_header_nritems(path->nodes[0]);
4053 if (nritems == 0) 4139 if (nritems == 0)
4054 return 1; 4140 return 1;
4055 4141
4056 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 4142 /*
4143 * we take the blocks in an order that upsets lockdep. Using
4144 * blocking mode is the only way around it.
4145 */
4146#ifdef CONFIG_DEBUG_LOCK_ALLOC
4147 force_blocking = 1;
4148#endif
4057 4149
4150 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4151again:
4152 level = 1;
4153 next = NULL;
4058 btrfs_release_path(root, path); 4154 btrfs_release_path(root, path);
4155
4059 path->keep_locks = 1; 4156 path->keep_locks = 1;
4157
4158 if (!force_blocking)
4159 path->leave_spinning = 1;
4160
4060 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4161 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4061 path->keep_locks = 0; 4162 path->keep_locks = 0;
4062 4163
4063 if (ret < 0) 4164 if (ret < 0)
4064 return ret; 4165 return ret;
4065 4166
4066 btrfs_set_path_blocking(path);
4067 nritems = btrfs_header_nritems(path->nodes[0]); 4167 nritems = btrfs_header_nritems(path->nodes[0]);
4068 /* 4168 /*
4069 * by releasing the path above we dropped all our locks. A balance 4169 * by releasing the path above we dropped all our locks. A balance
@@ -4073,19 +4173,24 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4073 */ 4173 */
4074 if (nritems > 0 && path->slots[0] < nritems - 1) { 4174 if (nritems > 0 && path->slots[0] < nritems - 1) {
4075 path->slots[0]++; 4175 path->slots[0]++;
4176 ret = 0;
4076 goto done; 4177 goto done;
4077 } 4178 }
4078 4179
4079 while (level < BTRFS_MAX_LEVEL) { 4180 while (level < BTRFS_MAX_LEVEL) {
4080 if (!path->nodes[level]) 4181 if (!path->nodes[level]) {
4081 return 1; 4182 ret = 1;
4183 goto done;
4184 }
4082 4185
4083 slot = path->slots[level] + 1; 4186 slot = path->slots[level] + 1;
4084 c = path->nodes[level]; 4187 c = path->nodes[level];
4085 if (slot >= btrfs_header_nritems(c)) { 4188 if (slot >= btrfs_header_nritems(c)) {
4086 level++; 4189 level++;
4087 if (level == BTRFS_MAX_LEVEL) 4190 if (level == BTRFS_MAX_LEVEL) {
4088 return 1; 4191 ret = 1;
4192 goto done;
4193 }
4089 continue; 4194 continue;
4090 } 4195 }
4091 4196
@@ -4094,16 +4199,22 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4094 free_extent_buffer(next); 4199 free_extent_buffer(next);
4095 } 4200 }
4096 4201
4097 /* the path was set to blocking above */ 4202 next = c;
4098 if (level == 1 && (path->locks[1] || path->skip_locking) && 4203 ret = read_block_for_search(NULL, root, path, &next, level,
4099 path->reada) 4204 slot, &key);
4100 reada_for_search(root, path, level, slot, 0); 4205 if (ret == -EAGAIN)
4206 goto again;
4101 4207
4102 next = read_node_slot(root, c, slot);
4103 if (!path->skip_locking) { 4208 if (!path->skip_locking) {
4104 WARN_ON(!btrfs_tree_locked(c)); 4209 ret = btrfs_try_spin_lock(next);
4105 btrfs_tree_lock(next); 4210 if (!ret) {
4106 btrfs_set_lock_blocking(next); 4211 btrfs_set_path_blocking(path);
4212 btrfs_tree_lock(next);
4213 if (!force_blocking)
4214 btrfs_clear_path_blocking(path, next);
4215 }
4216 if (force_blocking)
4217 btrfs_set_lock_blocking(next);
4107 } 4218 }
4108 break; 4219 break;
4109 } 4220 }
@@ -4113,27 +4224,42 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4113 c = path->nodes[level]; 4224 c = path->nodes[level];
4114 if (path->locks[level]) 4225 if (path->locks[level])
4115 btrfs_tree_unlock(c); 4226 btrfs_tree_unlock(c);
4227
4116 free_extent_buffer(c); 4228 free_extent_buffer(c);
4117 path->nodes[level] = next; 4229 path->nodes[level] = next;
4118 path->slots[level] = 0; 4230 path->slots[level] = 0;
4119 if (!path->skip_locking) 4231 if (!path->skip_locking)
4120 path->locks[level] = 1; 4232 path->locks[level] = 1;
4233
4121 if (!level) 4234 if (!level)
4122 break; 4235 break;
4123 4236
4124 btrfs_set_path_blocking(path); 4237 ret = read_block_for_search(NULL, root, path, &next, level,
4125 if (level == 1 && path->locks[1] && path->reada) 4238 0, &key);
4126 reada_for_search(root, path, level, slot, 0); 4239 if (ret == -EAGAIN)
4127 next = read_node_slot(root, next, 0); 4240 goto again;
4241
4128 if (!path->skip_locking) { 4242 if (!path->skip_locking) {
4129 WARN_ON(!btrfs_tree_locked(path->nodes[level])); 4243 btrfs_assert_tree_locked(path->nodes[level]);
4130 btrfs_tree_lock(next); 4244 ret = btrfs_try_spin_lock(next);
4131 btrfs_set_lock_blocking(next); 4245 if (!ret) {
4246 btrfs_set_path_blocking(path);
4247 btrfs_tree_lock(next);
4248 if (!force_blocking)
4249 btrfs_clear_path_blocking(path, next);
4250 }
4251 if (force_blocking)
4252 btrfs_set_lock_blocking(next);
4132 } 4253 }
4133 } 4254 }
4255 ret = 0;
4134done: 4256done:
4135 unlock_up(path, 0, 1); 4257 unlock_up(path, 0, 1);
4136 return 0; 4258 path->leave_spinning = old_spinning;
4259 if (!old_spinning)
4260 btrfs_set_path_blocking(path);
4261
4262 return ret;
4137} 4263}
4138 4264
4139/* 4265/*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 82491ba8fa40..ad96495dedc5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -45,6 +45,13 @@ struct btrfs_ordered_sum;
45 45
46#define BTRFS_MAX_LEVEL 8 46#define BTRFS_MAX_LEVEL 8
47 47
48/*
49 * files bigger than this get some pre-flushing when they are added
50 * to the ordered operations list. That way we limit the total
51 * work done by the commit
52 */
53#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
54
48/* holds pointers to all of the tree roots */ 55/* holds pointers to all of the tree roots */
49#define BTRFS_ROOT_TREE_OBJECTID 1ULL 56#define BTRFS_ROOT_TREE_OBJECTID 1ULL
50 57
@@ -136,12 +143,15 @@ static int btrfs_csum_sizes[] = { 4, 0 };
136#define BTRFS_FT_MAX 9 143#define BTRFS_FT_MAX 9
137 144
138/* 145/*
139 * the key defines the order in the tree, and so it also defines (optimal) 146 * The key defines the order in the tree, and so it also defines (optimal)
140 * block layout. objectid corresonds to the inode number. The flags 147 * block layout.
141 * tells us things about the object, and is a kind of stream selector. 148 *
142 * so for a given inode, keys with flags of 1 might refer to the inode 149 * objectid corresponds to the inode number.
143 * data, flags of 2 may point to file data in the btree and flags == 3 150 *
144 * may point to extents. 151 * type tells us things about the object, and is a kind of stream selector.
152 * so for a given inode, keys with type of 1 might refer to the inode data,
153 * type of 2 may point to file data in the btree and type == 3 may point to
154 * extents.
145 * 155 *
146 * offset is the starting byte offset for this key in the stream. 156 * offset is the starting byte offset for this key in the stream.
147 * 157 *
@@ -193,7 +203,7 @@ struct btrfs_dev_item {
193 203
194 /* 204 /*
195 * starting byte of this partition on the device, 205 * starting byte of this partition on the device,
196 * to allowr for stripe alignment in the future 206 * to allow for stripe alignment in the future
197 */ 207 */
198 __le64 start_offset; 208 __le64 start_offset;
199 209
@@ -401,15 +411,16 @@ struct btrfs_path {
401 int locks[BTRFS_MAX_LEVEL]; 411 int locks[BTRFS_MAX_LEVEL];
402 int reada; 412 int reada;
403 /* keep some upper locks as we walk down */ 413 /* keep some upper locks as we walk down */
404 int keep_locks;
405 int skip_locking;
406 int lowest_level; 414 int lowest_level;
407 415
408 /* 416 /*
409 * set by btrfs_split_item, tells search_slot to keep all locks 417 * set by btrfs_split_item, tells search_slot to keep all locks
410 * and to force calls to keep space in the nodes 418 * and to force calls to keep space in the nodes
411 */ 419 */
412 int search_for_split; 420 unsigned int search_for_split:1;
421 unsigned int keep_locks:1;
422 unsigned int skip_locking:1;
423 unsigned int leave_spinning:1;
413}; 424};
414 425
415/* 426/*
@@ -625,18 +636,35 @@ struct btrfs_space_info {
625 struct rw_semaphore groups_sem; 636 struct rw_semaphore groups_sem;
626}; 637};
627 638
628struct btrfs_free_space { 639/*
629 struct rb_node bytes_index; 640 * free clusters are used to claim free space in relatively large chunks,
630 struct rb_node offset_index; 641 * allowing us to do less seeky writes. They are used for all metadata
631 u64 offset; 642 * allocations and data allocations in ssd mode.
632 u64 bytes; 643 */
644struct btrfs_free_cluster {
645 spinlock_t lock;
646 spinlock_t refill_lock;
647 struct rb_root root;
648
649 /* largest extent in this cluster */
650 u64 max_size;
651
652 /* first extent starting offset */
653 u64 window_start;
654
655 struct btrfs_block_group_cache *block_group;
656 /*
657 * when a cluster is allocated from a block group, we put the
658 * cluster onto a list in the block group so that it can
659 * be freed before the block group is freed.
660 */
661 struct list_head block_group_list;
633}; 662};
634 663
635struct btrfs_block_group_cache { 664struct btrfs_block_group_cache {
636 struct btrfs_key key; 665 struct btrfs_key key;
637 struct btrfs_block_group_item item; 666 struct btrfs_block_group_item item;
638 spinlock_t lock; 667 spinlock_t lock;
639 struct mutex alloc_mutex;
640 struct mutex cache_mutex; 668 struct mutex cache_mutex;
641 u64 pinned; 669 u64 pinned;
642 u64 reserved; 670 u64 reserved;
@@ -648,6 +676,7 @@ struct btrfs_block_group_cache {
648 struct btrfs_space_info *space_info; 676 struct btrfs_space_info *space_info;
649 677
650 /* free space cache stuff */ 678 /* free space cache stuff */
679 spinlock_t tree_lock;
651 struct rb_root free_space_bytes; 680 struct rb_root free_space_bytes;
652 struct rb_root free_space_offset; 681 struct rb_root free_space_offset;
653 682
@@ -659,6 +688,11 @@ struct btrfs_block_group_cache {
659 688
660 /* usage count */ 689 /* usage count */
661 atomic_t count; 690 atomic_t count;
691
692 /* List of struct btrfs_free_clusters for this block group.
693 * Today it will only have one thing on it, but that may change
694 */
695 struct list_head cluster_list;
662}; 696};
663 697
664struct btrfs_leaf_ref_tree { 698struct btrfs_leaf_ref_tree {
@@ -688,15 +722,18 @@ struct btrfs_fs_info {
688 struct rb_root block_group_cache_tree; 722 struct rb_root block_group_cache_tree;
689 723
690 struct extent_io_tree pinned_extents; 724 struct extent_io_tree pinned_extents;
691 struct extent_io_tree pending_del;
692 struct extent_io_tree extent_ins;
693 725
694 /* logical->physical extent mapping */ 726 /* logical->physical extent mapping */
695 struct btrfs_mapping_tree mapping_tree; 727 struct btrfs_mapping_tree mapping_tree;
696 728
697 u64 generation; 729 u64 generation;
698 u64 last_trans_committed; 730 u64 last_trans_committed;
699 u64 last_trans_new_blockgroup; 731
732 /*
733 * this is updated to the current trans every time a full commit
734 * is required instead of the faster short fsync log commits
735 */
736 u64 last_trans_log_full_commit;
700 u64 open_ioctl_trans; 737 u64 open_ioctl_trans;
701 unsigned long mount_opt; 738 unsigned long mount_opt;
702 u64 max_extent; 739 u64 max_extent;
@@ -717,12 +754,20 @@ struct btrfs_fs_info {
717 struct mutex tree_log_mutex; 754 struct mutex tree_log_mutex;
718 struct mutex transaction_kthread_mutex; 755 struct mutex transaction_kthread_mutex;
719 struct mutex cleaner_mutex; 756 struct mutex cleaner_mutex;
720 struct mutex extent_ins_mutex;
721 struct mutex pinned_mutex;
722 struct mutex chunk_mutex; 757 struct mutex chunk_mutex;
723 struct mutex drop_mutex; 758 struct mutex drop_mutex;
724 struct mutex volume_mutex; 759 struct mutex volume_mutex;
725 struct mutex tree_reloc_mutex; 760 struct mutex tree_reloc_mutex;
761
762 /*
763 * this protects the ordered operations list only while we are
764 * processing all of the entries on it. This way we make
765 * sure the commit code doesn't find the list temporarily empty
766 * because another function happens to be doing non-waiting preflush
767 * before jumping into the main commit.
768 */
769 struct mutex ordered_operations_mutex;
770
726 struct list_head trans_list; 771 struct list_head trans_list;
727 struct list_head hashers; 772 struct list_head hashers;
728 struct list_head dead_roots; 773 struct list_head dead_roots;
@@ -737,10 +782,29 @@ struct btrfs_fs_info {
737 * ordered extents 782 * ordered extents
738 */ 783 */
739 spinlock_t ordered_extent_lock; 784 spinlock_t ordered_extent_lock;
785
786 /*
787 * all of the data=ordered extents pending writeback
788 * these can span multiple transactions and basically include
789 * every dirty data page that isn't from nodatacow
790 */
740 struct list_head ordered_extents; 791 struct list_head ordered_extents;
792
793 /*
794 * all of the inodes that have delalloc bytes. It is possible for
795 * this list to be empty even when there is still dirty data=ordered
796 * extents waiting to finish IO.
797 */
741 struct list_head delalloc_inodes; 798 struct list_head delalloc_inodes;
742 799
743 /* 800 /*
801 * special rename and truncate targets that must be on disk before
802 * we're allowed to commit. This is basically the ext3 style
803 * data=ordered list.
804 */
805 struct list_head ordered_operations;
806
807 /*
744 * there is a pool of worker threads for checksumming during writes 808 * there is a pool of worker threads for checksumming during writes
745 * and a pool for checksumming after reads. This is because readers 809 * and a pool for checksumming after reads. This is because readers
746 * can run with FS locks held, and the writers may be waiting for 810 * can run with FS locks held, and the writers may be waiting for
@@ -781,15 +845,31 @@ struct btrfs_fs_info {
781 atomic_t throttle_gen; 845 atomic_t throttle_gen;
782 846
783 u64 total_pinned; 847 u64 total_pinned;
848
849 /* protected by the delalloc lock, used to keep from writing
850 * metadata until there is a nice batch
851 */
852 u64 dirty_metadata_bytes;
784 struct list_head dirty_cowonly_roots; 853 struct list_head dirty_cowonly_roots;
785 854
786 struct btrfs_fs_devices *fs_devices; 855 struct btrfs_fs_devices *fs_devices;
856
857 /*
858 * the space_info list is almost entirely read only. It only changes
859 * when we add a new raid type to the FS, and that happens
860 * very rarely. RCU is used to protect it.
861 */
787 struct list_head space_info; 862 struct list_head space_info;
863
788 spinlock_t delalloc_lock; 864 spinlock_t delalloc_lock;
789 spinlock_t new_trans_lock; 865 spinlock_t new_trans_lock;
790 u64 delalloc_bytes; 866 u64 delalloc_bytes;
791 u64 last_alloc; 867
792 u64 last_data_alloc; 868 /* data_alloc_cluster is only used in ssd mode */
869 struct btrfs_free_cluster data_alloc_cluster;
870
871 /* all metadata allocations go through this cluster */
872 struct btrfs_free_cluster meta_alloc_cluster;
793 873
794 spinlock_t ref_cache_lock; 874 spinlock_t ref_cache_lock;
795 u64 total_ref_cache_size; 875 u64 total_ref_cache_size;
@@ -881,7 +961,6 @@ struct btrfs_root {
881}; 961};
882 962
883/* 963/*
884
885 * inode items have the data typically returned from stat and store other 964 * inode items have the data typically returned from stat and store other
886 * info about object characteristics. There is one for every file and dir in 965 * info about object characteristics. There is one for every file and dir in
887 * the FS 966 * the FS
@@ -912,7 +991,7 @@ struct btrfs_root {
912#define BTRFS_EXTENT_CSUM_KEY 128 991#define BTRFS_EXTENT_CSUM_KEY 128
913 992
914/* 993/*
915 * root items point to tree roots. There are typically in the root 994 * root items point to tree roots. They are typically in the root
916 * tree used by the super block to find all the other trees 995 * tree used by the super block to find all the other trees
917 */ 996 */
918#define BTRFS_ROOT_ITEM_KEY 132 997#define BTRFS_ROOT_ITEM_KEY 132
@@ -959,6 +1038,8 @@ struct btrfs_root {
959#define BTRFS_MOUNT_SSD (1 << 3) 1038#define BTRFS_MOUNT_SSD (1 << 3)
960#define BTRFS_MOUNT_DEGRADED (1 << 4) 1039#define BTRFS_MOUNT_DEGRADED (1 << 4)
961#define BTRFS_MOUNT_COMPRESS (1 << 5) 1040#define BTRFS_MOUNT_COMPRESS (1 << 5)
1041#define BTRFS_MOUNT_NOTREELOG (1 << 6)
1042#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7)
962 1043
963#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1044#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
964#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1045#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1697,18 +1778,16 @@ static inline struct dentry *fdentry(struct file *file)
1697} 1778}
1698 1779
1699/* extent-tree.c */ 1780/* extent-tree.c */
1781void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
1782int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1783 struct btrfs_root *root, unsigned long count);
1700int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); 1784int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
1701int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
1702 struct btrfs_root *root, u64 bytenr,
1703 u64 num_bytes, u32 *refs);
1704int btrfs_update_pinned_extents(struct btrfs_root *root, 1785int btrfs_update_pinned_extents(struct btrfs_root *root,
1705 u64 bytenr, u64 num, int pin); 1786 u64 bytenr, u64 num, int pin);
1706int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, 1787int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
1707 struct btrfs_root *root, struct extent_buffer *leaf); 1788 struct btrfs_root *root, struct extent_buffer *leaf);
1708int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 1789int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1709 struct btrfs_root *root, u64 objectid, u64 bytenr); 1790 struct btrfs_root *root, u64 objectid, u64 bytenr);
1710int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1711 struct btrfs_root *root);
1712int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy); 1791int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
1713struct btrfs_block_group_cache *btrfs_lookup_block_group( 1792struct btrfs_block_group_cache *btrfs_lookup_block_group(
1714 struct btrfs_fs_info *info, 1793 struct btrfs_fs_info *info,
@@ -1770,7 +1849,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1770 u64 root_objectid, u64 ref_generation, 1849 u64 root_objectid, u64 ref_generation,
1771 u64 owner_objectid); 1850 u64 owner_objectid);
1772int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, 1851int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1773 struct btrfs_root *root, u64 bytenr, 1852 struct btrfs_root *root, u64 bytenr, u64 num_bytes,
1774 u64 orig_parent, u64 parent, 1853 u64 orig_parent, u64 parent,
1775 u64 root_objectid, u64 ref_generation, 1854 u64 root_objectid, u64 ref_generation,
1776 u64 owner_objectid); 1855 u64 owner_objectid);
@@ -1797,6 +1876,8 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
1797int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); 1876int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
1798u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 1877u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
1799void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); 1878void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
1879void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
1880
1800int btrfs_check_metadata_free_space(struct btrfs_root *root); 1881int btrfs_check_metadata_free_space(struct btrfs_root *root);
1801int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 1882int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
1802 u64 bytes); 1883 u64 bytes);
@@ -1829,7 +1910,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
1829int btrfs_cow_block(struct btrfs_trans_handle *trans, 1910int btrfs_cow_block(struct btrfs_trans_handle *trans,
1830 struct btrfs_root *root, struct extent_buffer *buf, 1911 struct btrfs_root *root, struct extent_buffer *buf,
1831 struct extent_buffer *parent, int parent_slot, 1912 struct extent_buffer *parent, int parent_slot,
1832 struct extent_buffer **cow_ret, u64 prealloc_dest); 1913 struct extent_buffer **cow_ret);
1833int btrfs_copy_root(struct btrfs_trans_handle *trans, 1914int btrfs_copy_root(struct btrfs_trans_handle *trans,
1834 struct btrfs_root *root, 1915 struct btrfs_root *root,
1835 struct extent_buffer *buf, 1916 struct extent_buffer *buf,
@@ -2051,7 +2132,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
2051unsigned long btrfs_force_ra(struct address_space *mapping, 2132unsigned long btrfs_force_ra(struct address_space *mapping,
2052 struct file_ra_state *ra, struct file *file, 2133 struct file_ra_state *ra, struct file *file,
2053 pgoff_t offset, pgoff_t last_index); 2134 pgoff_t offset, pgoff_t last_index);
2054int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page); 2135int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2055int btrfs_readpage(struct file *file, struct page *page); 2136int btrfs_readpage(struct file *file, struct page *page);
2056void btrfs_delete_inode(struct inode *inode); 2137void btrfs_delete_inode(struct inode *inode);
2057void btrfs_put_inode(struct inode *inode); 2138void btrfs_put_inode(struct inode *inode);
@@ -2124,21 +2205,4 @@ int btrfs_check_acl(struct inode *inode, int mask);
2124int btrfs_init_acl(struct inode *inode, struct inode *dir); 2205int btrfs_init_acl(struct inode *inode, struct inode *dir);
2125int btrfs_acl_chmod(struct inode *inode); 2206int btrfs_acl_chmod(struct inode *inode);
2126 2207
2127/* free-space-cache.c */
2128int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
2129 u64 bytenr, u64 size);
2130int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
2131 u64 offset, u64 bytes);
2132int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
2133 u64 bytenr, u64 size);
2134int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
2135 u64 offset, u64 bytes);
2136void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
2137 *block_group);
2138struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
2139 *block_group, u64 offset,
2140 u64 bytes);
2141void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
2142 u64 bytes);
2143u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
2144#endif 2208#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
new file mode 100644
index 000000000000..d6c01c096a40
--- /dev/null
+++ b/fs/btrfs/delayed-ref.c
@@ -0,0 +1,668 @@
1/*
2 * Copyright (C) 2009 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/sort.h>
21#include "ctree.h"
22#include "delayed-ref.h"
23#include "transaction.h"
24
25/*
26 * delayed back reference update tracking. For subvolume trees
27 * we queue up extent allocations and backref maintenance for
28 * delayed processing. This avoids deep call chains where we
29 * add extents in the middle of btrfs_search_slot, and it allows
30 * us to buffer up frequently modified backrefs in an rb tree instead
31 * of hammering updates on the extent allocation tree.
32 *
33 * Right now this code is only used for reference counted trees, but
34 * the long term goal is to get rid of the similar code for delayed
35 * extent tree modifications.
36 */
37
38/*
39 * entries in the rb tree are ordered by the byte number of the extent
40 * and by the byte number of the parent block.
41 */
42static int comp_entry(struct btrfs_delayed_ref_node *ref,
43 u64 bytenr, u64 parent)
44{
45 if (bytenr < ref->bytenr)
46 return -1;
47 if (bytenr > ref->bytenr)
48 return 1;
49 if (parent < ref->parent)
50 return -1;
51 if (parent > ref->parent)
52 return 1;
53 return 0;
54}
55
56/*
57 * insert a new ref into the rbtree. This returns any existing refs
58 * for the same (bytenr,parent) tuple, or NULL if the new node was properly
59 * inserted.
60 */
61static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
62 u64 bytenr, u64 parent,
63 struct rb_node *node)
64{
65 struct rb_node **p = &root->rb_node;
66 struct rb_node *parent_node = NULL;
67 struct btrfs_delayed_ref_node *entry;
68 int cmp;
69
70 while (*p) {
71 parent_node = *p;
72 entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
73 rb_node);
74
75 cmp = comp_entry(entry, bytenr, parent);
76 if (cmp < 0)
77 p = &(*p)->rb_left;
78 else if (cmp > 0)
79 p = &(*p)->rb_right;
80 else
81 return entry;
82 }
83
84 entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
85 rb_link_node(node, parent_node, p);
86 rb_insert_color(node, root);
87 return NULL;
88}
89
90/*
91 * find an entry based on (bytenr,parent). This returns the delayed
92 * ref if it was able to find one, or NULL if nothing was in that spot
93 */
94static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
95 u64 bytenr, u64 parent,
96 struct btrfs_delayed_ref_node **last)
97{
98 struct rb_node *n = root->rb_node;
99 struct btrfs_delayed_ref_node *entry;
100 int cmp;
101
102 while (n) {
103 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
104 WARN_ON(!entry->in_tree);
105 if (last)
106 *last = entry;
107
108 cmp = comp_entry(entry, bytenr, parent);
109 if (cmp < 0)
110 n = n->rb_left;
111 else if (cmp > 0)
112 n = n->rb_right;
113 else
114 return entry;
115 }
116 return NULL;
117}
118
119int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
120 struct btrfs_delayed_ref_head *head)
121{
122 struct btrfs_delayed_ref_root *delayed_refs;
123
124 delayed_refs = &trans->transaction->delayed_refs;
125 assert_spin_locked(&delayed_refs->lock);
126 if (mutex_trylock(&head->mutex))
127 return 0;
128
129 atomic_inc(&head->node.refs);
130 spin_unlock(&delayed_refs->lock);
131
132 mutex_lock(&head->mutex);
133 spin_lock(&delayed_refs->lock);
134 if (!head->node.in_tree) {
135 mutex_unlock(&head->mutex);
136 btrfs_put_delayed_ref(&head->node);
137 return -EAGAIN;
138 }
139 btrfs_put_delayed_ref(&head->node);
140 return 0;
141}
142
143int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
144 struct list_head *cluster, u64 start)
145{
146 int count = 0;
147 struct btrfs_delayed_ref_root *delayed_refs;
148 struct rb_node *node;
149 struct btrfs_delayed_ref_node *ref;
150 struct btrfs_delayed_ref_head *head;
151
152 delayed_refs = &trans->transaction->delayed_refs;
153 if (start == 0) {
154 node = rb_first(&delayed_refs->root);
155 } else {
156 ref = NULL;
157 tree_search(&delayed_refs->root, start, (u64)-1, &ref);
158 if (ref) {
159 struct btrfs_delayed_ref_node *tmp;
160
161 node = rb_prev(&ref->rb_node);
162 while (node) {
163 tmp = rb_entry(node,
164 struct btrfs_delayed_ref_node,
165 rb_node);
166 if (tmp->bytenr < start)
167 break;
168 ref = tmp;
169 node = rb_prev(&ref->rb_node);
170 }
171 node = &ref->rb_node;
172 } else
173 node = rb_first(&delayed_refs->root);
174 }
175again:
176 while (node && count < 32) {
177 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
178 if (btrfs_delayed_ref_is_head(ref)) {
179 head = btrfs_delayed_node_to_head(ref);
180 if (list_empty(&head->cluster)) {
181 list_add_tail(&head->cluster, cluster);
182 delayed_refs->run_delayed_start =
183 head->node.bytenr;
184 count++;
185
186 WARN_ON(delayed_refs->num_heads_ready == 0);
187 delayed_refs->num_heads_ready--;
188 } else if (count) {
189 /* the goal of the clustering is to find extents
190 * that are likely to end up in the same extent
191 * leaf on disk. So, we don't want them spread
192 * all over the tree. Stop now if we've hit
193 * a head that was already in use
194 */
195 break;
196 }
197 }
198 node = rb_next(node);
199 }
200 if (count) {
201 return 0;
202 } else if (start) {
203 /*
204 * we've gone to the end of the rbtree without finding any
205 * clusters. start from the beginning and try again
206 */
207 start = 0;
208 node = rb_first(&delayed_refs->root);
209 goto again;
210 }
211 return 1;
212}
213
214/*
215 * This checks to see if there are any delayed refs in the
216 * btree for a given bytenr. It returns one if it finds any
217 * and zero otherwise.
218 *
219 * If it only finds a head node, it returns 0.
220 *
221 * The idea is to use this when deciding if you can safely delete an
222 * extent from the extent allocation tree. There may be a pending
223 * ref in the rbtree that adds or removes references, so as long as this
224 * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent
225 * allocation tree.
226 */
227int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
228{
229 struct btrfs_delayed_ref_node *ref;
230 struct btrfs_delayed_ref_root *delayed_refs;
231 struct rb_node *prev_node;
232 int ret = 0;
233
234 delayed_refs = &trans->transaction->delayed_refs;
235 spin_lock(&delayed_refs->lock);
236
237 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
238 if (ref) {
239 prev_node = rb_prev(&ref->rb_node);
240 if (!prev_node)
241 goto out;
242 ref = rb_entry(prev_node, struct btrfs_delayed_ref_node,
243 rb_node);
244 if (ref->bytenr == bytenr)
245 ret = 1;
246 }
247out:
248 spin_unlock(&delayed_refs->lock);
249 return ret;
250}
251
252/*
253 * helper function to lookup reference count
254 *
255 * the head node for delayed ref is used to store the sum of all the
256 * reference count modifications queued up in the rbtree. This way you
257 * can check to see what the reference count would be if all of the
258 * delayed refs are processed.
259 */
260int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
261 struct btrfs_root *root, u64 bytenr,
262 u64 num_bytes, u32 *refs)
263{
264 struct btrfs_delayed_ref_node *ref;
265 struct btrfs_delayed_ref_head *head;
266 struct btrfs_delayed_ref_root *delayed_refs;
267 struct btrfs_path *path;
268 struct extent_buffer *leaf;
269 struct btrfs_extent_item *ei;
270 struct btrfs_key key;
271 u32 num_refs;
272 int ret;
273
274 path = btrfs_alloc_path();
275 if (!path)
276 return -ENOMEM;
277
278 key.objectid = bytenr;
279 key.type = BTRFS_EXTENT_ITEM_KEY;
280 key.offset = num_bytes;
281 delayed_refs = &trans->transaction->delayed_refs;
282again:
283 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
284 &key, path, 0, 0);
285 if (ret < 0)
286 goto out;
287
288 if (ret == 0) {
289 leaf = path->nodes[0];
290 ei = btrfs_item_ptr(leaf, path->slots[0],
291 struct btrfs_extent_item);
292 num_refs = btrfs_extent_refs(leaf, ei);
293 } else {
294 num_refs = 0;
295 ret = 0;
296 }
297
298 spin_lock(&delayed_refs->lock);
299 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
300 if (ref) {
301 head = btrfs_delayed_node_to_head(ref);
302 if (mutex_trylock(&head->mutex)) {
303 num_refs += ref->ref_mod;
304 mutex_unlock(&head->mutex);
305 *refs = num_refs;
306 goto out;
307 }
308
309 atomic_inc(&ref->refs);
310 spin_unlock(&delayed_refs->lock);
311
312 btrfs_release_path(root->fs_info->extent_root, path);
313
314 mutex_lock(&head->mutex);
315 mutex_unlock(&head->mutex);
316 btrfs_put_delayed_ref(ref);
317 goto again;
318 } else {
319 *refs = num_refs;
320 }
321out:
322 spin_unlock(&delayed_refs->lock);
323 btrfs_free_path(path);
324 return ret;
325}
326
327/*
328 * helper function to update an extent delayed ref in the
329 * rbtree. existing and update must both have the same
330 * bytenr and parent
331 *
332 * This may free existing if the update cancels out whatever
333 * operation it was doing.
334 */
335static noinline void
336update_existing_ref(struct btrfs_trans_handle *trans,
337 struct btrfs_delayed_ref_root *delayed_refs,
338 struct btrfs_delayed_ref_node *existing,
339 struct btrfs_delayed_ref_node *update)
340{
341 struct btrfs_delayed_ref *existing_ref;
342 struct btrfs_delayed_ref *ref;
343
344 existing_ref = btrfs_delayed_node_to_ref(existing);
345 ref = btrfs_delayed_node_to_ref(update);
346
347 if (ref->pin)
348 existing_ref->pin = 1;
349
350 if (ref->action != existing_ref->action) {
351 /*
352 * this is effectively undoing either an add or a
353 * drop. We decrement the ref_mod, and if it goes
354 * down to zero we just delete the entry without
355 * every changing the extent allocation tree.
356 */
357 existing->ref_mod--;
358 if (existing->ref_mod == 0) {
359 rb_erase(&existing->rb_node,
360 &delayed_refs->root);
361 existing->in_tree = 0;
362 btrfs_put_delayed_ref(existing);
363 delayed_refs->num_entries--;
364 if (trans->delayed_ref_updates)
365 trans->delayed_ref_updates--;
366 }
367 } else {
368 if (existing_ref->action == BTRFS_ADD_DELAYED_REF) {
369 /* if we're adding refs, make sure all the
370 * details match up. The extent could
371 * have been totally freed and reallocated
372 * by a different owner before the delayed
373 * ref entries were removed.
374 */
375 existing_ref->owner_objectid = ref->owner_objectid;
376 existing_ref->generation = ref->generation;
377 existing_ref->root = ref->root;
378 existing->num_bytes = update->num_bytes;
379 }
380 /*
381 * the action on the existing ref matches
382 * the action on the ref we're trying to add.
383 * Bump the ref_mod by one so the backref that
384 * is eventually added/removed has the correct
385 * reference count
386 */
387 existing->ref_mod += update->ref_mod;
388 }
389}
390
391/*
392 * helper function to update the accounting in the head ref
393 * existing and update must have the same bytenr
394 */
395static noinline void
396update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
397 struct btrfs_delayed_ref_node *update)
398{
399 struct btrfs_delayed_ref_head *existing_ref;
400 struct btrfs_delayed_ref_head *ref;
401
402 existing_ref = btrfs_delayed_node_to_head(existing);
403 ref = btrfs_delayed_node_to_head(update);
404
405 if (ref->must_insert_reserved) {
406 /* if the extent was freed and then
407 * reallocated before the delayed ref
408 * entries were processed, we can end up
409 * with an existing head ref without
410 * the must_insert_reserved flag set.
411 * Set it again here
412 */
413 existing_ref->must_insert_reserved = ref->must_insert_reserved;
414
415 /*
416 * update the num_bytes so we make sure the accounting
417 * is done correctly
418 */
419 existing->num_bytes = update->num_bytes;
420
421 }
422
423 /*
424 * update the reference mod on the head to reflect this new operation
425 */
426 existing->ref_mod += update->ref_mod;
427}
428
429/*
430 * helper function to actually insert a delayed ref into the rbtree.
431 * this does all the dirty work in terms of maintaining the correct
432 * overall modification count in the head node and properly dealing
433 * with updating existing nodes as new modifications are queued.
434 */
435static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
436 struct btrfs_delayed_ref_node *ref,
437 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
438 u64 ref_generation, u64 owner_objectid, int action,
439 int pin)
440{
441 struct btrfs_delayed_ref_node *existing;
442 struct btrfs_delayed_ref *full_ref;
443 struct btrfs_delayed_ref_head *head_ref = NULL;
444 struct btrfs_delayed_ref_root *delayed_refs;
445 int count_mod = 1;
446 int must_insert_reserved = 0;
447
448 /*
449 * the head node stores the sum of all the mods, so dropping a ref
450 * should drop the sum in the head node by one.
451 */
452 if (parent == (u64)-1) {
453 if (action == BTRFS_DROP_DELAYED_REF)
454 count_mod = -1;
455 else if (action == BTRFS_UPDATE_DELAYED_HEAD)
456 count_mod = 0;
457 }
458
459 /*
460 * BTRFS_ADD_DELAYED_EXTENT means that we need to update
461 * the reserved accounting when the extent is finally added, or
462 * if a later modification deletes the delayed ref without ever
463 * inserting the extent into the extent allocation tree.
464 * ref->must_insert_reserved is the flag used to record
465 * that accounting mods are required.
466 *
467 * Once we record must_insert_reserved, switch the action to
468 * BTRFS_ADD_DELAYED_REF because other special casing is not required.
469 */
470 if (action == BTRFS_ADD_DELAYED_EXTENT) {
471 must_insert_reserved = 1;
472 action = BTRFS_ADD_DELAYED_REF;
473 } else {
474 must_insert_reserved = 0;
475 }
476
477
478 delayed_refs = &trans->transaction->delayed_refs;
479
480 /* first set the basic ref node struct up */
481 atomic_set(&ref->refs, 1);
482 ref->bytenr = bytenr;
483 ref->parent = parent;
484 ref->ref_mod = count_mod;
485 ref->in_tree = 1;
486 ref->num_bytes = num_bytes;
487
488 if (btrfs_delayed_ref_is_head(ref)) {
489 head_ref = btrfs_delayed_node_to_head(ref);
490 head_ref->must_insert_reserved = must_insert_reserved;
491 INIT_LIST_HEAD(&head_ref->cluster);
492 mutex_init(&head_ref->mutex);
493 } else {
494 full_ref = btrfs_delayed_node_to_ref(ref);
495 full_ref->root = ref_root;
496 full_ref->generation = ref_generation;
497 full_ref->owner_objectid = owner_objectid;
498 full_ref->pin = pin;
499 full_ref->action = action;
500 }
501
502 existing = tree_insert(&delayed_refs->root, bytenr,
503 parent, &ref->rb_node);
504
505 if (existing) {
506 if (btrfs_delayed_ref_is_head(ref))
507 update_existing_head_ref(existing, ref);
508 else
509 update_existing_ref(trans, delayed_refs, existing, ref);
510
511 /*
512 * we've updated the existing ref, free the newly
513 * allocated ref
514 */
515 kfree(ref);
516 } else {
517 if (btrfs_delayed_ref_is_head(ref)) {
518 delayed_refs->num_heads++;
519 delayed_refs->num_heads_ready++;
520 }
521 delayed_refs->num_entries++;
522 trans->delayed_ref_updates++;
523 }
524 return 0;
525}
526
527/*
528 * add a delayed ref to the tree. This does all of the accounting required
529 * to make sure the delayed ref is eventually processed before this
530 * transaction commits.
531 */
532int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
533 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
534 u64 ref_generation, u64 owner_objectid, int action,
535 int pin)
536{
537 struct btrfs_delayed_ref *ref;
538 struct btrfs_delayed_ref_head *head_ref;
539 struct btrfs_delayed_ref_root *delayed_refs;
540 int ret;
541
542 ref = kmalloc(sizeof(*ref), GFP_NOFS);
543 if (!ref)
544 return -ENOMEM;
545
546 /*
547 * the parent = 0 case comes from cases where we don't actually
548 * know the parent yet. It will get updated later via a add/drop
549 * pair.
550 */
551 if (parent == 0)
552 parent = bytenr;
553
554 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
555 if (!head_ref) {
556 kfree(ref);
557 return -ENOMEM;
558 }
559 delayed_refs = &trans->transaction->delayed_refs;
560 spin_lock(&delayed_refs->lock);
561
562 /*
563 * insert both the head node and the new ref without dropping
564 * the spin lock
565 */
566 ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
567 (u64)-1, 0, 0, 0, action, pin);
568 BUG_ON(ret);
569
570 ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
571 parent, ref_root, ref_generation,
572 owner_objectid, action, pin);
573 BUG_ON(ret);
574 spin_unlock(&delayed_refs->lock);
575 return 0;
576}
577
578/*
579 * this does a simple search for the head node for a given extent.
580 * It must be called with the delayed ref spinlock held, and it returns
581 * the head node if any where found, or NULL if not.
582 */
583struct btrfs_delayed_ref_head *
584btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
585{
586 struct btrfs_delayed_ref_node *ref;
587 struct btrfs_delayed_ref_root *delayed_refs;
588
589 delayed_refs = &trans->transaction->delayed_refs;
590 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
591 if (ref)
592 return btrfs_delayed_node_to_head(ref);
593 return NULL;
594}
595
596/*
597 * add a delayed ref to the tree. This does all of the accounting required
598 * to make sure the delayed ref is eventually processed before this
599 * transaction commits.
600 *
601 * The main point of this call is to add and remove a backreference in a single
602 * shot, taking the lock only once, and only searching for the head node once.
603 *
604 * It is the same as doing a ref add and delete in two separate calls.
605 */
606int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
607 u64 bytenr, u64 num_bytes, u64 orig_parent,
608 u64 parent, u64 orig_ref_root, u64 ref_root,
609 u64 orig_ref_generation, u64 ref_generation,
610 u64 owner_objectid, int pin)
611{
612 struct btrfs_delayed_ref *ref;
613 struct btrfs_delayed_ref *old_ref;
614 struct btrfs_delayed_ref_head *head_ref;
615 struct btrfs_delayed_ref_root *delayed_refs;
616 int ret;
617
618 ref = kmalloc(sizeof(*ref), GFP_NOFS);
619 if (!ref)
620 return -ENOMEM;
621
622 old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS);
623 if (!old_ref) {
624 kfree(ref);
625 return -ENOMEM;
626 }
627
628 /*
629 * the parent = 0 case comes from cases where we don't actually
630 * know the parent yet. It will get updated later via a add/drop
631 * pair.
632 */
633 if (parent == 0)
634 parent = bytenr;
635 if (orig_parent == 0)
636 orig_parent = bytenr;
637
638 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
639 if (!head_ref) {
640 kfree(ref);
641 kfree(old_ref);
642 return -ENOMEM;
643 }
644 delayed_refs = &trans->transaction->delayed_refs;
645 spin_lock(&delayed_refs->lock);
646
647 /*
648 * insert both the head node and the new ref without dropping
649 * the spin lock
650 */
651 ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
652 (u64)-1, 0, 0, 0,
653 BTRFS_UPDATE_DELAYED_HEAD, 0);
654 BUG_ON(ret);
655
656 ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
657 parent, ref_root, ref_generation,
658 owner_objectid, BTRFS_ADD_DELAYED_REF, 0);
659 BUG_ON(ret);
660
661 ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes,
662 orig_parent, orig_ref_root,
663 orig_ref_generation, owner_objectid,
664 BTRFS_DROP_DELAYED_REF, pin);
665 BUG_ON(ret);
666 spin_unlock(&delayed_refs->lock);
667 return 0;
668}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
new file mode 100644
index 000000000000..3bec2ff0b15c
--- /dev/null
+++ b/fs/btrfs/delayed-ref.h
@@ -0,0 +1,193 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#ifndef __DELAYED_REF__
19#define __DELAYED_REF__
20
21/* these are the possible values of struct btrfs_delayed_ref->action */
22#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */
23#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */
24#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
25#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
26
27struct btrfs_delayed_ref_node {
28 struct rb_node rb_node;
29
30 /* the starting bytenr of the extent */
31 u64 bytenr;
32
33 /* the parent our backref will point to */
34 u64 parent;
35
36 /* the size of the extent */
37 u64 num_bytes;
38
39 /* ref count on this data structure */
40 atomic_t refs;
41
42 /*
43 * how many refs is this entry adding or deleting. For
44 * head refs, this may be a negative number because it is keeping
45 * track of the total mods done to the reference count.
46 * For individual refs, this will always be a positive number
47 *
48 * It may be more than one, since it is possible for a single
49 * parent to have more than one ref on an extent
50 */
51 int ref_mod;
52
53 /* is this node still in the rbtree? */
54 unsigned int in_tree:1;
55};
56
57/*
58 * the head refs are used to hold a lock on a given extent, which allows us
59 * to make sure that only one process is running the delayed refs
60 * at a time for a single extent. They also store the sum of all the
61 * reference count modifications we've queued up.
62 */
63struct btrfs_delayed_ref_head {
64 struct btrfs_delayed_ref_node node;
65
66 /*
67 * the mutex is held while running the refs, and it is also
68 * held when checking the sum of reference modifications.
69 */
70 struct mutex mutex;
71
72 struct list_head cluster;
73
74 /*
75 * when a new extent is allocated, it is just reserved in memory
76 * The actual extent isn't inserted into the extent allocation tree
77 * until the delayed ref is processed. must_insert_reserved is
78 * used to flag a delayed ref so the accounting can be updated
79 * when a full insert is done.
80 *
81 * It is possible the extent will be freed before it is ever
82 * inserted into the extent allocation tree. In this case
83 * we need to update the in ram accounting to properly reflect
84 * the free has happened.
85 */
86 unsigned int must_insert_reserved:1;
87};
88
89struct btrfs_delayed_ref {
90 struct btrfs_delayed_ref_node node;
91
92 /* the root objectid our ref will point to */
93 u64 root;
94
95 /* the generation for the backref */
96 u64 generation;
97
98 /* owner_objectid of the backref */
99 u64 owner_objectid;
100
101 /* operation done by this entry in the rbtree */
102 u8 action;
103
104 /* if pin == 1, when the extent is freed it will be pinned until
105 * transaction commit
106 */
107 unsigned int pin:1;
108};
109
110struct btrfs_delayed_ref_root {
111 struct rb_root root;
112
113 /* this spin lock protects the rbtree and the entries inside */
114 spinlock_t lock;
115
116 /* how many delayed ref updates we've queued, used by the
117 * throttling code
118 */
119 unsigned long num_entries;
120
121 /* total number of head nodes in tree */
122 unsigned long num_heads;
123
124 /* total number of head nodes ready for processing */
125 unsigned long num_heads_ready;
126
127 /*
128 * set when the tree is flushing before a transaction commit,
129 * used by the throttling code to decide if new updates need
130 * to be run right away
131 */
132 int flushing;
133
134 u64 run_delayed_start;
135};
136
137static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
138{
139 WARN_ON(atomic_read(&ref->refs) == 0);
140 if (atomic_dec_and_test(&ref->refs)) {
141 WARN_ON(ref->in_tree);
142 kfree(ref);
143 }
144}
145
146int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
147 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
148 u64 ref_generation, u64 owner_objectid, int action,
149 int pin);
150
151struct btrfs_delayed_ref_head *
152btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
153int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
154int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
155 struct btrfs_root *root, u64 bytenr,
156 u64 num_bytes, u32 *refs);
157int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
158 u64 bytenr, u64 num_bytes, u64 orig_parent,
159 u64 parent, u64 orig_ref_root, u64 ref_root,
160 u64 orig_ref_generation, u64 ref_generation,
161 u64 owner_objectid, int pin);
162int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
163 struct btrfs_delayed_ref_head *head);
164int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
165 struct list_head *cluster, u64 search_start);
166/*
167 * a node might live in a head or a regular ref, this lets you
168 * test for the proper type to use.
169 */
170static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
171{
172 return node->parent == (u64)-1;
173}
174
175/*
176 * helper functions to cast a node into its container
177 */
178static inline struct btrfs_delayed_ref *
179btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node)
180{
181 WARN_ON(btrfs_delayed_ref_is_head(node));
182 return container_of(node, struct btrfs_delayed_ref, node);
183
184}
185
186static inline struct btrfs_delayed_ref_head *
187btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
188{
189 WARN_ON(!btrfs_delayed_ref_is_head(node));
190 return container_of(node, struct btrfs_delayed_ref_head, node);
191
192}
193#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 926a0b287a7d..1d70236ba00c 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -145,7 +145,10 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
145 key.objectid = dir; 145 key.objectid = dir;
146 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); 146 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
147 key.offset = btrfs_name_hash(name, name_len); 147 key.offset = btrfs_name_hash(name, name_len);
148
148 path = btrfs_alloc_path(); 149 path = btrfs_alloc_path();
150 path->leave_spinning = 1;
151
149 data_size = sizeof(*dir_item) + name_len; 152 data_size = sizeof(*dir_item) + name_len;
150 dir_item = insert_with_overflow(trans, root, path, &key, data_size, 153 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
151 name, name_len); 154 name, name_len);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index adda739a0215..92caa8035f36 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -38,6 +38,7 @@
38#include "locking.h" 38#include "locking.h"
39#include "ref-cache.h" 39#include "ref-cache.h"
40#include "tree-log.h" 40#include "tree-log.h"
41#include "free-space-cache.h"
41 42
42static struct extent_io_ops btree_extent_io_ops; 43static struct extent_io_ops btree_extent_io_ops;
43static void end_workqueue_fn(struct btrfs_work *work); 44static void end_workqueue_fn(struct btrfs_work *work);
@@ -668,14 +669,31 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
668static int btree_writepage(struct page *page, struct writeback_control *wbc) 669static int btree_writepage(struct page *page, struct writeback_control *wbc)
669{ 670{
670 struct extent_io_tree *tree; 671 struct extent_io_tree *tree;
672 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
673 struct extent_buffer *eb;
674 int was_dirty;
675
671 tree = &BTRFS_I(page->mapping->host)->io_tree; 676 tree = &BTRFS_I(page->mapping->host)->io_tree;
677 if (!(current->flags & PF_MEMALLOC)) {
678 return extent_write_full_page(tree, page,
679 btree_get_extent, wbc);
680 }
672 681
673 if (current->flags & PF_MEMALLOC) { 682 redirty_page_for_writepage(wbc, page);
674 redirty_page_for_writepage(wbc, page); 683 eb = btrfs_find_tree_block(root, page_offset(page),
675 unlock_page(page); 684 PAGE_CACHE_SIZE);
676 return 0; 685 WARN_ON(!eb);
686
687 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
688 if (!was_dirty) {
689 spin_lock(&root->fs_info->delalloc_lock);
690 root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
691 spin_unlock(&root->fs_info->delalloc_lock);
677 } 692 }
678 return extent_write_full_page(tree, page, btree_get_extent, wbc); 693 free_extent_buffer(eb);
694
695 unlock_page(page);
696 return 0;
679} 697}
680 698
681static int btree_writepages(struct address_space *mapping, 699static int btree_writepages(struct address_space *mapping,
@@ -684,15 +702,15 @@ static int btree_writepages(struct address_space *mapping,
684 struct extent_io_tree *tree; 702 struct extent_io_tree *tree;
685 tree = &BTRFS_I(mapping->host)->io_tree; 703 tree = &BTRFS_I(mapping->host)->io_tree;
686 if (wbc->sync_mode == WB_SYNC_NONE) { 704 if (wbc->sync_mode == WB_SYNC_NONE) {
705 struct btrfs_root *root = BTRFS_I(mapping->host)->root;
687 u64 num_dirty; 706 u64 num_dirty;
688 u64 start = 0;
689 unsigned long thresh = 32 * 1024 * 1024; 707 unsigned long thresh = 32 * 1024 * 1024;
690 708
691 if (wbc->for_kupdate) 709 if (wbc->for_kupdate)
692 return 0; 710 return 0;
693 711
694 num_dirty = count_range_bits(tree, &start, (u64)-1, 712 /* this is a bit racy, but that's ok */
695 thresh, EXTENT_DIRTY); 713 num_dirty = root->fs_info->dirty_metadata_bytes;
696 if (num_dirty < thresh) 714 if (num_dirty < thresh)
697 return 0; 715 return 0;
698 } 716 }
@@ -857,11 +875,19 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
857 struct inode *btree_inode = root->fs_info->btree_inode; 875 struct inode *btree_inode = root->fs_info->btree_inode;
858 if (btrfs_header_generation(buf) == 876 if (btrfs_header_generation(buf) ==
859 root->fs_info->running_transaction->transid) { 877 root->fs_info->running_transaction->transid) {
860 WARN_ON(!btrfs_tree_locked(buf)); 878 btrfs_assert_tree_locked(buf);
879
880 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
881 spin_lock(&root->fs_info->delalloc_lock);
882 if (root->fs_info->dirty_metadata_bytes >= buf->len)
883 root->fs_info->dirty_metadata_bytes -= buf->len;
884 else
885 WARN_ON(1);
886 spin_unlock(&root->fs_info->delalloc_lock);
887 }
861 888
862 /* ugh, clear_extent_buffer_dirty can be expensive */ 889 /* ugh, clear_extent_buffer_dirty needs to lock the page */
863 btrfs_set_lock_blocking(buf); 890 btrfs_set_lock_blocking(buf);
864
865 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, 891 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
866 buf); 892 buf);
867 } 893 }
@@ -1387,8 +1413,6 @@ static int bio_ready_for_csum(struct bio *bio)
1387 1413
1388 ret = extent_range_uptodate(io_tree, start + length, 1414 ret = extent_range_uptodate(io_tree, start + length,
1389 start + buf_len - 1); 1415 start + buf_len - 1);
1390 if (ret == 1)
1391 return ret;
1392 return ret; 1416 return ret;
1393} 1417}
1394 1418
@@ -1471,12 +1495,6 @@ static int transaction_kthread(void *arg)
1471 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1495 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1472 mutex_lock(&root->fs_info->transaction_kthread_mutex); 1496 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1473 1497
1474 if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
1475 printk(KERN_INFO "btrfs: total reference cache "
1476 "size %llu\n",
1477 root->fs_info->total_ref_cache_size);
1478 }
1479
1480 mutex_lock(&root->fs_info->trans_mutex); 1498 mutex_lock(&root->fs_info->trans_mutex);
1481 cur = root->fs_info->running_transaction; 1499 cur = root->fs_info->running_transaction;
1482 if (!cur) { 1500 if (!cur) {
@@ -1493,6 +1511,7 @@ static int transaction_kthread(void *arg)
1493 mutex_unlock(&root->fs_info->trans_mutex); 1511 mutex_unlock(&root->fs_info->trans_mutex);
1494 trans = btrfs_start_transaction(root, 1); 1512 trans = btrfs_start_transaction(root, 1);
1495 ret = btrfs_commit_transaction(trans, root); 1513 ret = btrfs_commit_transaction(trans, root);
1514
1496sleep: 1515sleep:
1497 wake_up_process(root->fs_info->cleaner_kthread); 1516 wake_up_process(root->fs_info->cleaner_kthread);
1498 mutex_unlock(&root->fs_info->transaction_kthread_mutex); 1517 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1552,6 +1571,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1552 INIT_LIST_HEAD(&fs_info->dead_roots); 1571 INIT_LIST_HEAD(&fs_info->dead_roots);
1553 INIT_LIST_HEAD(&fs_info->hashers); 1572 INIT_LIST_HEAD(&fs_info->hashers);
1554 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 1573 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1574 INIT_LIST_HEAD(&fs_info->ordered_operations);
1555 spin_lock_init(&fs_info->delalloc_lock); 1575 spin_lock_init(&fs_info->delalloc_lock);
1556 spin_lock_init(&fs_info->new_trans_lock); 1576 spin_lock_init(&fs_info->new_trans_lock);
1557 spin_lock_init(&fs_info->ref_cache_lock); 1577 spin_lock_init(&fs_info->ref_cache_lock);
@@ -1611,10 +1631,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1611 1631
1612 extent_io_tree_init(&fs_info->pinned_extents, 1632 extent_io_tree_init(&fs_info->pinned_extents,
1613 fs_info->btree_inode->i_mapping, GFP_NOFS); 1633 fs_info->btree_inode->i_mapping, GFP_NOFS);
1614 extent_io_tree_init(&fs_info->pending_del,
1615 fs_info->btree_inode->i_mapping, GFP_NOFS);
1616 extent_io_tree_init(&fs_info->extent_ins,
1617 fs_info->btree_inode->i_mapping, GFP_NOFS);
1618 fs_info->do_barriers = 1; 1634 fs_info->do_barriers = 1;
1619 1635
1620 INIT_LIST_HEAD(&fs_info->dead_reloc_roots); 1636 INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
@@ -1627,15 +1643,18 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1627 insert_inode_hash(fs_info->btree_inode); 1643 insert_inode_hash(fs_info->btree_inode);
1628 1644
1629 mutex_init(&fs_info->trans_mutex); 1645 mutex_init(&fs_info->trans_mutex);
1646 mutex_init(&fs_info->ordered_operations_mutex);
1630 mutex_init(&fs_info->tree_log_mutex); 1647 mutex_init(&fs_info->tree_log_mutex);
1631 mutex_init(&fs_info->drop_mutex); 1648 mutex_init(&fs_info->drop_mutex);
1632 mutex_init(&fs_info->extent_ins_mutex);
1633 mutex_init(&fs_info->pinned_mutex);
1634 mutex_init(&fs_info->chunk_mutex); 1649 mutex_init(&fs_info->chunk_mutex);
1635 mutex_init(&fs_info->transaction_kthread_mutex); 1650 mutex_init(&fs_info->transaction_kthread_mutex);
1636 mutex_init(&fs_info->cleaner_mutex); 1651 mutex_init(&fs_info->cleaner_mutex);
1637 mutex_init(&fs_info->volume_mutex); 1652 mutex_init(&fs_info->volume_mutex);
1638 mutex_init(&fs_info->tree_reloc_mutex); 1653 mutex_init(&fs_info->tree_reloc_mutex);
1654
1655 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
1656 btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
1657
1639 init_waitqueue_head(&fs_info->transaction_throttle); 1658 init_waitqueue_head(&fs_info->transaction_throttle);
1640 init_waitqueue_head(&fs_info->transaction_wait); 1659 init_waitqueue_head(&fs_info->transaction_wait);
1641 init_waitqueue_head(&fs_info->async_submit_wait); 1660 init_waitqueue_head(&fs_info->async_submit_wait);
@@ -2358,10 +2377,9 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
2358 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; 2377 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
2359 u64 transid = btrfs_header_generation(buf); 2378 u64 transid = btrfs_header_generation(buf);
2360 struct inode *btree_inode = root->fs_info->btree_inode; 2379 struct inode *btree_inode = root->fs_info->btree_inode;
2380 int was_dirty;
2361 2381
2362 btrfs_set_lock_blocking(buf); 2382 btrfs_assert_tree_locked(buf);
2363
2364 WARN_ON(!btrfs_tree_locked(buf));
2365 if (transid != root->fs_info->generation) { 2383 if (transid != root->fs_info->generation) {
2366 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " 2384 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
2367 "found %llu running %llu\n", 2385 "found %llu running %llu\n",
@@ -2370,7 +2388,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
2370 (unsigned long long)root->fs_info->generation); 2388 (unsigned long long)root->fs_info->generation);
2371 WARN_ON(1); 2389 WARN_ON(1);
2372 } 2390 }
2373 set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); 2391 was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
2392 buf);
2393 if (!was_dirty) {
2394 spin_lock(&root->fs_info->delalloc_lock);
2395 root->fs_info->dirty_metadata_bytes += buf->len;
2396 spin_unlock(&root->fs_info->delalloc_lock);
2397 }
2374} 2398}
2375 2399
2376void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) 2400void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
@@ -2385,7 +2409,7 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
2385 unsigned long thresh = 32 * 1024 * 1024; 2409 unsigned long thresh = 32 * 1024 * 1024;
2386 tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; 2410 tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
2387 2411
2388 if (current_is_pdflush() || current->flags & PF_MEMALLOC) 2412 if (current->flags & PF_MEMALLOC)
2389 return; 2413 return;
2390 2414
2391 num_dirty = count_range_bits(tree, &start, (u64)-1, 2415 num_dirty = count_range_bits(tree, &start, (u64)-1,
@@ -2410,6 +2434,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2410int btree_lock_page_hook(struct page *page) 2434int btree_lock_page_hook(struct page *page)
2411{ 2435{
2412 struct inode *inode = page->mapping->host; 2436 struct inode *inode = page->mapping->host;
2437 struct btrfs_root *root = BTRFS_I(inode)->root;
2413 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2438 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2414 struct extent_buffer *eb; 2439 struct extent_buffer *eb;
2415 unsigned long len; 2440 unsigned long len;
@@ -2425,6 +2450,16 @@ int btree_lock_page_hook(struct page *page)
2425 2450
2426 btrfs_tree_lock(eb); 2451 btrfs_tree_lock(eb);
2427 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 2452 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2453
2454 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
2455 spin_lock(&root->fs_info->delalloc_lock);
2456 if (root->fs_info->dirty_metadata_bytes >= eb->len)
2457 root->fs_info->dirty_metadata_bytes -= eb->len;
2458 else
2459 WARN_ON(1);
2460 spin_unlock(&root->fs_info->delalloc_lock);
2461 }
2462
2428 btrfs_tree_unlock(eb); 2463 btrfs_tree_unlock(eb);
2429 free_extent_buffer(eb); 2464 free_extent_buffer(eb);
2430out: 2465out:
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 95029db227be..c958ecbc1916 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -72,6 +72,7 @@ int btrfs_insert_dev_radix(struct btrfs_root *root,
72void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); 72void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
73int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); 73int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
74void btrfs_mark_buffer_dirty(struct extent_buffer *buf); 74void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
75void btrfs_mark_buffer_dirty_nonblocking(struct extent_buffer *buf);
75int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid); 76int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
76int btrfs_set_buffer_uptodate(struct extent_buffer *buf); 77int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
77int wait_on_tree_block_writeback(struct btrfs_root *root, 78int wait_on_tree_block_writeback(struct btrfs_root *root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6b5966aacf44..178df4c67de4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -20,6 +20,7 @@
20#include <linux/writeback.h> 20#include <linux/writeback.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/sort.h> 22#include <linux/sort.h>
23#include <linux/rcupdate.h>
23#include "compat.h" 24#include "compat.h"
24#include "hash.h" 25#include "hash.h"
25#include "crc32c.h" 26#include "crc32c.h"
@@ -30,6 +31,7 @@
30#include "volumes.h" 31#include "volumes.h"
31#include "locking.h" 32#include "locking.h"
32#include "ref-cache.h" 33#include "ref-cache.h"
34#include "free-space-cache.h"
33 35
34#define PENDING_EXTENT_INSERT 0 36#define PENDING_EXTENT_INSERT 0
35#define PENDING_EXTENT_DELETE 1 37#define PENDING_EXTENT_DELETE 1
@@ -48,17 +50,23 @@ struct pending_extent_op {
48 int del; 50 int del;
49}; 51};
50 52
51static int finish_current_insert(struct btrfs_trans_handle *trans, 53static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
52 struct btrfs_root *extent_root, int all); 54 struct btrfs_root *root, u64 parent,
53static int del_pending_extents(struct btrfs_trans_handle *trans, 55 u64 root_objectid, u64 ref_generation,
54 struct btrfs_root *extent_root, int all); 56 u64 owner, struct btrfs_key *ins,
55static int pin_down_bytes(struct btrfs_trans_handle *trans, 57 int ref_mod);
56 struct btrfs_root *root, 58static int update_reserved_extents(struct btrfs_root *root,
57 u64 bytenr, u64 num_bytes, int is_data); 59 u64 bytenr, u64 num, int reserve);
58static int update_block_group(struct btrfs_trans_handle *trans, 60static int update_block_group(struct btrfs_trans_handle *trans,
59 struct btrfs_root *root, 61 struct btrfs_root *root,
60 u64 bytenr, u64 num_bytes, int alloc, 62 u64 bytenr, u64 num_bytes, int alloc,
61 int mark_free); 63 int mark_free);
64static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans,
65 struct btrfs_root *root,
66 u64 bytenr, u64 num_bytes, u64 parent,
67 u64 root_objectid, u64 ref_generation,
68 u64 owner_objectid, int pin,
69 int ref_to_drop);
62 70
63static int do_chunk_alloc(struct btrfs_trans_handle *trans, 71static int do_chunk_alloc(struct btrfs_trans_handle *trans,
64 struct btrfs_root *extent_root, u64 alloc_bytes, 72 struct btrfs_root *extent_root, u64 alloc_bytes,
@@ -159,7 +167,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
159 u64 extent_start, extent_end, size; 167 u64 extent_start, extent_end, size;
160 int ret; 168 int ret;
161 169
162 mutex_lock(&info->pinned_mutex);
163 while (start < end) { 170 while (start < end) {
164 ret = find_first_extent_bit(&info->pinned_extents, start, 171 ret = find_first_extent_bit(&info->pinned_extents, start,
165 &extent_start, &extent_end, 172 &extent_start, &extent_end,
@@ -185,7 +192,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
185 ret = btrfs_add_free_space(block_group, start, size); 192 ret = btrfs_add_free_space(block_group, start, size);
186 BUG_ON(ret); 193 BUG_ON(ret);
187 } 194 }
188 mutex_unlock(&info->pinned_mutex);
189 195
190 return 0; 196 return 0;
191} 197}
@@ -284,8 +290,8 @@ next:
284 block_group->key.objectid + 290 block_group->key.objectid +
285 block_group->key.offset); 291 block_group->key.offset);
286 292
287 remove_sb_from_cache(root, block_group);
288 block_group->cached = 1; 293 block_group->cached = 1;
294 remove_sb_from_cache(root, block_group);
289 ret = 0; 295 ret = 0;
290err: 296err:
291 btrfs_free_path(path); 297 btrfs_free_path(path);
@@ -319,7 +325,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
319 return cache; 325 return cache;
320} 326}
321 327
322static inline void put_block_group(struct btrfs_block_group_cache *cache) 328void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
323{ 329{
324 if (atomic_dec_and_test(&cache->count)) 330 if (atomic_dec_and_test(&cache->count))
325 kfree(cache); 331 kfree(cache);
@@ -330,13 +336,33 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
330{ 336{
331 struct list_head *head = &info->space_info; 337 struct list_head *head = &info->space_info;
332 struct btrfs_space_info *found; 338 struct btrfs_space_info *found;
333 list_for_each_entry(found, head, list) { 339
334 if (found->flags == flags) 340 rcu_read_lock();
341 list_for_each_entry_rcu(found, head, list) {
342 if (found->flags == flags) {
343 rcu_read_unlock();
335 return found; 344 return found;
345 }
336 } 346 }
347 rcu_read_unlock();
337 return NULL; 348 return NULL;
338} 349}
339 350
351/*
352 * after adding space to the filesystem, we need to clear the full flags
353 * on all the space infos.
354 */
355void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
356{
357 struct list_head *head = &info->space_info;
358 struct btrfs_space_info *found;
359
360 rcu_read_lock();
361 list_for_each_entry_rcu(found, head, list)
362 found->full = 0;
363 rcu_read_unlock();
364}
365
340static u64 div_factor(u64 num, int factor) 366static u64 div_factor(u64 num, int factor)
341{ 367{
342 if (factor == 10) 368 if (factor == 10)
@@ -372,12 +398,12 @@ again:
372 div_factor(cache->key.offset, factor)) { 398 div_factor(cache->key.offset, factor)) {
373 group_start = cache->key.objectid; 399 group_start = cache->key.objectid;
374 spin_unlock(&cache->lock); 400 spin_unlock(&cache->lock);
375 put_block_group(cache); 401 btrfs_put_block_group(cache);
376 goto found; 402 goto found;
377 } 403 }
378 } 404 }
379 spin_unlock(&cache->lock); 405 spin_unlock(&cache->lock);
380 put_block_group(cache); 406 btrfs_put_block_group(cache);
381 cond_resched(); 407 cond_resched();
382 } 408 }
383 if (!wrapped) { 409 if (!wrapped) {
@@ -533,262 +559,13 @@ out:
533 return ret; 559 return ret;
534} 560}
535 561
536/*
537 * updates all the backrefs that are pending on update_list for the
538 * extent_root
539 */
540static noinline int update_backrefs(struct btrfs_trans_handle *trans,
541 struct btrfs_root *extent_root,
542 struct btrfs_path *path,
543 struct list_head *update_list)
544{
545 struct btrfs_key key;
546 struct btrfs_extent_ref *ref;
547 struct btrfs_fs_info *info = extent_root->fs_info;
548 struct pending_extent_op *op;
549 struct extent_buffer *leaf;
550 int ret = 0;
551 struct list_head *cur = update_list->next;
552 u64 ref_objectid;
553 u64 ref_root = extent_root->root_key.objectid;
554
555 op = list_entry(cur, struct pending_extent_op, list);
556
557search:
558 key.objectid = op->bytenr;
559 key.type = BTRFS_EXTENT_REF_KEY;
560 key.offset = op->orig_parent;
561
562 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
563 BUG_ON(ret);
564
565 leaf = path->nodes[0];
566
567loop:
568 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
569
570 ref_objectid = btrfs_ref_objectid(leaf, ref);
571
572 if (btrfs_ref_root(leaf, ref) != ref_root ||
573 btrfs_ref_generation(leaf, ref) != op->orig_generation ||
574 (ref_objectid != op->level &&
575 ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
576 printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
577 "root %llu, owner %u\n",
578 (unsigned long long)op->bytenr,
579 (unsigned long long)op->orig_parent,
580 (unsigned long long)ref_root, op->level);
581 btrfs_print_leaf(extent_root, leaf);
582 BUG();
583 }
584
585 key.objectid = op->bytenr;
586 key.offset = op->parent;
587 key.type = BTRFS_EXTENT_REF_KEY;
588 ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
589 BUG_ON(ret);
590 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
591 btrfs_set_ref_generation(leaf, ref, op->generation);
592
593 cur = cur->next;
594
595 list_del_init(&op->list);
596 unlock_extent(&info->extent_ins, op->bytenr,
597 op->bytenr + op->num_bytes - 1, GFP_NOFS);
598 kfree(op);
599
600 if (cur == update_list) {
601 btrfs_mark_buffer_dirty(path->nodes[0]);
602 btrfs_release_path(extent_root, path);
603 goto out;
604 }
605
606 op = list_entry(cur, struct pending_extent_op, list);
607
608 path->slots[0]++;
609 while (path->slots[0] < btrfs_header_nritems(leaf)) {
610 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
611 if (key.objectid == op->bytenr &&
612 key.type == BTRFS_EXTENT_REF_KEY)
613 goto loop;
614 path->slots[0]++;
615 }
616
617 btrfs_mark_buffer_dirty(path->nodes[0]);
618 btrfs_release_path(extent_root, path);
619 goto search;
620
621out:
622 return 0;
623}
624
625static noinline int insert_extents(struct btrfs_trans_handle *trans,
626 struct btrfs_root *extent_root,
627 struct btrfs_path *path,
628 struct list_head *insert_list, int nr)
629{
630 struct btrfs_key *keys;
631 u32 *data_size;
632 struct pending_extent_op *op;
633 struct extent_buffer *leaf;
634 struct list_head *cur = insert_list->next;
635 struct btrfs_fs_info *info = extent_root->fs_info;
636 u64 ref_root = extent_root->root_key.objectid;
637 int i = 0, last = 0, ret;
638 int total = nr * 2;
639
640 if (!nr)
641 return 0;
642
643 keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
644 if (!keys)
645 return -ENOMEM;
646
647 data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
648 if (!data_size) {
649 kfree(keys);
650 return -ENOMEM;
651 }
652
653 list_for_each_entry(op, insert_list, list) {
654 keys[i].objectid = op->bytenr;
655 keys[i].offset = op->num_bytes;
656 keys[i].type = BTRFS_EXTENT_ITEM_KEY;
657 data_size[i] = sizeof(struct btrfs_extent_item);
658 i++;
659
660 keys[i].objectid = op->bytenr;
661 keys[i].offset = op->parent;
662 keys[i].type = BTRFS_EXTENT_REF_KEY;
663 data_size[i] = sizeof(struct btrfs_extent_ref);
664 i++;
665 }
666
667 op = list_entry(cur, struct pending_extent_op, list);
668 i = 0;
669 while (i < total) {
670 int c;
671 ret = btrfs_insert_some_items(trans, extent_root, path,
672 keys+i, data_size+i, total-i);
673 BUG_ON(ret < 0);
674
675 if (last && ret > 1)
676 BUG();
677
678 leaf = path->nodes[0];
679 for (c = 0; c < ret; c++) {
680 int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
681
682 /*
683 * if the first item we inserted was a backref, then
684 * the EXTENT_ITEM will be the odd c's, else it will
685 * be the even c's
686 */
687 if ((ref_first && (c % 2)) ||
688 (!ref_first && !(c % 2))) {
689 struct btrfs_extent_item *itm;
690
691 itm = btrfs_item_ptr(leaf, path->slots[0] + c,
692 struct btrfs_extent_item);
693 btrfs_set_extent_refs(path->nodes[0], itm, 1);
694 op->del++;
695 } else {
696 struct btrfs_extent_ref *ref;
697
698 ref = btrfs_item_ptr(leaf, path->slots[0] + c,
699 struct btrfs_extent_ref);
700 btrfs_set_ref_root(leaf, ref, ref_root);
701 btrfs_set_ref_generation(leaf, ref,
702 op->generation);
703 btrfs_set_ref_objectid(leaf, ref, op->level);
704 btrfs_set_ref_num_refs(leaf, ref, 1);
705 op->del++;
706 }
707
708 /*
709 * using del to see when its ok to free up the
710 * pending_extent_op. In the case where we insert the
711 * last item on the list in order to help do batching
712 * we need to not free the extent op until we actually
713 * insert the extent_item
714 */
715 if (op->del == 2) {
716 unlock_extent(&info->extent_ins, op->bytenr,
717 op->bytenr + op->num_bytes - 1,
718 GFP_NOFS);
719 cur = cur->next;
720 list_del_init(&op->list);
721 kfree(op);
722 if (cur != insert_list)
723 op = list_entry(cur,
724 struct pending_extent_op,
725 list);
726 }
727 }
728 btrfs_mark_buffer_dirty(leaf);
729 btrfs_release_path(extent_root, path);
730
731 /*
732 * Ok backref's and items usually go right next to eachother,
733 * but if we could only insert 1 item that means that we
734 * inserted on the end of a leaf, and we have no idea what may
735 * be on the next leaf so we just play it safe. In order to
736 * try and help this case we insert the last thing on our
737 * insert list so hopefully it will end up being the last
738 * thing on the leaf and everything else will be before it,
739 * which will let us insert a whole bunch of items at the same
740 * time.
741 */
742 if (ret == 1 && !last && (i + ret < total)) {
743 /*
744 * last: where we will pick up the next time around
745 * i: our current key to insert, will be total - 1
746 * cur: the current op we are screwing with
747 * op: duh
748 */
749 last = i + ret;
750 i = total - 1;
751 cur = insert_list->prev;
752 op = list_entry(cur, struct pending_extent_op, list);
753 } else if (last) {
754 /*
755 * ok we successfully inserted the last item on the
756 * list, lets reset everything
757 *
758 * i: our current key to insert, so where we left off
759 * last time
760 * last: done with this
761 * cur: the op we are messing with
762 * op: duh
763 * total: since we inserted the last key, we need to
764 * decrement total so we dont overflow
765 */
766 i = last;
767 last = 0;
768 total--;
769 if (i < total) {
770 cur = insert_list->next;
771 op = list_entry(cur, struct pending_extent_op,
772 list);
773 }
774 } else {
775 i += ret;
776 }
777
778 cond_resched();
779 }
780 ret = 0;
781 kfree(keys);
782 kfree(data_size);
783 return ret;
784}
785
786static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, 562static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
787 struct btrfs_root *root, 563 struct btrfs_root *root,
788 struct btrfs_path *path, 564 struct btrfs_path *path,
789 u64 bytenr, u64 parent, 565 u64 bytenr, u64 parent,
790 u64 ref_root, u64 ref_generation, 566 u64 ref_root, u64 ref_generation,
791 u64 owner_objectid) 567 u64 owner_objectid,
568 int refs_to_add)
792{ 569{
793 struct btrfs_key key; 570 struct btrfs_key key;
794 struct extent_buffer *leaf; 571 struct extent_buffer *leaf;
@@ -808,9 +585,10 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
808 btrfs_set_ref_root(leaf, ref, ref_root); 585 btrfs_set_ref_root(leaf, ref, ref_root);
809 btrfs_set_ref_generation(leaf, ref, ref_generation); 586 btrfs_set_ref_generation(leaf, ref, ref_generation);
810 btrfs_set_ref_objectid(leaf, ref, owner_objectid); 587 btrfs_set_ref_objectid(leaf, ref, owner_objectid);
811 btrfs_set_ref_num_refs(leaf, ref, 1); 588 btrfs_set_ref_num_refs(leaf, ref, refs_to_add);
812 } else if (ret == -EEXIST) { 589 } else if (ret == -EEXIST) {
813 u64 existing_owner; 590 u64 existing_owner;
591
814 BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID); 592 BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
815 leaf = path->nodes[0]; 593 leaf = path->nodes[0];
816 ref = btrfs_item_ptr(leaf, path->slots[0], 594 ref = btrfs_item_ptr(leaf, path->slots[0],
@@ -824,7 +602,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
824 602
825 num_refs = btrfs_ref_num_refs(leaf, ref); 603 num_refs = btrfs_ref_num_refs(leaf, ref);
826 BUG_ON(num_refs == 0); 604 BUG_ON(num_refs == 0);
827 btrfs_set_ref_num_refs(leaf, ref, num_refs + 1); 605 btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add);
828 606
829 existing_owner = btrfs_ref_objectid(leaf, ref); 607 existing_owner = btrfs_ref_objectid(leaf, ref);
830 if (existing_owner != owner_objectid && 608 if (existing_owner != owner_objectid &&
@@ -836,6 +614,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
836 } else { 614 } else {
837 goto out; 615 goto out;
838 } 616 }
617 btrfs_unlock_up_safe(path, 1);
839 btrfs_mark_buffer_dirty(path->nodes[0]); 618 btrfs_mark_buffer_dirty(path->nodes[0]);
840out: 619out:
841 btrfs_release_path(root, path); 620 btrfs_release_path(root, path);
@@ -844,7 +623,8 @@ out:
844 623
845static noinline int remove_extent_backref(struct btrfs_trans_handle *trans, 624static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
846 struct btrfs_root *root, 625 struct btrfs_root *root,
847 struct btrfs_path *path) 626 struct btrfs_path *path,
627 int refs_to_drop)
848{ 628{
849 struct extent_buffer *leaf; 629 struct extent_buffer *leaf;
850 struct btrfs_extent_ref *ref; 630 struct btrfs_extent_ref *ref;
@@ -854,8 +634,8 @@ static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
854 leaf = path->nodes[0]; 634 leaf = path->nodes[0];
855 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); 635 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
856 num_refs = btrfs_ref_num_refs(leaf, ref); 636 num_refs = btrfs_ref_num_refs(leaf, ref);
857 BUG_ON(num_refs == 0); 637 BUG_ON(num_refs < refs_to_drop);
858 num_refs -= 1; 638 num_refs -= refs_to_drop;
859 if (num_refs == 0) { 639 if (num_refs == 0) {
860 ret = btrfs_del_item(trans, root, path); 640 ret = btrfs_del_item(trans, root, path);
861 } else { 641 } else {
@@ -906,332 +686,28 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
906#endif 686#endif
907} 687}
908 688
909static noinline int free_extents(struct btrfs_trans_handle *trans,
910 struct btrfs_root *extent_root,
911 struct list_head *del_list)
912{
913 struct btrfs_fs_info *info = extent_root->fs_info;
914 struct btrfs_path *path;
915 struct btrfs_key key, found_key;
916 struct extent_buffer *leaf;
917 struct list_head *cur;
918 struct pending_extent_op *op;
919 struct btrfs_extent_item *ei;
920 int ret, num_to_del, extent_slot = 0, found_extent = 0;
921 u32 refs;
922 u64 bytes_freed = 0;
923
924 path = btrfs_alloc_path();
925 if (!path)
926 return -ENOMEM;
927 path->reada = 1;
928
929search:
930 /* search for the backref for the current ref we want to delete */
931 cur = del_list->next;
932 op = list_entry(cur, struct pending_extent_op, list);
933 ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
934 op->orig_parent,
935 extent_root->root_key.objectid,
936 op->orig_generation, op->level, 1);
937 if (ret) {
938 printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
939 "root %llu gen %llu owner %u\n",
940 (unsigned long long)op->bytenr,
941 (unsigned long long)extent_root->root_key.objectid,
942 (unsigned long long)op->orig_generation, op->level);
943 btrfs_print_leaf(extent_root, path->nodes[0]);
944 WARN_ON(1);
945 goto out;
946 }
947
948 extent_slot = path->slots[0];
949 num_to_del = 1;
950 found_extent = 0;
951
952 /*
953 * if we aren't the first item on the leaf we can move back one and see
954 * if our ref is right next to our extent item
955 */
956 if (likely(extent_slot)) {
957 extent_slot--;
958 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
959 extent_slot);
960 if (found_key.objectid == op->bytenr &&
961 found_key.type == BTRFS_EXTENT_ITEM_KEY &&
962 found_key.offset == op->num_bytes) {
963 num_to_del++;
964 found_extent = 1;
965 }
966 }
967
968 /*
969 * if we didn't find the extent we need to delete the backref and then
970 * search for the extent item key so we can update its ref count
971 */
972 if (!found_extent) {
973 key.objectid = op->bytenr;
974 key.type = BTRFS_EXTENT_ITEM_KEY;
975 key.offset = op->num_bytes;
976
977 ret = remove_extent_backref(trans, extent_root, path);
978 BUG_ON(ret);
979 btrfs_release_path(extent_root, path);
980 ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
981 BUG_ON(ret);
982 extent_slot = path->slots[0];
983 }
984
985 /* this is where we update the ref count for the extent */
986 leaf = path->nodes[0];
987 ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
988 refs = btrfs_extent_refs(leaf, ei);
989 BUG_ON(refs == 0);
990 refs--;
991 btrfs_set_extent_refs(leaf, ei, refs);
992
993 btrfs_mark_buffer_dirty(leaf);
994
995 /*
996 * This extent needs deleting. The reason cur_slot is extent_slot +
997 * num_to_del is because extent_slot points to the slot where the extent
998 * is, and if the backref was not right next to the extent we will be
999 * deleting at least 1 item, and will want to start searching at the
1000 * slot directly next to extent_slot. However if we did find the
1001 * backref next to the extent item them we will be deleting at least 2
1002 * items and will want to start searching directly after the ref slot
1003 */
1004 if (!refs) {
1005 struct list_head *pos, *n, *end;
1006 int cur_slot = extent_slot+num_to_del;
1007 u64 super_used;
1008 u64 root_used;
1009
1010 path->slots[0] = extent_slot;
1011 bytes_freed = op->num_bytes;
1012
1013 mutex_lock(&info->pinned_mutex);
1014 ret = pin_down_bytes(trans, extent_root, op->bytenr,
1015 op->num_bytes, op->level >=
1016 BTRFS_FIRST_FREE_OBJECTID);
1017 mutex_unlock(&info->pinned_mutex);
1018 BUG_ON(ret < 0);
1019 op->del = ret;
1020
1021 /*
1022 * we need to see if we can delete multiple things at once, so
1023 * start looping through the list of extents we are wanting to
1024 * delete and see if their extent/backref's are right next to
1025 * eachother and the extents only have 1 ref
1026 */
1027 for (pos = cur->next; pos != del_list; pos = pos->next) {
1028 struct pending_extent_op *tmp;
1029
1030 tmp = list_entry(pos, struct pending_extent_op, list);
1031
1032 /* we only want to delete extent+ref at this stage */
1033 if (cur_slot >= btrfs_header_nritems(leaf) - 1)
1034 break;
1035
1036 btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
1037 if (found_key.objectid != tmp->bytenr ||
1038 found_key.type != BTRFS_EXTENT_ITEM_KEY ||
1039 found_key.offset != tmp->num_bytes)
1040 break;
1041
1042 /* check to make sure this extent only has one ref */
1043 ei = btrfs_item_ptr(leaf, cur_slot,
1044 struct btrfs_extent_item);
1045 if (btrfs_extent_refs(leaf, ei) != 1)
1046 break;
1047
1048 btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
1049 if (found_key.objectid != tmp->bytenr ||
1050 found_key.type != BTRFS_EXTENT_REF_KEY ||
1051 found_key.offset != tmp->orig_parent)
1052 break;
1053
1054 /*
1055 * the ref is right next to the extent, we can set the
1056 * ref count to 0 since we will delete them both now
1057 */
1058 btrfs_set_extent_refs(leaf, ei, 0);
1059
1060 /* pin down the bytes for this extent */
1061 mutex_lock(&info->pinned_mutex);
1062 ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
1063 tmp->num_bytes, tmp->level >=
1064 BTRFS_FIRST_FREE_OBJECTID);
1065 mutex_unlock(&info->pinned_mutex);
1066 BUG_ON(ret < 0);
1067
1068 /*
1069 * use the del field to tell if we need to go ahead and
1070 * free up the extent when we delete the item or not.
1071 */
1072 tmp->del = ret;
1073 bytes_freed += tmp->num_bytes;
1074
1075 num_to_del += 2;
1076 cur_slot += 2;
1077 }
1078 end = pos;
1079
1080 /* update the free space counters */
1081 spin_lock(&info->delalloc_lock);
1082 super_used = btrfs_super_bytes_used(&info->super_copy);
1083 btrfs_set_super_bytes_used(&info->super_copy,
1084 super_used - bytes_freed);
1085
1086 root_used = btrfs_root_used(&extent_root->root_item);
1087 btrfs_set_root_used(&extent_root->root_item,
1088 root_used - bytes_freed);
1089 spin_unlock(&info->delalloc_lock);
1090
1091 /* delete the items */
1092 ret = btrfs_del_items(trans, extent_root, path,
1093 path->slots[0], num_to_del);
1094 BUG_ON(ret);
1095
1096 /*
1097 * loop through the extents we deleted and do the cleanup work
1098 * on them
1099 */
1100 for (pos = cur, n = pos->next; pos != end;
1101 pos = n, n = pos->next) {
1102 struct pending_extent_op *tmp;
1103 tmp = list_entry(pos, struct pending_extent_op, list);
1104
1105 /*
1106 * remember tmp->del tells us wether or not we pinned
1107 * down the extent
1108 */
1109 ret = update_block_group(trans, extent_root,
1110 tmp->bytenr, tmp->num_bytes, 0,
1111 tmp->del);
1112 BUG_ON(ret);
1113
1114 list_del_init(&tmp->list);
1115 unlock_extent(&info->extent_ins, tmp->bytenr,
1116 tmp->bytenr + tmp->num_bytes - 1,
1117 GFP_NOFS);
1118 kfree(tmp);
1119 }
1120 } else if (refs && found_extent) {
1121 /*
1122 * the ref and extent were right next to eachother, but the
1123 * extent still has a ref, so just free the backref and keep
1124 * going
1125 */
1126 ret = remove_extent_backref(trans, extent_root, path);
1127 BUG_ON(ret);
1128
1129 list_del_init(&op->list);
1130 unlock_extent(&info->extent_ins, op->bytenr,
1131 op->bytenr + op->num_bytes - 1, GFP_NOFS);
1132 kfree(op);
1133 } else {
1134 /*
1135 * the extent has multiple refs and the backref we were looking
1136 * for was not right next to it, so just unlock and go next,
1137 * we're good to go
1138 */
1139 list_del_init(&op->list);
1140 unlock_extent(&info->extent_ins, op->bytenr,
1141 op->bytenr + op->num_bytes - 1, GFP_NOFS);
1142 kfree(op);
1143 }
1144
1145 btrfs_release_path(extent_root, path);
1146 if (!list_empty(del_list))
1147 goto search;
1148
1149out:
1150 btrfs_free_path(path);
1151 return ret;
1152}
1153
1154static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans, 689static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1155 struct btrfs_root *root, u64 bytenr, 690 struct btrfs_root *root, u64 bytenr,
691 u64 num_bytes,
1156 u64 orig_parent, u64 parent, 692 u64 orig_parent, u64 parent,
1157 u64 orig_root, u64 ref_root, 693 u64 orig_root, u64 ref_root,
1158 u64 orig_generation, u64 ref_generation, 694 u64 orig_generation, u64 ref_generation,
1159 u64 owner_objectid) 695 u64 owner_objectid)
1160{ 696{
1161 int ret; 697 int ret;
1162 struct btrfs_root *extent_root = root->fs_info->extent_root; 698 int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID;
1163 struct btrfs_path *path;
1164 699
1165 if (root == root->fs_info->extent_root) { 700 ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes,
1166 struct pending_extent_op *extent_op; 701 orig_parent, parent, orig_root,
1167 u64 num_bytes; 702 ref_root, orig_generation,
1168 703 ref_generation, owner_objectid, pin);
1169 BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
1170 num_bytes = btrfs_level_size(root, (int)owner_objectid);
1171 mutex_lock(&root->fs_info->extent_ins_mutex);
1172 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
1173 bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
1174 u64 priv;
1175 ret = get_state_private(&root->fs_info->extent_ins,
1176 bytenr, &priv);
1177 BUG_ON(ret);
1178 extent_op = (struct pending_extent_op *)
1179 (unsigned long)priv;
1180 BUG_ON(extent_op->parent != orig_parent);
1181 BUG_ON(extent_op->generation != orig_generation);
1182
1183 extent_op->parent = parent;
1184 extent_op->generation = ref_generation;
1185 } else {
1186 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
1187 BUG_ON(!extent_op);
1188
1189 extent_op->type = PENDING_BACKREF_UPDATE;
1190 extent_op->bytenr = bytenr;
1191 extent_op->num_bytes = num_bytes;
1192 extent_op->parent = parent;
1193 extent_op->orig_parent = orig_parent;
1194 extent_op->generation = ref_generation;
1195 extent_op->orig_generation = orig_generation;
1196 extent_op->level = (int)owner_objectid;
1197 INIT_LIST_HEAD(&extent_op->list);
1198 extent_op->del = 0;
1199
1200 set_extent_bits(&root->fs_info->extent_ins,
1201 bytenr, bytenr + num_bytes - 1,
1202 EXTENT_WRITEBACK, GFP_NOFS);
1203 set_state_private(&root->fs_info->extent_ins,
1204 bytenr, (unsigned long)extent_op);
1205 }
1206 mutex_unlock(&root->fs_info->extent_ins_mutex);
1207 return 0;
1208 }
1209
1210 path = btrfs_alloc_path();
1211 if (!path)
1212 return -ENOMEM;
1213 ret = lookup_extent_backref(trans, extent_root, path,
1214 bytenr, orig_parent, orig_root,
1215 orig_generation, owner_objectid, 1);
1216 if (ret)
1217 goto out;
1218 ret = remove_extent_backref(trans, extent_root, path);
1219 if (ret)
1220 goto out;
1221 ret = insert_extent_backref(trans, extent_root, path, bytenr,
1222 parent, ref_root, ref_generation,
1223 owner_objectid);
1224 BUG_ON(ret); 704 BUG_ON(ret);
1225 finish_current_insert(trans, extent_root, 0);
1226 del_pending_extents(trans, extent_root, 0);
1227out:
1228 btrfs_free_path(path);
1229 return ret; 705 return ret;
1230} 706}
1231 707
1232int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, 708int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1233 struct btrfs_root *root, u64 bytenr, 709 struct btrfs_root *root, u64 bytenr,
1234 u64 orig_parent, u64 parent, 710 u64 num_bytes, u64 orig_parent, u64 parent,
1235 u64 ref_root, u64 ref_generation, 711 u64 ref_root, u64 ref_generation,
1236 u64 owner_objectid) 712 u64 owner_objectid)
1237{ 713{
@@ -1239,20 +715,36 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1239 if (ref_root == BTRFS_TREE_LOG_OBJECTID && 715 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1240 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) 716 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1241 return 0; 717 return 0;
1242 ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent, 718
1243 parent, ref_root, ref_root, 719 ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
1244 ref_generation, ref_generation, 720 orig_parent, parent, ref_root,
1245 owner_objectid); 721 ref_root, ref_generation,
722 ref_generation, owner_objectid);
1246 return ret; 723 return ret;
1247} 724}
1248
1249static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 725static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1250 struct btrfs_root *root, u64 bytenr, 726 struct btrfs_root *root, u64 bytenr,
727 u64 num_bytes,
1251 u64 orig_parent, u64 parent, 728 u64 orig_parent, u64 parent,
1252 u64 orig_root, u64 ref_root, 729 u64 orig_root, u64 ref_root,
1253 u64 orig_generation, u64 ref_generation, 730 u64 orig_generation, u64 ref_generation,
1254 u64 owner_objectid) 731 u64 owner_objectid)
1255{ 732{
733 int ret;
734
735 ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root,
736 ref_generation, owner_objectid,
737 BTRFS_ADD_DELAYED_REF, 0);
738 BUG_ON(ret);
739 return ret;
740}
741
742static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
743 struct btrfs_root *root, u64 bytenr,
744 u64 num_bytes, u64 parent, u64 ref_root,
745 u64 ref_generation, u64 owner_objectid,
746 int refs_to_add)
747{
1256 struct btrfs_path *path; 748 struct btrfs_path *path;
1257 int ret; 749 int ret;
1258 struct btrfs_key key; 750 struct btrfs_key key;
@@ -1265,17 +757,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1265 return -ENOMEM; 757 return -ENOMEM;
1266 758
1267 path->reada = 1; 759 path->reada = 1;
760 path->leave_spinning = 1;
1268 key.objectid = bytenr; 761 key.objectid = bytenr;
1269 key.type = BTRFS_EXTENT_ITEM_KEY; 762 key.type = BTRFS_EXTENT_ITEM_KEY;
1270 key.offset = (u64)-1; 763 key.offset = num_bytes;
1271 764
1272 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 765 /* first find the extent item and update its reference count */
1273 0, 1); 766 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
1274 if (ret < 0) 767 path, 0, 1);
768 if (ret < 0) {
769 btrfs_set_path_blocking(path);
1275 return ret; 770 return ret;
1276 BUG_ON(ret == 0 || path->slots[0] == 0); 771 }
1277 772
1278 path->slots[0]--; 773 if (ret > 0) {
774 WARN_ON(1);
775 btrfs_free_path(path);
776 return -EIO;
777 }
1279 l = path->nodes[0]; 778 l = path->nodes[0];
1280 779
1281 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 780 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
@@ -1289,21 +788,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1289 BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY); 788 BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
1290 789
1291 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); 790 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
791
1292 refs = btrfs_extent_refs(l, item); 792 refs = btrfs_extent_refs(l, item);
1293 btrfs_set_extent_refs(l, item, refs + 1); 793 btrfs_set_extent_refs(l, item, refs + refs_to_add);
794 btrfs_unlock_up_safe(path, 1);
795
1294 btrfs_mark_buffer_dirty(path->nodes[0]); 796 btrfs_mark_buffer_dirty(path->nodes[0]);
1295 797
1296 btrfs_release_path(root->fs_info->extent_root, path); 798 btrfs_release_path(root->fs_info->extent_root, path);
1297 799
1298 path->reada = 1; 800 path->reada = 1;
801 path->leave_spinning = 1;
802
803 /* now insert the actual backref */
1299 ret = insert_extent_backref(trans, root->fs_info->extent_root, 804 ret = insert_extent_backref(trans, root->fs_info->extent_root,
1300 path, bytenr, parent, 805 path, bytenr, parent,
1301 ref_root, ref_generation, 806 ref_root, ref_generation,
1302 owner_objectid); 807 owner_objectid, refs_to_add);
1303 BUG_ON(ret); 808 BUG_ON(ret);
1304 finish_current_insert(trans, root->fs_info->extent_root, 0);
1305 del_pending_extents(trans, root->fs_info->extent_root, 0);
1306
1307 btrfs_free_path(path); 809 btrfs_free_path(path);
1308 return 0; 810 return 0;
1309} 811}
@@ -1318,68 +820,278 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1318 if (ref_root == BTRFS_TREE_LOG_OBJECTID && 820 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1319 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) 821 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1320 return 0; 822 return 0;
1321 ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent, 823
824 ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent,
1322 0, ref_root, 0, ref_generation, 825 0, ref_root, 0, ref_generation,
1323 owner_objectid); 826 owner_objectid);
1324 return ret; 827 return ret;
1325} 828}
1326 829
1327int btrfs_extent_post_op(struct btrfs_trans_handle *trans, 830static int drop_delayed_ref(struct btrfs_trans_handle *trans,
1328 struct btrfs_root *root) 831 struct btrfs_root *root,
832 struct btrfs_delayed_ref_node *node)
833{
834 int ret = 0;
835 struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node);
836
837 BUG_ON(node->ref_mod == 0);
838 ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes,
839 node->parent, ref->root, ref->generation,
840 ref->owner_objectid, ref->pin, node->ref_mod);
841
842 return ret;
843}
844
845/* helper function to actually process a single delayed ref entry */
846static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
847 struct btrfs_root *root,
848 struct btrfs_delayed_ref_node *node,
849 int insert_reserved)
1329{ 850{
1330 u64 start;
1331 u64 end;
1332 int ret; 851 int ret;
852 struct btrfs_delayed_ref *ref;
1333 853
1334 while(1) { 854 if (node->parent == (u64)-1) {
1335 finish_current_insert(trans, root->fs_info->extent_root, 1); 855 struct btrfs_delayed_ref_head *head;
1336 del_pending_extents(trans, root->fs_info->extent_root, 1); 856 /*
857 * we've hit the end of the chain and we were supposed
858 * to insert this extent into the tree. But, it got
859 * deleted before we ever needed to insert it, so all
860 * we have to do is clean up the accounting
861 */
862 if (insert_reserved) {
863 update_reserved_extents(root, node->bytenr,
864 node->num_bytes, 0);
865 }
866 head = btrfs_delayed_node_to_head(node);
867 mutex_unlock(&head->mutex);
868 return 0;
869 }
1337 870
1338 /* is there more work to do? */ 871 ref = btrfs_delayed_node_to_ref(node);
1339 ret = find_first_extent_bit(&root->fs_info->pending_del, 872 if (ref->action == BTRFS_ADD_DELAYED_REF) {
1340 0, &start, &end, EXTENT_WRITEBACK); 873 if (insert_reserved) {
1341 if (!ret) 874 struct btrfs_key ins;
1342 continue; 875
1343 ret = find_first_extent_bit(&root->fs_info->extent_ins, 876 ins.objectid = node->bytenr;
1344 0, &start, &end, EXTENT_WRITEBACK); 877 ins.offset = node->num_bytes;
1345 if (!ret) 878 ins.type = BTRFS_EXTENT_ITEM_KEY;
1346 continue; 879
1347 break; 880 /* record the full extent allocation */
881 ret = __btrfs_alloc_reserved_extent(trans, root,
882 node->parent, ref->root,
883 ref->generation, ref->owner_objectid,
884 &ins, node->ref_mod);
885 update_reserved_extents(root, node->bytenr,
886 node->num_bytes, 0);
887 } else {
888 /* just add one backref */
889 ret = add_extent_ref(trans, root, node->bytenr,
890 node->num_bytes,
891 node->parent, ref->root, ref->generation,
892 ref->owner_objectid, node->ref_mod);
893 }
894 BUG_ON(ret);
895 } else if (ref->action == BTRFS_DROP_DELAYED_REF) {
896 WARN_ON(insert_reserved);
897 ret = drop_delayed_ref(trans, root, node);
1348 } 898 }
1349 return 0; 899 return 0;
1350} 900}
1351 901
1352int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, 902static noinline struct btrfs_delayed_ref_node *
1353 struct btrfs_root *root, u64 bytenr, 903select_delayed_ref(struct btrfs_delayed_ref_head *head)
1354 u64 num_bytes, u32 *refs)
1355{ 904{
1356 struct btrfs_path *path; 905 struct rb_node *node;
906 struct btrfs_delayed_ref_node *ref;
907 int action = BTRFS_ADD_DELAYED_REF;
908again:
909 /*
910 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
911 * this prevents ref count from going down to zero when
912 * there still are pending delayed ref.
913 */
914 node = rb_prev(&head->node.rb_node);
915 while (1) {
916 if (!node)
917 break;
918 ref = rb_entry(node, struct btrfs_delayed_ref_node,
919 rb_node);
920 if (ref->bytenr != head->node.bytenr)
921 break;
922 if (btrfs_delayed_node_to_ref(ref)->action == action)
923 return ref;
924 node = rb_prev(node);
925 }
926 if (action == BTRFS_ADD_DELAYED_REF) {
927 action = BTRFS_DROP_DELAYED_REF;
928 goto again;
929 }
930 return NULL;
931}
932
933static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
934 struct btrfs_root *root,
935 struct list_head *cluster)
936{
937 struct btrfs_delayed_ref_root *delayed_refs;
938 struct btrfs_delayed_ref_node *ref;
939 struct btrfs_delayed_ref_head *locked_ref = NULL;
1357 int ret; 940 int ret;
1358 struct btrfs_key key; 941 int count = 0;
1359 struct extent_buffer *l; 942 int must_insert_reserved = 0;
1360 struct btrfs_extent_item *item;
1361 943
1362 WARN_ON(num_bytes < root->sectorsize); 944 delayed_refs = &trans->transaction->delayed_refs;
1363 path = btrfs_alloc_path(); 945 while (1) {
1364 path->reada = 1; 946 if (!locked_ref) {
1365 key.objectid = bytenr; 947 /* pick a new head ref from the cluster list */
1366 key.offset = num_bytes; 948 if (list_empty(cluster))
1367 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); 949 break;
1368 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 950
1369 0, 0); 951 locked_ref = list_entry(cluster->next,
1370 if (ret < 0) 952 struct btrfs_delayed_ref_head, cluster);
1371 goto out; 953
1372 if (ret != 0) { 954 /* grab the lock that says we are going to process
1373 btrfs_print_leaf(root, path->nodes[0]); 955 * all the refs for this head */
1374 printk(KERN_INFO "btrfs failed to find block number %llu\n", 956 ret = btrfs_delayed_ref_lock(trans, locked_ref);
1375 (unsigned long long)bytenr); 957
1376 BUG(); 958 /*
959 * we may have dropped the spin lock to get the head
960 * mutex lock, and that might have given someone else
961 * time to free the head. If that's true, it has been
962 * removed from our list and we can move on.
963 */
964 if (ret == -EAGAIN) {
965 locked_ref = NULL;
966 count++;
967 continue;
968 }
969 }
970
971 /*
972 * record the must insert reserved flag before we
973 * drop the spin lock.
974 */
975 must_insert_reserved = locked_ref->must_insert_reserved;
976 locked_ref->must_insert_reserved = 0;
977
978 /*
979 * locked_ref is the head node, so we have to go one
980 * node back for any delayed ref updates
981 */
982 ref = select_delayed_ref(locked_ref);
983 if (!ref) {
984 /* All delayed refs have been processed, Go ahead
985 * and send the head node to run_one_delayed_ref,
986 * so that any accounting fixes can happen
987 */
988 ref = &locked_ref->node;
989 list_del_init(&locked_ref->cluster);
990 locked_ref = NULL;
991 }
992
993 ref->in_tree = 0;
994 rb_erase(&ref->rb_node, &delayed_refs->root);
995 delayed_refs->num_entries--;
996 spin_unlock(&delayed_refs->lock);
997
998 ret = run_one_delayed_ref(trans, root, ref,
999 must_insert_reserved);
1000 BUG_ON(ret);
1001 btrfs_put_delayed_ref(ref);
1002
1003 count++;
1004 cond_resched();
1005 spin_lock(&delayed_refs->lock);
1006 }
1007 return count;
1008}
1009
1010/*
1011 * this starts processing the delayed reference count updates and
1012 * extent insertions we have queued up so far. count can be
1013 * 0, which means to process everything in the tree at the start
1014 * of the run (but not newly added entries), or it can be some target
1015 * number you'd like to process.
1016 */
1017int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1018 struct btrfs_root *root, unsigned long count)
1019{
1020 struct rb_node *node;
1021 struct btrfs_delayed_ref_root *delayed_refs;
1022 struct btrfs_delayed_ref_node *ref;
1023 struct list_head cluster;
1024 int ret;
1025 int run_all = count == (unsigned long)-1;
1026 int run_most = 0;
1027
1028 if (root == root->fs_info->extent_root)
1029 root = root->fs_info->tree_root;
1030
1031 delayed_refs = &trans->transaction->delayed_refs;
1032 INIT_LIST_HEAD(&cluster);
1033again:
1034 spin_lock(&delayed_refs->lock);
1035 if (count == 0) {
1036 count = delayed_refs->num_entries * 2;
1037 run_most = 1;
1038 }
1039 while (1) {
1040 if (!(run_all || run_most) &&
1041 delayed_refs->num_heads_ready < 64)
1042 break;
1043
1044 /*
1045 * go find something we can process in the rbtree. We start at
1046 * the beginning of the tree, and then build a cluster
1047 * of refs to process starting at the first one we are able to
1048 * lock
1049 */
1050 ret = btrfs_find_ref_cluster(trans, &cluster,
1051 delayed_refs->run_delayed_start);
1052 if (ret)
1053 break;
1054
1055 ret = run_clustered_refs(trans, root, &cluster);
1056 BUG_ON(ret < 0);
1057
1058 count -= min_t(unsigned long, ret, count);
1059
1060 if (count == 0)
1061 break;
1062 }
1063
1064 if (run_all) {
1065 node = rb_first(&delayed_refs->root);
1066 if (!node)
1067 goto out;
1068 count = (unsigned long)-1;
1069
1070 while (node) {
1071 ref = rb_entry(node, struct btrfs_delayed_ref_node,
1072 rb_node);
1073 if (btrfs_delayed_ref_is_head(ref)) {
1074 struct btrfs_delayed_ref_head *head;
1075
1076 head = btrfs_delayed_node_to_head(ref);
1077 atomic_inc(&ref->refs);
1078
1079 spin_unlock(&delayed_refs->lock);
1080 mutex_lock(&head->mutex);
1081 mutex_unlock(&head->mutex);
1082
1083 btrfs_put_delayed_ref(ref);
1084 cond_resched();
1085 goto again;
1086 }
1087 node = rb_next(node);
1088 }
1089 spin_unlock(&delayed_refs->lock);
1090 schedule_timeout(1);
1091 goto again;
1377 } 1092 }
1378 l = path->nodes[0];
1379 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
1380 *refs = btrfs_extent_refs(l, item);
1381out: 1093out:
1382 btrfs_free_path(path); 1094 spin_unlock(&delayed_refs->lock);
1383 return 0; 1095 return 0;
1384} 1096}
1385 1097
@@ -1603,7 +1315,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1603 int refi = 0; 1315 int refi = 0;
1604 int slot; 1316 int slot;
1605 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 1317 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1606 u64, u64, u64, u64, u64, u64, u64, u64); 1318 u64, u64, u64, u64, u64, u64, u64, u64, u64);
1607 1319
1608 ref_root = btrfs_header_owner(buf); 1320 ref_root = btrfs_header_owner(buf);
1609 ref_generation = btrfs_header_generation(buf); 1321 ref_generation = btrfs_header_generation(buf);
@@ -1675,12 +1387,19 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1675 1387
1676 if (level == 0) { 1388 if (level == 0) {
1677 btrfs_item_key_to_cpu(buf, &key, slot); 1389 btrfs_item_key_to_cpu(buf, &key, slot);
1390 fi = btrfs_item_ptr(buf, slot,
1391 struct btrfs_file_extent_item);
1392
1393 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1394 if (bytenr == 0)
1395 continue;
1678 1396
1679 ret = process_func(trans, root, bytenr, 1397 ret = process_func(trans, root, bytenr,
1680 orig_buf->start, buf->start, 1398 btrfs_file_extent_disk_num_bytes(buf, fi),
1681 orig_root, ref_root, 1399 orig_buf->start, buf->start,
1682 orig_generation, ref_generation, 1400 orig_root, ref_root,
1683 key.objectid); 1401 orig_generation, ref_generation,
1402 key.objectid);
1684 1403
1685 if (ret) { 1404 if (ret) {
1686 faili = slot; 1405 faili = slot;
@@ -1688,7 +1407,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1688 goto fail; 1407 goto fail;
1689 } 1408 }
1690 } else { 1409 } else {
1691 ret = process_func(trans, root, bytenr, 1410 ret = process_func(trans, root, bytenr, buf->len,
1692 orig_buf->start, buf->start, 1411 orig_buf->start, buf->start,
1693 orig_root, ref_root, 1412 orig_root, ref_root,
1694 orig_generation, ref_generation, 1413 orig_generation, ref_generation,
@@ -1765,17 +1484,17 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans,
1765 if (bytenr == 0) 1484 if (bytenr == 0)
1766 continue; 1485 continue;
1767 ret = __btrfs_update_extent_ref(trans, root, bytenr, 1486 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1768 orig_buf->start, buf->start, 1487 btrfs_file_extent_disk_num_bytes(buf, fi),
1769 orig_root, ref_root, 1488 orig_buf->start, buf->start,
1770 orig_generation, ref_generation, 1489 orig_root, ref_root, orig_generation,
1771 key.objectid); 1490 ref_generation, key.objectid);
1772 if (ret) 1491 if (ret)
1773 goto fail; 1492 goto fail;
1774 } else { 1493 } else {
1775 bytenr = btrfs_node_blockptr(buf, slot); 1494 bytenr = btrfs_node_blockptr(buf, slot);
1776 ret = __btrfs_update_extent_ref(trans, root, bytenr, 1495 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1777 orig_buf->start, buf->start, 1496 buf->len, orig_buf->start,
1778 orig_root, ref_root, 1497 buf->start, orig_root, ref_root,
1779 orig_generation, ref_generation, 1498 orig_generation, ref_generation,
1780 level - 1); 1499 level - 1);
1781 if (ret) 1500 if (ret)
@@ -1794,7 +1513,6 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
1794 struct btrfs_block_group_cache *cache) 1513 struct btrfs_block_group_cache *cache)
1795{ 1514{
1796 int ret; 1515 int ret;
1797 int pending_ret;
1798 struct btrfs_root *extent_root = root->fs_info->extent_root; 1516 struct btrfs_root *extent_root = root->fs_info->extent_root;
1799 unsigned long bi; 1517 unsigned long bi;
1800 struct extent_buffer *leaf; 1518 struct extent_buffer *leaf;
@@ -1810,12 +1528,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
1810 btrfs_mark_buffer_dirty(leaf); 1528 btrfs_mark_buffer_dirty(leaf);
1811 btrfs_release_path(extent_root, path); 1529 btrfs_release_path(extent_root, path);
1812fail: 1530fail:
1813 finish_current_insert(trans, extent_root, 0);
1814 pending_ret = del_pending_extents(trans, extent_root, 0);
1815 if (ret) 1531 if (ret)
1816 return ret; 1532 return ret;
1817 if (pending_ret)
1818 return pending_ret;
1819 return 0; 1533 return 0;
1820 1534
1821} 1535}
@@ -1879,7 +1593,7 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
1879 if (!block_group || block_group->ro) 1593 if (!block_group || block_group->ro)
1880 readonly = 1; 1594 readonly = 1;
1881 if (block_group) 1595 if (block_group)
1882 put_block_group(block_group); 1596 btrfs_put_block_group(block_group);
1883 return readonly; 1597 return readonly;
1884} 1598}
1885 1599
@@ -1903,7 +1617,6 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1903 if (!found) 1617 if (!found)
1904 return -ENOMEM; 1618 return -ENOMEM;
1905 1619
1906 list_add(&found->list, &info->space_info);
1907 INIT_LIST_HEAD(&found->block_groups); 1620 INIT_LIST_HEAD(&found->block_groups);
1908 init_rwsem(&found->groups_sem); 1621 init_rwsem(&found->groups_sem);
1909 spin_lock_init(&found->lock); 1622 spin_lock_init(&found->lock);
@@ -1917,6 +1630,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1917 found->full = 0; 1630 found->full = 0;
1918 found->force_alloc = 0; 1631 found->force_alloc = 0;
1919 *space_info = found; 1632 *space_info = found;
1633 list_add_rcu(&found->list, &info->space_info);
1920 return 0; 1634 return 0;
1921} 1635}
1922 1636
@@ -2303,7 +2017,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
2303 WARN_ON(ret); 2017 WARN_ON(ret);
2304 } 2018 }
2305 } 2019 }
2306 put_block_group(cache); 2020 btrfs_put_block_group(cache);
2307 total -= num_bytes; 2021 total -= num_bytes;
2308 bytenr += num_bytes; 2022 bytenr += num_bytes;
2309 } 2023 }
@@ -2320,7 +2034,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
2320 return 0; 2034 return 0;
2321 2035
2322 bytenr = cache->key.objectid; 2036 bytenr = cache->key.objectid;
2323 put_block_group(cache); 2037 btrfs_put_block_group(cache);
2324 2038
2325 return bytenr; 2039 return bytenr;
2326} 2040}
@@ -2332,7 +2046,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2332 struct btrfs_block_group_cache *cache; 2046 struct btrfs_block_group_cache *cache;
2333 struct btrfs_fs_info *fs_info = root->fs_info; 2047 struct btrfs_fs_info *fs_info = root->fs_info;
2334 2048
2335 WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
2336 if (pin) { 2049 if (pin) {
2337 set_extent_dirty(&fs_info->pinned_extents, 2050 set_extent_dirty(&fs_info->pinned_extents,
2338 bytenr, bytenr + num - 1, GFP_NOFS); 2051 bytenr, bytenr + num - 1, GFP_NOFS);
@@ -2340,6 +2053,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2340 clear_extent_dirty(&fs_info->pinned_extents, 2053 clear_extent_dirty(&fs_info->pinned_extents,
2341 bytenr, bytenr + num - 1, GFP_NOFS); 2054 bytenr, bytenr + num - 1, GFP_NOFS);
2342 } 2055 }
2056
2343 while (num > 0) { 2057 while (num > 0) {
2344 cache = btrfs_lookup_block_group(fs_info, bytenr); 2058 cache = btrfs_lookup_block_group(fs_info, bytenr);
2345 BUG_ON(!cache); 2059 BUG_ON(!cache);
@@ -2364,7 +2078,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2364 if (cache->cached) 2078 if (cache->cached)
2365 btrfs_add_free_space(cache, bytenr, len); 2079 btrfs_add_free_space(cache, bytenr, len);
2366 } 2080 }
2367 put_block_group(cache); 2081 btrfs_put_block_group(cache);
2368 bytenr += len; 2082 bytenr += len;
2369 num -= len; 2083 num -= len;
2370 } 2084 }
@@ -2395,7 +2109,7 @@ static int update_reserved_extents(struct btrfs_root *root,
2395 } 2109 }
2396 spin_unlock(&cache->lock); 2110 spin_unlock(&cache->lock);
2397 spin_unlock(&cache->space_info->lock); 2111 spin_unlock(&cache->space_info->lock);
2398 put_block_group(cache); 2112 btrfs_put_block_group(cache);
2399 bytenr += len; 2113 bytenr += len;
2400 num -= len; 2114 num -= len;
2401 } 2115 }
@@ -2410,7 +2124,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2410 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents; 2124 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
2411 int ret; 2125 int ret;
2412 2126
2413 mutex_lock(&root->fs_info->pinned_mutex);
2414 while (1) { 2127 while (1) {
2415 ret = find_first_extent_bit(pinned_extents, last, 2128 ret = find_first_extent_bit(pinned_extents, last,
2416 &start, &end, EXTENT_DIRTY); 2129 &start, &end, EXTENT_DIRTY);
@@ -2419,7 +2132,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2419 set_extent_dirty(copy, start, end, GFP_NOFS); 2132 set_extent_dirty(copy, start, end, GFP_NOFS);
2420 last = end + 1; 2133 last = end + 1;
2421 } 2134 }
2422 mutex_unlock(&root->fs_info->pinned_mutex);
2423 return 0; 2135 return 0;
2424} 2136}
2425 2137
@@ -2431,7 +2143,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2431 u64 end; 2143 u64 end;
2432 int ret; 2144 int ret;
2433 2145
2434 mutex_lock(&root->fs_info->pinned_mutex);
2435 while (1) { 2146 while (1) {
2436 ret = find_first_extent_bit(unpin, 0, &start, &end, 2147 ret = find_first_extent_bit(unpin, 0, &start, &end,
2437 EXTENT_DIRTY); 2148 EXTENT_DIRTY);
@@ -2440,209 +2151,20 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2440 2151
2441 ret = btrfs_discard_extent(root, start, end + 1 - start); 2152 ret = btrfs_discard_extent(root, start, end + 1 - start);
2442 2153
2154 /* unlocks the pinned mutex */
2443 btrfs_update_pinned_extents(root, start, end + 1 - start, 0); 2155 btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
2444 clear_extent_dirty(unpin, start, end, GFP_NOFS); 2156 clear_extent_dirty(unpin, start, end, GFP_NOFS);
2445 2157
2446 if (need_resched()) { 2158 cond_resched();
2447 mutex_unlock(&root->fs_info->pinned_mutex);
2448 cond_resched();
2449 mutex_lock(&root->fs_info->pinned_mutex);
2450 }
2451 } 2159 }
2452 mutex_unlock(&root->fs_info->pinned_mutex);
2453 return ret; 2160 return ret;
2454} 2161}
2455 2162
2456static int finish_current_insert(struct btrfs_trans_handle *trans,
2457 struct btrfs_root *extent_root, int all)
2458{
2459 u64 start;
2460 u64 end;
2461 u64 priv;
2462 u64 search = 0;
2463 struct btrfs_fs_info *info = extent_root->fs_info;
2464 struct btrfs_path *path;
2465 struct pending_extent_op *extent_op, *tmp;
2466 struct list_head insert_list, update_list;
2467 int ret;
2468 int num_inserts = 0, max_inserts, restart = 0;
2469
2470 path = btrfs_alloc_path();
2471 INIT_LIST_HEAD(&insert_list);
2472 INIT_LIST_HEAD(&update_list);
2473
2474 max_inserts = extent_root->leafsize /
2475 (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
2476 sizeof(struct btrfs_extent_ref) +
2477 sizeof(struct btrfs_extent_item));
2478again:
2479 mutex_lock(&info->extent_ins_mutex);
2480 while (1) {
2481 ret = find_first_extent_bit(&info->extent_ins, search, &start,
2482 &end, EXTENT_WRITEBACK);
2483 if (ret) {
2484 if (restart && !num_inserts &&
2485 list_empty(&update_list)) {
2486 restart = 0;
2487 search = 0;
2488 continue;
2489 }
2490 break;
2491 }
2492
2493 ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
2494 if (!ret) {
2495 if (all)
2496 restart = 1;
2497 search = end + 1;
2498 if (need_resched()) {
2499 mutex_unlock(&info->extent_ins_mutex);
2500 cond_resched();
2501 mutex_lock(&info->extent_ins_mutex);
2502 }
2503 continue;
2504 }
2505
2506 ret = get_state_private(&info->extent_ins, start, &priv);
2507 BUG_ON(ret);
2508 extent_op = (struct pending_extent_op *)(unsigned long) priv;
2509
2510 if (extent_op->type == PENDING_EXTENT_INSERT) {
2511 num_inserts++;
2512 list_add_tail(&extent_op->list, &insert_list);
2513 search = end + 1;
2514 if (num_inserts == max_inserts) {
2515 restart = 1;
2516 break;
2517 }
2518 } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
2519 list_add_tail(&extent_op->list, &update_list);
2520 search = end + 1;
2521 } else {
2522 BUG();
2523 }
2524 }
2525
2526 /*
2527 * process the update list, clear the writeback bit for it, and if
2528 * somebody marked this thing for deletion then just unlock it and be
2529 * done, the free_extents will handle it
2530 */
2531 list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
2532 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2533 extent_op->bytenr + extent_op->num_bytes - 1,
2534 EXTENT_WRITEBACK, GFP_NOFS);
2535 if (extent_op->del) {
2536 list_del_init(&extent_op->list);
2537 unlock_extent(&info->extent_ins, extent_op->bytenr,
2538 extent_op->bytenr + extent_op->num_bytes
2539 - 1, GFP_NOFS);
2540 kfree(extent_op);
2541 }
2542 }
2543 mutex_unlock(&info->extent_ins_mutex);
2544
2545 /*
2546 * still have things left on the update list, go ahead an update
2547 * everything
2548 */
2549 if (!list_empty(&update_list)) {
2550 ret = update_backrefs(trans, extent_root, path, &update_list);
2551 BUG_ON(ret);
2552
2553 /* we may have COW'ed new blocks, so lets start over */
2554 if (all)
2555 restart = 1;
2556 }
2557
2558 /*
2559 * if no inserts need to be done, but we skipped some extents and we
2560 * need to make sure everything is cleaned then reset everything and
2561 * go back to the beginning
2562 */
2563 if (!num_inserts && restart) {
2564 search = 0;
2565 restart = 0;
2566 INIT_LIST_HEAD(&update_list);
2567 INIT_LIST_HEAD(&insert_list);
2568 goto again;
2569 } else if (!num_inserts) {
2570 goto out;
2571 }
2572
2573 /*
2574 * process the insert extents list. Again if we are deleting this
2575 * extent, then just unlock it, pin down the bytes if need be, and be
2576 * done with it. Saves us from having to actually insert the extent
2577 * into the tree and then subsequently come along and delete it
2578 */
2579 mutex_lock(&info->extent_ins_mutex);
2580 list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
2581 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2582 extent_op->bytenr + extent_op->num_bytes - 1,
2583 EXTENT_WRITEBACK, GFP_NOFS);
2584 if (extent_op->del) {
2585 u64 used;
2586 list_del_init(&extent_op->list);
2587 unlock_extent(&info->extent_ins, extent_op->bytenr,
2588 extent_op->bytenr + extent_op->num_bytes
2589 - 1, GFP_NOFS);
2590
2591 mutex_lock(&extent_root->fs_info->pinned_mutex);
2592 ret = pin_down_bytes(trans, extent_root,
2593 extent_op->bytenr,
2594 extent_op->num_bytes, 0);
2595 mutex_unlock(&extent_root->fs_info->pinned_mutex);
2596
2597 spin_lock(&info->delalloc_lock);
2598 used = btrfs_super_bytes_used(&info->super_copy);
2599 btrfs_set_super_bytes_used(&info->super_copy,
2600 used - extent_op->num_bytes);
2601 used = btrfs_root_used(&extent_root->root_item);
2602 btrfs_set_root_used(&extent_root->root_item,
2603 used - extent_op->num_bytes);
2604 spin_unlock(&info->delalloc_lock);
2605
2606 ret = update_block_group(trans, extent_root,
2607 extent_op->bytenr,
2608 extent_op->num_bytes,
2609 0, ret > 0);
2610 BUG_ON(ret);
2611 kfree(extent_op);
2612 num_inserts--;
2613 }
2614 }
2615 mutex_unlock(&info->extent_ins_mutex);
2616
2617 ret = insert_extents(trans, extent_root, path, &insert_list,
2618 num_inserts);
2619 BUG_ON(ret);
2620
2621 /*
2622 * if restart is set for whatever reason we need to go back and start
2623 * searching through the pending list again.
2624 *
2625 * We just inserted some extents, which could have resulted in new
2626 * blocks being allocated, which would result in new blocks needing
2627 * updates, so if all is set we _must_ restart to get the updated
2628 * blocks.
2629 */
2630 if (restart || all) {
2631 INIT_LIST_HEAD(&insert_list);
2632 INIT_LIST_HEAD(&update_list);
2633 search = 0;
2634 restart = 0;
2635 num_inserts = 0;
2636 goto again;
2637 }
2638out:
2639 btrfs_free_path(path);
2640 return 0;
2641}
2642
2643static int pin_down_bytes(struct btrfs_trans_handle *trans, 2163static int pin_down_bytes(struct btrfs_trans_handle *trans,
2644 struct btrfs_root *root, 2164 struct btrfs_root *root,
2645 u64 bytenr, u64 num_bytes, int is_data) 2165 struct btrfs_path *path,
2166 u64 bytenr, u64 num_bytes, int is_data,
2167 struct extent_buffer **must_clean)
2646{ 2168{
2647 int err = 0; 2169 int err = 0;
2648 struct extent_buffer *buf; 2170 struct extent_buffer *buf;
@@ -2665,17 +2187,18 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
2665 u64 header_transid = btrfs_header_generation(buf); 2187 u64 header_transid = btrfs_header_generation(buf);
2666 if (header_owner != BTRFS_TREE_LOG_OBJECTID && 2188 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
2667 header_owner != BTRFS_TREE_RELOC_OBJECTID && 2189 header_owner != BTRFS_TREE_RELOC_OBJECTID &&
2190 header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
2668 header_transid == trans->transid && 2191 header_transid == trans->transid &&
2669 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 2192 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
2670 clean_tree_block(NULL, root, buf); 2193 *must_clean = buf;
2671 btrfs_tree_unlock(buf);
2672 free_extent_buffer(buf);
2673 return 1; 2194 return 1;
2674 } 2195 }
2675 btrfs_tree_unlock(buf); 2196 btrfs_tree_unlock(buf);
2676 } 2197 }
2677 free_extent_buffer(buf); 2198 free_extent_buffer(buf);
2678pinit: 2199pinit:
2200 btrfs_set_path_blocking(path);
2201 /* unlocks the pinned mutex */
2679 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); 2202 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2680 2203
2681 BUG_ON(err < 0); 2204 BUG_ON(err < 0);
@@ -2689,7 +2212,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2689 struct btrfs_root *root, 2212 struct btrfs_root *root,
2690 u64 bytenr, u64 num_bytes, u64 parent, 2213 u64 bytenr, u64 num_bytes, u64 parent,
2691 u64 root_objectid, u64 ref_generation, 2214 u64 root_objectid, u64 ref_generation,
2692 u64 owner_objectid, int pin, int mark_free) 2215 u64 owner_objectid, int pin, int mark_free,
2216 int refs_to_drop)
2693{ 2217{
2694 struct btrfs_path *path; 2218 struct btrfs_path *path;
2695 struct btrfs_key key; 2219 struct btrfs_key key;
@@ -2711,6 +2235,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2711 return -ENOMEM; 2235 return -ENOMEM;
2712 2236
2713 path->reada = 1; 2237 path->reada = 1;
2238 path->leave_spinning = 1;
2714 ret = lookup_extent_backref(trans, extent_root, path, 2239 ret = lookup_extent_backref(trans, extent_root, path,
2715 bytenr, parent, root_objectid, 2240 bytenr, parent, root_objectid,
2716 ref_generation, owner_objectid, 1); 2241 ref_generation, owner_objectid, 1);
@@ -2732,9 +2257,11 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2732 break; 2257 break;
2733 } 2258 }
2734 if (!found_extent) { 2259 if (!found_extent) {
2735 ret = remove_extent_backref(trans, extent_root, path); 2260 ret = remove_extent_backref(trans, extent_root, path,
2261 refs_to_drop);
2736 BUG_ON(ret); 2262 BUG_ON(ret);
2737 btrfs_release_path(extent_root, path); 2263 btrfs_release_path(extent_root, path);
2264 path->leave_spinning = 1;
2738 ret = btrfs_search_slot(trans, extent_root, 2265 ret = btrfs_search_slot(trans, extent_root,
2739 &key, path, -1, 1); 2266 &key, path, -1, 1);
2740 if (ret) { 2267 if (ret) {
@@ -2750,8 +2277,9 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2750 btrfs_print_leaf(extent_root, path->nodes[0]); 2277 btrfs_print_leaf(extent_root, path->nodes[0]);
2751 WARN_ON(1); 2278 WARN_ON(1);
2752 printk(KERN_ERR "btrfs unable to find ref byte nr %llu " 2279 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
2753 "root %llu gen %llu owner %llu\n", 2280 "parent %llu root %llu gen %llu owner %llu\n",
2754 (unsigned long long)bytenr, 2281 (unsigned long long)bytenr,
2282 (unsigned long long)parent,
2755 (unsigned long long)root_objectid, 2283 (unsigned long long)root_objectid,
2756 (unsigned long long)ref_generation, 2284 (unsigned long long)ref_generation,
2757 (unsigned long long)owner_objectid); 2285 (unsigned long long)owner_objectid);
@@ -2761,17 +2289,23 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2761 ei = btrfs_item_ptr(leaf, extent_slot, 2289 ei = btrfs_item_ptr(leaf, extent_slot,
2762 struct btrfs_extent_item); 2290 struct btrfs_extent_item);
2763 refs = btrfs_extent_refs(leaf, ei); 2291 refs = btrfs_extent_refs(leaf, ei);
2764 BUG_ON(refs == 0);
2765 refs -= 1;
2766 btrfs_set_extent_refs(leaf, ei, refs);
2767 2292
2293 /*
2294 * we're not allowed to delete the extent item if there
2295 * are other delayed ref updates pending
2296 */
2297
2298 BUG_ON(refs < refs_to_drop);
2299 refs -= refs_to_drop;
2300 btrfs_set_extent_refs(leaf, ei, refs);
2768 btrfs_mark_buffer_dirty(leaf); 2301 btrfs_mark_buffer_dirty(leaf);
2769 2302
2770 if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) { 2303 if (refs == 0 && found_extent &&
2304 path->slots[0] == extent_slot + 1) {
2771 struct btrfs_extent_ref *ref; 2305 struct btrfs_extent_ref *ref;
2772 ref = btrfs_item_ptr(leaf, path->slots[0], 2306 ref = btrfs_item_ptr(leaf, path->slots[0],
2773 struct btrfs_extent_ref); 2307 struct btrfs_extent_ref);
2774 BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1); 2308 BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop);
2775 /* if the back ref and the extent are next to each other 2309 /* if the back ref and the extent are next to each other
2776 * they get deleted below in one shot 2310 * they get deleted below in one shot
2777 */ 2311 */
@@ -2779,11 +2313,13 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2779 num_to_del = 2; 2313 num_to_del = 2;
2780 } else if (found_extent) { 2314 } else if (found_extent) {
2781 /* otherwise delete the extent back ref */ 2315 /* otherwise delete the extent back ref */
2782 ret = remove_extent_backref(trans, extent_root, path); 2316 ret = remove_extent_backref(trans, extent_root, path,
2317 refs_to_drop);
2783 BUG_ON(ret); 2318 BUG_ON(ret);
2784 /* if refs are 0, we need to setup the path for deletion */ 2319 /* if refs are 0, we need to setup the path for deletion */
2785 if (refs == 0) { 2320 if (refs == 0) {
2786 btrfs_release_path(extent_root, path); 2321 btrfs_release_path(extent_root, path);
2322 path->leave_spinning = 1;
2787 ret = btrfs_search_slot(trans, extent_root, &key, path, 2323 ret = btrfs_search_slot(trans, extent_root, &key, path,
2788 -1, 1); 2324 -1, 1);
2789 BUG_ON(ret); 2325 BUG_ON(ret);
@@ -2793,16 +2329,18 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2793 if (refs == 0) { 2329 if (refs == 0) {
2794 u64 super_used; 2330 u64 super_used;
2795 u64 root_used; 2331 u64 root_used;
2332 struct extent_buffer *must_clean = NULL;
2796 2333
2797 if (pin) { 2334 if (pin) {
2798 mutex_lock(&root->fs_info->pinned_mutex); 2335 ret = pin_down_bytes(trans, root, path,
2799 ret = pin_down_bytes(trans, root, bytenr, num_bytes, 2336 bytenr, num_bytes,
2800 owner_objectid >= BTRFS_FIRST_FREE_OBJECTID); 2337 owner_objectid >= BTRFS_FIRST_FREE_OBJECTID,
2801 mutex_unlock(&root->fs_info->pinned_mutex); 2338 &must_clean);
2802 if (ret > 0) 2339 if (ret > 0)
2803 mark_free = 1; 2340 mark_free = 1;
2804 BUG_ON(ret < 0); 2341 BUG_ON(ret < 0);
2805 } 2342 }
2343
2806 /* block accounting for super block */ 2344 /* block accounting for super block */
2807 spin_lock(&info->delalloc_lock); 2345 spin_lock(&info->delalloc_lock);
2808 super_used = btrfs_super_bytes_used(&info->super_copy); 2346 super_used = btrfs_super_bytes_used(&info->super_copy);
@@ -2814,14 +2352,34 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2814 btrfs_set_root_used(&root->root_item, 2352 btrfs_set_root_used(&root->root_item,
2815 root_used - num_bytes); 2353 root_used - num_bytes);
2816 spin_unlock(&info->delalloc_lock); 2354 spin_unlock(&info->delalloc_lock);
2355
2356 /*
2357 * it is going to be very rare for someone to be waiting
2358 * on the block we're freeing. del_items might need to
2359 * schedule, so rather than get fancy, just force it
2360 * to blocking here
2361 */
2362 if (must_clean)
2363 btrfs_set_lock_blocking(must_clean);
2364
2817 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 2365 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
2818 num_to_del); 2366 num_to_del);
2819 BUG_ON(ret); 2367 BUG_ON(ret);
2820 btrfs_release_path(extent_root, path); 2368 btrfs_release_path(extent_root, path);
2821 2369
2370 if (must_clean) {
2371 clean_tree_block(NULL, root, must_clean);
2372 btrfs_tree_unlock(must_clean);
2373 free_extent_buffer(must_clean);
2374 }
2375
2822 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 2376 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
2823 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 2377 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
2824 BUG_ON(ret); 2378 BUG_ON(ret);
2379 } else {
2380 invalidate_mapping_pages(info->btree_inode->i_mapping,
2381 bytenr >> PAGE_CACHE_SHIFT,
2382 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
2825 } 2383 }
2826 2384
2827 ret = update_block_group(trans, root, bytenr, num_bytes, 0, 2385 ret = update_block_group(trans, root, bytenr, num_bytes, 0,
@@ -2829,218 +2387,103 @@ static int __free_extent(struct btrfs_trans_handle *trans,
2829 BUG_ON(ret); 2387 BUG_ON(ret);
2830 } 2388 }
2831 btrfs_free_path(path); 2389 btrfs_free_path(path);
2832 finish_current_insert(trans, extent_root, 0);
2833 return ret; 2390 return ret;
2834} 2391}
2835 2392
2836/* 2393/*
2837 * find all the blocks marked as pending in the radix tree and remove 2394 * remove an extent from the root, returns 0 on success
2838 * them from the extent map
2839 */ 2395 */
2840static int del_pending_extents(struct btrfs_trans_handle *trans, 2396static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2841 struct btrfs_root *extent_root, int all) 2397 struct btrfs_root *root,
2398 u64 bytenr, u64 num_bytes, u64 parent,
2399 u64 root_objectid, u64 ref_generation,
2400 u64 owner_objectid, int pin,
2401 int refs_to_drop)
2842{ 2402{
2843 int ret; 2403 WARN_ON(num_bytes < root->sectorsize);
2844 int err = 0;
2845 u64 start;
2846 u64 end;
2847 u64 priv;
2848 u64 search = 0;
2849 int nr = 0, skipped = 0;
2850 struct extent_io_tree *pending_del;
2851 struct extent_io_tree *extent_ins;
2852 struct pending_extent_op *extent_op;
2853 struct btrfs_fs_info *info = extent_root->fs_info;
2854 struct list_head delete_list;
2855
2856 INIT_LIST_HEAD(&delete_list);
2857 extent_ins = &extent_root->fs_info->extent_ins;
2858 pending_del = &extent_root->fs_info->pending_del;
2859
2860again:
2861 mutex_lock(&info->extent_ins_mutex);
2862 while (1) {
2863 ret = find_first_extent_bit(pending_del, search, &start, &end,
2864 EXTENT_WRITEBACK);
2865 if (ret) {
2866 if (all && skipped && !nr) {
2867 search = 0;
2868 skipped = 0;
2869 continue;
2870 }
2871 mutex_unlock(&info->extent_ins_mutex);
2872 break;
2873 }
2874
2875 ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
2876 if (!ret) {
2877 search = end+1;
2878 skipped = 1;
2879
2880 if (need_resched()) {
2881 mutex_unlock(&info->extent_ins_mutex);
2882 cond_resched();
2883 mutex_lock(&info->extent_ins_mutex);
2884 }
2885
2886 continue;
2887 }
2888 BUG_ON(ret < 0);
2889
2890 ret = get_state_private(pending_del, start, &priv);
2891 BUG_ON(ret);
2892 extent_op = (struct pending_extent_op *)(unsigned long)priv;
2893
2894 clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
2895 GFP_NOFS);
2896 if (!test_range_bit(extent_ins, start, end,
2897 EXTENT_WRITEBACK, 0)) {
2898 list_add_tail(&extent_op->list, &delete_list);
2899 nr++;
2900 } else {
2901 kfree(extent_op);
2902
2903 ret = get_state_private(&info->extent_ins, start,
2904 &priv);
2905 BUG_ON(ret);
2906 extent_op = (struct pending_extent_op *)
2907 (unsigned long)priv;
2908
2909 clear_extent_bits(&info->extent_ins, start, end,
2910 EXTENT_WRITEBACK, GFP_NOFS);
2911
2912 if (extent_op->type == PENDING_BACKREF_UPDATE) {
2913 list_add_tail(&extent_op->list, &delete_list);
2914 search = end + 1;
2915 nr++;
2916 continue;
2917 }
2918
2919 mutex_lock(&extent_root->fs_info->pinned_mutex);
2920 ret = pin_down_bytes(trans, extent_root, start,
2921 end + 1 - start, 0);
2922 mutex_unlock(&extent_root->fs_info->pinned_mutex);
2923
2924 ret = update_block_group(trans, extent_root, start,
2925 end + 1 - start, 0, ret > 0);
2926
2927 unlock_extent(extent_ins, start, end, GFP_NOFS);
2928 BUG_ON(ret);
2929 kfree(extent_op);
2930 }
2931 if (ret)
2932 err = ret;
2933
2934 search = end + 1;
2935
2936 if (need_resched()) {
2937 mutex_unlock(&info->extent_ins_mutex);
2938 cond_resched();
2939 mutex_lock(&info->extent_ins_mutex);
2940 }
2941 }
2942 2404
2943 if (nr) { 2405 /*
2944 ret = free_extents(trans, extent_root, &delete_list); 2406 * if metadata always pin
2945 BUG_ON(ret); 2407 * if data pin when any transaction has committed this
2946 } 2408 */
2409 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID ||
2410 ref_generation != trans->transid)
2411 pin = 1;
2947 2412
2948 if (all && skipped) { 2413 if (ref_generation != trans->transid)
2949 INIT_LIST_HEAD(&delete_list); 2414 pin = 1;
2950 search = 0;
2951 nr = 0;
2952 goto again;
2953 }
2954 2415
2955 if (!err) 2416 return __free_extent(trans, root, bytenr, num_bytes, parent,
2956 finish_current_insert(trans, extent_root, 0); 2417 root_objectid, ref_generation,
2957 return err; 2418 owner_objectid, pin, pin == 0, refs_to_drop);
2958} 2419}
2959 2420
2960/* 2421/*
2961 * remove an extent from the root, returns 0 on success 2422 * when we free an extent, it is possible (and likely) that we free the last
2423 * delayed ref for that extent as well. This searches the delayed ref tree for
2424 * a given extent, and if there are no other delayed refs to be processed, it
2425 * removes it from the tree.
2962 */ 2426 */
2963static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 2427static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
2964 struct btrfs_root *root, 2428 struct btrfs_root *root, u64 bytenr)
2965 u64 bytenr, u64 num_bytes, u64 parent,
2966 u64 root_objectid, u64 ref_generation,
2967 u64 owner_objectid, int pin)
2968{ 2429{
2969 struct btrfs_root *extent_root = root->fs_info->extent_root; 2430 struct btrfs_delayed_ref_head *head;
2970 int pending_ret; 2431 struct btrfs_delayed_ref_root *delayed_refs;
2432 struct btrfs_delayed_ref_node *ref;
2433 struct rb_node *node;
2971 int ret; 2434 int ret;
2972 2435
2973 WARN_ON(num_bytes < root->sectorsize); 2436 delayed_refs = &trans->transaction->delayed_refs;
2974 if (root == extent_root) { 2437 spin_lock(&delayed_refs->lock);
2975 struct pending_extent_op *extent_op = NULL; 2438 head = btrfs_find_delayed_ref_head(trans, bytenr);
2976 2439 if (!head)
2977 mutex_lock(&root->fs_info->extent_ins_mutex); 2440 goto out;
2978 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
2979 bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
2980 u64 priv;
2981 ret = get_state_private(&root->fs_info->extent_ins,
2982 bytenr, &priv);
2983 BUG_ON(ret);
2984 extent_op = (struct pending_extent_op *)
2985 (unsigned long)priv;
2986 2441
2987 extent_op->del = 1; 2442 node = rb_prev(&head->node.rb_node);
2988 if (extent_op->type == PENDING_EXTENT_INSERT) { 2443 if (!node)
2989 mutex_unlock(&root->fs_info->extent_ins_mutex); 2444 goto out;
2990 return 0;
2991 }
2992 }
2993 2445
2994 if (extent_op) { 2446 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2995 ref_generation = extent_op->orig_generation;
2996 parent = extent_op->orig_parent;
2997 }
2998 2447
2999 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 2448 /* there are still entries for this ref, we can't drop it */
3000 BUG_ON(!extent_op); 2449 if (ref->bytenr == bytenr)
3001 2450 goto out;
3002 extent_op->type = PENDING_EXTENT_DELETE;
3003 extent_op->bytenr = bytenr;
3004 extent_op->num_bytes = num_bytes;
3005 extent_op->parent = parent;
3006 extent_op->orig_parent = parent;
3007 extent_op->generation = ref_generation;
3008 extent_op->orig_generation = ref_generation;
3009 extent_op->level = (int)owner_objectid;
3010 INIT_LIST_HEAD(&extent_op->list);
3011 extent_op->del = 0;
3012
3013 set_extent_bits(&root->fs_info->pending_del,
3014 bytenr, bytenr + num_bytes - 1,
3015 EXTENT_WRITEBACK, GFP_NOFS);
3016 set_state_private(&root->fs_info->pending_del,
3017 bytenr, (unsigned long)extent_op);
3018 mutex_unlock(&root->fs_info->extent_ins_mutex);
3019 return 0;
3020 }
3021 /* if metadata always pin */
3022 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
3023 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3024 mutex_lock(&root->fs_info->pinned_mutex);
3025 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
3026 mutex_unlock(&root->fs_info->pinned_mutex);
3027 update_reserved_extents(root, bytenr, num_bytes, 0);
3028 return 0;
3029 }
3030 pin = 1;
3031 }
3032 2451
3033 /* if data pin when any transaction has committed this */ 2452 /*
3034 if (ref_generation != trans->transid) 2453 * waiting for the lock here would deadlock. If someone else has it
3035 pin = 1; 2454 * locked they are already in the process of dropping it anyway
2455 */
2456 if (!mutex_trylock(&head->mutex))
2457 goto out;
3036 2458
3037 ret = __free_extent(trans, root, bytenr, num_bytes, parent, 2459 /*
3038 root_objectid, ref_generation, 2460 * at this point we have a head with no other entries. Go
3039 owner_objectid, pin, pin == 0); 2461 * ahead and process it.
2462 */
2463 head->node.in_tree = 0;
2464 rb_erase(&head->node.rb_node, &delayed_refs->root);
2465
2466 delayed_refs->num_entries--;
3040 2467
3041 finish_current_insert(trans, root->fs_info->extent_root, 0); 2468 /*
3042 pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0); 2469 * we don't take a ref on the node because we're removing it from the
3043 return ret ? ret : pending_ret; 2470 * tree, so we just steal the ref the tree was holding.
2471 */
2472 delayed_refs->num_heads--;
2473 if (list_empty(&head->cluster))
2474 delayed_refs->num_heads_ready--;
2475
2476 list_del_init(&head->cluster);
2477 spin_unlock(&delayed_refs->lock);
2478
2479 ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
2480 &head->node, head->must_insert_reserved);
2481 BUG_ON(ret);
2482 btrfs_put_delayed_ref(&head->node);
2483 return 0;
2484out:
2485 spin_unlock(&delayed_refs->lock);
2486 return 0;
3044} 2487}
3045 2488
3046int btrfs_free_extent(struct btrfs_trans_handle *trans, 2489int btrfs_free_extent(struct btrfs_trans_handle *trans,
@@ -3051,9 +2494,28 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
3051{ 2494{
3052 int ret; 2495 int ret;
3053 2496
3054 ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent, 2497 /*
3055 root_objectid, ref_generation, 2498 * tree log blocks never actually go into the extent allocation
3056 owner_objectid, pin); 2499 * tree, just update pinning info and exit early.
2500 *
2501 * data extents referenced by the tree log do need to have
2502 * their reference counts bumped.
2503 */
2504 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
2505 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2506 /* unlocks the pinned mutex */
2507 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2508 update_reserved_extents(root, bytenr, num_bytes, 0);
2509 ret = 0;
2510 } else {
2511 ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent,
2512 root_objectid, ref_generation,
2513 owner_objectid,
2514 BTRFS_DROP_DELAYED_REF, 1);
2515 BUG_ON(ret);
2516 ret = check_ref_cleanup(trans, root, bytenr);
2517 BUG_ON(ret);
2518 }
3057 return ret; 2519 return ret;
3058} 2520}
3059 2521
@@ -3082,228 +2544,237 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3082{ 2544{
3083 int ret = 0; 2545 int ret = 0;
3084 struct btrfs_root *root = orig_root->fs_info->extent_root; 2546 struct btrfs_root *root = orig_root->fs_info->extent_root;
3085 u64 total_needed = num_bytes; 2547 struct btrfs_free_cluster *last_ptr = NULL;
3086 u64 *last_ptr = NULL;
3087 u64 last_wanted = 0;
3088 struct btrfs_block_group_cache *block_group = NULL; 2548 struct btrfs_block_group_cache *block_group = NULL;
3089 int chunk_alloc_done = 0;
3090 int empty_cluster = 2 * 1024 * 1024; 2549 int empty_cluster = 2 * 1024 * 1024;
3091 int allowed_chunk_alloc = 0; 2550 int allowed_chunk_alloc = 0;
3092 struct list_head *head = NULL, *cur = NULL;
3093 int loop = 0;
3094 int extra_loop = 0;
3095 struct btrfs_space_info *space_info; 2551 struct btrfs_space_info *space_info;
2552 int last_ptr_loop = 0;
2553 int loop = 0;
3096 2554
3097 WARN_ON(num_bytes < root->sectorsize); 2555 WARN_ON(num_bytes < root->sectorsize);
3098 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 2556 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
3099 ins->objectid = 0; 2557 ins->objectid = 0;
3100 ins->offset = 0; 2558 ins->offset = 0;
3101 2559
2560 space_info = __find_space_info(root->fs_info, data);
2561
3102 if (orig_root->ref_cows || empty_size) 2562 if (orig_root->ref_cows || empty_size)
3103 allowed_chunk_alloc = 1; 2563 allowed_chunk_alloc = 1;
3104 2564
3105 if (data & BTRFS_BLOCK_GROUP_METADATA) { 2565 if (data & BTRFS_BLOCK_GROUP_METADATA) {
3106 last_ptr = &root->fs_info->last_alloc; 2566 last_ptr = &root->fs_info->meta_alloc_cluster;
3107 if (!btrfs_test_opt(root, SSD)) 2567 if (!btrfs_test_opt(root, SSD))
3108 empty_cluster = 64 * 1024; 2568 empty_cluster = 64 * 1024;
3109 } 2569 }
3110 2570
3111 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) 2571 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
3112 last_ptr = &root->fs_info->last_data_alloc; 2572 last_ptr = &root->fs_info->data_alloc_cluster;
2573 }
3113 2574
3114 if (last_ptr) { 2575 if (last_ptr) {
3115 if (*last_ptr) { 2576 spin_lock(&last_ptr->lock);
3116 hint_byte = *last_ptr; 2577 if (last_ptr->block_group)
3117 last_wanted = *last_ptr; 2578 hint_byte = last_ptr->window_start;
3118 } else 2579 spin_unlock(&last_ptr->lock);
3119 empty_size += empty_cluster;
3120 } else {
3121 empty_cluster = 0;
3122 } 2580 }
2581
3123 search_start = max(search_start, first_logical_byte(root, 0)); 2582 search_start = max(search_start, first_logical_byte(root, 0));
3124 search_start = max(search_start, hint_byte); 2583 search_start = max(search_start, hint_byte);
3125 2584
3126 if (last_wanted && search_start != last_wanted) { 2585 if (!last_ptr) {
3127 last_wanted = 0; 2586 empty_cluster = 0;
3128 empty_size += empty_cluster; 2587 loop = 1;
3129 } 2588 }
3130 2589
3131 total_needed += empty_size; 2590 if (search_start == hint_byte) {
3132 block_group = btrfs_lookup_block_group(root->fs_info, search_start); 2591 block_group = btrfs_lookup_block_group(root->fs_info,
3133 if (!block_group) 2592 search_start);
3134 block_group = btrfs_lookup_first_block_group(root->fs_info, 2593 if (block_group && block_group_bits(block_group, data)) {
3135 search_start); 2594 down_read(&space_info->groups_sem);
3136 space_info = __find_space_info(root->fs_info, data); 2595 goto have_block_group;
2596 } else if (block_group) {
2597 btrfs_put_block_group(block_group);
2598 }
2599 }
3137 2600
2601search:
3138 down_read(&space_info->groups_sem); 2602 down_read(&space_info->groups_sem);
3139 while (1) { 2603 list_for_each_entry(block_group, &space_info->block_groups, list) {
3140 struct btrfs_free_space *free_space; 2604 u64 offset;
3141 /*
3142 * the only way this happens if our hint points to a block
3143 * group thats not of the proper type, while looping this
3144 * should never happen
3145 */
3146 if (empty_size)
3147 extra_loop = 1;
3148 2605
3149 if (!block_group) 2606 atomic_inc(&block_group->count);
3150 goto new_group_no_lock; 2607 search_start = block_group->key.objectid;
3151 2608
2609have_block_group:
3152 if (unlikely(!block_group->cached)) { 2610 if (unlikely(!block_group->cached)) {
3153 mutex_lock(&block_group->cache_mutex); 2611 mutex_lock(&block_group->cache_mutex);
3154 ret = cache_block_group(root, block_group); 2612 ret = cache_block_group(root, block_group);
3155 mutex_unlock(&block_group->cache_mutex); 2613 mutex_unlock(&block_group->cache_mutex);
3156 if (ret) 2614 if (ret) {
2615 btrfs_put_block_group(block_group);
3157 break; 2616 break;
2617 }
3158 } 2618 }
3159 2619
3160 mutex_lock(&block_group->alloc_mutex);
3161 if (unlikely(!block_group_bits(block_group, data)))
3162 goto new_group;
3163
3164 if (unlikely(block_group->ro)) 2620 if (unlikely(block_group->ro))
3165 goto new_group; 2621 goto loop;
3166 2622
3167 free_space = btrfs_find_free_space(block_group, search_start, 2623 if (last_ptr) {
3168 total_needed); 2624 /*
3169 if (free_space) { 2625 * the refill lock keeps out other
3170 u64 start = block_group->key.objectid; 2626 * people trying to start a new cluster
3171 u64 end = block_group->key.objectid + 2627 */
3172 block_group->key.offset; 2628 spin_lock(&last_ptr->refill_lock);
2629 offset = btrfs_alloc_from_cluster(block_group, last_ptr,
2630 num_bytes, search_start);
2631 if (offset) {
2632 /* we have a block, we're done */
2633 spin_unlock(&last_ptr->refill_lock);
2634 goto checks;
2635 }
3173 2636
3174 search_start = stripe_align(root, free_space->offset); 2637 spin_lock(&last_ptr->lock);
2638 /*
2639 * whoops, this cluster doesn't actually point to
2640 * this block group. Get a ref on the block
2641 * group is does point to and try again
2642 */
2643 if (!last_ptr_loop && last_ptr->block_group &&
2644 last_ptr->block_group != block_group) {
2645
2646 btrfs_put_block_group(block_group);
2647 block_group = last_ptr->block_group;
2648 atomic_inc(&block_group->count);
2649 spin_unlock(&last_ptr->lock);
2650 spin_unlock(&last_ptr->refill_lock);
2651
2652 last_ptr_loop = 1;
2653 search_start = block_group->key.objectid;
2654 goto have_block_group;
2655 }
2656 spin_unlock(&last_ptr->lock);
3175 2657
3176 /* move on to the next group */ 2658 /*
3177 if (search_start + num_bytes >= search_end) 2659 * this cluster didn't work out, free it and
3178 goto new_group; 2660 * start over
2661 */
2662 btrfs_return_cluster_to_free_space(NULL, last_ptr);
3179 2663
3180 /* move on to the next group */ 2664 last_ptr_loop = 0;
3181 if (search_start + num_bytes > end)
3182 goto new_group;
3183 2665
3184 if (last_wanted && search_start != last_wanted) { 2666 /* allocate a cluster in this block group */
3185 total_needed += empty_cluster; 2667 ret = btrfs_find_space_cluster(trans,
3186 empty_size += empty_cluster; 2668 block_group, last_ptr,
3187 last_wanted = 0; 2669 offset, num_bytes,
2670 empty_cluster + empty_size);
2671 if (ret == 0) {
3188 /* 2672 /*
3189 * if search_start is still in this block group 2673 * now pull our allocation out of this
3190 * then we just re-search this block group 2674 * cluster
3191 */ 2675 */
3192 if (search_start >= start && 2676 offset = btrfs_alloc_from_cluster(block_group,
3193 search_start < end) { 2677 last_ptr, num_bytes,
3194 mutex_unlock(&block_group->alloc_mutex); 2678 search_start);
3195 continue; 2679 if (offset) {
2680 /* we found one, proceed */
2681 spin_unlock(&last_ptr->refill_lock);
2682 goto checks;
3196 } 2683 }
3197
3198 /* else we go to the next block group */
3199 goto new_group;
3200 } 2684 }
3201 2685 /*
3202 if (exclude_nr > 0 && 2686 * at this point we either didn't find a cluster
3203 (search_start + num_bytes > exclude_start && 2687 * or we weren't able to allocate a block from our
3204 search_start < exclude_start + exclude_nr)) { 2688 * cluster. Free the cluster we've been trying
3205 search_start = exclude_start + exclude_nr; 2689 * to use, and go to the next block group
3206 /* 2690 */
3207 * if search_start is still in this block group 2691 if (loop < 2) {
3208 * then we just re-search this block group 2692 btrfs_return_cluster_to_free_space(NULL,
3209 */ 2693 last_ptr);
3210 if (search_start >= start && 2694 spin_unlock(&last_ptr->refill_lock);
3211 search_start < end) { 2695 goto loop;
3212 mutex_unlock(&block_group->alloc_mutex);
3213 last_wanted = 0;
3214 continue;
3215 }
3216
3217 /* else we go to the next block group */
3218 goto new_group;
3219 } 2696 }
2697 spin_unlock(&last_ptr->refill_lock);
2698 }
3220 2699
3221 ins->objectid = search_start; 2700 offset = btrfs_find_space_for_alloc(block_group, search_start,
3222 ins->offset = num_bytes; 2701 num_bytes, empty_size);
2702 if (!offset)
2703 goto loop;
2704checks:
2705 search_start = stripe_align(root, offset);
3223 2706
3224 btrfs_remove_free_space_lock(block_group, search_start, 2707 /* move on to the next group */
3225 num_bytes); 2708 if (search_start + num_bytes >= search_end) {
3226 /* we are all good, lets return */ 2709 btrfs_add_free_space(block_group, offset, num_bytes);
3227 mutex_unlock(&block_group->alloc_mutex); 2710 goto loop;
3228 break;
3229 } 2711 }
3230new_group:
3231 mutex_unlock(&block_group->alloc_mutex);
3232 put_block_group(block_group);
3233 block_group = NULL;
3234new_group_no_lock:
3235 /* don't try to compare new allocations against the
3236 * last allocation any more
3237 */
3238 last_wanted = 0;
3239 2712
3240 /* 2713 /* move on to the next group */
3241 * Here's how this works. 2714 if (search_start + num_bytes >
3242 * loop == 0: we were searching a block group via a hint 2715 block_group->key.objectid + block_group->key.offset) {
3243 * and didn't find anything, so we start at 2716 btrfs_add_free_space(block_group, offset, num_bytes);
3244 * the head of the block groups and keep searching 2717 goto loop;
3245 * loop == 1: we're searching through all of the block groups 2718 }
3246 * if we hit the head again we have searched 2719
3247 * all of the block groups for this space and we 2720 if (exclude_nr > 0 &&
3248 * need to try and allocate, if we cant error out. 2721 (search_start + num_bytes > exclude_start &&
3249 * loop == 2: we allocated more space and are looping through 2722 search_start < exclude_start + exclude_nr)) {
3250 * all of the block groups again. 2723 search_start = exclude_start + exclude_nr;
3251 */ 2724
3252 if (loop == 0) { 2725 btrfs_add_free_space(block_group, offset, num_bytes);
3253 head = &space_info->block_groups; 2726 /*
3254 cur = head->next; 2727 * if search_start is still in this block group
3255 loop++; 2728 * then we just re-search this block group
3256 } else if (loop == 1 && cur == head) {
3257 int keep_going;
3258
3259 /* at this point we give up on the empty_size
3260 * allocations and just try to allocate the min
3261 * space.
3262 *
3263 * The extra_loop field was set if an empty_size
3264 * allocation was attempted above, and if this
3265 * is try we need to try the loop again without
3266 * the additional empty_size.
3267 */ 2729 */
3268 total_needed -= empty_size; 2730 if (search_start >= block_group->key.objectid &&
3269 empty_size = 0; 2731 search_start < (block_group->key.objectid +
3270 keep_going = extra_loop; 2732 block_group->key.offset))
3271 loop++; 2733 goto have_block_group;
2734 goto loop;
2735 }
3272 2736
3273 if (allowed_chunk_alloc && !chunk_alloc_done) { 2737 ins->objectid = search_start;
3274 up_read(&space_info->groups_sem); 2738 ins->offset = num_bytes;
3275 ret = do_chunk_alloc(trans, root, num_bytes + 2739
3276 2 * 1024 * 1024, data, 1); 2740 if (offset < search_start)
3277 down_read(&space_info->groups_sem); 2741 btrfs_add_free_space(block_group, offset,
3278 if (ret < 0) 2742 search_start - offset);
3279 goto loop_check; 2743 BUG_ON(offset > search_start);
3280 head = &space_info->block_groups; 2744
3281 /* 2745 /* we are all good, lets return */
3282 * we've allocated a new chunk, keep 2746 break;
3283 * trying 2747loop:
3284 */ 2748 btrfs_put_block_group(block_group);
3285 keep_going = 1; 2749 }
3286 chunk_alloc_done = 1; 2750 up_read(&space_info->groups_sem);
3287 } else if (!allowed_chunk_alloc) { 2751
3288 space_info->force_alloc = 1; 2752 /* loop == 0, try to find a clustered alloc in every block group
3289 } 2753 * loop == 1, try again after forcing a chunk allocation
3290loop_check: 2754 * loop == 2, set empty_size and empty_cluster to 0 and try again
3291 if (keep_going) { 2755 */
3292 cur = head->next; 2756 if (!ins->objectid && loop < 3 &&
3293 extra_loop = 0; 2757 (empty_size || empty_cluster || allowed_chunk_alloc)) {
3294 } else { 2758 if (loop >= 2) {
3295 break; 2759 empty_size = 0;
3296 } 2760 empty_cluster = 0;
3297 } else if (cur == head) {
3298 break;
3299 } 2761 }
3300 2762
3301 block_group = list_entry(cur, struct btrfs_block_group_cache, 2763 if (allowed_chunk_alloc) {
3302 list); 2764 ret = do_chunk_alloc(trans, root, num_bytes +
3303 atomic_inc(&block_group->count); 2765 2 * 1024 * 1024, data, 1);
2766 allowed_chunk_alloc = 0;
2767 } else {
2768 space_info->force_alloc = 1;
2769 }
3304 2770
3305 search_start = block_group->key.objectid; 2771 if (loop < 3) {
3306 cur = cur->next; 2772 loop++;
2773 goto search;
2774 }
2775 ret = -ENOSPC;
2776 } else if (!ins->objectid) {
2777 ret = -ENOSPC;
3307 } 2778 }
3308 2779
3309 /* we found what we needed */ 2780 /* we found what we needed */
@@ -3311,21 +2782,10 @@ loop_check:
3311 if (!(data & BTRFS_BLOCK_GROUP_DATA)) 2782 if (!(data & BTRFS_BLOCK_GROUP_DATA))
3312 trans->block_group = block_group->key.objectid; 2783 trans->block_group = block_group->key.objectid;
3313 2784
3314 if (last_ptr) 2785 btrfs_put_block_group(block_group);
3315 *last_ptr = ins->objectid + ins->offset;
3316 ret = 0; 2786 ret = 0;
3317 } else if (!ret) {
3318 printk(KERN_ERR "btrfs searching for %llu bytes, "
3319 "num_bytes %llu, loop %d, allowed_alloc %d\n",
3320 (unsigned long long)total_needed,
3321 (unsigned long long)num_bytes,
3322 loop, allowed_chunk_alloc);
3323 ret = -ENOSPC;
3324 } 2787 }
3325 if (block_group)
3326 put_block_group(block_group);
3327 2788
3328 up_read(&space_info->groups_sem);
3329 return ret; 2789 return ret;
3330} 2790}
3331 2791
@@ -3430,7 +2890,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
3430 ret = btrfs_discard_extent(root, start, len); 2890 ret = btrfs_discard_extent(root, start, len);
3431 2891
3432 btrfs_add_free_space(cache, start, len); 2892 btrfs_add_free_space(cache, start, len);
3433 put_block_group(cache); 2893 btrfs_put_block_group(cache);
3434 update_reserved_extents(root, start, len, 0); 2894 update_reserved_extents(root, start, len, 0);
3435 2895
3436 return ret; 2896 return ret;
@@ -3454,10 +2914,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3454static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, 2914static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3455 struct btrfs_root *root, u64 parent, 2915 struct btrfs_root *root, u64 parent,
3456 u64 root_objectid, u64 ref_generation, 2916 u64 root_objectid, u64 ref_generation,
3457 u64 owner, struct btrfs_key *ins) 2917 u64 owner, struct btrfs_key *ins,
2918 int ref_mod)
3458{ 2919{
3459 int ret; 2920 int ret;
3460 int pending_ret;
3461 u64 super_used; 2921 u64 super_used;
3462 u64 root_used; 2922 u64 root_used;
3463 u64 num_bytes = ins->offset; 2923 u64 num_bytes = ins->offset;
@@ -3482,33 +2942,6 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3482 btrfs_set_root_used(&root->root_item, root_used + num_bytes); 2942 btrfs_set_root_used(&root->root_item, root_used + num_bytes);
3483 spin_unlock(&info->delalloc_lock); 2943 spin_unlock(&info->delalloc_lock);
3484 2944
3485 if (root == extent_root) {
3486 struct pending_extent_op *extent_op;
3487
3488 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
3489 BUG_ON(!extent_op);
3490
3491 extent_op->type = PENDING_EXTENT_INSERT;
3492 extent_op->bytenr = ins->objectid;
3493 extent_op->num_bytes = ins->offset;
3494 extent_op->parent = parent;
3495 extent_op->orig_parent = 0;
3496 extent_op->generation = ref_generation;
3497 extent_op->orig_generation = 0;
3498 extent_op->level = (int)owner;
3499 INIT_LIST_HEAD(&extent_op->list);
3500 extent_op->del = 0;
3501
3502 mutex_lock(&root->fs_info->extent_ins_mutex);
3503 set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
3504 ins->objectid + ins->offset - 1,
3505 EXTENT_WRITEBACK, GFP_NOFS);
3506 set_state_private(&root->fs_info->extent_ins,
3507 ins->objectid, (unsigned long)extent_op);
3508 mutex_unlock(&root->fs_info->extent_ins_mutex);
3509 goto update_block;
3510 }
3511
3512 memcpy(&keys[0], ins, sizeof(*ins)); 2945 memcpy(&keys[0], ins, sizeof(*ins));
3513 keys[1].objectid = ins->objectid; 2946 keys[1].objectid = ins->objectid;
3514 keys[1].type = BTRFS_EXTENT_REF_KEY; 2947 keys[1].type = BTRFS_EXTENT_REF_KEY;
@@ -3519,37 +2952,31 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3519 path = btrfs_alloc_path(); 2952 path = btrfs_alloc_path();
3520 BUG_ON(!path); 2953 BUG_ON(!path);
3521 2954
2955 path->leave_spinning = 1;
3522 ret = btrfs_insert_empty_items(trans, extent_root, path, keys, 2956 ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
3523 sizes, 2); 2957 sizes, 2);
3524 BUG_ON(ret); 2958 BUG_ON(ret);
3525 2959
3526 extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2960 extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3527 struct btrfs_extent_item); 2961 struct btrfs_extent_item);
3528 btrfs_set_extent_refs(path->nodes[0], extent_item, 1); 2962 btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod);
3529 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 2963 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
3530 struct btrfs_extent_ref); 2964 struct btrfs_extent_ref);
3531 2965
3532 btrfs_set_ref_root(path->nodes[0], ref, root_objectid); 2966 btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
3533 btrfs_set_ref_generation(path->nodes[0], ref, ref_generation); 2967 btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
3534 btrfs_set_ref_objectid(path->nodes[0], ref, owner); 2968 btrfs_set_ref_objectid(path->nodes[0], ref, owner);
3535 btrfs_set_ref_num_refs(path->nodes[0], ref, 1); 2969 btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod);
3536 2970
3537 btrfs_mark_buffer_dirty(path->nodes[0]); 2971 btrfs_mark_buffer_dirty(path->nodes[0]);
3538 2972
3539 trans->alloc_exclude_start = 0; 2973 trans->alloc_exclude_start = 0;
3540 trans->alloc_exclude_nr = 0; 2974 trans->alloc_exclude_nr = 0;
3541 btrfs_free_path(path); 2975 btrfs_free_path(path);
3542 finish_current_insert(trans, extent_root, 0);
3543 pending_ret = del_pending_extents(trans, extent_root, 0);
3544 2976
3545 if (ret) 2977 if (ret)
3546 goto out; 2978 goto out;
3547 if (pending_ret) {
3548 ret = pending_ret;
3549 goto out;
3550 }
3551 2979
3552update_block:
3553 ret = update_block_group(trans, root, ins->objectid, 2980 ret = update_block_group(trans, root, ins->objectid,
3554 ins->offset, 1, 0); 2981 ins->offset, 1, 0);
3555 if (ret) { 2982 if (ret) {
@@ -3571,9 +2998,12 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3571 2998
3572 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) 2999 if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
3573 return 0; 3000 return 0;
3574 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, 3001
3575 ref_generation, owner, ins); 3002 ret = btrfs_add_delayed_ref(trans, ins->objectid,
3576 update_reserved_extents(root, ins->objectid, ins->offset, 0); 3003 ins->offset, parent, root_objectid,
3004 ref_generation, owner,
3005 BTRFS_ADD_DELAYED_EXTENT, 0);
3006 BUG_ON(ret);
3577 return ret; 3007 return ret;
3578} 3008}
3579 3009
@@ -3598,9 +3028,9 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
3598 ret = btrfs_remove_free_space(block_group, ins->objectid, 3028 ret = btrfs_remove_free_space(block_group, ins->objectid,
3599 ins->offset); 3029 ins->offset);
3600 BUG_ON(ret); 3030 BUG_ON(ret);
3601 put_block_group(block_group); 3031 btrfs_put_block_group(block_group);
3602 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, 3032 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3603 ref_generation, owner, ins); 3033 ref_generation, owner, ins, 1);
3604 return ret; 3034 return ret;
3605} 3035}
3606 3036
@@ -3619,20 +3049,18 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
3619 u64 search_end, struct btrfs_key *ins, u64 data) 3049 u64 search_end, struct btrfs_key *ins, u64 data)
3620{ 3050{
3621 int ret; 3051 int ret;
3622
3623 ret = __btrfs_reserve_extent(trans, root, num_bytes, 3052 ret = __btrfs_reserve_extent(trans, root, num_bytes,
3624 min_alloc_size, empty_size, hint_byte, 3053 min_alloc_size, empty_size, hint_byte,
3625 search_end, ins, data); 3054 search_end, ins, data);
3626 BUG_ON(ret); 3055 BUG_ON(ret);
3627 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 3056 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
3628 ret = __btrfs_alloc_reserved_extent(trans, root, parent, 3057 ret = btrfs_add_delayed_ref(trans, ins->objectid,
3629 root_objectid, ref_generation, 3058 ins->offset, parent, root_objectid,
3630 owner_objectid, ins); 3059 ref_generation, owner_objectid,
3060 BTRFS_ADD_DELAYED_EXTENT, 0);
3631 BUG_ON(ret); 3061 BUG_ON(ret);
3632
3633 } else {
3634 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3635 } 3062 }
3063 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3636 return ret; 3064 return ret;
3637} 3065}
3638 3066
@@ -3768,7 +3196,7 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3768 3196
3769 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 3197 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
3770 3198
3771 ret = __btrfs_free_extent(trans, root, disk_bytenr, 3199 ret = btrfs_free_extent(trans, root, disk_bytenr,
3772 btrfs_file_extent_disk_num_bytes(leaf, fi), 3200 btrfs_file_extent_disk_num_bytes(leaf, fi),
3773 leaf->start, leaf_owner, leaf_generation, 3201 leaf->start, leaf_owner, leaf_generation,
3774 key.objectid, 0); 3202 key.objectid, 0);
@@ -3808,7 +3236,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3808 */ 3236 */
3809 for (i = 0; i < ref->nritems; i++) { 3237 for (i = 0; i < ref->nritems; i++) {
3810 info = ref->extents + sorted[i].slot; 3238 info = ref->extents + sorted[i].slot;
3811 ret = __btrfs_free_extent(trans, root, info->bytenr, 3239 ret = btrfs_free_extent(trans, root, info->bytenr,
3812 info->num_bytes, ref->bytenr, 3240 info->num_bytes, ref->bytenr,
3813 ref->owner, ref->generation, 3241 ref->owner, ref->generation,
3814 info->objectid, 0); 3242 info->objectid, 0);
@@ -3825,12 +3253,13 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3825 return 0; 3253 return 0;
3826} 3254}
3827 3255
3828static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, 3256static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
3257 struct btrfs_root *root, u64 start,
3829 u64 len, u32 *refs) 3258 u64 len, u32 *refs)
3830{ 3259{
3831 int ret; 3260 int ret;
3832 3261
3833 ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs); 3262 ret = btrfs_lookup_extent_ref(trans, root, start, len, refs);
3834 BUG_ON(ret); 3263 BUG_ON(ret);
3835 3264
3836#if 0 /* some debugging code in case we see problems here */ 3265#if 0 /* some debugging code in case we see problems here */
@@ -3938,7 +3367,8 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
3938 * we just decrement it below and don't update any 3367 * we just decrement it below and don't update any
3939 * of the refs the leaf points to. 3368 * of the refs the leaf points to.
3940 */ 3369 */
3941 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 3370 ret = drop_snap_lookup_refcount(trans, root, bytenr,
3371 blocksize, &refs);
3942 BUG_ON(ret); 3372 BUG_ON(ret);
3943 if (refs != 1) 3373 if (refs != 1)
3944 continue; 3374 continue;
@@ -3989,7 +3419,7 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
3989 */ 3419 */
3990 for (i = 0; i < refi; i++) { 3420 for (i = 0; i < refi; i++) {
3991 bytenr = sorted[i].bytenr; 3421 bytenr = sorted[i].bytenr;
3992 ret = __btrfs_free_extent(trans, root, bytenr, 3422 ret = btrfs_free_extent(trans, root, bytenr,
3993 blocksize, eb->start, 3423 blocksize, eb->start,
3994 root_owner, root_gen, 0, 1); 3424 root_owner, root_gen, 0, 1);
3995 BUG_ON(ret); 3425 BUG_ON(ret);
@@ -4032,7 +3462,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4032 3462
4033 WARN_ON(*level < 0); 3463 WARN_ON(*level < 0);
4034 WARN_ON(*level >= BTRFS_MAX_LEVEL); 3464 WARN_ON(*level >= BTRFS_MAX_LEVEL);
4035 ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start, 3465 ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
4036 path->nodes[*level]->len, &refs); 3466 path->nodes[*level]->len, &refs);
4037 BUG_ON(ret); 3467 BUG_ON(ret);
4038 if (refs > 1) 3468 if (refs > 1)
@@ -4083,7 +3513,8 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4083 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 3513 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
4084 blocksize = btrfs_level_size(root, *level - 1); 3514 blocksize = btrfs_level_size(root, *level - 1);
4085 3515
4086 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 3516 ret = drop_snap_lookup_refcount(trans, root, bytenr,
3517 blocksize, &refs);
4087 BUG_ON(ret); 3518 BUG_ON(ret);
4088 3519
4089 /* 3520 /*
@@ -4098,7 +3529,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4098 root_gen = btrfs_header_generation(parent); 3529 root_gen = btrfs_header_generation(parent);
4099 path->slots[*level]++; 3530 path->slots[*level]++;
4100 3531
4101 ret = __btrfs_free_extent(trans, root, bytenr, 3532 ret = btrfs_free_extent(trans, root, bytenr,
4102 blocksize, parent->start, 3533 blocksize, parent->start,
4103 root_owner, root_gen, 3534 root_owner, root_gen,
4104 *level - 1, 1); 3535 *level - 1, 1);
@@ -4144,7 +3575,7 @@ out:
4144 * cleanup and free the reference on the last node 3575 * cleanup and free the reference on the last node
4145 * we processed 3576 * we processed
4146 */ 3577 */
4147 ret = __btrfs_free_extent(trans, root, bytenr, blocksize, 3578 ret = btrfs_free_extent(trans, root, bytenr, blocksize,
4148 parent->start, root_owner, root_gen, 3579 parent->start, root_owner, root_gen,
4149 *level, 1); 3580 *level, 1);
4150 free_extent_buffer(path->nodes[*level]); 3581 free_extent_buffer(path->nodes[*level]);
@@ -4333,6 +3764,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
4333 struct btrfs_path *path; 3764 struct btrfs_path *path;
4334 int i; 3765 int i;
4335 int orig_level; 3766 int orig_level;
3767 int update_count;
4336 struct btrfs_root_item *root_item = &root->root_item; 3768 struct btrfs_root_item *root_item = &root->root_item;
4337 3769
4338 WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex)); 3770 WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
@@ -4374,6 +3806,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
4374 } 3806 }
4375 } 3807 }
4376 while (1) { 3808 while (1) {
3809 unsigned long update;
4377 wret = walk_down_tree(trans, root, path, &level); 3810 wret = walk_down_tree(trans, root, path, &level);
4378 if (wret > 0) 3811 if (wret > 0)
4379 break; 3812 break;
@@ -4386,12 +3819,21 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
4386 break; 3819 break;
4387 if (wret < 0) 3820 if (wret < 0)
4388 ret = wret; 3821 ret = wret;
4389 if (trans->transaction->in_commit) { 3822 if (trans->transaction->in_commit ||
3823 trans->transaction->delayed_refs.flushing) {
4390 ret = -EAGAIN; 3824 ret = -EAGAIN;
4391 break; 3825 break;
4392 } 3826 }
4393 atomic_inc(&root->fs_info->throttle_gen); 3827 atomic_inc(&root->fs_info->throttle_gen);
4394 wake_up(&root->fs_info->transaction_throttle); 3828 wake_up(&root->fs_info->transaction_throttle);
3829 for (update_count = 0; update_count < 16; update_count++) {
3830 update = trans->delayed_ref_updates;
3831 trans->delayed_ref_updates = 0;
3832 if (update)
3833 btrfs_run_delayed_refs(trans, root, update);
3834 else
3835 break;
3836 }
4395 } 3837 }
4396 for (i = 0; i <= orig_level; i++) { 3838 for (i = 0; i <= orig_level; i++) {
4397 if (path->nodes[i]) { 3839 if (path->nodes[i]) {
@@ -4418,13 +3860,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
4418 path = btrfs_alloc_path(); 3860 path = btrfs_alloc_path();
4419 BUG_ON(!path); 3861 BUG_ON(!path);
4420 3862
4421 BUG_ON(!btrfs_tree_locked(parent)); 3863 btrfs_assert_tree_locked(parent);
4422 parent_level = btrfs_header_level(parent); 3864 parent_level = btrfs_header_level(parent);
4423 extent_buffer_get(parent); 3865 extent_buffer_get(parent);
4424 path->nodes[parent_level] = parent; 3866 path->nodes[parent_level] = parent;
4425 path->slots[parent_level] = btrfs_header_nritems(parent); 3867 path->slots[parent_level] = btrfs_header_nritems(parent);
4426 3868
4427 BUG_ON(!btrfs_tree_locked(node)); 3869 btrfs_assert_tree_locked(node);
4428 level = btrfs_header_level(node); 3870 level = btrfs_header_level(node);
4429 extent_buffer_get(node); 3871 extent_buffer_get(node);
4430 path->nodes[level] = node; 3872 path->nodes[level] = node;
@@ -5436,6 +4878,7 @@ static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
5436 root->root_key.objectid, 4878 root->root_key.objectid,
5437 trans->transid, key.objectid); 4879 trans->transid, key.objectid);
5438 BUG_ON(ret); 4880 BUG_ON(ret);
4881
5439 ret = btrfs_free_extent(trans, root, 4882 ret = btrfs_free_extent(trans, root,
5440 bytenr, num_bytes, leaf->start, 4883 bytenr, num_bytes, leaf->start,
5441 btrfs_header_owner(leaf), 4884 btrfs_header_owner(leaf),
@@ -5747,9 +5190,6 @@ static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
5747 ref_path, NULL, NULL); 5190 ref_path, NULL, NULL);
5748 BUG_ON(ret); 5191 BUG_ON(ret);
5749 5192
5750 if (root == root->fs_info->extent_root)
5751 btrfs_extent_post_op(trans, root);
5752
5753 return 0; 5193 return 0;
5754} 5194}
5755 5195
@@ -6017,6 +5457,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
6017 if (!path) 5457 if (!path)
6018 return -ENOMEM; 5458 return -ENOMEM;
6019 5459
5460 path->leave_spinning = 1;
6020 ret = btrfs_insert_empty_inode(trans, root, path, objectid); 5461 ret = btrfs_insert_empty_inode(trans, root, path, objectid);
6021 if (ret) 5462 if (ret)
6022 goto out; 5463 goto out;
@@ -6187,6 +5628,9 @@ again:
6187 btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1); 5628 btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
6188 mutex_unlock(&root->fs_info->cleaner_mutex); 5629 mutex_unlock(&root->fs_info->cleaner_mutex);
6189 5630
5631 trans = btrfs_start_transaction(info->tree_root, 1);
5632 btrfs_commit_transaction(trans, info->tree_root);
5633
6190 while (1) { 5634 while (1) {
6191 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5635 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6192 if (ret < 0) 5636 if (ret < 0)
@@ -6273,7 +5717,7 @@ next:
6273 WARN_ON(block_group->reserved > 0); 5717 WARN_ON(block_group->reserved > 0);
6274 WARN_ON(btrfs_block_group_used(&block_group->item) > 0); 5718 WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
6275 spin_unlock(&block_group->lock); 5719 spin_unlock(&block_group->lock);
6276 put_block_group(block_group); 5720 btrfs_put_block_group(block_group);
6277 ret = 0; 5721 ret = 0;
6278out: 5722out:
6279 btrfs_free_path(path); 5723 btrfs_free_path(path);
@@ -6320,6 +5764,7 @@ out:
6320int btrfs_free_block_groups(struct btrfs_fs_info *info) 5764int btrfs_free_block_groups(struct btrfs_fs_info *info)
6321{ 5765{
6322 struct btrfs_block_group_cache *block_group; 5766 struct btrfs_block_group_cache *block_group;
5767 struct btrfs_space_info *space_info;
6323 struct rb_node *n; 5768 struct rb_node *n;
6324 5769
6325 spin_lock(&info->block_group_cache_lock); 5770 spin_lock(&info->block_group_cache_lock);
@@ -6341,6 +5786,23 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
6341 spin_lock(&info->block_group_cache_lock); 5786 spin_lock(&info->block_group_cache_lock);
6342 } 5787 }
6343 spin_unlock(&info->block_group_cache_lock); 5788 spin_unlock(&info->block_group_cache_lock);
5789
5790 /* now that all the block groups are freed, go through and
5791 * free all the space_info structs. This is only called during
5792 * the final stages of unmount, and so we know nobody is
5793 * using them. We call synchronize_rcu() once before we start,
5794 * just to be on the safe side.
5795 */
5796 synchronize_rcu();
5797
5798 while(!list_empty(&info->space_info)) {
5799 space_info = list_entry(info->space_info.next,
5800 struct btrfs_space_info,
5801 list);
5802
5803 list_del(&space_info->list);
5804 kfree(space_info);
5805 }
6344 return 0; 5806 return 0;
6345} 5807}
6346 5808
@@ -6382,9 +5844,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
6382 5844
6383 atomic_set(&cache->count, 1); 5845 atomic_set(&cache->count, 1);
6384 spin_lock_init(&cache->lock); 5846 spin_lock_init(&cache->lock);
6385 mutex_init(&cache->alloc_mutex); 5847 spin_lock_init(&cache->tree_lock);
6386 mutex_init(&cache->cache_mutex); 5848 mutex_init(&cache->cache_mutex);
6387 INIT_LIST_HEAD(&cache->list); 5849 INIT_LIST_HEAD(&cache->list);
5850 INIT_LIST_HEAD(&cache->cluster_list);
6388 read_extent_buffer(leaf, &cache->item, 5851 read_extent_buffer(leaf, &cache->item,
6389 btrfs_item_ptr_offset(leaf, path->slots[0]), 5852 btrfs_item_ptr_offset(leaf, path->slots[0]),
6390 sizeof(cache->item)); 5853 sizeof(cache->item));
@@ -6427,7 +5890,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
6427 5890
6428 extent_root = root->fs_info->extent_root; 5891 extent_root = root->fs_info->extent_root;
6429 5892
6430 root->fs_info->last_trans_new_blockgroup = trans->transid; 5893 root->fs_info->last_trans_log_full_commit = trans->transid;
6431 5894
6432 cache = kzalloc(sizeof(*cache), GFP_NOFS); 5895 cache = kzalloc(sizeof(*cache), GFP_NOFS);
6433 if (!cache) 5896 if (!cache)
@@ -6438,9 +5901,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
6438 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 5901 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
6439 atomic_set(&cache->count, 1); 5902 atomic_set(&cache->count, 1);
6440 spin_lock_init(&cache->lock); 5903 spin_lock_init(&cache->lock);
6441 mutex_init(&cache->alloc_mutex); 5904 spin_lock_init(&cache->tree_lock);
6442 mutex_init(&cache->cache_mutex); 5905 mutex_init(&cache->cache_mutex);
6443 INIT_LIST_HEAD(&cache->list); 5906 INIT_LIST_HEAD(&cache->list);
5907 INIT_LIST_HEAD(&cache->cluster_list);
6444 5908
6445 btrfs_set_block_group_used(&cache->item, bytes_used); 5909 btrfs_set_block_group_used(&cache->item, bytes_used);
6446 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 5910 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
@@ -6461,9 +5925,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
6461 sizeof(cache->item)); 5925 sizeof(cache->item));
6462 BUG_ON(ret); 5926 BUG_ON(ret);
6463 5927
6464 finish_current_insert(trans, extent_root, 0);
6465 ret = del_pending_extents(trans, extent_root, 0);
6466 BUG_ON(ret);
6467 set_avail_alloc_bits(extent_root->fs_info, type); 5928 set_avail_alloc_bits(extent_root->fs_info, type);
6468 5929
6469 return 0; 5930 return 0;
@@ -6503,8 +5964,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
6503 spin_unlock(&block_group->space_info->lock); 5964 spin_unlock(&block_group->space_info->lock);
6504 block_group->space_info->full = 0; 5965 block_group->space_info->full = 0;
6505 5966
6506 put_block_group(block_group); 5967 btrfs_put_block_group(block_group);
6507 put_block_group(block_group); 5968 btrfs_put_block_group(block_group);
6508 5969
6509 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 5970 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
6510 if (ret > 0) 5971 if (ret > 0)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ebe6b29e6069..eb2bee8b7fbf 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2884,25 +2884,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2884 disko = 0; 2884 disko = 0;
2885 flags = 0; 2885 flags = 0;
2886 2886
2887 switch (em->block_start) { 2887 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
2888 case EXTENT_MAP_LAST_BYTE:
2889 end = 1; 2888 end = 1;
2890 flags |= FIEMAP_EXTENT_LAST; 2889 flags |= FIEMAP_EXTENT_LAST;
2891 break; 2890 } else if (em->block_start == EXTENT_MAP_HOLE) {
2892 case EXTENT_MAP_HOLE:
2893 flags |= FIEMAP_EXTENT_UNWRITTEN; 2891 flags |= FIEMAP_EXTENT_UNWRITTEN;
2894 break; 2892 } else if (em->block_start == EXTENT_MAP_INLINE) {
2895 case EXTENT_MAP_INLINE:
2896 flags |= (FIEMAP_EXTENT_DATA_INLINE | 2893 flags |= (FIEMAP_EXTENT_DATA_INLINE |
2897 FIEMAP_EXTENT_NOT_ALIGNED); 2894 FIEMAP_EXTENT_NOT_ALIGNED);
2898 break; 2895 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
2899 case EXTENT_MAP_DELALLOC:
2900 flags |= (FIEMAP_EXTENT_DELALLOC | 2896 flags |= (FIEMAP_EXTENT_DELALLOC |
2901 FIEMAP_EXTENT_UNKNOWN); 2897 FIEMAP_EXTENT_UNKNOWN);
2902 break; 2898 } else {
2903 default:
2904 disko = em->block_start; 2899 disko = em->block_start;
2905 break;
2906 } 2900 }
2907 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2901 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2908 flags |= FIEMAP_EXTENT_ENCODED; 2902 flags |= FIEMAP_EXTENT_ENCODED;
@@ -3124,20 +3118,15 @@ void free_extent_buffer(struct extent_buffer *eb)
3124int clear_extent_buffer_dirty(struct extent_io_tree *tree, 3118int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3125 struct extent_buffer *eb) 3119 struct extent_buffer *eb)
3126{ 3120{
3127 int set;
3128 unsigned long i; 3121 unsigned long i;
3129 unsigned long num_pages; 3122 unsigned long num_pages;
3130 struct page *page; 3123 struct page *page;
3131 3124
3132 u64 start = eb->start;
3133 u64 end = start + eb->len - 1;
3134
3135 set = clear_extent_dirty(tree, start, end, GFP_NOFS);
3136 num_pages = num_extent_pages(eb->start, eb->len); 3125 num_pages = num_extent_pages(eb->start, eb->len);
3137 3126
3138 for (i = 0; i < num_pages; i++) { 3127 for (i = 0; i < num_pages; i++) {
3139 page = extent_buffer_page(eb, i); 3128 page = extent_buffer_page(eb, i);
3140 if (!set && !PageDirty(page)) 3129 if (!PageDirty(page))
3141 continue; 3130 continue;
3142 3131
3143 lock_page(page); 3132 lock_page(page);
@@ -3146,22 +3135,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3146 else 3135 else
3147 set_page_private(page, EXTENT_PAGE_PRIVATE); 3136 set_page_private(page, EXTENT_PAGE_PRIVATE);
3148 3137
3149 /*
3150 * if we're on the last page or the first page and the
3151 * block isn't aligned on a page boundary, do extra checks
3152 * to make sure we don't clean page that is partially dirty
3153 */
3154 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3155 ((i == num_pages - 1) &&
3156 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3157 start = (u64)page->index << PAGE_CACHE_SHIFT;
3158 end = start + PAGE_CACHE_SIZE - 1;
3159 if (test_range_bit(tree, start, end,
3160 EXTENT_DIRTY, 0)) {
3161 unlock_page(page);
3162 continue;
3163 }
3164 }
3165 clear_page_dirty_for_io(page); 3138 clear_page_dirty_for_io(page);
3166 spin_lock_irq(&page->mapping->tree_lock); 3139 spin_lock_irq(&page->mapping->tree_lock);
3167 if (!PageDirty(page)) { 3140 if (!PageDirty(page)) {
@@ -3187,29 +3160,13 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
3187{ 3160{
3188 unsigned long i; 3161 unsigned long i;
3189 unsigned long num_pages; 3162 unsigned long num_pages;
3163 int was_dirty = 0;
3190 3164
3165 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3191 num_pages = num_extent_pages(eb->start, eb->len); 3166 num_pages = num_extent_pages(eb->start, eb->len);
3192 for (i = 0; i < num_pages; i++) { 3167 for (i = 0; i < num_pages; i++)
3193 struct page *page = extent_buffer_page(eb, i);
3194 /* writepage may need to do something special for the
3195 * first page, we have to make sure page->private is
3196 * properly set. releasepage may drop page->private
3197 * on us if the page isn't already dirty.
3198 */
3199 lock_page(page);
3200 if (i == 0) {
3201 set_page_extent_head(page, eb->len);
3202 } else if (PagePrivate(page) &&
3203 page->private != EXTENT_PAGE_PRIVATE) {
3204 set_page_extent_mapped(page);
3205 }
3206 __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); 3168 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3207 set_extent_dirty(tree, page_offset(page), 3169 return was_dirty;
3208 page_offset(page) + PAGE_CACHE_SIZE - 1,
3209 GFP_NOFS);
3210 unlock_page(page);
3211 }
3212 return 0;
3213} 3170}
3214 3171
3215int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 3172int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
@@ -3789,6 +3746,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3789 ret = 0; 3746 ret = 0;
3790 goto out; 3747 goto out;
3791 } 3748 }
3749 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3750 ret = 0;
3751 goto out;
3752 }
3792 /* at this point we can safely release the extent buffer */ 3753 /* at this point we can safely release the extent buffer */
3793 num_pages = num_extent_pages(eb->start, eb->len); 3754 num_pages = num_extent_pages(eb->start, eb->len);
3794 for (i = 0; i < num_pages; i++) 3755 for (i = 0; i < num_pages; i++)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 1f9df88afbf6..5bc20abf3f3d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -25,6 +25,7 @@
25/* these are bit numbers for test/set bit */ 25/* these are bit numbers for test/set bit */
26#define EXTENT_BUFFER_UPTODATE 0 26#define EXTENT_BUFFER_UPTODATE 0
27#define EXTENT_BUFFER_BLOCKING 1 27#define EXTENT_BUFFER_BLOCKING 1
28#define EXTENT_BUFFER_DIRTY 2
28 29
29/* 30/*
30 * page->private values. Every page that is controlled by the extent 31 * page->private values. Every page that is controlled by the extent
@@ -254,6 +255,8 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
254 struct extent_buffer *eb); 255 struct extent_buffer *eb);
255int set_extent_buffer_dirty(struct extent_io_tree *tree, 256int set_extent_buffer_dirty(struct extent_io_tree *tree,
256 struct extent_buffer *eb); 257 struct extent_buffer *eb);
258int test_extent_buffer_dirty(struct extent_io_tree *tree,
259 struct extent_buffer *eb);
257int set_extent_buffer_uptodate(struct extent_io_tree *tree, 260int set_extent_buffer_uptodate(struct extent_io_tree *tree,
258 struct extent_buffer *eb); 261 struct extent_buffer *eb);
259int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 262int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 50da69da20ce..b187917b36fa 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -234,7 +234,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
234 rb = tree_insert(&tree->map, em->start, &em->rb_node); 234 rb = tree_insert(&tree->map, em->start, &em->rb_node);
235 if (rb) { 235 if (rb) {
236 ret = -EEXIST; 236 ret = -EEXIST;
237 free_extent_map(merge);
238 goto out; 237 goto out;
239 } 238 }
240 atomic_inc(&em->refs); 239 atomic_inc(&em->refs);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 964652435fd1..9b99886562d0 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -52,6 +52,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
52 file_key.offset = pos; 52 file_key.offset = pos;
53 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); 53 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
54 54
55 path->leave_spinning = 1;
55 ret = btrfs_insert_empty_item(trans, root, path, &file_key, 56 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
56 sizeof(*item)); 57 sizeof(*item));
57 if (ret < 0) 58 if (ret < 0)
@@ -523,6 +524,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
523 key.offset = end_byte - 1; 524 key.offset = end_byte - 1;
524 key.type = BTRFS_EXTENT_CSUM_KEY; 525 key.type = BTRFS_EXTENT_CSUM_KEY;
525 526
527 path->leave_spinning = 1;
526 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 528 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
527 if (ret > 0) { 529 if (ret > 0) {
528 if (path->slots[0] == 0) 530 if (path->slots[0] == 0)
@@ -757,8 +759,10 @@ insert:
757 } else { 759 } else {
758 ins_size = csum_size; 760 ins_size = csum_size;
759 } 761 }
762 path->leave_spinning = 1;
760 ret = btrfs_insert_empty_item(trans, root, path, &file_key, 763 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
761 ins_size); 764 ins_size);
765 path->leave_spinning = 0;
762 if (ret < 0) 766 if (ret < 0)
763 goto fail_unlock; 767 goto fail_unlock;
764 if (ret != 0) { 768 if (ret != 0) {
@@ -776,7 +780,6 @@ found:
776 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + 780 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
777 btrfs_item_size_nr(leaf, path->slots[0])); 781 btrfs_item_size_nr(leaf, path->slots[0]));
778 eb_token = NULL; 782 eb_token = NULL;
779 cond_resched();
780next_sector: 783next_sector:
781 784
782 if (!eb_token || 785 if (!eb_token ||
@@ -817,9 +820,9 @@ next_sector:
817 eb_token = NULL; 820 eb_token = NULL;
818 } 821 }
819 btrfs_mark_buffer_dirty(path->nodes[0]); 822 btrfs_mark_buffer_dirty(path->nodes[0]);
820 cond_resched();
821 if (total_bytes < sums->len) { 823 if (total_bytes < sums->len) {
822 btrfs_release_path(root, path); 824 btrfs_release_path(root, path);
825 cond_resched();
823 goto again; 826 goto again;
824 } 827 }
825out: 828out:
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index dc78954861b3..9c9fb46ccd08 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -606,6 +606,7 @@ next_slot:
606 btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY); 606 btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
607 607
608 btrfs_release_path(root, path); 608 btrfs_release_path(root, path);
609 path->leave_spinning = 1;
609 ret = btrfs_insert_empty_item(trans, root, path, &ins, 610 ret = btrfs_insert_empty_item(trans, root, path, &ins,
610 sizeof(*extent)); 611 sizeof(*extent));
611 BUG_ON(ret); 612 BUG_ON(ret);
@@ -639,17 +640,22 @@ next_slot:
639 ram_bytes); 640 ram_bytes);
640 btrfs_set_file_extent_type(leaf, extent, found_type); 641 btrfs_set_file_extent_type(leaf, extent, found_type);
641 642
643 btrfs_unlock_up_safe(path, 1);
642 btrfs_mark_buffer_dirty(path->nodes[0]); 644 btrfs_mark_buffer_dirty(path->nodes[0]);
645 btrfs_set_lock_blocking(path->nodes[0]);
643 646
644 if (disk_bytenr != 0) { 647 if (disk_bytenr != 0) {
645 ret = btrfs_update_extent_ref(trans, root, 648 ret = btrfs_update_extent_ref(trans, root,
646 disk_bytenr, orig_parent, 649 disk_bytenr,
650 le64_to_cpu(old.disk_num_bytes),
651 orig_parent,
647 leaf->start, 652 leaf->start,
648 root->root_key.objectid, 653 root->root_key.objectid,
649 trans->transid, ins.objectid); 654 trans->transid, ins.objectid);
650 655
651 BUG_ON(ret); 656 BUG_ON(ret);
652 } 657 }
658 path->leave_spinning = 0;
653 btrfs_release_path(root, path); 659 btrfs_release_path(root, path);
654 if (disk_bytenr != 0) 660 if (disk_bytenr != 0)
655 inode_add_bytes(inode, extent_end - end); 661 inode_add_bytes(inode, extent_end - end);
@@ -912,7 +918,7 @@ again:
912 btrfs_set_file_extent_other_encoding(leaf, fi, 0); 918 btrfs_set_file_extent_other_encoding(leaf, fi, 0);
913 919
914 if (orig_parent != leaf->start) { 920 if (orig_parent != leaf->start) {
915 ret = btrfs_update_extent_ref(trans, root, bytenr, 921 ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
916 orig_parent, leaf->start, 922 orig_parent, leaf->start,
917 root->root_key.objectid, 923 root->root_key.objectid,
918 trans->transid, inode->i_ino); 924 trans->transid, inode->i_ino);
@@ -1155,6 +1161,20 @@ out_nolock:
1155 page_cache_release(pinned[1]); 1161 page_cache_release(pinned[1]);
1156 *ppos = pos; 1162 *ppos = pos;
1157 1163
1164 /*
1165 * we want to make sure fsync finds this change
1166 * but we haven't joined a transaction running right now.
1167 *
1168 * Later on, someone is sure to update the inode and get the
1169 * real transid recorded.
1170 *
1171 * We set last_trans now to the fs_info generation + 1,
1172 * this will either be one more than the running transaction
1173 * or the generation used for the next transaction if there isn't
1174 * one running right now.
1175 */
1176 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1177
1158 if (num_written > 0 && will_write) { 1178 if (num_written > 0 && will_write) {
1159 struct btrfs_trans_handle *trans; 1179 struct btrfs_trans_handle *trans;
1160 1180
@@ -1167,8 +1187,11 @@ out_nolock:
1167 ret = btrfs_log_dentry_safe(trans, root, 1187 ret = btrfs_log_dentry_safe(trans, root,
1168 file->f_dentry); 1188 file->f_dentry);
1169 if (ret == 0) { 1189 if (ret == 0) {
1170 btrfs_sync_log(trans, root); 1190 ret = btrfs_sync_log(trans, root);
1171 btrfs_end_transaction(trans, root); 1191 if (ret == 0)
1192 btrfs_end_transaction(trans, root);
1193 else
1194 btrfs_commit_transaction(trans, root);
1172 } else { 1195 } else {
1173 btrfs_commit_transaction(trans, root); 1196 btrfs_commit_transaction(trans, root);
1174 } 1197 }
@@ -1185,6 +1208,18 @@ out_nolock:
1185 1208
1186int btrfs_release_file(struct inode *inode, struct file *filp) 1209int btrfs_release_file(struct inode *inode, struct file *filp)
1187{ 1210{
1211 /*
1212 * ordered_data_close is set by settattr when we are about to truncate
1213 * a file from a non-zero size to a zero size. This tries to
1214 * flush down new bytes that may have been written if the
1215 * application were using truncate to replace a file in place.
1216 */
1217 if (BTRFS_I(inode)->ordered_data_close) {
1218 BTRFS_I(inode)->ordered_data_close = 0;
1219 btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
1220 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1221 filemap_flush(inode->i_mapping);
1222 }
1188 if (filp->private_data) 1223 if (filp->private_data)
1189 btrfs_ioctl_trans_end(filp); 1224 btrfs_ioctl_trans_end(filp);
1190 return 0; 1225 return 0;
@@ -1260,8 +1295,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1260 if (ret > 0) { 1295 if (ret > 0) {
1261 ret = btrfs_commit_transaction(trans, root); 1296 ret = btrfs_commit_transaction(trans, root);
1262 } else { 1297 } else {
1263 btrfs_sync_log(trans, root); 1298 ret = btrfs_sync_log(trans, root);
1264 ret = btrfs_end_transaction(trans, root); 1299 if (ret == 0)
1300 ret = btrfs_end_transaction(trans, root);
1301 else
1302 ret = btrfs_commit_transaction(trans, root);
1265 } 1303 }
1266 mutex_lock(&dentry->d_inode->i_mutex); 1304 mutex_lock(&dentry->d_inode->i_mutex);
1267out: 1305out:
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d1e5f0e84c58..768b9523662d 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,15 @@
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include "ctree.h" 20#include "ctree.h"
21#include "free-space-cache.h"
22#include "transaction.h"
23
24struct btrfs_free_space {
25 struct rb_node bytes_index;
26 struct rb_node offset_index;
27 u64 offset;
28 u64 bytes;
29};
21 30
22static int tree_insert_offset(struct rb_root *root, u64 offset, 31static int tree_insert_offset(struct rb_root *root, u64 offset,
23 struct rb_node *node) 32 struct rb_node *node)
@@ -68,14 +77,24 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
68} 77}
69 78
70/* 79/*
71 * searches the tree for the given offset. If contains is set we will return 80 * searches the tree for the given offset.
72 * the free space that contains the given offset. If contains is not set we 81 *
73 * will return the free space that starts at or after the given offset and is 82 * fuzzy == 1: this is used for allocations where we are given a hint of where
74 * at least bytes long. 83 * to look for free space. Because the hint may not be completely on an offset
84 * mark, or the hint may no longer point to free space we need to fudge our
85 * results a bit. So we look for free space starting at or after offset with at
86 * least bytes size. We prefer to find as close to the given offset as we can.
87 * Also if the offset is within a free space range, then we will return the free
88 * space that contains the given offset, which means we can return a free space
89 * chunk with an offset before the provided offset.
90 *
91 * fuzzy == 0: this is just a normal tree search. Give us the free space that
92 * starts at the given offset which is at least bytes size, and if its not there
93 * return NULL.
75 */ 94 */
76static struct btrfs_free_space *tree_search_offset(struct rb_root *root, 95static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
77 u64 offset, u64 bytes, 96 u64 offset, u64 bytes,
78 int contains) 97 int fuzzy)
79{ 98{
80 struct rb_node *n = root->rb_node; 99 struct rb_node *n = root->rb_node;
81 struct btrfs_free_space *entry, *ret = NULL; 100 struct btrfs_free_space *entry, *ret = NULL;
@@ -84,13 +103,14 @@ static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
84 entry = rb_entry(n, struct btrfs_free_space, offset_index); 103 entry = rb_entry(n, struct btrfs_free_space, offset_index);
85 104
86 if (offset < entry->offset) { 105 if (offset < entry->offset) {
87 if (!contains && 106 if (fuzzy &&
88 (!ret || entry->offset < ret->offset) && 107 (!ret || entry->offset < ret->offset) &&
89 (bytes <= entry->bytes)) 108 (bytes <= entry->bytes))
90 ret = entry; 109 ret = entry;
91 n = n->rb_left; 110 n = n->rb_left;
92 } else if (offset > entry->offset) { 111 } else if (offset > entry->offset) {
93 if ((entry->offset + entry->bytes - 1) >= offset && 112 if (fuzzy &&
113 (entry->offset + entry->bytes - 1) >= offset &&
94 bytes <= entry->bytes) { 114 bytes <= entry->bytes) {
95 ret = entry; 115 ret = entry;
96 break; 116 break;
@@ -171,6 +191,7 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
171 int ret = 0; 191 int ret = 0;
172 192
173 193
194 BUG_ON(!info->bytes);
174 ret = tree_insert_offset(&block_group->free_space_offset, info->offset, 195 ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
175 &info->offset_index); 196 &info->offset_index);
176 if (ret) 197 if (ret)
@@ -184,108 +205,70 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
184 return ret; 205 return ret;
185} 206}
186 207
187static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 208int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
188 u64 offset, u64 bytes) 209 u64 offset, u64 bytes)
189{ 210{
190 struct btrfs_free_space *right_info; 211 struct btrfs_free_space *right_info;
191 struct btrfs_free_space *left_info; 212 struct btrfs_free_space *left_info;
192 struct btrfs_free_space *info = NULL; 213 struct btrfs_free_space *info = NULL;
193 struct btrfs_free_space *alloc_info;
194 int ret = 0; 214 int ret = 0;
195 215
196 alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); 216 info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
197 if (!alloc_info) 217 if (!info)
198 return -ENOMEM; 218 return -ENOMEM;
199 219
220 info->offset = offset;
221 info->bytes = bytes;
222
223 spin_lock(&block_group->tree_lock);
224
200 /* 225 /*
201 * first we want to see if there is free space adjacent to the range we 226 * first we want to see if there is free space adjacent to the range we
202 * are adding, if there is remove that struct and add a new one to 227 * are adding, if there is remove that struct and add a new one to
203 * cover the entire range 228 * cover the entire range
204 */ 229 */
205 right_info = tree_search_offset(&block_group->free_space_offset, 230 right_info = tree_search_offset(&block_group->free_space_offset,
206 offset+bytes, 0, 1); 231 offset+bytes, 0, 0);
207 left_info = tree_search_offset(&block_group->free_space_offset, 232 left_info = tree_search_offset(&block_group->free_space_offset,
208 offset-1, 0, 1); 233 offset-1, 0, 1);
209 234
210 if (right_info && right_info->offset == offset+bytes) { 235 if (right_info) {
211 unlink_free_space(block_group, right_info); 236 unlink_free_space(block_group, right_info);
212 info = right_info; 237 info->bytes += right_info->bytes;
213 info->offset = offset; 238 kfree(right_info);
214 info->bytes += bytes;
215 } else if (right_info && right_info->offset != offset+bytes) {
216 printk(KERN_ERR "btrfs adding space in the middle of an "
217 "existing free space area. existing: "
218 "offset=%llu, bytes=%llu. new: offset=%llu, "
219 "bytes=%llu\n", (unsigned long long)right_info->offset,
220 (unsigned long long)right_info->bytes,
221 (unsigned long long)offset,
222 (unsigned long long)bytes);
223 BUG();
224 } 239 }
225 240
226 if (left_info) { 241 if (left_info && left_info->offset + left_info->bytes == offset) {
227 unlink_free_space(block_group, left_info); 242 unlink_free_space(block_group, left_info);
228 243 info->offset = left_info->offset;
229 if (unlikely((left_info->offset + left_info->bytes) != 244 info->bytes += left_info->bytes;
230 offset)) { 245 kfree(left_info);
231 printk(KERN_ERR "btrfs free space to the left "
232 "of new free space isn't "
233 "quite right. existing: offset=%llu, "
234 "bytes=%llu. new: offset=%llu, bytes=%llu\n",
235 (unsigned long long)left_info->offset,
236 (unsigned long long)left_info->bytes,
237 (unsigned long long)offset,
238 (unsigned long long)bytes);
239 BUG();
240 }
241
242 if (info) {
243 info->offset = left_info->offset;
244 info->bytes += left_info->bytes;
245 kfree(left_info);
246 } else {
247 info = left_info;
248 info->bytes += bytes;
249 }
250 } 246 }
251 247
252 if (info) {
253 ret = link_free_space(block_group, info);
254 if (!ret)
255 info = NULL;
256 goto out;
257 }
258
259 info = alloc_info;
260 alloc_info = NULL;
261 info->offset = offset;
262 info->bytes = bytes;
263
264 ret = link_free_space(block_group, info); 248 ret = link_free_space(block_group, info);
265 if (ret) 249 if (ret)
266 kfree(info); 250 kfree(info);
267out: 251
252 spin_unlock(&block_group->tree_lock);
253
268 if (ret) { 254 if (ret) {
269 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); 255 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
270 if (ret == -EEXIST) 256 BUG_ON(ret == -EEXIST);
271 BUG();
272 } 257 }
273 258
274 kfree(alloc_info);
275
276 return ret; 259 return ret;
277} 260}
278 261
279static int 262int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
280__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 263 u64 offset, u64 bytes)
281 u64 offset, u64 bytes)
282{ 264{
283 struct btrfs_free_space *info; 265 struct btrfs_free_space *info;
284 int ret = 0; 266 int ret = 0;
285 267
268 spin_lock(&block_group->tree_lock);
269
286 info = tree_search_offset(&block_group->free_space_offset, offset, 0, 270 info = tree_search_offset(&block_group->free_space_offset, offset, 0,
287 1); 271 1);
288
289 if (info && info->offset == offset) { 272 if (info && info->offset == offset) {
290 if (info->bytes < bytes) { 273 if (info->bytes < bytes) {
291 printk(KERN_ERR "Found free space at %llu, size %llu," 274 printk(KERN_ERR "Found free space at %llu, size %llu,"
@@ -295,12 +278,14 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
295 (unsigned long long)bytes); 278 (unsigned long long)bytes);
296 WARN_ON(1); 279 WARN_ON(1);
297 ret = -EINVAL; 280 ret = -EINVAL;
281 spin_unlock(&block_group->tree_lock);
298 goto out; 282 goto out;
299 } 283 }
300 unlink_free_space(block_group, info); 284 unlink_free_space(block_group, info);
301 285
302 if (info->bytes == bytes) { 286 if (info->bytes == bytes) {
303 kfree(info); 287 kfree(info);
288 spin_unlock(&block_group->tree_lock);
304 goto out; 289 goto out;
305 } 290 }
306 291
@@ -308,6 +293,7 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
308 info->bytes -= bytes; 293 info->bytes -= bytes;
309 294
310 ret = link_free_space(block_group, info); 295 ret = link_free_space(block_group, info);
296 spin_unlock(&block_group->tree_lock);
311 BUG_ON(ret); 297 BUG_ON(ret);
312 } else if (info && info->offset < offset && 298 } else if (info && info->offset < offset &&
313 info->offset + info->bytes >= offset + bytes) { 299 info->offset + info->bytes >= offset + bytes) {
@@ -333,70 +319,33 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
333 */ 319 */
334 kfree(info); 320 kfree(info);
335 } 321 }
336 322 spin_unlock(&block_group->tree_lock);
337 /* step two, insert a new info struct to cover anything 323 /* step two, insert a new info struct to cover anything
338 * before the hole 324 * before the hole
339 */ 325 */
340 ret = __btrfs_add_free_space(block_group, old_start, 326 ret = btrfs_add_free_space(block_group, old_start,
341 offset - old_start); 327 offset - old_start);
342 BUG_ON(ret); 328 BUG_ON(ret);
343 } else { 329 } else {
330 spin_unlock(&block_group->tree_lock);
331 if (!info) {
332 printk(KERN_ERR "couldn't find space %llu to free\n",
333 (unsigned long long)offset);
334 printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
335 block_group->cached, block_group->key.objectid,
336 block_group->key.offset);
337 btrfs_dump_free_space(block_group, bytes);
338 } else if (info) {
339 printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
340 "but wanted offset=%llu bytes=%llu\n",
341 info->offset, info->bytes, offset, bytes);
342 }
344 WARN_ON(1); 343 WARN_ON(1);
345 } 344 }
346out: 345out:
347 return ret; 346 return ret;
348} 347}
349 348
350int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
351 u64 offset, u64 bytes)
352{
353 int ret;
354 struct btrfs_free_space *sp;
355
356 mutex_lock(&block_group->alloc_mutex);
357 ret = __btrfs_add_free_space(block_group, offset, bytes);
358 sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
359 BUG_ON(!sp);
360 mutex_unlock(&block_group->alloc_mutex);
361
362 return ret;
363}
364
365int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
366 u64 offset, u64 bytes)
367{
368 int ret;
369 struct btrfs_free_space *sp;
370
371 ret = __btrfs_add_free_space(block_group, offset, bytes);
372 sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
373 BUG_ON(!sp);
374
375 return ret;
376}
377
378int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
379 u64 offset, u64 bytes)
380{
381 int ret = 0;
382
383 mutex_lock(&block_group->alloc_mutex);
384 ret = __btrfs_remove_free_space(block_group, offset, bytes);
385 mutex_unlock(&block_group->alloc_mutex);
386
387 return ret;
388}
389
390int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
391 u64 offset, u64 bytes)
392{
393 int ret;
394
395 ret = __btrfs_remove_free_space(block_group, offset, bytes);
396
397 return ret;
398}
399
400void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, 349void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
401 u64 bytes) 350 u64 bytes)
402{ 351{
@@ -408,6 +357,8 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
408 info = rb_entry(n, struct btrfs_free_space, offset_index); 357 info = rb_entry(n, struct btrfs_free_space, offset_index);
409 if (info->bytes >= bytes) 358 if (info->bytes >= bytes)
410 count++; 359 count++;
360 printk(KERN_ERR "entry offset %llu, bytes %llu\n", info->offset,
361 info->bytes);
411 } 362 }
412 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" 363 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
413 "\n", count); 364 "\n", count);
@@ -428,68 +379,337 @@ u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
428 return ret; 379 return ret;
429} 380}
430 381
382/*
383 * for a given cluster, put all of its extents back into the free
384 * space cache. If the block group passed doesn't match the block group
385 * pointed to by the cluster, someone else raced in and freed the
386 * cluster already. In that case, we just return without changing anything
387 */
388static int
389__btrfs_return_cluster_to_free_space(
390 struct btrfs_block_group_cache *block_group,
391 struct btrfs_free_cluster *cluster)
392{
393 struct btrfs_free_space *entry;
394 struct rb_node *node;
395
396 spin_lock(&cluster->lock);
397 if (cluster->block_group != block_group)
398 goto out;
399
400 cluster->window_start = 0;
401 node = rb_first(&cluster->root);
402 while(node) {
403 entry = rb_entry(node, struct btrfs_free_space, offset_index);
404 node = rb_next(&entry->offset_index);
405 rb_erase(&entry->offset_index, &cluster->root);
406 link_free_space(block_group, entry);
407 }
408 list_del_init(&cluster->block_group_list);
409
410 btrfs_put_block_group(cluster->block_group);
411 cluster->block_group = NULL;
412 cluster->root.rb_node = NULL;
413out:
414 spin_unlock(&cluster->lock);
415 return 0;
416}
417
431void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) 418void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
432{ 419{
433 struct btrfs_free_space *info; 420 struct btrfs_free_space *info;
434 struct rb_node *node; 421 struct rb_node *node;
422 struct btrfs_free_cluster *cluster;
423 struct btrfs_free_cluster *safe;
424
425 spin_lock(&block_group->tree_lock);
426
427 list_for_each_entry_safe(cluster, safe, &block_group->cluster_list,
428 block_group_list) {
429
430 WARN_ON(cluster->block_group != block_group);
431 __btrfs_return_cluster_to_free_space(block_group, cluster);
432 }
435 433
436 mutex_lock(&block_group->alloc_mutex);
437 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { 434 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
438 info = rb_entry(node, struct btrfs_free_space, bytes_index); 435 info = rb_entry(node, struct btrfs_free_space, bytes_index);
439 unlink_free_space(block_group, info); 436 unlink_free_space(block_group, info);
440 kfree(info); 437 kfree(info);
441 if (need_resched()) { 438 if (need_resched()) {
442 mutex_unlock(&block_group->alloc_mutex); 439 spin_unlock(&block_group->tree_lock);
443 cond_resched(); 440 cond_resched();
444 mutex_lock(&block_group->alloc_mutex); 441 spin_lock(&block_group->tree_lock);
445 } 442 }
446 } 443 }
447 mutex_unlock(&block_group->alloc_mutex); 444 spin_unlock(&block_group->tree_lock);
448} 445}
449 446
450#if 0 447u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
451static struct btrfs_free_space *btrfs_find_free_space_offset(struct 448 u64 offset, u64 bytes, u64 empty_size)
452 btrfs_block_group_cache
453 *block_group, u64 offset,
454 u64 bytes)
455{ 449{
456 struct btrfs_free_space *ret; 450 struct btrfs_free_space *entry = NULL;
451 u64 ret = 0;
457 452
458 mutex_lock(&block_group->alloc_mutex); 453 spin_lock(&block_group->tree_lock);
459 ret = tree_search_offset(&block_group->free_space_offset, offset, 454 entry = tree_search_offset(&block_group->free_space_offset, offset,
460 bytes, 0); 455 bytes + empty_size, 1);
461 mutex_unlock(&block_group->alloc_mutex); 456 if (!entry)
457 entry = tree_search_bytes(&block_group->free_space_bytes,
458 offset, bytes + empty_size);
459 if (entry) {
460 unlink_free_space(block_group, entry);
461 ret = entry->offset;
462 entry->offset += bytes;
463 entry->bytes -= bytes;
464
465 if (!entry->bytes)
466 kfree(entry);
467 else
468 link_free_space(block_group, entry);
469 }
470 spin_unlock(&block_group->tree_lock);
462 471
463 return ret; 472 return ret;
464} 473}
465 474
466static struct btrfs_free_space *btrfs_find_free_space_bytes(struct 475/*
467 btrfs_block_group_cache 476 * given a cluster, put all of its extents back into the free space
468 *block_group, u64 offset, 477 * cache. If a block group is passed, this function will only free
469 u64 bytes) 478 * a cluster that belongs to the passed block group.
479 *
480 * Otherwise, it'll get a reference on the block group pointed to by the
481 * cluster and remove the cluster from it.
482 */
483int btrfs_return_cluster_to_free_space(
484 struct btrfs_block_group_cache *block_group,
485 struct btrfs_free_cluster *cluster)
470{ 486{
471 struct btrfs_free_space *ret; 487 int ret;
472 488
473 mutex_lock(&block_group->alloc_mutex); 489 /* first, get a safe pointer to the block group */
490 spin_lock(&cluster->lock);
491 if (!block_group) {
492 block_group = cluster->block_group;
493 if (!block_group) {
494 spin_unlock(&cluster->lock);
495 return 0;
496 }
497 } else if (cluster->block_group != block_group) {
498 /* someone else has already freed it don't redo their work */
499 spin_unlock(&cluster->lock);
500 return 0;
501 }
502 atomic_inc(&block_group->count);
503 spin_unlock(&cluster->lock);
474 504
475 ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes); 505 /* now return any extents the cluster had on it */
476 mutex_unlock(&block_group->alloc_mutex); 506 spin_lock(&block_group->tree_lock);
507 ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
508 spin_unlock(&block_group->tree_lock);
477 509
510 /* finally drop our ref */
511 btrfs_put_block_group(block_group);
478 return ret; 512 return ret;
479} 513}
480#endif
481 514
482struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache 515/*
483 *block_group, u64 offset, 516 * given a cluster, try to allocate 'bytes' from it, returns 0
484 u64 bytes) 517 * if it couldn't find anything suitably large, or a logical disk offset
518 * if things worked out
519 */
520u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
521 struct btrfs_free_cluster *cluster, u64 bytes,
522 u64 min_start)
523{
524 struct btrfs_free_space *entry = NULL;
525 struct rb_node *node;
526 u64 ret = 0;
527
528 spin_lock(&cluster->lock);
529 if (bytes > cluster->max_size)
530 goto out;
531
532 if (cluster->block_group != block_group)
533 goto out;
534
535 node = rb_first(&cluster->root);
536 if (!node)
537 goto out;
538
539 entry = rb_entry(node, struct btrfs_free_space, offset_index);
540
541 while(1) {
542 if (entry->bytes < bytes || entry->offset < min_start) {
543 struct rb_node *node;
544
545 node = rb_next(&entry->offset_index);
546 if (!node)
547 break;
548 entry = rb_entry(node, struct btrfs_free_space,
549 offset_index);
550 continue;
551 }
552 ret = entry->offset;
553
554 entry->offset += bytes;
555 entry->bytes -= bytes;
556
557 if (entry->bytes == 0) {
558 rb_erase(&entry->offset_index, &cluster->root);
559 kfree(entry);
560 }
561 break;
562 }
563out:
564 spin_unlock(&cluster->lock);
565 return ret;
566}
567
568/*
569 * here we try to find a cluster of blocks in a block group. The goal
570 * is to find at least bytes free and up to empty_size + bytes free.
571 * We might not find them all in one contiguous area.
572 *
573 * returns zero and sets up cluster if things worked out, otherwise
574 * it returns -enospc
575 */
576int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
577 struct btrfs_block_group_cache *block_group,
578 struct btrfs_free_cluster *cluster,
579 u64 offset, u64 bytes, u64 empty_size)
485{ 580{
486 struct btrfs_free_space *ret = NULL; 581 struct btrfs_free_space *entry = NULL;
582 struct rb_node *node;
583 struct btrfs_free_space *next;
584 struct btrfs_free_space *last;
585 u64 min_bytes;
586 u64 window_start;
587 u64 window_free;
588 u64 max_extent = 0;
589 int total_retries = 0;
590 int ret;
591
592 /* for metadata, allow allocates with more holes */
593 if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
594 /*
595 * we want to do larger allocations when we are
596 * flushing out the delayed refs, it helps prevent
597 * making more work as we go along.
598 */
599 if (trans->transaction->delayed_refs.flushing)
600 min_bytes = max(bytes, (bytes + empty_size) >> 1);
601 else
602 min_bytes = max(bytes, (bytes + empty_size) >> 4);
603 } else
604 min_bytes = max(bytes, (bytes + empty_size) >> 2);
605
606 spin_lock(&block_group->tree_lock);
607 spin_lock(&cluster->lock);
608
609 /* someone already found a cluster, hooray */
610 if (cluster->block_group) {
611 ret = 0;
612 goto out;
613 }
614again:
615 min_bytes = min(min_bytes, bytes + empty_size);
616 entry = tree_search_bytes(&block_group->free_space_bytes,
617 offset, min_bytes);
618 if (!entry) {
619 ret = -ENOSPC;
620 goto out;
621 }
622 window_start = entry->offset;
623 window_free = entry->bytes;
624 last = entry;
625 max_extent = entry->bytes;
626
627 while(1) {
628 /* out window is just right, lets fill it */
629 if (window_free >= bytes + empty_size)
630 break;
487 631
488 ret = tree_search_offset(&block_group->free_space_offset, offset, 632 node = rb_next(&last->offset_index);
489 bytes, 0); 633 if (!node) {
490 if (!ret) 634 ret = -ENOSPC;
491 ret = tree_search_bytes(&block_group->free_space_bytes, 635 goto out;
492 offset, bytes); 636 }
637 next = rb_entry(node, struct btrfs_free_space, offset_index);
638
639 /*
640 * we haven't filled the empty size and the window is
641 * very large. reset and try again
642 */
643 if (next->offset - window_start > (bytes + empty_size) * 2) {
644 entry = next;
645 window_start = entry->offset;
646 window_free = entry->bytes;
647 last = entry;
648 max_extent = 0;
649 total_retries++;
650 if (total_retries % 256 == 0) {
651 if (min_bytes >= (bytes + empty_size)) {
652 ret = -ENOSPC;
653 goto out;
654 }
655 /*
656 * grow our allocation a bit, we're not having
657 * much luck
658 */
659 min_bytes *= 2;
660 goto again;
661 }
662 } else {
663 last = next;
664 window_free += next->bytes;
665 if (entry->bytes > max_extent)
666 max_extent = entry->bytes;
667 }
668 }
669
670 cluster->window_start = entry->offset;
671
672 /*
673 * now we've found our entries, pull them out of the free space
674 * cache and put them into the cluster rbtree
675 *
676 * The cluster includes an rbtree, but only uses the offset index
677 * of each free space cache entry.
678 */
679 while(1) {
680 node = rb_next(&entry->offset_index);
681 unlink_free_space(block_group, entry);
682 ret = tree_insert_offset(&cluster->root, entry->offset,
683 &entry->offset_index);
684 BUG_ON(ret);
685
686 if (!node || entry == last)
687 break;
688
689 entry = rb_entry(node, struct btrfs_free_space, offset_index);
690 }
691 ret = 0;
692 cluster->max_size = max_extent;
693 atomic_inc(&block_group->count);
694 list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
695 cluster->block_group = block_group;
696out:
697 spin_unlock(&cluster->lock);
698 spin_unlock(&block_group->tree_lock);
493 699
494 return ret; 700 return ret;
495} 701}
702
703/*
704 * simple code to zero out a cluster
705 */
706void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
707{
708 spin_lock_init(&cluster->lock);
709 spin_lock_init(&cluster->refill_lock);
710 cluster->root.rb_node = NULL;
711 cluster->max_size = 0;
712 INIT_LIST_HEAD(&cluster->block_group_list);
713 cluster->block_group = NULL;
714}
715
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
new file mode 100644
index 000000000000..ab0bdc0a63ce
--- /dev/null
+++ b/fs/btrfs/free-space-cache.h
@@ -0,0 +1,44 @@
1/*
2 * Copyright (C) 2009 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_FREE_SPACE_CACHE
20#define __BTRFS_FREE_SPACE_CACHE
21
22int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
23 u64 bytenr, u64 size);
24int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
25 u64 bytenr, u64 size);
26void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
27 *block_group);
28u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
29 u64 offset, u64 bytes, u64 empty_size);
30void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
31 u64 bytes);
32u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
33int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
34 struct btrfs_block_group_cache *block_group,
35 struct btrfs_free_cluster *cluster,
36 u64 offset, u64 bytes, u64 empty_size);
37void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
38u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
39 struct btrfs_free_cluster *cluster, u64 bytes,
40 u64 min_start);
41int btrfs_return_cluster_to_free_space(
42 struct btrfs_block_group_cache *block_group,
43 struct btrfs_free_cluster *cluster);
44#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 3d46fa1f29a4..6b627c611808 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -73,6 +73,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
73 if (!path) 73 if (!path)
74 return -ENOMEM; 74 return -ENOMEM;
75 75
76 path->leave_spinning = 1;
77
76 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 78 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
77 if (ret > 0) { 79 if (ret > 0) {
78 ret = -ENOENT; 80 ret = -ENOENT;
@@ -127,6 +129,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
127 if (!path) 129 if (!path)
128 return -ENOMEM; 130 return -ENOMEM;
129 131
132 path->leave_spinning = 1;
130 ret = btrfs_insert_empty_item(trans, root, path, &key, 133 ret = btrfs_insert_empty_item(trans, root, path, &key,
131 ins_len); 134 ins_len);
132 if (ret == -EEXIST) { 135 if (ret == -EEXIST) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7d4f948bc22a..a0d1dd492a58 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -134,6 +134,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
134 if (!path) 134 if (!path)
135 return -ENOMEM; 135 return -ENOMEM;
136 136
137 path->leave_spinning = 1;
137 btrfs_set_trans_block_group(trans, inode); 138 btrfs_set_trans_block_group(trans, inode);
138 139
139 key.objectid = inode->i_ino; 140 key.objectid = inode->i_ino;
@@ -167,9 +168,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
167 cur_size = min_t(unsigned long, compressed_size, 168 cur_size = min_t(unsigned long, compressed_size,
168 PAGE_CACHE_SIZE); 169 PAGE_CACHE_SIZE);
169 170
170 kaddr = kmap(cpage); 171 kaddr = kmap_atomic(cpage, KM_USER0);
171 write_extent_buffer(leaf, kaddr, ptr, cur_size); 172 write_extent_buffer(leaf, kaddr, ptr, cur_size);
172 kunmap(cpage); 173 kunmap_atomic(kaddr, KM_USER0);
173 174
174 i++; 175 i++;
175 ptr += cur_size; 176 ptr += cur_size;
@@ -204,7 +205,7 @@ fail:
204 * does the checks required to make sure the data is small enough 205 * does the checks required to make sure the data is small enough
205 * to fit as an inline extent. 206 * to fit as an inline extent.
206 */ 207 */
207static int cow_file_range_inline(struct btrfs_trans_handle *trans, 208static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
208 struct btrfs_root *root, 209 struct btrfs_root *root,
209 struct inode *inode, u64 start, u64 end, 210 struct inode *inode, u64 start, u64 end,
210 size_t compressed_size, 211 size_t compressed_size,
@@ -854,11 +855,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
854 u64 cur_end; 855 u64 cur_end;
855 int limit = 10 * 1024 * 1042; 856 int limit = 10 * 1024 * 1042;
856 857
857 if (!btrfs_test_opt(root, COMPRESS)) {
858 return cow_file_range(inode, locked_page, start, end,
859 page_started, nr_written, 1);
860 }
861
862 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | 858 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
863 EXTENT_DELALLOC, 1, 0, GFP_NOFS); 859 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
864 while (start < end) { 860 while (start < end) {
@@ -935,7 +931,8 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,
935 * If no cow copies or snapshots exist, we write directly to the existing 931 * If no cow copies or snapshots exist, we write directly to the existing
936 * blocks on disk 932 * blocks on disk
937 */ 933 */
938static int run_delalloc_nocow(struct inode *inode, struct page *locked_page, 934static noinline int run_delalloc_nocow(struct inode *inode,
935 struct page *locked_page,
939 u64 start, u64 end, int *page_started, int force, 936 u64 start, u64 end, int *page_started, int force,
940 unsigned long *nr_written) 937 unsigned long *nr_written)
941{ 938{
@@ -1133,6 +1130,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1133 unsigned long *nr_written) 1130 unsigned long *nr_written)
1134{ 1131{
1135 int ret; 1132 int ret;
1133 struct btrfs_root *root = BTRFS_I(inode)->root;
1136 1134
1137 if (btrfs_test_flag(inode, NODATACOW)) 1135 if (btrfs_test_flag(inode, NODATACOW))
1138 ret = run_delalloc_nocow(inode, locked_page, start, end, 1136 ret = run_delalloc_nocow(inode, locked_page, start, end,
@@ -1140,10 +1138,12 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1140 else if (btrfs_test_flag(inode, PREALLOC)) 1138 else if (btrfs_test_flag(inode, PREALLOC))
1141 ret = run_delalloc_nocow(inode, locked_page, start, end, 1139 ret = run_delalloc_nocow(inode, locked_page, start, end,
1142 page_started, 0, nr_written); 1140 page_started, 0, nr_written);
1141 else if (!btrfs_test_opt(root, COMPRESS))
1142 ret = cow_file_range(inode, locked_page, start, end,
1143 page_started, nr_written, 1);
1143 else 1144 else
1144 ret = cow_file_range_async(inode, locked_page, start, end, 1145 ret = cow_file_range_async(inode, locked_page, start, end,
1145 page_started, nr_written); 1146 page_started, nr_written);
1146
1147 return ret; 1147 return ret;
1148} 1148}
1149 1149
@@ -1453,6 +1453,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1453 path = btrfs_alloc_path(); 1453 path = btrfs_alloc_path();
1454 BUG_ON(!path); 1454 BUG_ON(!path);
1455 1455
1456 path->leave_spinning = 1;
1456 ret = btrfs_drop_extents(trans, root, inode, file_pos, 1457 ret = btrfs_drop_extents(trans, root, inode, file_pos,
1457 file_pos + num_bytes, file_pos, &hint); 1458 file_pos + num_bytes, file_pos, &hint);
1458 BUG_ON(ret); 1459 BUG_ON(ret);
@@ -1475,6 +1476,10 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1475 btrfs_set_file_extent_compression(leaf, fi, compression); 1476 btrfs_set_file_extent_compression(leaf, fi, compression);
1476 btrfs_set_file_extent_encryption(leaf, fi, encryption); 1477 btrfs_set_file_extent_encryption(leaf, fi, encryption);
1477 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); 1478 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1479
1480 btrfs_unlock_up_safe(path, 1);
1481 btrfs_set_lock_blocking(leaf);
1482
1478 btrfs_mark_buffer_dirty(leaf); 1483 btrfs_mark_buffer_dirty(leaf);
1479 1484
1480 inode_add_bytes(inode, num_bytes); 1485 inode_add_bytes(inode, num_bytes);
@@ -1487,11 +1492,35 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1487 root->root_key.objectid, 1492 root->root_key.objectid,
1488 trans->transid, inode->i_ino, &ins); 1493 trans->transid, inode->i_ino, &ins);
1489 BUG_ON(ret); 1494 BUG_ON(ret);
1490
1491 btrfs_free_path(path); 1495 btrfs_free_path(path);
1496
1492 return 0; 1497 return 0;
1493} 1498}
1494 1499
1500/*
1501 * helper function for btrfs_finish_ordered_io, this
1502 * just reads in some of the csum leaves to prime them into ram
1503 * before we start the transaction. It limits the amount of btree
1504 * reads required while inside the transaction.
1505 */
1506static noinline void reada_csum(struct btrfs_root *root,
1507 struct btrfs_path *path,
1508 struct btrfs_ordered_extent *ordered_extent)
1509{
1510 struct btrfs_ordered_sum *sum;
1511 u64 bytenr;
1512
1513 sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum,
1514 list);
1515 bytenr = sum->sums[0].bytenr;
1516
1517 /*
1518 * we don't care about the results, the point of this search is
1519 * just to get the btree leaves into ram
1520 */
1521 btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0);
1522}
1523
1495/* as ordered data IO finishes, this gets called so we can finish 1524/* as ordered data IO finishes, this gets called so we can finish
1496 * an ordered extent if the range of bytes in the file it covers are 1525 * an ordered extent if the range of bytes in the file it covers are
1497 * fully written. 1526 * fully written.
@@ -1500,8 +1529,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1500{ 1529{
1501 struct btrfs_root *root = BTRFS_I(inode)->root; 1530 struct btrfs_root *root = BTRFS_I(inode)->root;
1502 struct btrfs_trans_handle *trans; 1531 struct btrfs_trans_handle *trans;
1503 struct btrfs_ordered_extent *ordered_extent; 1532 struct btrfs_ordered_extent *ordered_extent = NULL;
1504 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1533 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1534 struct btrfs_path *path;
1505 int compressed = 0; 1535 int compressed = 0;
1506 int ret; 1536 int ret;
1507 1537
@@ -1509,9 +1539,33 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1509 if (!ret) 1539 if (!ret)
1510 return 0; 1540 return 0;
1511 1541
1542 /*
1543 * before we join the transaction, try to do some of our IO.
1544 * This will limit the amount of IO that we have to do with
1545 * the transaction running. We're unlikely to need to do any
1546 * IO if the file extents are new, the disk_i_size checks
1547 * covers the most common case.
1548 */
1549 if (start < BTRFS_I(inode)->disk_i_size) {
1550 path = btrfs_alloc_path();
1551 if (path) {
1552 ret = btrfs_lookup_file_extent(NULL, root, path,
1553 inode->i_ino,
1554 start, 0);
1555 ordered_extent = btrfs_lookup_ordered_extent(inode,
1556 start);
1557 if (!list_empty(&ordered_extent->list)) {
1558 btrfs_release_path(root, path);
1559 reada_csum(root, path, ordered_extent);
1560 }
1561 btrfs_free_path(path);
1562 }
1563 }
1564
1512 trans = btrfs_join_transaction(root, 1); 1565 trans = btrfs_join_transaction(root, 1);
1513 1566
1514 ordered_extent = btrfs_lookup_ordered_extent(inode, start); 1567 if (!ordered_extent)
1568 ordered_extent = btrfs_lookup_ordered_extent(inode, start);
1515 BUG_ON(!ordered_extent); 1569 BUG_ON(!ordered_extent);
1516 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) 1570 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
1517 goto nocow; 1571 goto nocow;
@@ -2101,6 +2155,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2101 2155
2102 path = btrfs_alloc_path(); 2156 path = btrfs_alloc_path();
2103 BUG_ON(!path); 2157 BUG_ON(!path);
2158 path->leave_spinning = 1;
2104 ret = btrfs_lookup_inode(trans, root, path, 2159 ret = btrfs_lookup_inode(trans, root, path,
2105 &BTRFS_I(inode)->location, 1); 2160 &BTRFS_I(inode)->location, 1);
2106 if (ret) { 2161 if (ret) {
@@ -2147,6 +2202,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2147 goto err; 2202 goto err;
2148 } 2203 }
2149 2204
2205 path->leave_spinning = 1;
2150 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 2206 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2151 name, name_len, -1); 2207 name, name_len, -1);
2152 if (IS_ERR(di)) { 2208 if (IS_ERR(di)) {
@@ -2190,8 +2246,6 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2190 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, 2246 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
2191 inode, dir->i_ino); 2247 inode, dir->i_ino);
2192 BUG_ON(ret != 0 && ret != -ENOENT); 2248 BUG_ON(ret != 0 && ret != -ENOENT);
2193 if (ret != -ENOENT)
2194 BTRFS_I(dir)->log_dirty_trans = trans->transid;
2195 2249
2196 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, 2250 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
2197 dir, index); 2251 dir, index);
@@ -2224,6 +2278,9 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2224 trans = btrfs_start_transaction(root, 1); 2278 trans = btrfs_start_transaction(root, 1);
2225 2279
2226 btrfs_set_trans_block_group(trans, dir); 2280 btrfs_set_trans_block_group(trans, dir);
2281
2282 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
2283
2227 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 2284 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2228 dentry->d_name.name, dentry->d_name.len); 2285 dentry->d_name.name, dentry->d_name.len);
2229 2286
@@ -2498,6 +2555,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2498 key.type = (u8)-1; 2555 key.type = (u8)-1;
2499 2556
2500search_again: 2557search_again:
2558 path->leave_spinning = 1;
2501 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2559 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2502 if (ret < 0) 2560 if (ret < 0)
2503 goto error; 2561 goto error;
@@ -2644,6 +2702,7 @@ delete:
2644 break; 2702 break;
2645 } 2703 }
2646 if (found_extent) { 2704 if (found_extent) {
2705 btrfs_set_path_blocking(path);
2647 ret = btrfs_free_extent(trans, root, extent_start, 2706 ret = btrfs_free_extent(trans, root, extent_start,
2648 extent_num_bytes, 2707 extent_num_bytes,
2649 leaf->start, root_owner, 2708 leaf->start, root_owner,
@@ -2848,11 +2907,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
2848 if (err) 2907 if (err)
2849 return err; 2908 return err;
2850 2909
2851 if (S_ISREG(inode->i_mode) && 2910 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
2852 attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) { 2911 if (attr->ia_size > inode->i_size) {
2853 err = btrfs_cont_expand(inode, attr->ia_size); 2912 err = btrfs_cont_expand(inode, attr->ia_size);
2854 if (err) 2913 if (err)
2855 return err; 2914 return err;
2915 } else if (inode->i_size > 0 &&
2916 attr->ia_size == 0) {
2917
2918 /* we're truncating a file that used to have good
2919 * data down to zero. Make sure it gets into
2920 * the ordered flush list so that any new writes
2921 * get down to disk quickly.
2922 */
2923 BTRFS_I(inode)->ordered_data_close = 1;
2924 }
2856 } 2925 }
2857 2926
2858 err = inode_setattr(inode, attr); 2927 err = inode_setattr(inode, attr);
@@ -2984,13 +3053,14 @@ static noinline void init_btrfs_i(struct inode *inode)
2984 bi->disk_i_size = 0; 3053 bi->disk_i_size = 0;
2985 bi->flags = 0; 3054 bi->flags = 0;
2986 bi->index_cnt = (u64)-1; 3055 bi->index_cnt = (u64)-1;
2987 bi->log_dirty_trans = 0; 3056 bi->last_unlink_trans = 0;
2988 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); 3057 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
2989 extent_io_tree_init(&BTRFS_I(inode)->io_tree, 3058 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
2990 inode->i_mapping, GFP_NOFS); 3059 inode->i_mapping, GFP_NOFS);
2991 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, 3060 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
2992 inode->i_mapping, GFP_NOFS); 3061 inode->i_mapping, GFP_NOFS);
2993 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); 3062 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
3063 INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
2994 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); 3064 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
2995 mutex_init(&BTRFS_I(inode)->extent_mutex); 3065 mutex_init(&BTRFS_I(inode)->extent_mutex);
2996 mutex_init(&BTRFS_I(inode)->log_mutex); 3066 mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -3411,8 +3481,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3411 3481
3412 if (dir) { 3482 if (dir) {
3413 ret = btrfs_set_inode_index(dir, index); 3483 ret = btrfs_set_inode_index(dir, index);
3414 if (ret) 3484 if (ret) {
3485 iput(inode);
3415 return ERR_PTR(ret); 3486 return ERR_PTR(ret);
3487 }
3416 } 3488 }
3417 /* 3489 /*
3418 * index_cnt is ignored for everything but a dir, 3490 * index_cnt is ignored for everything but a dir,
@@ -3449,6 +3521,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3449 sizes[0] = sizeof(struct btrfs_inode_item); 3521 sizes[0] = sizeof(struct btrfs_inode_item);
3450 sizes[1] = name_len + sizeof(*ref); 3522 sizes[1] = name_len + sizeof(*ref);
3451 3523
3524 path->leave_spinning = 1;
3452 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); 3525 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
3453 if (ret != 0) 3526 if (ret != 0)
3454 goto fail; 3527 goto fail;
@@ -3494,6 +3567,7 @@ fail:
3494 if (dir) 3567 if (dir)
3495 BTRFS_I(dir)->index_cnt--; 3568 BTRFS_I(dir)->index_cnt--;
3496 btrfs_free_path(path); 3569 btrfs_free_path(path);
3570 iput(inode);
3497 return ERR_PTR(ret); 3571 return ERR_PTR(ret);
3498} 3572}
3499 3573
@@ -3727,6 +3801,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
3727 drop_inode = 1; 3801 drop_inode = 1;
3728 3802
3729 nr = trans->blocks_used; 3803 nr = trans->blocks_used;
3804
3805 btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
3730 btrfs_end_transaction_throttle(trans, root); 3806 btrfs_end_transaction_throttle(trans, root);
3731fail: 3807fail:
3732 if (drop_inode) { 3808 if (drop_inode) {
@@ -4292,8 +4368,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4292 * beyond EOF, then the page is guaranteed safe against truncation until we 4368 * beyond EOF, then the page is guaranteed safe against truncation until we
4293 * unlock the page. 4369 * unlock the page.
4294 */ 4370 */
4295int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page) 4371int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
4296{ 4372{
4373 struct page *page = vmf->page;
4297 struct inode *inode = fdentry(vma->vm_file)->d_inode; 4374 struct inode *inode = fdentry(vma->vm_file)->d_inode;
4298 struct btrfs_root *root = BTRFS_I(inode)->root; 4375 struct btrfs_root *root = BTRFS_I(inode)->root;
4299 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 4376 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -4306,10 +4383,15 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4306 u64 page_end; 4383 u64 page_end;
4307 4384
4308 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 4385 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
4309 if (ret) 4386 if (ret) {
4387 if (ret == -ENOMEM)
4388 ret = VM_FAULT_OOM;
4389 else /* -ENOSPC, -EIO, etc */
4390 ret = VM_FAULT_SIGBUS;
4310 goto out; 4391 goto out;
4392 }
4311 4393
4312 ret = -EINVAL; 4394 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
4313again: 4395again:
4314 lock_page(page); 4396 lock_page(page);
4315 size = i_size_read(inode); 4397 size = i_size_read(inode);
@@ -4357,6 +4439,8 @@ again:
4357 } 4439 }
4358 ClearPageChecked(page); 4440 ClearPageChecked(page);
4359 set_page_dirty(page); 4441 set_page_dirty(page);
4442
4443 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
4360 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 4444 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4361 4445
4362out_unlock: 4446out_unlock:
@@ -4382,6 +4466,27 @@ static void btrfs_truncate(struct inode *inode)
4382 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 4466 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
4383 4467
4384 trans = btrfs_start_transaction(root, 1); 4468 trans = btrfs_start_transaction(root, 1);
4469
4470 /*
4471 * setattr is responsible for setting the ordered_data_close flag,
4472 * but that is only tested during the last file release. That
4473 * could happen well after the next commit, leaving a great big
4474 * window where new writes may get lost if someone chooses to write
4475 * to this file after truncating to zero
4476 *
4477 * The inode doesn't have any dirty data here, and so if we commit
4478 * this is a noop. If someone immediately starts writing to the inode
4479 * it is very likely we'll catch some of their writes in this
4480 * transaction, and the commit will find this file on the ordered
4481 * data list with good things to send down.
4482 *
4483 * This is a best effort solution, there is still a window where
4484 * using truncate to replace the contents of the file will
4485 * end up with a zero length file after a crash.
4486 */
4487 if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
4488 btrfs_add_ordered_operation(trans, root, inode);
4489
4385 btrfs_set_trans_block_group(trans, inode); 4490 btrfs_set_trans_block_group(trans, inode);
4386 btrfs_i_size_write(inode, inode->i_size); 4491 btrfs_i_size_write(inode, inode->i_size);
4387 4492
@@ -4458,12 +4563,15 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
4458 ei->i_acl = BTRFS_ACL_NOT_CACHED; 4563 ei->i_acl = BTRFS_ACL_NOT_CACHED;
4459 ei->i_default_acl = BTRFS_ACL_NOT_CACHED; 4564 ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
4460 INIT_LIST_HEAD(&ei->i_orphan); 4565 INIT_LIST_HEAD(&ei->i_orphan);
4566 INIT_LIST_HEAD(&ei->ordered_operations);
4461 return &ei->vfs_inode; 4567 return &ei->vfs_inode;
4462} 4568}
4463 4569
4464void btrfs_destroy_inode(struct inode *inode) 4570void btrfs_destroy_inode(struct inode *inode)
4465{ 4571{
4466 struct btrfs_ordered_extent *ordered; 4572 struct btrfs_ordered_extent *ordered;
4573 struct btrfs_root *root = BTRFS_I(inode)->root;
4574
4467 WARN_ON(!list_empty(&inode->i_dentry)); 4575 WARN_ON(!list_empty(&inode->i_dentry));
4468 WARN_ON(inode->i_data.nrpages); 4576 WARN_ON(inode->i_data.nrpages);
4469 4577
@@ -4474,13 +4582,24 @@ void btrfs_destroy_inode(struct inode *inode)
4474 BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED) 4582 BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
4475 posix_acl_release(BTRFS_I(inode)->i_default_acl); 4583 posix_acl_release(BTRFS_I(inode)->i_default_acl);
4476 4584
4477 spin_lock(&BTRFS_I(inode)->root->list_lock); 4585 /*
4586 * Make sure we're properly removed from the ordered operation
4587 * lists.
4588 */
4589 smp_mb();
4590 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
4591 spin_lock(&root->fs_info->ordered_extent_lock);
4592 list_del_init(&BTRFS_I(inode)->ordered_operations);
4593 spin_unlock(&root->fs_info->ordered_extent_lock);
4594 }
4595
4596 spin_lock(&root->list_lock);
4478 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 4597 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
4479 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" 4598 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
4480 " list\n", inode->i_ino); 4599 " list\n", inode->i_ino);
4481 dump_stack(); 4600 dump_stack();
4482 } 4601 }
4483 spin_unlock(&BTRFS_I(inode)->root->list_lock); 4602 spin_unlock(&root->list_lock);
4484 4603
4485 while (1) { 4604 while (1) {
4486 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 4605 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -4605,8 +4724,36 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4605 if (ret) 4724 if (ret)
4606 goto out_unlock; 4725 goto out_unlock;
4607 4726
4727 /*
4728 * we're using rename to replace one file with another.
4729 * and the replacement file is large. Start IO on it now so
4730 * we don't add too much work to the end of the transaction
4731 */
4732 if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
4733 new_inode->i_size &&
4734 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
4735 filemap_flush(old_inode->i_mapping);
4736
4608 trans = btrfs_start_transaction(root, 1); 4737 trans = btrfs_start_transaction(root, 1);
4609 4738
4739 /*
4740 * make sure the inode gets flushed if it is replacing
4741 * something.
4742 */
4743 if (new_inode && new_inode->i_size &&
4744 old_inode && S_ISREG(old_inode->i_mode)) {
4745 btrfs_add_ordered_operation(trans, root, old_inode);
4746 }
4747
4748 /*
4749 * this is an ugly little race, but the rename is required to make
4750 * sure that if we crash, the inode is either at the old name
4751 * or the new one. pinning the log transaction lets us make sure
4752 * we don't allow a log commit to come in after we unlink the
4753 * name but before we add the new name back in.
4754 */
4755 btrfs_pin_log_trans(root);
4756
4610 btrfs_set_trans_block_group(trans, new_dir); 4757 btrfs_set_trans_block_group(trans, new_dir);
4611 4758
4612 btrfs_inc_nlink(old_dentry->d_inode); 4759 btrfs_inc_nlink(old_dentry->d_inode);
@@ -4614,6 +4761,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4614 new_dir->i_ctime = new_dir->i_mtime = ctime; 4761 new_dir->i_ctime = new_dir->i_mtime = ctime;
4615 old_inode->i_ctime = ctime; 4762 old_inode->i_ctime = ctime;
4616 4763
4764 if (old_dentry->d_parent != new_dentry->d_parent)
4765 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
4766
4617 ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode, 4767 ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
4618 old_dentry->d_name.name, 4768 old_dentry->d_name.name,
4619 old_dentry->d_name.len); 4769 old_dentry->d_name.len);
@@ -4645,7 +4795,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4645 if (ret) 4795 if (ret)
4646 goto out_fail; 4796 goto out_fail;
4647 4797
4798 btrfs_log_new_name(trans, old_inode, old_dir,
4799 new_dentry->d_parent);
4648out_fail: 4800out_fail:
4801
4802 /* this btrfs_end_log_trans just allows the current
4803 * log-sub transaction to complete
4804 */
4805 btrfs_end_log_trans(root);
4649 btrfs_end_transaction_throttle(trans, root); 4806 btrfs_end_transaction_throttle(trans, root);
4650out_unlock: 4807out_unlock:
4651 return ret; 4808 return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bca729fc80c8..7594bec1be10 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -267,7 +267,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
267 goto out_dput; 267 goto out_dput;
268 268
269 if (!IS_POSIXACL(parent->dentry->d_inode)) 269 if (!IS_POSIXACL(parent->dentry->d_inode))
270 mode &= ~current->fs->umask; 270 mode &= ~current_umask();
271 271
272 error = mnt_want_write(parent->mnt); 272 error = mnt_want_write(parent->mnt);
273 if (error) 273 if (error)
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 85506c4a3af7..1c36e5cd8f55 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -60,8 +60,8 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb)
60 60
61/* 61/*
62 * unfortunately, many of the places that currently set a lock to blocking 62 * unfortunately, many of the places that currently set a lock to blocking
63 * don't end up blocking for every long, and often they don't block 63 * don't end up blocking for very long, and often they don't block
64 * at all. For a dbench 50 run, if we don't spin one the blocking bit 64 * at all. For a dbench 50 run, if we don't spin on the blocking bit
65 * at all, the context switch rate can jump up to 400,000/sec or more. 65 * at all, the context switch rate can jump up to 400,000/sec or more.
66 * 66 *
67 * So, we're still stuck with this crummy spin on the blocking bit, 67 * So, we're still stuck with this crummy spin on the blocking bit,
@@ -71,12 +71,13 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb)
71static int btrfs_spin_on_block(struct extent_buffer *eb) 71static int btrfs_spin_on_block(struct extent_buffer *eb)
72{ 72{
73 int i; 73 int i;
74
74 for (i = 0; i < 512; i++) { 75 for (i = 0; i < 512; i++) {
75 cpu_relax();
76 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 76 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
77 return 1; 77 return 1;
78 if (need_resched()) 78 if (need_resched())
79 break; 79 break;
80 cpu_relax();
80 } 81 }
81 return 0; 82 return 0;
82} 83}
@@ -95,13 +96,15 @@ int btrfs_try_spin_lock(struct extent_buffer *eb)
95{ 96{
96 int i; 97 int i;
97 98
98 spin_nested(eb); 99 if (btrfs_spin_on_block(eb)) {
99 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 100 spin_nested(eb);
100 return 1; 101 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
101 spin_unlock(&eb->lock); 102 return 1;
102 103 spin_unlock(&eb->lock);
104 }
103 /* spin for a bit on the BLOCKING flag */ 105 /* spin for a bit on the BLOCKING flag */
104 for (i = 0; i < 2; i++) { 106 for (i = 0; i < 2; i++) {
107 cpu_relax();
105 if (!btrfs_spin_on_block(eb)) 108 if (!btrfs_spin_on_block(eb))
106 break; 109 break;
107 110
@@ -148,6 +151,9 @@ int btrfs_tree_lock(struct extent_buffer *eb)
148 DEFINE_WAIT(wait); 151 DEFINE_WAIT(wait);
149 wait.func = btrfs_wake_function; 152 wait.func = btrfs_wake_function;
150 153
154 if (!btrfs_spin_on_block(eb))
155 goto sleep;
156
151 while(1) { 157 while(1) {
152 spin_nested(eb); 158 spin_nested(eb);
153 159
@@ -165,9 +171,10 @@ int btrfs_tree_lock(struct extent_buffer *eb)
165 * spin for a bit, and if the blocking flag goes away, 171 * spin for a bit, and if the blocking flag goes away,
166 * loop around 172 * loop around
167 */ 173 */
174 cpu_relax();
168 if (btrfs_spin_on_block(eb)) 175 if (btrfs_spin_on_block(eb))
169 continue; 176 continue;
170 177sleep:
171 prepare_to_wait_exclusive(&eb->lock_wq, &wait, 178 prepare_to_wait_exclusive(&eb->lock_wq, &wait,
172 TASK_UNINTERRUPTIBLE); 179 TASK_UNINTERRUPTIBLE);
173 180
@@ -220,8 +227,8 @@ int btrfs_tree_unlock(struct extent_buffer *eb)
220 return 0; 227 return 0;
221} 228}
222 229
223int btrfs_tree_locked(struct extent_buffer *eb) 230void btrfs_assert_tree_locked(struct extent_buffer *eb)
224{ 231{
225 return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) || 232 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
226 spin_is_locked(&eb->lock); 233 assert_spin_locked(&eb->lock);
227} 234}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 6bb0afbff928..6c4ce457168c 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -21,11 +21,11 @@
21 21
22int btrfs_tree_lock(struct extent_buffer *eb); 22int btrfs_tree_lock(struct extent_buffer *eb);
23int btrfs_tree_unlock(struct extent_buffer *eb); 23int btrfs_tree_unlock(struct extent_buffer *eb);
24int btrfs_tree_locked(struct extent_buffer *eb);
25 24
26int btrfs_try_tree_lock(struct extent_buffer *eb); 25int btrfs_try_tree_lock(struct extent_buffer *eb);
27int btrfs_try_spin_lock(struct extent_buffer *eb); 26int btrfs_try_spin_lock(struct extent_buffer *eb);
28 27
29void btrfs_set_lock_blocking(struct extent_buffer *eb); 28void btrfs_set_lock_blocking(struct extent_buffer *eb);
30void btrfs_clear_lock_blocking(struct extent_buffer *eb); 29void btrfs_clear_lock_blocking(struct extent_buffer *eb);
30void btrfs_assert_tree_locked(struct extent_buffer *eb);
31#endif 31#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 77c2411a5f0f..53c87b197d70 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -310,6 +310,16 @@ int btrfs_remove_ordered_extent(struct inode *inode,
310 310
311 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 311 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
312 list_del_init(&entry->root_extent_list); 312 list_del_init(&entry->root_extent_list);
313
314 /*
315 * we have no more ordered extents for this inode and
316 * no dirty pages. We can safely remove it from the
317 * list of ordered extents
318 */
319 if (RB_EMPTY_ROOT(&tree->tree) &&
320 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
321 list_del_init(&BTRFS_I(inode)->ordered_operations);
322 }
313 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 323 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
314 324
315 mutex_unlock(&tree->mutex); 325 mutex_unlock(&tree->mutex);
@@ -370,6 +380,68 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
370} 380}
371 381
372/* 382/*
383 * this is used during transaction commit to write all the inodes
384 * added to the ordered operation list. These files must be fully on
385 * disk before the transaction commits.
386 *
387 * we have two modes here, one is to just start the IO via filemap_flush
388 * and the other is to wait for all the io. When we wait, we have an
389 * extra check to make sure the ordered operation list really is empty
390 * before we return
391 */
392int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
393{
394 struct btrfs_inode *btrfs_inode;
395 struct inode *inode;
396 struct list_head splice;
397
398 INIT_LIST_HEAD(&splice);
399
400 mutex_lock(&root->fs_info->ordered_operations_mutex);
401 spin_lock(&root->fs_info->ordered_extent_lock);
402again:
403 list_splice_init(&root->fs_info->ordered_operations, &splice);
404
405 while (!list_empty(&splice)) {
406 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
407 ordered_operations);
408
409 inode = &btrfs_inode->vfs_inode;
410
411 list_del_init(&btrfs_inode->ordered_operations);
412
413 /*
414 * the inode may be getting freed (in sys_unlink path).
415 */
416 inode = igrab(inode);
417
418 if (!wait && inode) {
419 list_add_tail(&BTRFS_I(inode)->ordered_operations,
420 &root->fs_info->ordered_operations);
421 }
422 spin_unlock(&root->fs_info->ordered_extent_lock);
423
424 if (inode) {
425 if (wait)
426 btrfs_wait_ordered_range(inode, 0, (u64)-1);
427 else
428 filemap_flush(inode->i_mapping);
429 iput(inode);
430 }
431
432 cond_resched();
433 spin_lock(&root->fs_info->ordered_extent_lock);
434 }
435 if (wait && !list_empty(&root->fs_info->ordered_operations))
436 goto again;
437
438 spin_unlock(&root->fs_info->ordered_extent_lock);
439 mutex_unlock(&root->fs_info->ordered_operations_mutex);
440
441 return 0;
442}
443
444/*
373 * Used to start IO or wait for a given ordered extent to finish. 445 * Used to start IO or wait for a given ordered extent to finish.
374 * 446 *
375 * If wait is one, this effectively waits on page writeback for all the pages 447 * If wait is one, this effectively waits on page writeback for all the pages
@@ -726,3 +798,49 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
726 798
727 return ret; 799 return ret;
728} 800}
801
802/*
803 * add a given inode to the list of inodes that must be fully on
804 * disk before a transaction commit finishes.
805 *
806 * This basically gives us the ext3 style data=ordered mode, and it is mostly
807 * used to make sure renamed files are fully on disk.
808 *
809 * It is a noop if the inode is already fully on disk.
810 *
811 * If trans is not null, we'll do a friendly check for a transaction that
812 * is already flushing things and force the IO down ourselves.
813 */
814int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
815 struct btrfs_root *root,
816 struct inode *inode)
817{
818 u64 last_mod;
819
820 last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
821
822 /*
823 * if this file hasn't been changed since the last transaction
824 * commit, we can safely return without doing anything
825 */
826 if (last_mod < root->fs_info->last_trans_committed)
827 return 0;
828
829 /*
830 * the transaction is already committing. Just start the IO and
831 * don't bother with all of this list nonsense
832 */
833 if (trans && root->fs_info->running_transaction->blocked) {
834 btrfs_wait_ordered_range(inode, 0, (u64)-1);
835 return 0;
836 }
837
838 spin_lock(&root->fs_info->ordered_extent_lock);
839 if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
840 list_add_tail(&BTRFS_I(inode)->ordered_operations,
841 &root->fs_info->ordered_operations);
842 }
843 spin_unlock(&root->fs_info->ordered_extent_lock);
844
845 return 0;
846}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index ab66d5e8d6d6..3d31c8827b01 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -155,4 +155,8 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
155int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, 155int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
156 loff_t end, int sync_mode); 156 loff_t end, int sync_mode);
157int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); 157int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
158int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
159int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
160 struct btrfs_root *root,
161 struct inode *inode);
158#endif 162#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 19a4daf03ccb..9744af9d71e9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -24,6 +24,7 @@
24#include <linux/highmem.h> 24#include <linux/highmem.h>
25#include <linux/time.h> 25#include <linux/time.h>
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/seq_file.h>
27#include <linux/string.h> 28#include <linux/string.h>
28#include <linux/smp_lock.h> 29#include <linux/smp_lock.h>
29#include <linux/backing-dev.h> 30#include <linux/backing-dev.h>
@@ -66,7 +67,8 @@ static void btrfs_put_super(struct super_block *sb)
66enum { 67enum {
67 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, 68 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
68 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, 69 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
69 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err, 70 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_notreelog,
71 Opt_flushoncommit, Opt_err,
70}; 72};
71 73
72static match_table_t tokens = { 74static match_table_t tokens = {
@@ -83,6 +85,8 @@ static match_table_t tokens = {
83 {Opt_compress, "compress"}, 85 {Opt_compress, "compress"},
84 {Opt_ssd, "ssd"}, 86 {Opt_ssd, "ssd"},
85 {Opt_noacl, "noacl"}, 87 {Opt_noacl, "noacl"},
88 {Opt_notreelog, "notreelog"},
89 {Opt_flushoncommit, "flushoncommit"},
86 {Opt_err, NULL}, 90 {Opt_err, NULL},
87}; 91};
88 92
@@ -222,6 +226,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
222 case Opt_noacl: 226 case Opt_noacl:
223 root->fs_info->sb->s_flags &= ~MS_POSIXACL; 227 root->fs_info->sb->s_flags &= ~MS_POSIXACL;
224 break; 228 break;
229 case Opt_notreelog:
230 printk(KERN_INFO "btrfs: disabling tree log\n");
231 btrfs_set_opt(info->mount_opt, NOTREELOG);
232 break;
233 case Opt_flushoncommit:
234 printk(KERN_INFO "btrfs: turning on flush-on-commit\n");
235 btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT);
236 break;
225 default: 237 default:
226 break; 238 break;
227 } 239 }
@@ -363,9 +375,8 @@ fail_close:
363int btrfs_sync_fs(struct super_block *sb, int wait) 375int btrfs_sync_fs(struct super_block *sb, int wait)
364{ 376{
365 struct btrfs_trans_handle *trans; 377 struct btrfs_trans_handle *trans;
366 struct btrfs_root *root; 378 struct btrfs_root *root = btrfs_sb(sb);
367 int ret; 379 int ret;
368 root = btrfs_sb(sb);
369 380
370 if (sb->s_flags & MS_RDONLY) 381 if (sb->s_flags & MS_RDONLY)
371 return 0; 382 return 0;
@@ -385,6 +396,41 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
385 return ret; 396 return ret;
386} 397}
387 398
399static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
400{
401 struct btrfs_root *root = btrfs_sb(vfs->mnt_sb);
402 struct btrfs_fs_info *info = root->fs_info;
403
404 if (btrfs_test_opt(root, DEGRADED))
405 seq_puts(seq, ",degraded");
406 if (btrfs_test_opt(root, NODATASUM))
407 seq_puts(seq, ",nodatasum");
408 if (btrfs_test_opt(root, NODATACOW))
409 seq_puts(seq, ",nodatacow");
410 if (btrfs_test_opt(root, NOBARRIER))
411 seq_puts(seq, ",nobarrier");
412 if (info->max_extent != (u64)-1)
413 seq_printf(seq, ",max_extent=%llu", info->max_extent);
414 if (info->max_inline != 8192 * 1024)
415 seq_printf(seq, ",max_inline=%llu", info->max_inline);
416 if (info->alloc_start != 0)
417 seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
418 if (info->thread_pool_size != min_t(unsigned long,
419 num_online_cpus() + 2, 8))
420 seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
421 if (btrfs_test_opt(root, COMPRESS))
422 seq_puts(seq, ",compress");
423 if (btrfs_test_opt(root, SSD))
424 seq_puts(seq, ",ssd");
425 if (btrfs_test_opt(root, NOTREELOG))
426 seq_puts(seq, ",no-treelog");
427 if (btrfs_test_opt(root, FLUSHONCOMMIT))
428 seq_puts(seq, ",flush-on-commit");
429 if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
430 seq_puts(seq, ",noacl");
431 return 0;
432}
433
388static void btrfs_write_super(struct super_block *sb) 434static void btrfs_write_super(struct super_block *sb)
389{ 435{
390 sb->s_dirt = 0; 436 sb->s_dirt = 0;
@@ -630,7 +676,7 @@ static struct super_operations btrfs_super_ops = {
630 .put_super = btrfs_put_super, 676 .put_super = btrfs_put_super,
631 .write_super = btrfs_write_super, 677 .write_super = btrfs_write_super,
632 .sync_fs = btrfs_sync_fs, 678 .sync_fs = btrfs_sync_fs,
633 .show_options = generic_show_options, 679 .show_options = btrfs_show_options,
634 .write_inode = btrfs_write_inode, 680 .write_inode = btrfs_write_inode,
635 .dirty_inode = btrfs_dirty_inode, 681 .dirty_inode = btrfs_dirty_inode,
636 .alloc_inode = btrfs_alloc_inode, 682 .alloc_inode = btrfs_alloc_inode,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4112d53d4f4d..2869b3361eb6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -53,8 +53,6 @@ static noinline int join_transaction(struct btrfs_root *root)
53 GFP_NOFS); 53 GFP_NOFS);
54 BUG_ON(!cur_trans); 54 BUG_ON(!cur_trans);
55 root->fs_info->generation++; 55 root->fs_info->generation++;
56 root->fs_info->last_alloc = 0;
57 root->fs_info->last_data_alloc = 0;
58 cur_trans->num_writers = 1; 56 cur_trans->num_writers = 1;
59 cur_trans->num_joined = 0; 57 cur_trans->num_joined = 0;
60 cur_trans->transid = root->fs_info->generation; 58 cur_trans->transid = root->fs_info->generation;
@@ -65,6 +63,15 @@ static noinline int join_transaction(struct btrfs_root *root)
65 cur_trans->use_count = 1; 63 cur_trans->use_count = 1;
66 cur_trans->commit_done = 0; 64 cur_trans->commit_done = 0;
67 cur_trans->start_time = get_seconds(); 65 cur_trans->start_time = get_seconds();
66
67 cur_trans->delayed_refs.root.rb_node = NULL;
68 cur_trans->delayed_refs.num_entries = 0;
69 cur_trans->delayed_refs.num_heads_ready = 0;
70 cur_trans->delayed_refs.num_heads = 0;
71 cur_trans->delayed_refs.flushing = 0;
72 cur_trans->delayed_refs.run_delayed_start = 0;
73 spin_lock_init(&cur_trans->delayed_refs.lock);
74
68 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 75 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
69 list_add_tail(&cur_trans->list, &root->fs_info->trans_list); 76 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
70 extent_io_tree_init(&cur_trans->dirty_pages, 77 extent_io_tree_init(&cur_trans->dirty_pages,
@@ -182,6 +189,8 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
182 h->block_group = 0; 189 h->block_group = 0;
183 h->alloc_exclude_nr = 0; 190 h->alloc_exclude_nr = 0;
184 h->alloc_exclude_start = 0; 191 h->alloc_exclude_start = 0;
192 h->delayed_ref_updates = 0;
193
185 root->fs_info->running_transaction->use_count++; 194 root->fs_info->running_transaction->use_count++;
186 mutex_unlock(&root->fs_info->trans_mutex); 195 mutex_unlock(&root->fs_info->trans_mutex);
187 return h; 196 return h;
@@ -271,7 +280,6 @@ void btrfs_throttle(struct btrfs_root *root)
271 if (!root->fs_info->open_ioctl_trans) 280 if (!root->fs_info->open_ioctl_trans)
272 wait_current_trans(root); 281 wait_current_trans(root);
273 mutex_unlock(&root->fs_info->trans_mutex); 282 mutex_unlock(&root->fs_info->trans_mutex);
274
275 throttle_on_drops(root); 283 throttle_on_drops(root);
276} 284}
277 285
@@ -280,6 +288,27 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
280{ 288{
281 struct btrfs_transaction *cur_trans; 289 struct btrfs_transaction *cur_trans;
282 struct btrfs_fs_info *info = root->fs_info; 290 struct btrfs_fs_info *info = root->fs_info;
291 int count = 0;
292
293 while (count < 4) {
294 unsigned long cur = trans->delayed_ref_updates;
295 trans->delayed_ref_updates = 0;
296 if (cur &&
297 trans->transaction->delayed_refs.num_heads_ready > 64) {
298 trans->delayed_ref_updates = 0;
299
300 /*
301 * do a full flush if the transaction is trying
302 * to close
303 */
304 if (trans->transaction->delayed_refs.flushing)
305 cur = 0;
306 btrfs_run_delayed_refs(trans, root, cur);
307 } else {
308 break;
309 }
310 count++;
311 }
283 312
284 mutex_lock(&info->trans_mutex); 313 mutex_lock(&info->trans_mutex);
285 cur_trans = info->running_transaction; 314 cur_trans = info->running_transaction;
@@ -424,9 +453,10 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
424 u64 old_root_bytenr; 453 u64 old_root_bytenr;
425 struct btrfs_root *tree_root = root->fs_info->tree_root; 454 struct btrfs_root *tree_root = root->fs_info->tree_root;
426 455
427 btrfs_extent_post_op(trans, root);
428 btrfs_write_dirty_block_groups(trans, root); 456 btrfs_write_dirty_block_groups(trans, root);
429 btrfs_extent_post_op(trans, root); 457
458 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
459 BUG_ON(ret);
430 460
431 while (1) { 461 while (1) {
432 old_root_bytenr = btrfs_root_bytenr(&root->root_item); 462 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
@@ -438,14 +468,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
438 btrfs_header_level(root->node)); 468 btrfs_header_level(root->node));
439 btrfs_set_root_generation(&root->root_item, trans->transid); 469 btrfs_set_root_generation(&root->root_item, trans->transid);
440 470
441 btrfs_extent_post_op(trans, root);
442
443 ret = btrfs_update_root(trans, tree_root, 471 ret = btrfs_update_root(trans, tree_root,
444 &root->root_key, 472 &root->root_key,
445 &root->root_item); 473 &root->root_item);
446 BUG_ON(ret); 474 BUG_ON(ret);
447 btrfs_write_dirty_block_groups(trans, root); 475 btrfs_write_dirty_block_groups(trans, root);
448 btrfs_extent_post_op(trans, root); 476
477 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
478 BUG_ON(ret);
449 } 479 }
450 return 0; 480 return 0;
451} 481}
@@ -459,15 +489,18 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
459 struct btrfs_fs_info *fs_info = root->fs_info; 489 struct btrfs_fs_info *fs_info = root->fs_info;
460 struct list_head *next; 490 struct list_head *next;
461 struct extent_buffer *eb; 491 struct extent_buffer *eb;
492 int ret;
462 493
463 btrfs_extent_post_op(trans, fs_info->tree_root); 494 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
495 BUG_ON(ret);
464 496
465 eb = btrfs_lock_root_node(fs_info->tree_root); 497 eb = btrfs_lock_root_node(fs_info->tree_root);
466 btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0); 498 btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
467 btrfs_tree_unlock(eb); 499 btrfs_tree_unlock(eb);
468 free_extent_buffer(eb); 500 free_extent_buffer(eb);
469 501
470 btrfs_extent_post_op(trans, fs_info->tree_root); 502 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
503 BUG_ON(ret);
471 504
472 while (!list_empty(&fs_info->dirty_cowonly_roots)) { 505 while (!list_empty(&fs_info->dirty_cowonly_roots)) {
473 next = fs_info->dirty_cowonly_roots.next; 506 next = fs_info->dirty_cowonly_roots.next;
@@ -475,6 +508,9 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
475 root = list_entry(next, struct btrfs_root, dirty_list); 508 root = list_entry(next, struct btrfs_root, dirty_list);
476 509
477 update_cowonly_root(trans, root); 510 update_cowonly_root(trans, root);
511
512 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
513 BUG_ON(ret);
478 } 514 }
479 return 0; 515 return 0;
480} 516}
@@ -635,6 +671,31 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
635} 671}
636 672
637/* 673/*
674 * when dropping snapshots, we generate a ton of delayed refs, and it makes
675 * sense not to join the transaction while it is trying to flush the current
676 * queue of delayed refs out.
677 *
678 * This is used by the drop snapshot code only
679 */
680static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
681{
682 DEFINE_WAIT(wait);
683
684 mutex_lock(&info->trans_mutex);
685 while (info->running_transaction &&
686 info->running_transaction->delayed_refs.flushing) {
687 prepare_to_wait(&info->transaction_wait, &wait,
688 TASK_UNINTERRUPTIBLE);
689 mutex_unlock(&info->trans_mutex);
690 schedule();
691 mutex_lock(&info->trans_mutex);
692 finish_wait(&info->transaction_wait, &wait);
693 }
694 mutex_unlock(&info->trans_mutex);
695 return 0;
696}
697
698/*
638 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on 699 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
639 * all of them 700 * all of them
640 */ 701 */
@@ -661,7 +722,22 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
661 atomic_inc(&root->fs_info->throttles); 722 atomic_inc(&root->fs_info->throttles);
662 723
663 while (1) { 724 while (1) {
725 /*
726 * we don't want to jump in and create a bunch of
727 * delayed refs if the transaction is starting to close
728 */
729 wait_transaction_pre_flush(tree_root->fs_info);
664 trans = btrfs_start_transaction(tree_root, 1); 730 trans = btrfs_start_transaction(tree_root, 1);
731
732 /*
733 * we've joined a transaction, make sure it isn't
734 * closing right now
735 */
736 if (trans->transaction->delayed_refs.flushing) {
737 btrfs_end_transaction(trans, tree_root);
738 continue;
739 }
740
665 mutex_lock(&root->fs_info->drop_mutex); 741 mutex_lock(&root->fs_info->drop_mutex);
666 ret = btrfs_drop_snapshot(trans, dirty->root); 742 ret = btrfs_drop_snapshot(trans, dirty->root);
667 if (ret != -EAGAIN) 743 if (ret != -EAGAIN)
@@ -766,7 +842,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
766 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 842 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
767 843
768 old = btrfs_lock_root_node(root); 844 old = btrfs_lock_root_node(root);
769 btrfs_cow_block(trans, root, old, NULL, 0, &old, 0); 845 btrfs_cow_block(trans, root, old, NULL, 0, &old);
770 846
771 btrfs_copy_root(trans, root, old, &tmp, objectid); 847 btrfs_copy_root(trans, root, old, &tmp, objectid);
772 btrfs_tree_unlock(old); 848 btrfs_tree_unlock(old);
@@ -894,12 +970,32 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
894 struct extent_io_tree *pinned_copy; 970 struct extent_io_tree *pinned_copy;
895 DEFINE_WAIT(wait); 971 DEFINE_WAIT(wait);
896 int ret; 972 int ret;
973 int should_grow = 0;
974 unsigned long now = get_seconds();
975 int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
976
977 btrfs_run_ordered_operations(root, 0);
978
979 /* make a pass through all the delayed refs we have so far
980 * any runnings procs may add more while we are here
981 */
982 ret = btrfs_run_delayed_refs(trans, root, 0);
983 BUG_ON(ret);
984
985 cur_trans = trans->transaction;
986 /*
987 * set the flushing flag so procs in this transaction have to
988 * start sending their work down.
989 */
990 cur_trans->delayed_refs.flushing = 1;
991
992 ret = btrfs_run_delayed_refs(trans, root, 0);
993 BUG_ON(ret);
897 994
898 INIT_LIST_HEAD(&dirty_fs_roots);
899 mutex_lock(&root->fs_info->trans_mutex); 995 mutex_lock(&root->fs_info->trans_mutex);
900 if (trans->transaction->in_commit) { 996 INIT_LIST_HEAD(&dirty_fs_roots);
901 cur_trans = trans->transaction; 997 if (cur_trans->in_commit) {
902 trans->transaction->use_count++; 998 cur_trans->use_count++;
903 mutex_unlock(&root->fs_info->trans_mutex); 999 mutex_unlock(&root->fs_info->trans_mutex);
904 btrfs_end_transaction(trans, root); 1000 btrfs_end_transaction(trans, root);
905 1001
@@ -922,7 +1018,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
922 1018
923 trans->transaction->in_commit = 1; 1019 trans->transaction->in_commit = 1;
924 trans->transaction->blocked = 1; 1020 trans->transaction->blocked = 1;
925 cur_trans = trans->transaction;
926 if (cur_trans->list.prev != &root->fs_info->trans_list) { 1021 if (cur_trans->list.prev != &root->fs_info->trans_list) {
927 prev_trans = list_entry(cur_trans->list.prev, 1022 prev_trans = list_entry(cur_trans->list.prev,
928 struct btrfs_transaction, list); 1023 struct btrfs_transaction, list);
@@ -937,6 +1032,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
937 } 1032 }
938 } 1033 }
939 1034
1035 if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
1036 should_grow = 1;
1037
940 do { 1038 do {
941 int snap_pending = 0; 1039 int snap_pending = 0;
942 joined = cur_trans->num_joined; 1040 joined = cur_trans->num_joined;
@@ -949,26 +1047,42 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
949 1047
950 if (cur_trans->num_writers > 1) 1048 if (cur_trans->num_writers > 1)
951 timeout = MAX_SCHEDULE_TIMEOUT; 1049 timeout = MAX_SCHEDULE_TIMEOUT;
952 else 1050 else if (should_grow)
953 timeout = 1; 1051 timeout = 1;
954 1052
955 mutex_unlock(&root->fs_info->trans_mutex); 1053 mutex_unlock(&root->fs_info->trans_mutex);
956 1054
957 if (snap_pending) { 1055 if (flush_on_commit || snap_pending) {
1056 if (flush_on_commit)
1057 btrfs_start_delalloc_inodes(root);
958 ret = btrfs_wait_ordered_extents(root, 1); 1058 ret = btrfs_wait_ordered_extents(root, 1);
959 BUG_ON(ret); 1059 BUG_ON(ret);
960 } 1060 }
961 1061
962 schedule_timeout(timeout); 1062 /*
1063 * rename don't use btrfs_join_transaction, so, once we
1064 * set the transaction to blocked above, we aren't going
1065 * to get any new ordered operations. We can safely run
1066 * it here and no for sure that nothing new will be added
1067 * to the list
1068 */
1069 btrfs_run_ordered_operations(root, 1);
1070
1071 smp_mb();
1072 if (cur_trans->num_writers > 1 || should_grow)
1073 schedule_timeout(timeout);
963 1074
964 mutex_lock(&root->fs_info->trans_mutex); 1075 mutex_lock(&root->fs_info->trans_mutex);
965 finish_wait(&cur_trans->writer_wait, &wait); 1076 finish_wait(&cur_trans->writer_wait, &wait);
966 } while (cur_trans->num_writers > 1 || 1077 } while (cur_trans->num_writers > 1 ||
967 (cur_trans->num_joined != joined)); 1078 (should_grow && cur_trans->num_joined != joined));
968 1079
969 ret = create_pending_snapshots(trans, root->fs_info); 1080 ret = create_pending_snapshots(trans, root->fs_info);
970 BUG_ON(ret); 1081 BUG_ON(ret);
971 1082
1083 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1084 BUG_ON(ret);
1085
972 WARN_ON(cur_trans != trans->transaction); 1086 WARN_ON(cur_trans != trans->transaction);
973 1087
974 /* btrfs_commit_tree_roots is responsible for getting the 1088 /* btrfs_commit_tree_roots is responsible for getting the
@@ -1032,6 +1146,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1032 btrfs_copy_pinned(root, pinned_copy); 1146 btrfs_copy_pinned(root, pinned_copy);
1033 1147
1034 trans->transaction->blocked = 0; 1148 trans->transaction->blocked = 0;
1149
1035 wake_up(&root->fs_info->transaction_throttle); 1150 wake_up(&root->fs_info->transaction_throttle);
1036 wake_up(&root->fs_info->transaction_wait); 1151 wake_up(&root->fs_info->transaction_wait);
1037 1152
@@ -1058,6 +1173,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1058 mutex_lock(&root->fs_info->trans_mutex); 1173 mutex_lock(&root->fs_info->trans_mutex);
1059 1174
1060 cur_trans->commit_done = 1; 1175 cur_trans->commit_done = 1;
1176
1061 root->fs_info->last_trans_committed = cur_trans->transid; 1177 root->fs_info->last_trans_committed = cur_trans->transid;
1062 wake_up(&cur_trans->commit_wait); 1178 wake_up(&cur_trans->commit_wait);
1063 1179
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index ea292117f882..94f5bde2b58d 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -19,10 +19,16 @@
19#ifndef __BTRFS_TRANSACTION__ 19#ifndef __BTRFS_TRANSACTION__
20#define __BTRFS_TRANSACTION__ 20#define __BTRFS_TRANSACTION__
21#include "btrfs_inode.h" 21#include "btrfs_inode.h"
22#include "delayed-ref.h"
22 23
23struct btrfs_transaction { 24struct btrfs_transaction {
24 u64 transid; 25 u64 transid;
26 /*
27 * total writers in this transaction, it must be zero before the
28 * transaction can end
29 */
25 unsigned long num_writers; 30 unsigned long num_writers;
31
26 unsigned long num_joined; 32 unsigned long num_joined;
27 int in_commit; 33 int in_commit;
28 int use_count; 34 int use_count;
@@ -34,6 +40,7 @@ struct btrfs_transaction {
34 wait_queue_head_t writer_wait; 40 wait_queue_head_t writer_wait;
35 wait_queue_head_t commit_wait; 41 wait_queue_head_t commit_wait;
36 struct list_head pending_snapshots; 42 struct list_head pending_snapshots;
43 struct btrfs_delayed_ref_root delayed_refs;
37}; 44};
38 45
39struct btrfs_trans_handle { 46struct btrfs_trans_handle {
@@ -44,6 +51,7 @@ struct btrfs_trans_handle {
44 u64 block_group; 51 u64 block_group;
45 u64 alloc_exclude_start; 52 u64 alloc_exclude_start;
46 u64 alloc_exclude_nr; 53 u64 alloc_exclude_nr;
54 unsigned long delayed_ref_updates;
47}; 55};
48 56
49struct btrfs_pending_snapshot { 57struct btrfs_pending_snapshot {
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 98d25fa4570e..b10eacdb1620 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -124,8 +124,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
124 } 124 }
125 125
126 btrfs_release_path(root, path); 126 btrfs_release_path(root, path);
127 if (is_extent)
128 btrfs_extent_post_op(trans, root);
129out: 127out:
130 if (path) 128 if (path)
131 btrfs_free_path(path); 129 btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9c462fbd60fa..25f20ea11f27 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -35,6 +35,49 @@
35#define LOG_INODE_EXISTS 1 35#define LOG_INODE_EXISTS 1
36 36
37/* 37/*
38 * directory trouble cases
39 *
40 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
41 * log, we must force a full commit before doing an fsync of the directory
42 * where the unlink was done.
43 * ---> record transid of last unlink/rename per directory
44 *
45 * mkdir foo/some_dir
46 * normal commit
47 * rename foo/some_dir foo2/some_dir
48 * mkdir foo/some_dir
49 * fsync foo/some_dir/some_file
50 *
51 * The fsync above will unlink the original some_dir without recording
52 * it in its new location (foo2). After a crash, some_dir will be gone
53 * unless the fsync of some_file forces a full commit
54 *
55 * 2) we must log any new names for any file or dir that is in the fsync
56 * log. ---> check inode while renaming/linking.
57 *
58 * 2a) we must log any new names for any file or dir during rename
59 * when the directory they are being removed from was logged.
60 * ---> check inode and old parent dir during rename
61 *
62 * 2a is actually the more important variant. With the extra logging
63 * a crash might unlink the old name without recreating the new one
64 *
65 * 3) after a crash, we must go through any directories with a link count
66 * of zero and redo the rm -rf
67 *
68 * mkdir f1/foo
69 * normal commit
70 * rm -rf f1/foo
71 * fsync(f1)
72 *
73 * The directory f1 was fully removed from the FS, but fsync was never
74 * called on f1, only its parent dir. After a crash the rm -rf must
75 * be replayed. This must be able to recurse down the entire
76 * directory tree. The inode link count fixup code takes care of the
77 * ugly details.
78 */
79
80/*
38 * stages for the tree walking. The first 81 * stages for the tree walking. The first
39 * stage (0) is to only pin down the blocks we find 82 * stage (0) is to only pin down the blocks we find
40 * the second stage (1) is to make sure that all the inodes 83 * the second stage (1) is to make sure that all the inodes
@@ -47,12 +90,17 @@
47#define LOG_WALK_REPLAY_INODES 1 90#define LOG_WALK_REPLAY_INODES 1
48#define LOG_WALK_REPLAY_ALL 2 91#define LOG_WALK_REPLAY_ALL 2
49 92
50static int __btrfs_log_inode(struct btrfs_trans_handle *trans, 93static int btrfs_log_inode(struct btrfs_trans_handle *trans,
51 struct btrfs_root *root, struct inode *inode, 94 struct btrfs_root *root, struct inode *inode,
52 int inode_only); 95 int inode_only);
53static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 96static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root, 97 struct btrfs_root *root,
55 struct btrfs_path *path, u64 objectid); 98 struct btrfs_path *path, u64 objectid);
99static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
100 struct btrfs_root *root,
101 struct btrfs_root *log,
102 struct btrfs_path *path,
103 u64 dirid, int del_all);
56 104
57/* 105/*
58 * tree logging is a special write ahead log used to make sure that 106 * tree logging is a special write ahead log used to make sure that
@@ -133,10 +181,25 @@ static int join_running_log_trans(struct btrfs_root *root)
133} 181}
134 182
135/* 183/*
184 * This either makes the current running log transaction wait
185 * until you call btrfs_end_log_trans() or it makes any future
186 * log transactions wait until you call btrfs_end_log_trans()
187 */
188int btrfs_pin_log_trans(struct btrfs_root *root)
189{
190 int ret = -ENOENT;
191
192 mutex_lock(&root->log_mutex);
193 atomic_inc(&root->log_writers);
194 mutex_unlock(&root->log_mutex);
195 return ret;
196}
197
198/*
136 * indicate we're done making changes to the log tree 199 * indicate we're done making changes to the log tree
137 * and wake up anyone waiting to do a sync 200 * and wake up anyone waiting to do a sync
138 */ 201 */
139static int end_log_trans(struct btrfs_root *root) 202int btrfs_end_log_trans(struct btrfs_root *root)
140{ 203{
141 if (atomic_dec_and_test(&root->log_writers)) { 204 if (atomic_dec_and_test(&root->log_writers)) {
142 smp_mb(); 205 smp_mb();
@@ -199,12 +262,9 @@ static int process_one_buffer(struct btrfs_root *log,
199 struct extent_buffer *eb, 262 struct extent_buffer *eb,
200 struct walk_control *wc, u64 gen) 263 struct walk_control *wc, u64 gen)
201{ 264{
202 if (wc->pin) { 265 if (wc->pin)
203 mutex_lock(&log->fs_info->pinned_mutex);
204 btrfs_update_pinned_extents(log->fs_info->extent_root, 266 btrfs_update_pinned_extents(log->fs_info->extent_root,
205 eb->start, eb->len, 1); 267 eb->start, eb->len, 1);
206 mutex_unlock(&log->fs_info->pinned_mutex);
207 }
208 268
209 if (btrfs_buffer_uptodate(eb, gen)) { 269 if (btrfs_buffer_uptodate(eb, gen)) {
210 if (wc->write) 270 if (wc->write)
@@ -603,6 +663,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
603 663
604 ret = link_to_fixup_dir(trans, root, path, location.objectid); 664 ret = link_to_fixup_dir(trans, root, path, location.objectid);
605 BUG_ON(ret); 665 BUG_ON(ret);
666
606 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 667 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
607 BUG_ON(ret); 668 BUG_ON(ret);
608 kfree(name); 669 kfree(name);
@@ -804,6 +865,7 @@ conflict_again:
804 victim_name_len)) { 865 victim_name_len)) {
805 btrfs_inc_nlink(inode); 866 btrfs_inc_nlink(inode);
806 btrfs_release_path(root, path); 867 btrfs_release_path(root, path);
868
807 ret = btrfs_unlink_inode(trans, root, dir, 869 ret = btrfs_unlink_inode(trans, root, dir,
808 inode, victim_name, 870 inode, victim_name,
809 victim_name_len); 871 victim_name_len);
@@ -922,13 +984,20 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
922 key.offset--; 984 key.offset--;
923 btrfs_release_path(root, path); 985 btrfs_release_path(root, path);
924 } 986 }
925 btrfs_free_path(path); 987 btrfs_release_path(root, path);
926 if (nlink != inode->i_nlink) { 988 if (nlink != inode->i_nlink) {
927 inode->i_nlink = nlink; 989 inode->i_nlink = nlink;
928 btrfs_update_inode(trans, root, inode); 990 btrfs_update_inode(trans, root, inode);
929 } 991 }
930 BTRFS_I(inode)->index_cnt = (u64)-1; 992 BTRFS_I(inode)->index_cnt = (u64)-1;
931 993
994 if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) {
995 ret = replay_dir_deletes(trans, root, NULL, path,
996 inode->i_ino, 1);
997 BUG_ON(ret);
998 }
999 btrfs_free_path(path);
1000
932 return 0; 1001 return 0;
933} 1002}
934 1003
@@ -971,9 +1040,12 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
971 1040
972 iput(inode); 1041 iput(inode);
973 1042
974 if (key.offset == 0) 1043 /*
975 break; 1044 * fixup on a directory may create new entries,
976 key.offset--; 1045 * make sure we always look for the highset possible
1046 * offset
1047 */
1048 key.offset = (u64)-1;
977 } 1049 }
978 btrfs_release_path(root, path); 1050 btrfs_release_path(root, path);
979 return 0; 1051 return 0;
@@ -1150,8 +1222,7 @@ insert:
1150 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1222 ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1151 name, name_len, log_type, &log_key); 1223 name, name_len, log_type, &log_key);
1152 1224
1153 if (ret && ret != -ENOENT) 1225 BUG_ON(ret && ret != -ENOENT);
1154 BUG();
1155 goto out; 1226 goto out;
1156} 1227}
1157 1228
@@ -1313,11 +1384,11 @@ again:
1313 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1384 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1314 name_len); 1385 name_len);
1315 log_di = NULL; 1386 log_di = NULL;
1316 if (dir_key->type == BTRFS_DIR_ITEM_KEY) { 1387 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
1317 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1388 log_di = btrfs_lookup_dir_item(trans, log, log_path,
1318 dir_key->objectid, 1389 dir_key->objectid,
1319 name, name_len, 0); 1390 name, name_len, 0);
1320 } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) { 1391 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
1321 log_di = btrfs_lookup_dir_index_item(trans, log, 1392 log_di = btrfs_lookup_dir_index_item(trans, log,
1322 log_path, 1393 log_path,
1323 dir_key->objectid, 1394 dir_key->objectid,
@@ -1378,7 +1449,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1378 struct btrfs_root *root, 1449 struct btrfs_root *root,
1379 struct btrfs_root *log, 1450 struct btrfs_root *log,
1380 struct btrfs_path *path, 1451 struct btrfs_path *path,
1381 u64 dirid) 1452 u64 dirid, int del_all)
1382{ 1453{
1383 u64 range_start; 1454 u64 range_start;
1384 u64 range_end; 1455 u64 range_end;
@@ -1408,10 +1479,14 @@ again:
1408 range_start = 0; 1479 range_start = 0;
1409 range_end = 0; 1480 range_end = 0;
1410 while (1) { 1481 while (1) {
1411 ret = find_dir_range(log, path, dirid, key_type, 1482 if (del_all)
1412 &range_start, &range_end); 1483 range_end = (u64)-1;
1413 if (ret != 0) 1484 else {
1414 break; 1485 ret = find_dir_range(log, path, dirid, key_type,
1486 &range_start, &range_end);
1487 if (ret != 0)
1488 break;
1489 }
1415 1490
1416 dir_key.offset = range_start; 1491 dir_key.offset = range_start;
1417 while (1) { 1492 while (1) {
@@ -1437,7 +1512,8 @@ again:
1437 break; 1512 break;
1438 1513
1439 ret = check_item_in_log(trans, root, log, path, 1514 ret = check_item_in_log(trans, root, log, path,
1440 log_path, dir, &found_key); 1515 log_path, dir,
1516 &found_key);
1441 BUG_ON(ret); 1517 BUG_ON(ret);
1442 if (found_key.offset == (u64)-1) 1518 if (found_key.offset == (u64)-1)
1443 break; 1519 break;
@@ -1514,7 +1590,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1514 mode = btrfs_inode_mode(eb, inode_item); 1590 mode = btrfs_inode_mode(eb, inode_item);
1515 if (S_ISDIR(mode)) { 1591 if (S_ISDIR(mode)) {
1516 ret = replay_dir_deletes(wc->trans, 1592 ret = replay_dir_deletes(wc->trans,
1517 root, log, path, key.objectid); 1593 root, log, path, key.objectid, 0);
1518 BUG_ON(ret); 1594 BUG_ON(ret);
1519 } 1595 }
1520 ret = overwrite_item(wc->trans, root, path, 1596 ret = overwrite_item(wc->trans, root, path,
@@ -1533,6 +1609,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1533 root, inode, inode->i_size, 1609 root, inode, inode->i_size,
1534 BTRFS_EXTENT_DATA_KEY); 1610 BTRFS_EXTENT_DATA_KEY);
1535 BUG_ON(ret); 1611 BUG_ON(ret);
1612
1613 /* if the nlink count is zero here, the iput
1614 * will free the inode. We bump it to make
1615 * sure it doesn't get freed until the link
1616 * count fixup is done
1617 */
1618 if (inode->i_nlink == 0) {
1619 btrfs_inc_nlink(inode);
1620 btrfs_update_inode(wc->trans,
1621 root, inode);
1622 }
1536 iput(inode); 1623 iput(inode);
1537 } 1624 }
1538 ret = link_to_fixup_dir(wc->trans, root, 1625 ret = link_to_fixup_dir(wc->trans, root,
@@ -1840,7 +1927,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
1840 return ret; 1927 return ret;
1841} 1928}
1842 1929
1843static int wait_log_commit(struct btrfs_root *root, unsigned long transid) 1930static int wait_log_commit(struct btrfs_trans_handle *trans,
1931 struct btrfs_root *root, unsigned long transid)
1844{ 1932{
1845 DEFINE_WAIT(wait); 1933 DEFINE_WAIT(wait);
1846 int index = transid % 2; 1934 int index = transid % 2;
@@ -1854,9 +1942,12 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
1854 prepare_to_wait(&root->log_commit_wait[index], 1942 prepare_to_wait(&root->log_commit_wait[index],
1855 &wait, TASK_UNINTERRUPTIBLE); 1943 &wait, TASK_UNINTERRUPTIBLE);
1856 mutex_unlock(&root->log_mutex); 1944 mutex_unlock(&root->log_mutex);
1857 if (root->log_transid < transid + 2 && 1945
1946 if (root->fs_info->last_trans_log_full_commit !=
1947 trans->transid && root->log_transid < transid + 2 &&
1858 atomic_read(&root->log_commit[index])) 1948 atomic_read(&root->log_commit[index]))
1859 schedule(); 1949 schedule();
1950
1860 finish_wait(&root->log_commit_wait[index], &wait); 1951 finish_wait(&root->log_commit_wait[index], &wait);
1861 mutex_lock(&root->log_mutex); 1952 mutex_lock(&root->log_mutex);
1862 } while (root->log_transid < transid + 2 && 1953 } while (root->log_transid < transid + 2 &&
@@ -1864,14 +1955,16 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
1864 return 0; 1955 return 0;
1865} 1956}
1866 1957
1867static int wait_for_writer(struct btrfs_root *root) 1958static int wait_for_writer(struct btrfs_trans_handle *trans,
1959 struct btrfs_root *root)
1868{ 1960{
1869 DEFINE_WAIT(wait); 1961 DEFINE_WAIT(wait);
1870 while (atomic_read(&root->log_writers)) { 1962 while (atomic_read(&root->log_writers)) {
1871 prepare_to_wait(&root->log_writer_wait, 1963 prepare_to_wait(&root->log_writer_wait,
1872 &wait, TASK_UNINTERRUPTIBLE); 1964 &wait, TASK_UNINTERRUPTIBLE);
1873 mutex_unlock(&root->log_mutex); 1965 mutex_unlock(&root->log_mutex);
1874 if (atomic_read(&root->log_writers)) 1966 if (root->fs_info->last_trans_log_full_commit !=
1967 trans->transid && atomic_read(&root->log_writers))
1875 schedule(); 1968 schedule();
1876 mutex_lock(&root->log_mutex); 1969 mutex_lock(&root->log_mutex);
1877 finish_wait(&root->log_writer_wait, &wait); 1970 finish_wait(&root->log_writer_wait, &wait);
@@ -1882,7 +1975,14 @@ static int wait_for_writer(struct btrfs_root *root)
1882/* 1975/*
1883 * btrfs_sync_log does sends a given tree log down to the disk and 1976 * btrfs_sync_log does sends a given tree log down to the disk and
1884 * updates the super blocks to record it. When this call is done, 1977 * updates the super blocks to record it. When this call is done,
1885 * you know that any inodes previously logged are safely on disk 1978 * you know that any inodes previously logged are safely on disk only
1979 * if it returns 0.
1980 *
1981 * Any other return value means you need to call btrfs_commit_transaction.
1982 * Some of the edge cases for fsyncing directories that have had unlinks
1983 * or renames done in the past mean that sometimes the only safe
1984 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
1985 * that has happened.
1886 */ 1986 */
1887int btrfs_sync_log(struct btrfs_trans_handle *trans, 1987int btrfs_sync_log(struct btrfs_trans_handle *trans,
1888 struct btrfs_root *root) 1988 struct btrfs_root *root)
@@ -1896,7 +1996,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1896 mutex_lock(&root->log_mutex); 1996 mutex_lock(&root->log_mutex);
1897 index1 = root->log_transid % 2; 1997 index1 = root->log_transid % 2;
1898 if (atomic_read(&root->log_commit[index1])) { 1998 if (atomic_read(&root->log_commit[index1])) {
1899 wait_log_commit(root, root->log_transid); 1999 wait_log_commit(trans, root, root->log_transid);
1900 mutex_unlock(&root->log_mutex); 2000 mutex_unlock(&root->log_mutex);
1901 return 0; 2001 return 0;
1902 } 2002 }
@@ -1904,18 +2004,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1904 2004
1905 /* wait for previous tree log sync to complete */ 2005 /* wait for previous tree log sync to complete */
1906 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2006 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
1907 wait_log_commit(root, root->log_transid - 1); 2007 wait_log_commit(trans, root, root->log_transid - 1);
1908 2008
1909 while (1) { 2009 while (1) {
1910 unsigned long batch = root->log_batch; 2010 unsigned long batch = root->log_batch;
1911 mutex_unlock(&root->log_mutex); 2011 mutex_unlock(&root->log_mutex);
1912 schedule_timeout_uninterruptible(1); 2012 schedule_timeout_uninterruptible(1);
1913 mutex_lock(&root->log_mutex); 2013 mutex_lock(&root->log_mutex);
1914 wait_for_writer(root); 2014
2015 wait_for_writer(trans, root);
1915 if (batch == root->log_batch) 2016 if (batch == root->log_batch)
1916 break; 2017 break;
1917 } 2018 }
1918 2019
2020 /* bail out if we need to do a full commit */
2021 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2022 ret = -EAGAIN;
2023 mutex_unlock(&root->log_mutex);
2024 goto out;
2025 }
2026
1919 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 2027 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
1920 BUG_ON(ret); 2028 BUG_ON(ret);
1921 2029
@@ -1951,16 +2059,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1951 2059
1952 index2 = log_root_tree->log_transid % 2; 2060 index2 = log_root_tree->log_transid % 2;
1953 if (atomic_read(&log_root_tree->log_commit[index2])) { 2061 if (atomic_read(&log_root_tree->log_commit[index2])) {
1954 wait_log_commit(log_root_tree, log_root_tree->log_transid); 2062 wait_log_commit(trans, log_root_tree,
2063 log_root_tree->log_transid);
1955 mutex_unlock(&log_root_tree->log_mutex); 2064 mutex_unlock(&log_root_tree->log_mutex);
1956 goto out; 2065 goto out;
1957 } 2066 }
1958 atomic_set(&log_root_tree->log_commit[index2], 1); 2067 atomic_set(&log_root_tree->log_commit[index2], 1);
1959 2068
1960 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) 2069 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
1961 wait_log_commit(log_root_tree, log_root_tree->log_transid - 1); 2070 wait_log_commit(trans, log_root_tree,
2071 log_root_tree->log_transid - 1);
2072 }
2073
2074 wait_for_writer(trans, log_root_tree);
1962 2075
1963 wait_for_writer(log_root_tree); 2076 /*
2077 * now that we've moved on to the tree of log tree roots,
2078 * check the full commit flag again
2079 */
2080 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2081 mutex_unlock(&log_root_tree->log_mutex);
2082 ret = -EAGAIN;
2083 goto out_wake_log_root;
2084 }
1964 2085
1965 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2086 ret = btrfs_write_and_wait_marked_extents(log_root_tree,
1966 &log_root_tree->dirty_log_pages); 2087 &log_root_tree->dirty_log_pages);
@@ -1985,7 +2106,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1985 * in and cause problems either. 2106 * in and cause problems either.
1986 */ 2107 */
1987 write_ctree_super(trans, root->fs_info->tree_root, 2); 2108 write_ctree_super(trans, root->fs_info->tree_root, 2);
2109 ret = 0;
1988 2110
2111out_wake_log_root:
1989 atomic_set(&log_root_tree->log_commit[index2], 0); 2112 atomic_set(&log_root_tree->log_commit[index2], 0);
1990 smp_mb(); 2113 smp_mb();
1991 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2114 if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
@@ -1998,7 +2121,8 @@ out:
1998 return 0; 2121 return 0;
1999} 2122}
2000 2123
2001/* * free all the extents used by the tree log. This should be called 2124/*
2125 * free all the extents used by the tree log. This should be called
2002 * at commit time of the full transaction 2126 * at commit time of the full transaction
2003 */ 2127 */
2004int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 2128int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
@@ -2132,7 +2256,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2132 2256
2133 btrfs_free_path(path); 2257 btrfs_free_path(path);
2134 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2258 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2135 end_log_trans(root); 2259 btrfs_end_log_trans(root);
2136 2260
2137 return 0; 2261 return 0;
2138} 2262}
@@ -2159,7 +2283,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2159 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2283 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2160 dirid, &index); 2284 dirid, &index);
2161 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2285 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2162 end_log_trans(root); 2286 btrfs_end_log_trans(root);
2163 2287
2164 return ret; 2288 return ret;
2165} 2289}
@@ -2559,7 +2683,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2559 * 2683 *
2560 * This handles both files and directories. 2684 * This handles both files and directories.
2561 */ 2685 */
2562static int __btrfs_log_inode(struct btrfs_trans_handle *trans, 2686static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2563 struct btrfs_root *root, struct inode *inode, 2687 struct btrfs_root *root, struct inode *inode,
2564 int inode_only) 2688 int inode_only)
2565{ 2689{
@@ -2585,28 +2709,17 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
2585 min_key.offset = 0; 2709 min_key.offset = 0;
2586 2710
2587 max_key.objectid = inode->i_ino; 2711 max_key.objectid = inode->i_ino;
2712
2713 /* today the code can only do partial logging of directories */
2714 if (!S_ISDIR(inode->i_mode))
2715 inode_only = LOG_INODE_ALL;
2716
2588 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 2717 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
2589 max_key.type = BTRFS_XATTR_ITEM_KEY; 2718 max_key.type = BTRFS_XATTR_ITEM_KEY;
2590 else 2719 else
2591 max_key.type = (u8)-1; 2720 max_key.type = (u8)-1;
2592 max_key.offset = (u64)-1; 2721 max_key.offset = (u64)-1;
2593 2722
2594 /*
2595 * if this inode has already been logged and we're in inode_only
2596 * mode, we don't want to delete the things that have already
2597 * been written to the log.
2598 *
2599 * But, if the inode has been through an inode_only log,
2600 * the logged_trans field is not set. This allows us to catch
2601 * any new names for this inode in the backrefs by logging it
2602 * again
2603 */
2604 if (inode_only == LOG_INODE_EXISTS &&
2605 BTRFS_I(inode)->logged_trans == trans->transid) {
2606 btrfs_free_path(path);
2607 btrfs_free_path(dst_path);
2608 goto out;
2609 }
2610 mutex_lock(&BTRFS_I(inode)->log_mutex); 2723 mutex_lock(&BTRFS_I(inode)->log_mutex);
2611 2724
2612 /* 2725 /*
@@ -2693,7 +2806,6 @@ next_slot:
2693 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 2806 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2694 btrfs_release_path(root, path); 2807 btrfs_release_path(root, path);
2695 btrfs_release_path(log, dst_path); 2808 btrfs_release_path(log, dst_path);
2696 BTRFS_I(inode)->log_dirty_trans = 0;
2697 ret = log_directory_changes(trans, root, inode, path, dst_path); 2809 ret = log_directory_changes(trans, root, inode, path, dst_path);
2698 BUG_ON(ret); 2810 BUG_ON(ret);
2699 } 2811 }
@@ -2702,19 +2814,69 @@ next_slot:
2702 2814
2703 btrfs_free_path(path); 2815 btrfs_free_path(path);
2704 btrfs_free_path(dst_path); 2816 btrfs_free_path(dst_path);
2705out:
2706 return 0; 2817 return 0;
2707} 2818}
2708 2819
2709int btrfs_log_inode(struct btrfs_trans_handle *trans, 2820/*
2710 struct btrfs_root *root, struct inode *inode, 2821 * follow the dentry parent pointers up the chain and see if any
2711 int inode_only) 2822 * of the directories in it require a full commit before they can
2823 * be logged. Returns zero if nothing special needs to be done or 1 if
2824 * a full commit is required.
2825 */
2826static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
2827 struct inode *inode,
2828 struct dentry *parent,
2829 struct super_block *sb,
2830 u64 last_committed)
2712{ 2831{
2713 int ret; 2832 int ret = 0;
2833 struct btrfs_root *root;
2714 2834
2715 start_log_trans(trans, root); 2835 /*
2716 ret = __btrfs_log_inode(trans, root, inode, inode_only); 2836 * for regular files, if its inode is already on disk, we don't
2717 end_log_trans(root); 2837 * have to worry about the parents at all. This is because
2838 * we can use the last_unlink_trans field to record renames
2839 * and other fun in this file.
2840 */
2841 if (S_ISREG(inode->i_mode) &&
2842 BTRFS_I(inode)->generation <= last_committed &&
2843 BTRFS_I(inode)->last_unlink_trans <= last_committed)
2844 goto out;
2845
2846 if (!S_ISDIR(inode->i_mode)) {
2847 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2848 goto out;
2849 inode = parent->d_inode;
2850 }
2851
2852 while (1) {
2853 BTRFS_I(inode)->logged_trans = trans->transid;
2854 smp_mb();
2855
2856 if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
2857 root = BTRFS_I(inode)->root;
2858
2859 /*
2860 * make sure any commits to the log are forced
2861 * to be full commits
2862 */
2863 root->fs_info->last_trans_log_full_commit =
2864 trans->transid;
2865 ret = 1;
2866 break;
2867 }
2868
2869 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2870 break;
2871
2872 if (parent == sb->s_root)
2873 break;
2874
2875 parent = parent->d_parent;
2876 inode = parent->d_inode;
2877
2878 }
2879out:
2718 return ret; 2880 return ret;
2719} 2881}
2720 2882
@@ -2724,31 +2886,70 @@ int btrfs_log_inode(struct btrfs_trans_handle *trans,
2724 * only logging is done of any parent directories that are older than 2886 * only logging is done of any parent directories that are older than
2725 * the last committed transaction 2887 * the last committed transaction
2726 */ 2888 */
2727int btrfs_log_dentry(struct btrfs_trans_handle *trans, 2889int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2728 struct btrfs_root *root, struct dentry *dentry) 2890 struct btrfs_root *root, struct inode *inode,
2891 struct dentry *parent, int exists_only)
2729{ 2892{
2730 int inode_only = LOG_INODE_ALL; 2893 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
2731 struct super_block *sb; 2894 struct super_block *sb;
2732 int ret; 2895 int ret = 0;
2896 u64 last_committed = root->fs_info->last_trans_committed;
2897
2898 sb = inode->i_sb;
2899
2900 if (btrfs_test_opt(root, NOTREELOG)) {
2901 ret = 1;
2902 goto end_no_trans;
2903 }
2904
2905 if (root->fs_info->last_trans_log_full_commit >
2906 root->fs_info->last_trans_committed) {
2907 ret = 1;
2908 goto end_no_trans;
2909 }
2910
2911 ret = check_parent_dirs_for_sync(trans, inode, parent,
2912 sb, last_committed);
2913 if (ret)
2914 goto end_no_trans;
2733 2915
2734 start_log_trans(trans, root); 2916 start_log_trans(trans, root);
2735 sb = dentry->d_inode->i_sb;
2736 while (1) {
2737 ret = __btrfs_log_inode(trans, root, dentry->d_inode,
2738 inode_only);
2739 BUG_ON(ret);
2740 inode_only = LOG_INODE_EXISTS;
2741 2917
2742 dentry = dentry->d_parent; 2918 ret = btrfs_log_inode(trans, root, inode, inode_only);
2743 if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb) 2919 BUG_ON(ret);
2920
2921 /*
2922 * for regular files, if its inode is already on disk, we don't
2923 * have to worry about the parents at all. This is because
2924 * we can use the last_unlink_trans field to record renames
2925 * and other fun in this file.
2926 */
2927 if (S_ISREG(inode->i_mode) &&
2928 BTRFS_I(inode)->generation <= last_committed &&
2929 BTRFS_I(inode)->last_unlink_trans <= last_committed)
2930 goto no_parent;
2931
2932 inode_only = LOG_INODE_EXISTS;
2933 while (1) {
2934 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
2744 break; 2935 break;
2745 2936
2746 if (BTRFS_I(dentry->d_inode)->generation <= 2937 inode = parent->d_inode;
2747 root->fs_info->last_trans_committed) 2938 if (BTRFS_I(inode)->generation >
2939 root->fs_info->last_trans_committed) {
2940 ret = btrfs_log_inode(trans, root, inode, inode_only);
2941 BUG_ON(ret);
2942 }
2943 if (parent == sb->s_root)
2748 break; 2944 break;
2945
2946 parent = parent->d_parent;
2749 } 2947 }
2750 end_log_trans(root); 2948no_parent:
2751 return 0; 2949 ret = 0;
2950 btrfs_end_log_trans(root);
2951end_no_trans:
2952 return ret;
2752} 2953}
2753 2954
2754/* 2955/*
@@ -2760,12 +2961,8 @@ int btrfs_log_dentry(struct btrfs_trans_handle *trans,
2760int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 2961int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
2761 struct btrfs_root *root, struct dentry *dentry) 2962 struct btrfs_root *root, struct dentry *dentry)
2762{ 2963{
2763 u64 gen; 2964 return btrfs_log_inode_parent(trans, root, dentry->d_inode,
2764 gen = root->fs_info->last_trans_new_blockgroup; 2965 dentry->d_parent, 0);
2765 if (gen > root->fs_info->last_trans_committed)
2766 return 1;
2767 else
2768 return btrfs_log_dentry(trans, root, dentry);
2769} 2966}
2770 2967
2771/* 2968/*
@@ -2884,3 +3081,94 @@ again:
2884 kfree(log_root_tree); 3081 kfree(log_root_tree);
2885 return 0; 3082 return 0;
2886} 3083}
3084
3085/*
3086 * there are some corner cases where we want to force a full
3087 * commit instead of allowing a directory to be logged.
3088 *
3089 * They revolve around files there were unlinked from the directory, and
3090 * this function updates the parent directory so that a full commit is
3091 * properly done if it is fsync'd later after the unlinks are done.
3092 */
3093void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
3094 struct inode *dir, struct inode *inode,
3095 int for_rename)
3096{
3097 /*
3098 * when we're logging a file, if it hasn't been renamed
3099 * or unlinked, and its inode is fully committed on disk,
3100 * we don't have to worry about walking up the directory chain
3101 * to log its parents.
3102 *
3103 * So, we use the last_unlink_trans field to put this transid
3104 * into the file. When the file is logged we check it and
3105 * don't log the parents if the file is fully on disk.
3106 */
3107 if (S_ISREG(inode->i_mode))
3108 BTRFS_I(inode)->last_unlink_trans = trans->transid;
3109
3110 /*
3111 * if this directory was already logged any new
3112 * names for this file/dir will get recorded
3113 */
3114 smp_mb();
3115 if (BTRFS_I(dir)->logged_trans == trans->transid)
3116 return;
3117
3118 /*
3119 * if the inode we're about to unlink was logged,
3120 * the log will be properly updated for any new names
3121 */
3122 if (BTRFS_I(inode)->logged_trans == trans->transid)
3123 return;
3124
3125 /*
3126 * when renaming files across directories, if the directory
3127 * there we're unlinking from gets fsync'd later on, there's
3128 * no way to find the destination directory later and fsync it
3129 * properly. So, we have to be conservative and force commits
3130 * so the new name gets discovered.
3131 */
3132 if (for_rename)
3133 goto record;
3134
3135 /* we can safely do the unlink without any special recording */
3136 return;
3137
3138record:
3139 BTRFS_I(dir)->last_unlink_trans = trans->transid;
3140}
3141
3142/*
3143 * Call this after adding a new name for a file and it will properly
3144 * update the log to reflect the new name.
3145 *
3146 * It will return zero if all goes well, and it will return 1 if a
3147 * full transaction commit is required.
3148 */
3149int btrfs_log_new_name(struct btrfs_trans_handle *trans,
3150 struct inode *inode, struct inode *old_dir,
3151 struct dentry *parent)
3152{
3153 struct btrfs_root * root = BTRFS_I(inode)->root;
3154
3155 /*
3156 * this will force the logging code to walk the dentry chain
3157 * up for the file
3158 */
3159 if (S_ISREG(inode->i_mode))
3160 BTRFS_I(inode)->last_unlink_trans = trans->transid;
3161
3162 /*
3163 * if this inode hasn't been logged and directory we're renaming it
3164 * from hasn't been logged, we don't need to log it
3165 */
3166 if (BTRFS_I(inode)->logged_trans <=
3167 root->fs_info->last_trans_committed &&
3168 (!old_dir || BTRFS_I(old_dir)->logged_trans <=
3169 root->fs_info->last_trans_committed))
3170 return 0;
3171
3172 return btrfs_log_inode_parent(trans, root, inode, parent, 1);
3173}
3174
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index b9409b32ed02..d09c7609e16b 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -22,14 +22,9 @@
22int btrfs_sync_log(struct btrfs_trans_handle *trans, 22int btrfs_sync_log(struct btrfs_trans_handle *trans,
23 struct btrfs_root *root); 23 struct btrfs_root *root);
24int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 24int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
25int btrfs_log_dentry(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root, struct dentry *dentry);
27int btrfs_recover_log_trees(struct btrfs_root *tree_root); 25int btrfs_recover_log_trees(struct btrfs_root *tree_root);
28int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 26int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
29 struct btrfs_root *root, struct dentry *dentry); 27 struct btrfs_root *root, struct dentry *dentry);
30int btrfs_log_inode(struct btrfs_trans_handle *trans,
31 struct btrfs_root *root, struct inode *inode,
32 int inode_only);
33int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 28int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
34 struct btrfs_root *root, 29 struct btrfs_root *root,
35 const char *name, int name_len, 30 const char *name, int name_len,
@@ -38,4 +33,16 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
38 struct btrfs_root *root, 33 struct btrfs_root *root,
39 const char *name, int name_len, 34 const char *name, int name_len,
40 struct inode *inode, u64 dirid); 35 struct inode *inode, u64 dirid);
36int btrfs_join_running_log_trans(struct btrfs_root *root);
37int btrfs_end_log_trans(struct btrfs_root *root);
38int btrfs_pin_log_trans(struct btrfs_root *root);
39int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
40 struct btrfs_root *root, struct inode *inode,
41 struct dentry *parent, int exists_only);
42void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
43 struct inode *dir, struct inode *inode,
44 int for_rename);
45int btrfs_log_new_name(struct btrfs_trans_handle *trans,
46 struct inode *inode, struct inode *old_dir,
47 struct dentry *parent);
41#endif 48#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1316139bf9e8..e0913e469728 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,6 +20,7 @@
20#include <linux/buffer_head.h> 20#include <linux/buffer_head.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/random.h> 22#include <linux/random.h>
23#include <linux/iocontext.h>
23#include <asm/div64.h> 24#include <asm/div64.h>
24#include "compat.h" 25#include "compat.h"
25#include "ctree.h" 26#include "ctree.h"
@@ -145,8 +146,9 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
145 int again = 0; 146 int again = 0;
146 unsigned long num_run = 0; 147 unsigned long num_run = 0;
147 unsigned long limit; 148 unsigned long limit;
149 unsigned long last_waited = 0;
148 150
149 bdi = device->bdev->bd_inode->i_mapping->backing_dev_info; 151 bdi = blk_get_backing_dev_info(device->bdev);
150 fs_info = device->dev_root->fs_info; 152 fs_info = device->dev_root->fs_info;
151 limit = btrfs_async_submit_limit(fs_info); 153 limit = btrfs_async_submit_limit(fs_info);
152 limit = limit * 2 / 3; 154 limit = limit * 2 / 3;
@@ -207,7 +209,32 @@ loop_lock:
207 if (pending && bdi_write_congested(bdi) && num_run > 16 && 209 if (pending && bdi_write_congested(bdi) && num_run > 16 &&
208 fs_info->fs_devices->open_devices > 1) { 210 fs_info->fs_devices->open_devices > 1) {
209 struct bio *old_head; 211 struct bio *old_head;
212 struct io_context *ioc;
210 213
214 ioc = current->io_context;
215
216 /*
217 * the main goal here is that we don't want to
218 * block if we're going to be able to submit
219 * more requests without blocking.
220 *
221 * This code does two great things, it pokes into
222 * the elevator code from a filesystem _and_
223 * it makes assumptions about how batching works.
224 */
225 if (ioc && ioc->nr_batch_requests > 0 &&
226 time_before(jiffies, ioc->last_waited + HZ/50UL) &&
227 (last_waited == 0 ||
228 ioc->last_waited == last_waited)) {
229 /*
230 * we want to go through our batch of
231 * requests and stop. So, we copy out
232 * the ioc->last_waited time and test
233 * against it before looping
234 */
235 last_waited = ioc->last_waited;
236 continue;
237 }
211 spin_lock(&device->io_lock); 238 spin_lock(&device->io_lock);
212 239
213 old_head = device->pending_bios; 240 old_head = device->pending_bios;
@@ -231,6 +258,18 @@ loop_lock:
231 if (device->pending_bios) 258 if (device->pending_bios)
232 goto loop_lock; 259 goto loop_lock;
233 spin_unlock(&device->io_lock); 260 spin_unlock(&device->io_lock);
261
262 /*
263 * IO has already been through a long path to get here. Checksumming,
264 * async helper threads, perhaps compression. We've done a pretty
265 * good job of collecting a batch of IO and should just unplug
266 * the device right away.
267 *
268 * This will help anyone who is waiting on the IO, they might have
269 * already unplugged, but managed to do so before the bio they
270 * cared about found its way down here.
271 */
272 blk_run_backing_dev(bdi, NULL);
234done: 273done:
235 return 0; 274 return 0;
236} 275}
@@ -1374,6 +1413,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1374 ret = btrfs_add_device(trans, root, device); 1413 ret = btrfs_add_device(trans, root, device);
1375 } 1414 }
1376 1415
1416 /*
1417 * we've got more storage, clear any full flags on the space
1418 * infos
1419 */
1420 btrfs_clear_space_info_full(root->fs_info);
1421
1377 unlock_chunks(root); 1422 unlock_chunks(root);
1378 btrfs_commit_transaction(trans, root); 1423 btrfs_commit_transaction(trans, root);
1379 1424
@@ -1459,6 +1504,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1459 device->fs_devices->total_rw_bytes += diff; 1504 device->fs_devices->total_rw_bytes += diff;
1460 1505
1461 device->total_bytes = new_size; 1506 device->total_bytes = new_size;
1507 btrfs_clear_space_info_full(device->dev_root->fs_info);
1508
1462 return btrfs_update_device(trans, device); 1509 return btrfs_update_device(trans, device);
1463} 1510}
1464 1511
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 86c44e9ae110..2185de72ff7d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -76,7 +76,7 @@ struct btrfs_device {
76struct btrfs_fs_devices { 76struct btrfs_fs_devices {
77 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ 77 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
78 78
79 /* the device with this id has the most recent coyp of the super */ 79 /* the device with this id has the most recent copy of the super */
80 u64 latest_devid; 80 u64 latest_devid;
81 u64 latest_trans; 81 u64 latest_trans;
82 u64 num_devices; 82 u64 num_devices;
diff --git a/fs/buffer.c b/fs/buffer.c
index 9f697419ed8e..5d55a896ff78 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -166,151 +166,6 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
166} 166}
167 167
168/* 168/*
169 * Write out and wait upon all the dirty data associated with a block
170 * device via its mapping. Does not take the superblock lock.
171 */
172int sync_blockdev(struct block_device *bdev)
173{
174 int ret = 0;
175
176 if (bdev)
177 ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
178 return ret;
179}
180EXPORT_SYMBOL(sync_blockdev);
181
182/*
183 * Write out and wait upon all dirty data associated with this
184 * device. Filesystem data as well as the underlying block
185 * device. Takes the superblock lock.
186 */
187int fsync_bdev(struct block_device *bdev)
188{
189 struct super_block *sb = get_super(bdev);
190 if (sb) {
191 int res = fsync_super(sb);
192 drop_super(sb);
193 return res;
194 }
195 return sync_blockdev(bdev);
196}
197
198/**
199 * freeze_bdev -- lock a filesystem and force it into a consistent state
200 * @bdev: blockdevice to lock
201 *
202 * This takes the block device bd_mount_sem to make sure no new mounts
203 * happen on bdev until thaw_bdev() is called.
204 * If a superblock is found on this device, we take the s_umount semaphore
205 * on it to make sure nobody unmounts until the snapshot creation is done.
206 * The reference counter (bd_fsfreeze_count) guarantees that only the last
207 * unfreeze process can unfreeze the frozen filesystem actually when multiple
208 * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
209 * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
210 * actually.
211 */
212struct super_block *freeze_bdev(struct block_device *bdev)
213{
214 struct super_block *sb;
215 int error = 0;
216
217 mutex_lock(&bdev->bd_fsfreeze_mutex);
218 if (bdev->bd_fsfreeze_count > 0) {
219 bdev->bd_fsfreeze_count++;
220 sb = get_super(bdev);
221 mutex_unlock(&bdev->bd_fsfreeze_mutex);
222 return sb;
223 }
224 bdev->bd_fsfreeze_count++;
225
226 down(&bdev->bd_mount_sem);
227 sb = get_super(bdev);
228 if (sb && !(sb->s_flags & MS_RDONLY)) {
229 sb->s_frozen = SB_FREEZE_WRITE;
230 smp_wmb();
231
232 __fsync_super(sb);
233
234 sb->s_frozen = SB_FREEZE_TRANS;
235 smp_wmb();
236
237 sync_blockdev(sb->s_bdev);
238
239 if (sb->s_op->freeze_fs) {
240 error = sb->s_op->freeze_fs(sb);
241 if (error) {
242 printk(KERN_ERR
243 "VFS:Filesystem freeze failed\n");
244 sb->s_frozen = SB_UNFROZEN;
245 drop_super(sb);
246 up(&bdev->bd_mount_sem);
247 bdev->bd_fsfreeze_count--;
248 mutex_unlock(&bdev->bd_fsfreeze_mutex);
249 return ERR_PTR(error);
250 }
251 }
252 }
253
254 sync_blockdev(bdev);
255 mutex_unlock(&bdev->bd_fsfreeze_mutex);
256
257 return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
258}
259EXPORT_SYMBOL(freeze_bdev);
260
261/**
262 * thaw_bdev -- unlock filesystem
263 * @bdev: blockdevice to unlock
264 * @sb: associated superblock
265 *
266 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
267 */
268int thaw_bdev(struct block_device *bdev, struct super_block *sb)
269{
270 int error = 0;
271
272 mutex_lock(&bdev->bd_fsfreeze_mutex);
273 if (!bdev->bd_fsfreeze_count) {
274 mutex_unlock(&bdev->bd_fsfreeze_mutex);
275 return -EINVAL;
276 }
277
278 bdev->bd_fsfreeze_count--;
279 if (bdev->bd_fsfreeze_count > 0) {
280 if (sb)
281 drop_super(sb);
282 mutex_unlock(&bdev->bd_fsfreeze_mutex);
283 return 0;
284 }
285
286 if (sb) {
287 BUG_ON(sb->s_bdev != bdev);
288 if (!(sb->s_flags & MS_RDONLY)) {
289 if (sb->s_op->unfreeze_fs) {
290 error = sb->s_op->unfreeze_fs(sb);
291 if (error) {
292 printk(KERN_ERR
293 "VFS:Filesystem thaw failed\n");
294 sb->s_frozen = SB_FREEZE_TRANS;
295 bdev->bd_fsfreeze_count++;
296 mutex_unlock(&bdev->bd_fsfreeze_mutex);
297 return error;
298 }
299 }
300 sb->s_frozen = SB_UNFROZEN;
301 smp_wmb();
302 wake_up(&sb->s_wait_unfrozen);
303 }
304 drop_super(sb);
305 }
306
307 up(&bdev->bd_mount_sem);
308 mutex_unlock(&bdev->bd_fsfreeze_mutex);
309 return 0;
310}
311EXPORT_SYMBOL(thaw_bdev);
312
313/*
314 * Various filesystems appear to want __find_get_block to be non-blocking. 169 * Various filesystems appear to want __find_get_block to be non-blocking.
315 * But it's the page lock which protects the buffers. To get around this, 170 * But it's the page lock which protects the buffers. To get around this,
316 * we get exclusion from try_to_free_buffers with the blockdev mapping's 171 * we get exclusion from try_to_free_buffers with the blockdev mapping's
@@ -344,13 +199,13 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
344 head = page_buffers(page); 199 head = page_buffers(page);
345 bh = head; 200 bh = head;
346 do { 201 do {
347 if (bh->b_blocknr == block) { 202 if (!buffer_mapped(bh))
203 all_mapped = 0;
204 else if (bh->b_blocknr == block) {
348 ret = bh; 205 ret = bh;
349 get_bh(bh); 206 get_bh(bh);
350 goto out_unlock; 207 goto out_unlock;
351 } 208 }
352 if (!buffer_mapped(bh))
353 all_mapped = 0;
354 bh = bh->b_this_page; 209 bh = bh->b_this_page;
355 } while (bh != head); 210 } while (bh != head);
356 211
@@ -435,7 +290,7 @@ static void free_more_memory(void)
435 &zone); 290 &zone);
436 if (zone) 291 if (zone)
437 try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, 292 try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
438 GFP_NOFS); 293 GFP_NOFS, NULL);
439 } 294 }
440} 295}
441 296
@@ -692,6 +547,39 @@ repeat:
692 return err; 547 return err;
693} 548}
694 549
550void do_thaw_all(unsigned long unused)
551{
552 struct super_block *sb;
553 char b[BDEVNAME_SIZE];
554
555 spin_lock(&sb_lock);
556restart:
557 list_for_each_entry(sb, &super_blocks, s_list) {
558 sb->s_count++;
559 spin_unlock(&sb_lock);
560 down_read(&sb->s_umount);
561 while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
562 printk(KERN_WARNING "Emergency Thaw on %s\n",
563 bdevname(sb->s_bdev, b));
564 up_read(&sb->s_umount);
565 spin_lock(&sb_lock);
566 if (__put_super_and_need_restart(sb))
567 goto restart;
568 }
569 spin_unlock(&sb_lock);
570 printk(KERN_WARNING "Emergency Thaw complete\n");
571}
572
573/**
574 * emergency_thaw_all -- forcibly thaw every frozen filesystem
575 *
576 * Used for emergency unfreeze of all filesystems via SysRq
577 */
578void emergency_thaw_all(void)
579{
580 pdflush_operation(do_thaw_all, 0);
581}
582
695/** 583/**
696 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers 584 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
697 * @mapping: the mapping which wants those buffers written 585 * @mapping: the mapping which wants those buffers written
@@ -760,33 +648,18 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
760 * If warn is true, then emit a warning if the page is not uptodate and has 648 * If warn is true, then emit a warning if the page is not uptodate and has
761 * not been truncated. 649 * not been truncated.
762 */ 650 */
763static int __set_page_dirty(struct page *page, 651static void __set_page_dirty(struct page *page,
764 struct address_space *mapping, int warn) 652 struct address_space *mapping, int warn)
765{ 653{
766 if (unlikely(!mapping))
767 return !TestSetPageDirty(page);
768
769 if (TestSetPageDirty(page))
770 return 0;
771
772 spin_lock_irq(&mapping->tree_lock); 654 spin_lock_irq(&mapping->tree_lock);
773 if (page->mapping) { /* Race with truncate? */ 655 if (page->mapping) { /* Race with truncate? */
774 WARN_ON_ONCE(warn && !PageUptodate(page)); 656 WARN_ON_ONCE(warn && !PageUptodate(page));
775 657 account_page_dirtied(page, mapping);
776 if (mapping_cap_account_dirty(mapping)) {
777 __inc_zone_page_state(page, NR_FILE_DIRTY);
778 __inc_bdi_stat(mapping->backing_dev_info,
779 BDI_RECLAIMABLE);
780 task_dirty_inc(current);
781 task_io_account_write(PAGE_CACHE_SIZE);
782 }
783 radix_tree_tag_set(&mapping->page_tree, 658 radix_tree_tag_set(&mapping->page_tree,
784 page_index(page), PAGECACHE_TAG_DIRTY); 659 page_index(page), PAGECACHE_TAG_DIRTY);
785 } 660 }
786 spin_unlock_irq(&mapping->tree_lock); 661 spin_unlock_irq(&mapping->tree_lock);
787 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 662 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
788
789 return 1;
790} 663}
791 664
792/* 665/*
@@ -816,6 +689,7 @@ static int __set_page_dirty(struct page *page,
816 */ 689 */
817int __set_page_dirty_buffers(struct page *page) 690int __set_page_dirty_buffers(struct page *page)
818{ 691{
692 int newly_dirty;
819 struct address_space *mapping = page_mapping(page); 693 struct address_space *mapping = page_mapping(page);
820 694
821 if (unlikely(!mapping)) 695 if (unlikely(!mapping))
@@ -831,9 +705,12 @@ int __set_page_dirty_buffers(struct page *page)
831 bh = bh->b_this_page; 705 bh = bh->b_this_page;
832 } while (bh != head); 706 } while (bh != head);
833 } 707 }
708 newly_dirty = !TestSetPageDirty(page);
834 spin_unlock(&mapping->private_lock); 709 spin_unlock(&mapping->private_lock);
835 710
836 return __set_page_dirty(page, mapping, 1); 711 if (newly_dirty)
712 __set_page_dirty(page, mapping, 1);
713 return newly_dirty;
837} 714}
838EXPORT_SYMBOL(__set_page_dirty_buffers); 715EXPORT_SYMBOL(__set_page_dirty_buffers);
839 716
@@ -1262,8 +1139,11 @@ void mark_buffer_dirty(struct buffer_head *bh)
1262 return; 1139 return;
1263 } 1140 }
1264 1141
1265 if (!test_set_buffer_dirty(bh)) 1142 if (!test_set_buffer_dirty(bh)) {
1266 __set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0); 1143 struct page *page = bh->b_page;
1144 if (!TestSetPageDirty(page))
1145 __set_page_dirty(page, page_mapping(page), 0);
1146 }
1267} 1147}
1268 1148
1269/* 1149/*
@@ -1715,6 +1595,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1715 struct buffer_head *bh, *head; 1595 struct buffer_head *bh, *head;
1716 const unsigned blocksize = 1 << inode->i_blkbits; 1596 const unsigned blocksize = 1 << inode->i_blkbits;
1717 int nr_underway = 0; 1597 int nr_underway = 0;
1598 int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
1718 1599
1719 BUG_ON(!PageLocked(page)); 1600 BUG_ON(!PageLocked(page));
1720 1601
@@ -1806,7 +1687,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1806 do { 1687 do {
1807 struct buffer_head *next = bh->b_this_page; 1688 struct buffer_head *next = bh->b_this_page;
1808 if (buffer_async_write(bh)) { 1689 if (buffer_async_write(bh)) {
1809 submit_bh(WRITE, bh); 1690 submit_bh(write_op, bh);
1810 nr_underway++; 1691 nr_underway++;
1811 } 1692 }
1812 bh = next; 1693 bh = next;
@@ -1860,7 +1741,7 @@ recover:
1860 struct buffer_head *next = bh->b_this_page; 1741 struct buffer_head *next = bh->b_this_page;
1861 if (buffer_async_write(bh)) { 1742 if (buffer_async_write(bh)) {
1862 clear_buffer_dirty(bh); 1743 clear_buffer_dirty(bh);
1863 submit_bh(WRITE, bh); 1744 submit_bh(write_op, bh);
1864 nr_underway++; 1745 nr_underway++;
1865 } 1746 }
1866 bh = next; 1747 bh = next;
@@ -2466,13 +2347,14 @@ int block_commit_write(struct page *page, unsigned from, unsigned to)
2466 * unlock the page. 2347 * unlock the page.
2467 */ 2348 */
2468int 2349int
2469block_page_mkwrite(struct vm_area_struct *vma, struct page *page, 2350block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2470 get_block_t get_block) 2351 get_block_t get_block)
2471{ 2352{
2353 struct page *page = vmf->page;
2472 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 2354 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2473 unsigned long end; 2355 unsigned long end;
2474 loff_t size; 2356 loff_t size;
2475 int ret = -EINVAL; 2357 int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
2476 2358
2477 lock_page(page); 2359 lock_page(page);
2478 size = i_size_read(inode); 2360 size = i_size_read(inode);
@@ -2492,6 +2374,13 @@ block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
2492 if (!ret) 2374 if (!ret)
2493 ret = block_commit_write(page, 0, end); 2375 ret = block_commit_write(page, 0, end);
2494 2376
2377 if (unlikely(ret)) {
2378 if (ret == -ENOMEM)
2379 ret = VM_FAULT_OOM;
2380 else /* -ENOSPC, -EIO, etc */
2381 ret = VM_FAULT_SIGBUS;
2382 }
2383
2495out_unlock: 2384out_unlock:
2496 unlock_page(page); 2385 unlock_page(page);
2497 return ret; 2386 return ret;
@@ -3427,7 +3316,6 @@ EXPORT_SYMBOL(cont_write_begin);
3427EXPORT_SYMBOL(end_buffer_read_sync); 3316EXPORT_SYMBOL(end_buffer_read_sync);
3428EXPORT_SYMBOL(end_buffer_write_sync); 3317EXPORT_SYMBOL(end_buffer_write_sync);
3429EXPORT_SYMBOL(file_fsync); 3318EXPORT_SYMBOL(file_fsync);
3430EXPORT_SYMBOL(fsync_bdev);
3431EXPORT_SYMBOL(generic_block_bmap); 3319EXPORT_SYMBOL(generic_block_bmap);
3432EXPORT_SYMBOL(generic_cont_expand_simple); 3320EXPORT_SYMBOL(generic_cont_expand_simple);
3433EXPORT_SYMBOL(init_buffer); 3321EXPORT_SYMBOL(init_buffer);
diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig
new file mode 100644
index 000000000000..80e9c6167f0b
--- /dev/null
+++ b/fs/cachefiles/Kconfig
@@ -0,0 +1,39 @@
1
2config CACHEFILES
3 tristate "Filesystem caching on files"
4 depends on FSCACHE && BLOCK
5 help
6 This permits use of a mounted filesystem as a cache for other
7 filesystems - primarily networking filesystems - thus allowing fast
8 local disk to enhance the speed of slower devices.
9
10 See Documentation/filesystems/caching/cachefiles.txt for more
11 information.
12
13config CACHEFILES_DEBUG
14 bool "Debug CacheFiles"
15 depends on CACHEFILES
16 help
17 This permits debugging to be dynamically enabled in the filesystem
18 caching on files module. If this is set, the debugging output may be
19 enabled by setting bits in /sys/modules/cachefiles/parameter/debug or
20 by including a debugging specifier in /etc/cachefilesd.conf.
21
22config CACHEFILES_HISTOGRAM
23 bool "Gather latency information on CacheFiles"
24 depends on CACHEFILES && PROC_FS
25 help
26
27 This option causes latency information to be gathered on CacheFiles
28 operation and exported through file:
29
30 /proc/fs/cachefiles/histogram
31
32 The generation of this histogram adds a certain amount of overhead to
33 execution as there are a number of points at which data is gathered,
34 and on a multi-CPU system these may be on cachelines that keep
35 bouncing between CPUs. On the other hand, the histogram may be
36 useful for debugging purposes. Saying 'N' here is recommended.
37
38 See Documentation/filesystems/caching/cachefiles.txt for more
39 information.
diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile
new file mode 100644
index 000000000000..32cbab0ffce3
--- /dev/null
+++ b/fs/cachefiles/Makefile
@@ -0,0 +1,18 @@
1#
2# Makefile for caching in a mounted filesystem
3#
4
5cachefiles-y := \
6 bind.o \
7 daemon.o \
8 interface.o \
9 key.o \
10 main.o \
11 namei.o \
12 rdwr.o \
13 security.o \
14 xattr.o
15
16cachefiles-$(CONFIG_CACHEFILES_HISTOGRAM) += proc.o
17
18obj-$(CONFIG_CACHEFILES) := cachefiles.o
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
new file mode 100644
index 000000000000..3797e0077b35
--- /dev/null
+++ b/fs/cachefiles/bind.c
@@ -0,0 +1,286 @@
1/* Bind and unbind a cache from the filesystem backing it
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/sched.h>
15#include <linux/completion.h>
16#include <linux/slab.h>
17#include <linux/fs.h>
18#include <linux/file.h>
19#include <linux/namei.h>
20#include <linux/mount.h>
21#include <linux/statfs.h>
22#include <linux/ctype.h>
23#include "internal.h"
24
25static int cachefiles_daemon_add_cache(struct cachefiles_cache *caches);
26
27/*
28 * bind a directory as a cache
29 */
30int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
31{
32 _enter("{%u,%u,%u,%u,%u,%u},%s",
33 cache->frun_percent,
34 cache->fcull_percent,
35 cache->fstop_percent,
36 cache->brun_percent,
37 cache->bcull_percent,
38 cache->bstop_percent,
39 args);
40
41 /* start by checking things over */
42 ASSERT(cache->fstop_percent >= 0 &&
43 cache->fstop_percent < cache->fcull_percent &&
44 cache->fcull_percent < cache->frun_percent &&
45 cache->frun_percent < 100);
46
47 ASSERT(cache->bstop_percent >= 0 &&
48 cache->bstop_percent < cache->bcull_percent &&
49 cache->bcull_percent < cache->brun_percent &&
50 cache->brun_percent < 100);
51
52 if (*args) {
53 kerror("'bind' command doesn't take an argument");
54 return -EINVAL;
55 }
56
57 if (!cache->rootdirname) {
58 kerror("No cache directory specified");
59 return -EINVAL;
60 }
61
62 /* don't permit already bound caches to be re-bound */
63 if (test_bit(CACHEFILES_READY, &cache->flags)) {
64 kerror("Cache already bound");
65 return -EBUSY;
66 }
67
68 /* make sure we have copies of the tag and dirname strings */
69 if (!cache->tag) {
70 /* the tag string is released by the fops->release()
71 * function, so we don't release it on error here */
72 cache->tag = kstrdup("CacheFiles", GFP_KERNEL);
73 if (!cache->tag)
74 return -ENOMEM;
75 }
76
77 /* add the cache */
78 return cachefiles_daemon_add_cache(cache);
79}
80
81/*
82 * add a cache
83 */
84static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
85{
86 struct cachefiles_object *fsdef;
87 struct nameidata nd;
88 struct kstatfs stats;
89 struct dentry *graveyard, *cachedir, *root;
90 const struct cred *saved_cred;
91 int ret;
92
93 _enter("");
94
95 /* we want to work under the module's security ID */
96 ret = cachefiles_get_security_ID(cache);
97 if (ret < 0)
98 return ret;
99
100 cachefiles_begin_secure(cache, &saved_cred);
101
102 /* allocate the root index object */
103 ret = -ENOMEM;
104
105 fsdef = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
106 if (!fsdef)
107 goto error_root_object;
108
109 ASSERTCMP(fsdef->backer, ==, NULL);
110
111 atomic_set(&fsdef->usage, 1);
112 fsdef->type = FSCACHE_COOKIE_TYPE_INDEX;
113
114 _debug("- fsdef %p", fsdef);
115
116 /* look up the directory at the root of the cache */
117 memset(&nd, 0, sizeof(nd));
118
119 ret = path_lookup(cache->rootdirname, LOOKUP_DIRECTORY, &nd);
120 if (ret < 0)
121 goto error_open_root;
122
123 cache->mnt = mntget(nd.path.mnt);
124 root = dget(nd.path.dentry);
125 path_put(&nd.path);
126
127 /* check parameters */
128 ret = -EOPNOTSUPP;
129 if (!root->d_inode ||
130 !root->d_inode->i_op ||
131 !root->d_inode->i_op->lookup ||
132 !root->d_inode->i_op->mkdir ||
133 !root->d_inode->i_op->setxattr ||
134 !root->d_inode->i_op->getxattr ||
135 !root->d_sb ||
136 !root->d_sb->s_op ||
137 !root->d_sb->s_op->statfs ||
138 !root->d_sb->s_op->sync_fs)
139 goto error_unsupported;
140
141 ret = -EROFS;
142 if (root->d_sb->s_flags & MS_RDONLY)
143 goto error_unsupported;
144
145 /* determine the security of the on-disk cache as this governs
146 * security ID of files we create */
147 ret = cachefiles_determine_cache_security(cache, root, &saved_cred);
148 if (ret < 0)
149 goto error_unsupported;
150
151 /* get the cache size and blocksize */
152 ret = vfs_statfs(root, &stats);
153 if (ret < 0)
154 goto error_unsupported;
155
156 ret = -ERANGE;
157 if (stats.f_bsize <= 0)
158 goto error_unsupported;
159
160 ret = -EOPNOTSUPP;
161 if (stats.f_bsize > PAGE_SIZE)
162 goto error_unsupported;
163
164 cache->bsize = stats.f_bsize;
165 cache->bshift = 0;
166 if (stats.f_bsize < PAGE_SIZE)
167 cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize);
168
169 _debug("blksize %u (shift %u)",
170 cache->bsize, cache->bshift);
171
172 _debug("size %llu, avail %llu",
173 (unsigned long long) stats.f_blocks,
174 (unsigned long long) stats.f_bavail);
175
176 /* set up caching limits */
177 do_div(stats.f_files, 100);
178 cache->fstop = stats.f_files * cache->fstop_percent;
179 cache->fcull = stats.f_files * cache->fcull_percent;
180 cache->frun = stats.f_files * cache->frun_percent;
181
182 _debug("limits {%llu,%llu,%llu} files",
183 (unsigned long long) cache->frun,
184 (unsigned long long) cache->fcull,
185 (unsigned long long) cache->fstop);
186
187 stats.f_blocks >>= cache->bshift;
188 do_div(stats.f_blocks, 100);
189 cache->bstop = stats.f_blocks * cache->bstop_percent;
190 cache->bcull = stats.f_blocks * cache->bcull_percent;
191 cache->brun = stats.f_blocks * cache->brun_percent;
192
193 _debug("limits {%llu,%llu,%llu} blocks",
194 (unsigned long long) cache->brun,
195 (unsigned long long) cache->bcull,
196 (unsigned long long) cache->bstop);
197
198 /* get the cache directory and check its type */
199 cachedir = cachefiles_get_directory(cache, root, "cache");
200 if (IS_ERR(cachedir)) {
201 ret = PTR_ERR(cachedir);
202 goto error_unsupported;
203 }
204
205 fsdef->dentry = cachedir;
206 fsdef->fscache.cookie = NULL;
207
208 ret = cachefiles_check_object_type(fsdef);
209 if (ret < 0)
210 goto error_unsupported;
211
212 /* get the graveyard directory */
213 graveyard = cachefiles_get_directory(cache, root, "graveyard");
214 if (IS_ERR(graveyard)) {
215 ret = PTR_ERR(graveyard);
216 goto error_unsupported;
217 }
218
219 cache->graveyard = graveyard;
220
221 /* publish the cache */
222 fscache_init_cache(&cache->cache,
223 &cachefiles_cache_ops,
224 "%s",
225 fsdef->dentry->d_sb->s_id);
226
227 fscache_object_init(&fsdef->fscache, NULL, &cache->cache);
228
229 ret = fscache_add_cache(&cache->cache, &fsdef->fscache, cache->tag);
230 if (ret < 0)
231 goto error_add_cache;
232
233 /* done */
234 set_bit(CACHEFILES_READY, &cache->flags);
235 dput(root);
236
237 printk(KERN_INFO "CacheFiles:"
238 " File cache on %s registered\n",
239 cache->cache.identifier);
240
241 /* check how much space the cache has */
242 cachefiles_has_space(cache, 0, 0);
243 cachefiles_end_secure(cache, saved_cred);
244 return 0;
245
246error_add_cache:
247 dput(cache->graveyard);
248 cache->graveyard = NULL;
249error_unsupported:
250 mntput(cache->mnt);
251 cache->mnt = NULL;
252 dput(fsdef->dentry);
253 fsdef->dentry = NULL;
254 dput(root);
255error_open_root:
256 kmem_cache_free(cachefiles_object_jar, fsdef);
257error_root_object:
258 cachefiles_end_secure(cache, saved_cred);
259 kerror("Failed to register: %d", ret);
260 return ret;
261}
262
263/*
264 * unbind a cache on fd release
265 */
266void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
267{
268 _enter("");
269
270 if (test_bit(CACHEFILES_READY, &cache->flags)) {
271 printk(KERN_INFO "CacheFiles:"
272 " File cache on %s unregistering\n",
273 cache->cache.identifier);
274
275 fscache_withdraw_cache(&cache->cache);
276 }
277
278 dput(cache->graveyard);
279 mntput(cache->mnt);
280
281 kfree(cache->rootdirname);
282 kfree(cache->secctx);
283 kfree(cache->tag);
284
285 _leave("");
286}
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
new file mode 100644
index 000000000000..4618516dd994
--- /dev/null
+++ b/fs/cachefiles/daemon.c
@@ -0,0 +1,755 @@
1/* Daemon interface
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/sched.h>
15#include <linux/completion.h>
16#include <linux/slab.h>
17#include <linux/fs.h>
18#include <linux/file.h>
19#include <linux/namei.h>
20#include <linux/poll.h>
21#include <linux/mount.h>
22#include <linux/statfs.h>
23#include <linux/ctype.h>
24#include <linux/fs_struct.h>
25#include "internal.h"
26
27static int cachefiles_daemon_open(struct inode *, struct file *);
28static int cachefiles_daemon_release(struct inode *, struct file *);
29static ssize_t cachefiles_daemon_read(struct file *, char __user *, size_t,
30 loff_t *);
31static ssize_t cachefiles_daemon_write(struct file *, const char __user *,
32 size_t, loff_t *);
33static unsigned int cachefiles_daemon_poll(struct file *,
34 struct poll_table_struct *);
35static int cachefiles_daemon_frun(struct cachefiles_cache *, char *);
36static int cachefiles_daemon_fcull(struct cachefiles_cache *, char *);
37static int cachefiles_daemon_fstop(struct cachefiles_cache *, char *);
38static int cachefiles_daemon_brun(struct cachefiles_cache *, char *);
39static int cachefiles_daemon_bcull(struct cachefiles_cache *, char *);
40static int cachefiles_daemon_bstop(struct cachefiles_cache *, char *);
41static int cachefiles_daemon_cull(struct cachefiles_cache *, char *);
42static int cachefiles_daemon_debug(struct cachefiles_cache *, char *);
43static int cachefiles_daemon_dir(struct cachefiles_cache *, char *);
44static int cachefiles_daemon_inuse(struct cachefiles_cache *, char *);
45static int cachefiles_daemon_secctx(struct cachefiles_cache *, char *);
46static int cachefiles_daemon_tag(struct cachefiles_cache *, char *);
47
48static unsigned long cachefiles_open;
49
50const struct file_operations cachefiles_daemon_fops = {
51 .owner = THIS_MODULE,
52 .open = cachefiles_daemon_open,
53 .release = cachefiles_daemon_release,
54 .read = cachefiles_daemon_read,
55 .write = cachefiles_daemon_write,
56 .poll = cachefiles_daemon_poll,
57};
58
59struct cachefiles_daemon_cmd {
60 char name[8];
61 int (*handler)(struct cachefiles_cache *cache, char *args);
62};
63
64static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = {
65 { "bind", cachefiles_daemon_bind },
66 { "brun", cachefiles_daemon_brun },
67 { "bcull", cachefiles_daemon_bcull },
68 { "bstop", cachefiles_daemon_bstop },
69 { "cull", cachefiles_daemon_cull },
70 { "debug", cachefiles_daemon_debug },
71 { "dir", cachefiles_daemon_dir },
72 { "frun", cachefiles_daemon_frun },
73 { "fcull", cachefiles_daemon_fcull },
74 { "fstop", cachefiles_daemon_fstop },
75 { "inuse", cachefiles_daemon_inuse },
76 { "secctx", cachefiles_daemon_secctx },
77 { "tag", cachefiles_daemon_tag },
78 { "", NULL }
79};
80
81
82/*
83 * do various checks
84 */
85static int cachefiles_daemon_open(struct inode *inode, struct file *file)
86{
87 struct cachefiles_cache *cache;
88
89 _enter("");
90
91 /* only the superuser may do this */
92 if (!capable(CAP_SYS_ADMIN))
93 return -EPERM;
94
95 /* the cachefiles device may only be open once at a time */
96 if (xchg(&cachefiles_open, 1) == 1)
97 return -EBUSY;
98
99 /* allocate a cache record */
100 cache = kzalloc(sizeof(struct cachefiles_cache), GFP_KERNEL);
101 if (!cache) {
102 cachefiles_open = 0;
103 return -ENOMEM;
104 }
105
106 mutex_init(&cache->daemon_mutex);
107 cache->active_nodes = RB_ROOT;
108 rwlock_init(&cache->active_lock);
109 init_waitqueue_head(&cache->daemon_pollwq);
110
111 /* set default caching limits
112 * - limit at 1% free space and/or free files
113 * - cull below 5% free space and/or free files
114 * - cease culling above 7% free space and/or free files
115 */
116 cache->frun_percent = 7;
117 cache->fcull_percent = 5;
118 cache->fstop_percent = 1;
119 cache->brun_percent = 7;
120 cache->bcull_percent = 5;
121 cache->bstop_percent = 1;
122
123 file->private_data = cache;
124 cache->cachefilesd = file;
125 return 0;
126}
127
128/*
129 * release a cache
130 */
131static int cachefiles_daemon_release(struct inode *inode, struct file *file)
132{
133 struct cachefiles_cache *cache = file->private_data;
134
135 _enter("");
136
137 ASSERT(cache);
138
139 set_bit(CACHEFILES_DEAD, &cache->flags);
140
141 cachefiles_daemon_unbind(cache);
142
143 ASSERT(!cache->active_nodes.rb_node);
144
145 /* clean up the control file interface */
146 cache->cachefilesd = NULL;
147 file->private_data = NULL;
148 cachefiles_open = 0;
149
150 kfree(cache);
151
152 _leave("");
153 return 0;
154}
155
156/*
157 * read the cache state
158 */
159static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
160 size_t buflen, loff_t *pos)
161{
162 struct cachefiles_cache *cache = file->private_data;
163 char buffer[256];
164 int n;
165
166 //_enter(",,%zu,", buflen);
167
168 if (!test_bit(CACHEFILES_READY, &cache->flags))
169 return 0;
170
171 /* check how much space the cache has */
172 cachefiles_has_space(cache, 0, 0);
173
174 /* summarise */
175 clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
176
177 n = snprintf(buffer, sizeof(buffer),
178 "cull=%c"
179 " frun=%llx"
180 " fcull=%llx"
181 " fstop=%llx"
182 " brun=%llx"
183 " bcull=%llx"
184 " bstop=%llx",
185 test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0',
186 (unsigned long long) cache->frun,
187 (unsigned long long) cache->fcull,
188 (unsigned long long) cache->fstop,
189 (unsigned long long) cache->brun,
190 (unsigned long long) cache->bcull,
191 (unsigned long long) cache->bstop
192 );
193
194 if (n > buflen)
195 return -EMSGSIZE;
196
197 if (copy_to_user(_buffer, buffer, n) != 0)
198 return -EFAULT;
199
200 return n;
201}
202
203/*
204 * command the cache
205 */
206static ssize_t cachefiles_daemon_write(struct file *file,
207 const char __user *_data,
208 size_t datalen,
209 loff_t *pos)
210{
211 const struct cachefiles_daemon_cmd *cmd;
212 struct cachefiles_cache *cache = file->private_data;
213 ssize_t ret;
214 char *data, *args, *cp;
215
216 //_enter(",,%zu,", datalen);
217
218 ASSERT(cache);
219
220 if (test_bit(CACHEFILES_DEAD, &cache->flags))
221 return -EIO;
222
223 if (datalen < 0 || datalen > PAGE_SIZE - 1)
224 return -EOPNOTSUPP;
225
226 /* drag the command string into the kernel so we can parse it */
227 data = kmalloc(datalen + 1, GFP_KERNEL);
228 if (!data)
229 return -ENOMEM;
230
231 ret = -EFAULT;
232 if (copy_from_user(data, _data, datalen) != 0)
233 goto error;
234
235 data[datalen] = '\0';
236
237 ret = -EINVAL;
238 if (memchr(data, '\0', datalen))
239 goto error;
240
241 /* strip any newline */
242 cp = memchr(data, '\n', datalen);
243 if (cp) {
244 if (cp == data)
245 goto error;
246
247 *cp = '\0';
248 }
249
250 /* parse the command */
251 ret = -EOPNOTSUPP;
252
253 for (args = data; *args; args++)
254 if (isspace(*args))
255 break;
256 if (*args) {
257 if (args == data)
258 goto error;
259 *args = '\0';
260 for (args++; isspace(*args); args++)
261 continue;
262 }
263
264 /* run the appropriate command handler */
265 for (cmd = cachefiles_daemon_cmds; cmd->name[0]; cmd++)
266 if (strcmp(cmd->name, data) == 0)
267 goto found_command;
268
269error:
270 kfree(data);
271 //_leave(" = %zd", ret);
272 return ret;
273
274found_command:
275 mutex_lock(&cache->daemon_mutex);
276
277 ret = -EIO;
278 if (!test_bit(CACHEFILES_DEAD, &cache->flags))
279 ret = cmd->handler(cache, args);
280
281 mutex_unlock(&cache->daemon_mutex);
282
283 if (ret == 0)
284 ret = datalen;
285 goto error;
286}
287
288/*
289 * poll for culling state
290 * - use POLLOUT to indicate culling state
291 */
292static unsigned int cachefiles_daemon_poll(struct file *file,
293 struct poll_table_struct *poll)
294{
295 struct cachefiles_cache *cache = file->private_data;
296 unsigned int mask;
297
298 poll_wait(file, &cache->daemon_pollwq, poll);
299 mask = 0;
300
301 if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
302 mask |= POLLIN;
303
304 if (test_bit(CACHEFILES_CULLING, &cache->flags))
305 mask |= POLLOUT;
306
307 return mask;
308}
309
310/*
311 * give a range error for cache space constraints
312 * - can be tail-called
313 */
314static int cachefiles_daemon_range_error(struct cachefiles_cache *cache,
315 char *args)
316{
317 kerror("Free space limits must be in range"
318 " 0%%<=stop<cull<run<100%%");
319
320 return -EINVAL;
321}
322
323/*
324 * set the percentage of files at which to stop culling
325 * - command: "frun <N>%"
326 */
327static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args)
328{
329 unsigned long frun;
330
331 _enter(",%s", args);
332
333 if (!*args)
334 return -EINVAL;
335
336 frun = simple_strtoul(args, &args, 10);
337 if (args[0] != '%' || args[1] != '\0')
338 return -EINVAL;
339
340 if (frun <= cache->fcull_percent || frun >= 100)
341 return cachefiles_daemon_range_error(cache, args);
342
343 cache->frun_percent = frun;
344 return 0;
345}
346
347/*
348 * set the percentage of files at which to start culling
349 * - command: "fcull <N>%"
350 */
351static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args)
352{
353 unsigned long fcull;
354
355 _enter(",%s", args);
356
357 if (!*args)
358 return -EINVAL;
359
360 fcull = simple_strtoul(args, &args, 10);
361 if (args[0] != '%' || args[1] != '\0')
362 return -EINVAL;
363
364 if (fcull <= cache->fstop_percent || fcull >= cache->frun_percent)
365 return cachefiles_daemon_range_error(cache, args);
366
367 cache->fcull_percent = fcull;
368 return 0;
369}
370
371/*
372 * set the percentage of files at which to stop allocating
373 * - command: "fstop <N>%"
374 */
375static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args)
376{
377 unsigned long fstop;
378
379 _enter(",%s", args);
380
381 if (!*args)
382 return -EINVAL;
383
384 fstop = simple_strtoul(args, &args, 10);
385 if (args[0] != '%' || args[1] != '\0')
386 return -EINVAL;
387
388 if (fstop < 0 || fstop >= cache->fcull_percent)
389 return cachefiles_daemon_range_error(cache, args);
390
391 cache->fstop_percent = fstop;
392 return 0;
393}
394
395/*
396 * set the percentage of blocks at which to stop culling
397 * - command: "brun <N>%"
398 */
399static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args)
400{
401 unsigned long brun;
402
403 _enter(",%s", args);
404
405 if (!*args)
406 return -EINVAL;
407
408 brun = simple_strtoul(args, &args, 10);
409 if (args[0] != '%' || args[1] != '\0')
410 return -EINVAL;
411
412 if (brun <= cache->bcull_percent || brun >= 100)
413 return cachefiles_daemon_range_error(cache, args);
414
415 cache->brun_percent = brun;
416 return 0;
417}
418
419/*
420 * set the percentage of blocks at which to start culling
421 * - command: "bcull <N>%"
422 */
423static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args)
424{
425 unsigned long bcull;
426
427 _enter(",%s", args);
428
429 if (!*args)
430 return -EINVAL;
431
432 bcull = simple_strtoul(args, &args, 10);
433 if (args[0] != '%' || args[1] != '\0')
434 return -EINVAL;
435
436 if (bcull <= cache->bstop_percent || bcull >= cache->brun_percent)
437 return cachefiles_daemon_range_error(cache, args);
438
439 cache->bcull_percent = bcull;
440 return 0;
441}
442
443/*
444 * set the percentage of blocks at which to stop allocating
445 * - command: "bstop <N>%"
446 */
447static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args)
448{
449 unsigned long bstop;
450
451 _enter(",%s", args);
452
453 if (!*args)
454 return -EINVAL;
455
456 bstop = simple_strtoul(args, &args, 10);
457 if (args[0] != '%' || args[1] != '\0')
458 return -EINVAL;
459
460 if (bstop < 0 || bstop >= cache->bcull_percent)
461 return cachefiles_daemon_range_error(cache, args);
462
463 cache->bstop_percent = bstop;
464 return 0;
465}
466
467/*
468 * set the cache directory
469 * - command: "dir <name>"
470 */
471static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args)
472{
473 char *dir;
474
475 _enter(",%s", args);
476
477 if (!*args) {
478 kerror("Empty directory specified");
479 return -EINVAL;
480 }
481
482 if (cache->rootdirname) {
483 kerror("Second cache directory specified");
484 return -EEXIST;
485 }
486
487 dir = kstrdup(args, GFP_KERNEL);
488 if (!dir)
489 return -ENOMEM;
490
491 cache->rootdirname = dir;
492 return 0;
493}
494
495/*
496 * set the cache security context
497 * - command: "secctx <ctx>"
498 */
499static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
500{
501 char *secctx;
502
503 _enter(",%s", args);
504
505 if (!*args) {
506 kerror("Empty security context specified");
507 return -EINVAL;
508 }
509
510 if (cache->secctx) {
511 kerror("Second security context specified");
512 return -EINVAL;
513 }
514
515 secctx = kstrdup(args, GFP_KERNEL);
516 if (!secctx)
517 return -ENOMEM;
518
519 cache->secctx = secctx;
520 return 0;
521}
522
523/*
524 * set the cache tag
525 * - command: "tag <name>"
526 */
527static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
528{
529 char *tag;
530
531 _enter(",%s", args);
532
533 if (!*args) {
534 kerror("Empty tag specified");
535 return -EINVAL;
536 }
537
538 if (cache->tag)
539 return -EEXIST;
540
541 tag = kstrdup(args, GFP_KERNEL);
542 if (!tag)
543 return -ENOMEM;
544
545 cache->tag = tag;
546 return 0;
547}
548
549/*
550 * request a node in the cache be culled from the current working directory
551 * - command: "cull <name>"
552 */
553static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
554{
555 struct fs_struct *fs;
556 struct dentry *dir;
557 const struct cred *saved_cred;
558 int ret;
559
560 _enter(",%s", args);
561
562 if (strchr(args, '/'))
563 goto inval;
564
565 if (!test_bit(CACHEFILES_READY, &cache->flags)) {
566 kerror("cull applied to unready cache");
567 return -EIO;
568 }
569
570 if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
571 kerror("cull applied to dead cache");
572 return -EIO;
573 }
574
575 /* extract the directory dentry from the cwd */
576 fs = current->fs;
577 read_lock(&fs->lock);
578 dir = dget(fs->pwd.dentry);
579 read_unlock(&fs->lock);
580
581 if (!S_ISDIR(dir->d_inode->i_mode))
582 goto notdir;
583
584 cachefiles_begin_secure(cache, &saved_cred);
585 ret = cachefiles_cull(cache, dir, args);
586 cachefiles_end_secure(cache, saved_cred);
587
588 dput(dir);
589 _leave(" = %d", ret);
590 return ret;
591
592notdir:
593 dput(dir);
594 kerror("cull command requires dirfd to be a directory");
595 return -ENOTDIR;
596
597inval:
598 kerror("cull command requires dirfd and filename");
599 return -EINVAL;
600}
601
602/*
603 * set debugging mode
604 * - command: "debug <mask>"
605 */
606static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args)
607{
608 unsigned long mask;
609
610 _enter(",%s", args);
611
612 mask = simple_strtoul(args, &args, 0);
613 if (args[0] != '\0')
614 goto inval;
615
616 cachefiles_debug = mask;
617 _leave(" = 0");
618 return 0;
619
620inval:
621 kerror("debug command requires mask");
622 return -EINVAL;
623}
624
625/*
626 * find out whether an object in the current working directory is in use or not
627 * - command: "inuse <name>"
628 */
629static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
630{
631 struct fs_struct *fs;
632 struct dentry *dir;
633 const struct cred *saved_cred;
634 int ret;
635
636 //_enter(",%s", args);
637
638 if (strchr(args, '/'))
639 goto inval;
640
641 if (!test_bit(CACHEFILES_READY, &cache->flags)) {
642 kerror("inuse applied to unready cache");
643 return -EIO;
644 }
645
646 if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
647 kerror("inuse applied to dead cache");
648 return -EIO;
649 }
650
651 /* extract the directory dentry from the cwd */
652 fs = current->fs;
653 read_lock(&fs->lock);
654 dir = dget(fs->pwd.dentry);
655 read_unlock(&fs->lock);
656
657 if (!S_ISDIR(dir->d_inode->i_mode))
658 goto notdir;
659
660 cachefiles_begin_secure(cache, &saved_cred);
661 ret = cachefiles_check_in_use(cache, dir, args);
662 cachefiles_end_secure(cache, saved_cred);
663
664 dput(dir);
665 //_leave(" = %d", ret);
666 return ret;
667
668notdir:
669 dput(dir);
670 kerror("inuse command requires dirfd to be a directory");
671 return -ENOTDIR;
672
673inval:
674 kerror("inuse command requires dirfd and filename");
675 return -EINVAL;
676}
677
678/*
679 * see if we have space for a number of pages and/or a number of files in the
680 * cache
681 */
682int cachefiles_has_space(struct cachefiles_cache *cache,
683 unsigned fnr, unsigned bnr)
684{
685 struct kstatfs stats;
686 int ret;
687
688 //_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u",
689 // (unsigned long long) cache->frun,
690 // (unsigned long long) cache->fcull,
691 // (unsigned long long) cache->fstop,
692 // (unsigned long long) cache->brun,
693 // (unsigned long long) cache->bcull,
694 // (unsigned long long) cache->bstop,
695 // fnr, bnr);
696
697 /* find out how many pages of blockdev are available */
698 memset(&stats, 0, sizeof(stats));
699
700 ret = vfs_statfs(cache->mnt->mnt_root, &stats);
701 if (ret < 0) {
702 if (ret == -EIO)
703 cachefiles_io_error(cache, "statfs failed");
704 _leave(" = %d", ret);
705 return ret;
706 }
707
708 stats.f_bavail >>= cache->bshift;
709
710 //_debug("avail %llu,%llu",
711 // (unsigned long long) stats.f_ffree,
712 // (unsigned long long) stats.f_bavail);
713
714 /* see if there is sufficient space */
715 if (stats.f_ffree > fnr)
716 stats.f_ffree -= fnr;
717 else
718 stats.f_ffree = 0;
719
720 if (stats.f_bavail > bnr)
721 stats.f_bavail -= bnr;
722 else
723 stats.f_bavail = 0;
724
725 ret = -ENOBUFS;
726 if (stats.f_ffree < cache->fstop ||
727 stats.f_bavail < cache->bstop)
728 goto begin_cull;
729
730 ret = 0;
731 if (stats.f_ffree < cache->fcull ||
732 stats.f_bavail < cache->bcull)
733 goto begin_cull;
734
735 if (test_bit(CACHEFILES_CULLING, &cache->flags) &&
736 stats.f_ffree >= cache->frun &&
737 stats.f_bavail >= cache->brun &&
738 test_and_clear_bit(CACHEFILES_CULLING, &cache->flags)
739 ) {
740 _debug("cease culling");
741 cachefiles_state_changed(cache);
742 }
743
744 //_leave(" = 0");
745 return 0;
746
747begin_cull:
748 if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) {
749 _debug("### CULL CACHE ###");
750 cachefiles_state_changed(cache);
751 }
752
753 _leave(" = %d", ret);
754 return ret;
755}
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
new file mode 100644
index 000000000000..1e962348d111
--- /dev/null
+++ b/fs/cachefiles/interface.c
@@ -0,0 +1,449 @@
1/* FS-Cache interface to CacheFiles
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/mount.h>
13#include <linux/buffer_head.h>
14#include "internal.h"
15
16#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
17
18struct cachefiles_lookup_data {
19 struct cachefiles_xattr *auxdata; /* auxiliary data */
20 char *key; /* key path */
21};
22
23static int cachefiles_attr_changed(struct fscache_object *_object);
24
25/*
26 * allocate an object record for a cookie lookup and prepare the lookup data
27 */
28static struct fscache_object *cachefiles_alloc_object(
29 struct fscache_cache *_cache,
30 struct fscache_cookie *cookie)
31{
32 struct cachefiles_lookup_data *lookup_data;
33 struct cachefiles_object *object;
34 struct cachefiles_cache *cache;
35 struct cachefiles_xattr *auxdata;
36 unsigned keylen, auxlen;
37 void *buffer;
38 char *key;
39
40 cache = container_of(_cache, struct cachefiles_cache, cache);
41
42 _enter("{%s},%p,", cache->cache.identifier, cookie);
43
44 lookup_data = kmalloc(sizeof(*lookup_data), GFP_KERNEL);
45 if (!lookup_data)
46 goto nomem_lookup_data;
47
48 /* create a new object record and a temporary leaf image */
49 object = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
50 if (!object)
51 goto nomem_object;
52
53 ASSERTCMP(object->backer, ==, NULL);
54
55 BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
56 atomic_set(&object->usage, 1);
57
58 fscache_object_init(&object->fscache, cookie, &cache->cache);
59
60 object->type = cookie->def->type;
61
62 /* get hold of the raw key
63 * - stick the length on the front and leave space on the back for the
64 * encoder
65 */
66 buffer = kmalloc((2 + 512) + 3, GFP_KERNEL);
67 if (!buffer)
68 goto nomem_buffer;
69
70 keylen = cookie->def->get_key(cookie->netfs_data, buffer + 2, 512);
71 ASSERTCMP(keylen, <, 512);
72
73 *(uint16_t *)buffer = keylen;
74 ((char *)buffer)[keylen + 2] = 0;
75 ((char *)buffer)[keylen + 3] = 0;
76 ((char *)buffer)[keylen + 4] = 0;
77
78 /* turn the raw key into something that can work with as a filename */
79 key = cachefiles_cook_key(buffer, keylen + 2, object->type);
80 if (!key)
81 goto nomem_key;
82
83 /* get hold of the auxiliary data and prepend the object type */
84 auxdata = buffer;
85 auxlen = 0;
86 if (cookie->def->get_aux) {
87 auxlen = cookie->def->get_aux(cookie->netfs_data,
88 auxdata->data, 511);
89 ASSERTCMP(auxlen, <, 511);
90 }
91
92 auxdata->len = auxlen + 1;
93 auxdata->type = cookie->def->type;
94
95 lookup_data->auxdata = auxdata;
96 lookup_data->key = key;
97 object->lookup_data = lookup_data;
98
99 _leave(" = %p [%p]", &object->fscache, lookup_data);
100 return &object->fscache;
101
102nomem_key:
103 kfree(buffer);
104nomem_buffer:
105 BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
106 kmem_cache_free(cachefiles_object_jar, object);
107 fscache_object_destroyed(&cache->cache);
108nomem_object:
109 kfree(lookup_data);
110nomem_lookup_data:
111 _leave(" = -ENOMEM");
112 return ERR_PTR(-ENOMEM);
113}
114
115/*
116 * attempt to look up the nominated node in this cache
117 */
118static void cachefiles_lookup_object(struct fscache_object *_object)
119{
120 struct cachefiles_lookup_data *lookup_data;
121 struct cachefiles_object *parent, *object;
122 struct cachefiles_cache *cache;
123 const struct cred *saved_cred;
124 int ret;
125
126 _enter("{OBJ%x}", _object->debug_id);
127
128 cache = container_of(_object->cache, struct cachefiles_cache, cache);
129 parent = container_of(_object->parent,
130 struct cachefiles_object, fscache);
131 object = container_of(_object, struct cachefiles_object, fscache);
132 lookup_data = object->lookup_data;
133
134 ASSERTCMP(lookup_data, !=, NULL);
135
136 /* look up the key, creating any missing bits */
137 cachefiles_begin_secure(cache, &saved_cred);
138 ret = cachefiles_walk_to_object(parent, object,
139 lookup_data->key,
140 lookup_data->auxdata);
141 cachefiles_end_secure(cache, saved_cred);
142
143 /* polish off by setting the attributes of non-index files */
144 if (ret == 0 &&
145 object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX)
146 cachefiles_attr_changed(&object->fscache);
147
148 if (ret < 0) {
149 printk(KERN_WARNING "CacheFiles: Lookup failed error %d\n",
150 ret);
151 fscache_object_lookup_error(&object->fscache);
152 }
153
154 _leave(" [%d]", ret);
155}
156
157/*
158 * indication of lookup completion
159 */
160static void cachefiles_lookup_complete(struct fscache_object *_object)
161{
162 struct cachefiles_object *object;
163
164 object = container_of(_object, struct cachefiles_object, fscache);
165
166 _enter("{OBJ%x,%p}", object->fscache.debug_id, object->lookup_data);
167
168 if (object->lookup_data) {
169 kfree(object->lookup_data->key);
170 kfree(object->lookup_data->auxdata);
171 kfree(object->lookup_data);
172 object->lookup_data = NULL;
173 }
174}
175
176/*
177 * increment the usage count on an inode object (may fail if unmounting)
178 */
179static
180struct fscache_object *cachefiles_grab_object(struct fscache_object *_object)
181{
182 struct cachefiles_object *object =
183 container_of(_object, struct cachefiles_object, fscache);
184
185 _enter("{OBJ%x,%d}", _object->debug_id, atomic_read(&object->usage));
186
187#ifdef CACHEFILES_DEBUG_SLAB
188 ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
189#endif
190
191 atomic_inc(&object->usage);
192 return &object->fscache;
193}
194
195/*
196 * update the auxilliary data for an object object on disk
197 */
198static void cachefiles_update_object(struct fscache_object *_object)
199{
200 struct cachefiles_object *object;
201 struct cachefiles_xattr *auxdata;
202 struct cachefiles_cache *cache;
203 struct fscache_cookie *cookie;
204 const struct cred *saved_cred;
205 unsigned auxlen;
206
207 _enter("{OBJ%x}", _object->debug_id);
208
209 object = container_of(_object, struct cachefiles_object, fscache);
210 cache = container_of(object->fscache.cache, struct cachefiles_cache,
211 cache);
212 cookie = object->fscache.cookie;
213
214 if (!cookie->def->get_aux) {
215 _leave(" [no aux]");
216 return;
217 }
218
219 auxdata = kmalloc(2 + 512 + 3, GFP_KERNEL);
220 if (!auxdata) {
221 _leave(" [nomem]");
222 return;
223 }
224
225 auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511);
226 ASSERTCMP(auxlen, <, 511);
227
228 auxdata->len = auxlen + 1;
229 auxdata->type = cookie->def->type;
230
231 cachefiles_begin_secure(cache, &saved_cred);
232 cachefiles_update_object_xattr(object, auxdata);
233 cachefiles_end_secure(cache, saved_cred);
234 kfree(auxdata);
235 _leave("");
236}
237
238/*
239 * discard the resources pinned by an object and effect retirement if
240 * requested
241 */
242static void cachefiles_drop_object(struct fscache_object *_object)
243{
244 struct cachefiles_object *object;
245 struct cachefiles_cache *cache;
246 const struct cred *saved_cred;
247
248 ASSERT(_object);
249
250 object = container_of(_object, struct cachefiles_object, fscache);
251
252 _enter("{OBJ%x,%d}",
253 object->fscache.debug_id, atomic_read(&object->usage));
254
255 cache = container_of(object->fscache.cache,
256 struct cachefiles_cache, cache);
257
258#ifdef CACHEFILES_DEBUG_SLAB
259 ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
260#endif
261
262 /* delete retired objects */
263 if (object->fscache.state == FSCACHE_OBJECT_RECYCLING &&
264 _object != cache->cache.fsdef
265 ) {
266 _debug("- retire object OBJ%x", object->fscache.debug_id);
267 cachefiles_begin_secure(cache, &saved_cred);
268 cachefiles_delete_object(cache, object);
269 cachefiles_end_secure(cache, saved_cred);
270 }
271
272 /* close the filesystem stuff attached to the object */
273 if (object->backer != object->dentry)
274 dput(object->backer);
275 object->backer = NULL;
276
277 /* note that the object is now inactive */
278 if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
279 write_lock(&cache->active_lock);
280 if (!test_and_clear_bit(CACHEFILES_OBJECT_ACTIVE,
281 &object->flags))
282 BUG();
283 rb_erase(&object->active_node, &cache->active_nodes);
284 wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
285 write_unlock(&cache->active_lock);
286 }
287
288 dput(object->dentry);
289 object->dentry = NULL;
290
291 _leave("");
292}
293
294/*
295 * dispose of a reference to an object
296 */
297static void cachefiles_put_object(struct fscache_object *_object)
298{
299 struct cachefiles_object *object;
300 struct fscache_cache *cache;
301
302 ASSERT(_object);
303
304 object = container_of(_object, struct cachefiles_object, fscache);
305
306 _enter("{OBJ%x,%d}",
307 object->fscache.debug_id, atomic_read(&object->usage));
308
309#ifdef CACHEFILES_DEBUG_SLAB
310 ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
311#endif
312
313 ASSERTIFCMP(object->fscache.parent,
314 object->fscache.parent->n_children, >, 0);
315
316 if (atomic_dec_and_test(&object->usage)) {
317 _debug("- kill object OBJ%x", object->fscache.debug_id);
318
319 ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
320 ASSERTCMP(object->fscache.parent, ==, NULL);
321 ASSERTCMP(object->backer, ==, NULL);
322 ASSERTCMP(object->dentry, ==, NULL);
323 ASSERTCMP(object->fscache.n_ops, ==, 0);
324 ASSERTCMP(object->fscache.n_children, ==, 0);
325
326 if (object->lookup_data) {
327 kfree(object->lookup_data->key);
328 kfree(object->lookup_data->auxdata);
329 kfree(object->lookup_data);
330 object->lookup_data = NULL;
331 }
332
333 cache = object->fscache.cache;
334 kmem_cache_free(cachefiles_object_jar, object);
335 fscache_object_destroyed(cache);
336 }
337
338 _leave("");
339}
340
341/*
342 * sync a cache
343 */
344static void cachefiles_sync_cache(struct fscache_cache *_cache)
345{
346 struct cachefiles_cache *cache;
347 const struct cred *saved_cred;
348 int ret;
349
350 _enter("%p", _cache);
351
352 cache = container_of(_cache, struct cachefiles_cache, cache);
353
354 /* make sure all pages pinned by operations on behalf of the netfs are
355 * written to disc */
356 cachefiles_begin_secure(cache, &saved_cred);
357 ret = fsync_super(cache->mnt->mnt_sb);
358 cachefiles_end_secure(cache, saved_cred);
359
360 if (ret == -EIO)
361 cachefiles_io_error(cache,
362 "Attempt to sync backing fs superblock"
363 " returned error %d",
364 ret);
365}
366
367/*
368 * notification the attributes on an object have changed
369 * - called with reads/writes excluded by FS-Cache
370 */
371static int cachefiles_attr_changed(struct fscache_object *_object)
372{
373 struct cachefiles_object *object;
374 struct cachefiles_cache *cache;
375 const struct cred *saved_cred;
376 struct iattr newattrs;
377 uint64_t ni_size;
378 loff_t oi_size;
379 int ret;
380
381 _object->cookie->def->get_attr(_object->cookie->netfs_data, &ni_size);
382
383 _enter("{OBJ%x},[%llu]",
384 _object->debug_id, (unsigned long long) ni_size);
385
386 object = container_of(_object, struct cachefiles_object, fscache);
387 cache = container_of(object->fscache.cache,
388 struct cachefiles_cache, cache);
389
390 if (ni_size == object->i_size)
391 return 0;
392
393 if (!object->backer)
394 return -ENOBUFS;
395
396 ASSERT(S_ISREG(object->backer->d_inode->i_mode));
397
398 fscache_set_store_limit(&object->fscache, ni_size);
399
400 oi_size = i_size_read(object->backer->d_inode);
401 if (oi_size == ni_size)
402 return 0;
403
404 newattrs.ia_size = ni_size;
405 newattrs.ia_valid = ATTR_SIZE;
406
407 cachefiles_begin_secure(cache, &saved_cred);
408 mutex_lock(&object->backer->d_inode->i_mutex);
409 ret = notify_change(object->backer, &newattrs);
410 mutex_unlock(&object->backer->d_inode->i_mutex);
411 cachefiles_end_secure(cache, saved_cred);
412
413 if (ret == -EIO) {
414 fscache_set_store_limit(&object->fscache, 0);
415 cachefiles_io_error_obj(object, "Size set failed");
416 ret = -ENOBUFS;
417 }
418
419 _leave(" = %d", ret);
420 return ret;
421}
422
423/*
424 * dissociate a cache from all the pages it was backing
425 */
426static void cachefiles_dissociate_pages(struct fscache_cache *cache)
427{
428 _enter("");
429}
430
431const struct fscache_cache_ops cachefiles_cache_ops = {
432 .name = "cachefiles",
433 .alloc_object = cachefiles_alloc_object,
434 .lookup_object = cachefiles_lookup_object,
435 .lookup_complete = cachefiles_lookup_complete,
436 .grab_object = cachefiles_grab_object,
437 .update_object = cachefiles_update_object,
438 .drop_object = cachefiles_drop_object,
439 .put_object = cachefiles_put_object,
440 .sync_cache = cachefiles_sync_cache,
441 .attr_changed = cachefiles_attr_changed,
442 .read_or_alloc_page = cachefiles_read_or_alloc_page,
443 .read_or_alloc_pages = cachefiles_read_or_alloc_pages,
444 .allocate_page = cachefiles_allocate_page,
445 .allocate_pages = cachefiles_allocate_pages,
446 .write_page = cachefiles_write_page,
447 .uncache_page = cachefiles_uncache_page,
448 .dissociate_pages = cachefiles_dissociate_pages,
449};
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
new file mode 100644
index 000000000000..19218e1463d6
--- /dev/null
+++ b/fs/cachefiles/internal.h
@@ -0,0 +1,360 @@
1/* General netfs cache on cache files internal defs
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/fscache-cache.h>
13#include <linux/timer.h>
14#include <linux/wait.h>
15#include <linux/workqueue.h>
16#include <linux/security.h>
17
18struct cachefiles_cache;
19struct cachefiles_object;
20
21extern unsigned cachefiles_debug;
22#define CACHEFILES_DEBUG_KENTER 1
23#define CACHEFILES_DEBUG_KLEAVE 2
24#define CACHEFILES_DEBUG_KDEBUG 4
25
26/*
27 * node records
28 */
29struct cachefiles_object {
30 struct fscache_object fscache; /* fscache handle */
31 struct cachefiles_lookup_data *lookup_data; /* cached lookup data */
32 struct dentry *dentry; /* the file/dir representing this object */
33 struct dentry *backer; /* backing file */
34 loff_t i_size; /* object size */
35 unsigned long flags;
36#define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */
37 atomic_t usage; /* object usage count */
38 uint8_t type; /* object type */
39 uint8_t new; /* T if object new */
40 spinlock_t work_lock;
41 struct rb_node active_node; /* link in active tree (dentry is key) */
42};
43
44extern struct kmem_cache *cachefiles_object_jar;
45
46/*
47 * Cache files cache definition
48 */
49struct cachefiles_cache {
50 struct fscache_cache cache; /* FS-Cache record */
51 struct vfsmount *mnt; /* mountpoint holding the cache */
52 struct dentry *graveyard; /* directory into which dead objects go */
53 struct file *cachefilesd; /* manager daemon handle */
54 const struct cred *cache_cred; /* security override for accessing cache */
55 struct mutex daemon_mutex; /* command serialisation mutex */
56 wait_queue_head_t daemon_pollwq; /* poll waitqueue for daemon */
57 struct rb_root active_nodes; /* active nodes (can't be culled) */
58 rwlock_t active_lock; /* lock for active_nodes */
59 atomic_t gravecounter; /* graveyard uniquifier */
60 unsigned frun_percent; /* when to stop culling (% files) */
61 unsigned fcull_percent; /* when to start culling (% files) */
62 unsigned fstop_percent; /* when to stop allocating (% files) */
63 unsigned brun_percent; /* when to stop culling (% blocks) */
64 unsigned bcull_percent; /* when to start culling (% blocks) */
65 unsigned bstop_percent; /* when to stop allocating (% blocks) */
66 unsigned bsize; /* cache's block size */
67 unsigned bshift; /* min(ilog2(PAGE_SIZE / bsize), 0) */
68 uint64_t frun; /* when to stop culling */
69 uint64_t fcull; /* when to start culling */
70 uint64_t fstop; /* when to stop allocating */
71 sector_t brun; /* when to stop culling */
72 sector_t bcull; /* when to start culling */
73 sector_t bstop; /* when to stop allocating */
74 unsigned long flags;
75#define CACHEFILES_READY 0 /* T if cache prepared */
76#define CACHEFILES_DEAD 1 /* T if cache dead */
77#define CACHEFILES_CULLING 2 /* T if cull engaged */
78#define CACHEFILES_STATE_CHANGED 3 /* T if state changed (poll trigger) */
79 char *rootdirname; /* name of cache root directory */
80 char *secctx; /* LSM security context */
81 char *tag; /* cache binding tag */
82};
83
84/*
85 * backing file read tracking
86 */
87struct cachefiles_one_read {
88 wait_queue_t monitor; /* link into monitored waitqueue */
89 struct page *back_page; /* backing file page we're waiting for */
90 struct page *netfs_page; /* netfs page we're going to fill */
91 struct fscache_retrieval *op; /* retrieval op covering this */
92 struct list_head op_link; /* link in op's todo list */
93};
94
95/*
96 * backing file write tracking
97 */
98struct cachefiles_one_write {
99 struct page *netfs_page; /* netfs page to copy */
100 struct cachefiles_object *object;
101 struct list_head obj_link; /* link in object's lists */
102 fscache_rw_complete_t end_io_func;
103 void *context;
104};
105
106/*
107 * auxiliary data xattr buffer
108 */
109struct cachefiles_xattr {
110 uint16_t len;
111 uint8_t type;
112 uint8_t data[];
113};
114
115/*
116 * note change of state for daemon
117 */
118static inline void cachefiles_state_changed(struct cachefiles_cache *cache)
119{
120 set_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
121 wake_up_all(&cache->daemon_pollwq);
122}
123
124/*
125 * cf-bind.c
126 */
127extern int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args);
128extern void cachefiles_daemon_unbind(struct cachefiles_cache *cache);
129
130/*
131 * cf-daemon.c
132 */
133extern const struct file_operations cachefiles_daemon_fops;
134
135extern int cachefiles_has_space(struct cachefiles_cache *cache,
136 unsigned fnr, unsigned bnr);
137
138/*
139 * cf-interface.c
140 */
141extern const struct fscache_cache_ops cachefiles_cache_ops;
142
143/*
144 * cf-key.c
145 */
146extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type);
147
148/*
149 * cf-namei.c
150 */
151extern int cachefiles_delete_object(struct cachefiles_cache *cache,
152 struct cachefiles_object *object);
153extern int cachefiles_walk_to_object(struct cachefiles_object *parent,
154 struct cachefiles_object *object,
155 const char *key,
156 struct cachefiles_xattr *auxdata);
157extern struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
158 struct dentry *dir,
159 const char *name);
160
161extern int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
162 char *filename);
163
164extern int cachefiles_check_in_use(struct cachefiles_cache *cache,
165 struct dentry *dir, char *filename);
166
167/*
168 * cf-proc.c
169 */
170#ifdef CONFIG_CACHEFILES_HISTOGRAM
171extern atomic_t cachefiles_lookup_histogram[HZ];
172extern atomic_t cachefiles_mkdir_histogram[HZ];
173extern atomic_t cachefiles_create_histogram[HZ];
174
175extern int __init cachefiles_proc_init(void);
176extern void cachefiles_proc_cleanup(void);
177static inline
178void cachefiles_hist(atomic_t histogram[], unsigned long start_jif)
179{
180 unsigned long jif = jiffies - start_jif;
181 if (jif >= HZ)
182 jif = HZ - 1;
183 atomic_inc(&histogram[jif]);
184}
185
186#else
187#define cachefiles_proc_init() (0)
188#define cachefiles_proc_cleanup() do {} while (0)
189#define cachefiles_hist(hist, start_jif) do {} while (0)
190#endif
191
192/*
193 * cf-rdwr.c
194 */
195extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *,
196 struct page *, gfp_t);
197extern int cachefiles_read_or_alloc_pages(struct fscache_retrieval *,
198 struct list_head *, unsigned *,
199 gfp_t);
200extern int cachefiles_allocate_page(struct fscache_retrieval *, struct page *,
201 gfp_t);
202extern int cachefiles_allocate_pages(struct fscache_retrieval *,
203 struct list_head *, unsigned *, gfp_t);
204extern int cachefiles_write_page(struct fscache_storage *, struct page *);
205extern void cachefiles_uncache_page(struct fscache_object *, struct page *);
206
207/*
208 * cf-security.c
209 */
210extern int cachefiles_get_security_ID(struct cachefiles_cache *cache);
211extern int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
212 struct dentry *root,
213 const struct cred **_saved_cred);
214
215static inline void cachefiles_begin_secure(struct cachefiles_cache *cache,
216 const struct cred **_saved_cred)
217{
218 *_saved_cred = override_creds(cache->cache_cred);
219}
220
221static inline void cachefiles_end_secure(struct cachefiles_cache *cache,
222 const struct cred *saved_cred)
223{
224 revert_creds(saved_cred);
225}
226
227/*
228 * cf-xattr.c
229 */
230extern int cachefiles_check_object_type(struct cachefiles_object *object);
231extern int cachefiles_set_object_xattr(struct cachefiles_object *object,
232 struct cachefiles_xattr *auxdata);
233extern int cachefiles_update_object_xattr(struct cachefiles_object *object,
234 struct cachefiles_xattr *auxdata);
235extern int cachefiles_check_object_xattr(struct cachefiles_object *object,
236 struct cachefiles_xattr *auxdata);
237extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
238 struct dentry *dentry);
239
240
241/*
242 * error handling
243 */
244#define kerror(FMT, ...) printk(KERN_ERR "CacheFiles: "FMT"\n", ##__VA_ARGS__)
245
246#define cachefiles_io_error(___cache, FMT, ...) \
247do { \
248 kerror("I/O Error: " FMT, ##__VA_ARGS__); \
249 fscache_io_error(&(___cache)->cache); \
250 set_bit(CACHEFILES_DEAD, &(___cache)->flags); \
251} while (0)
252
253#define cachefiles_io_error_obj(object, FMT, ...) \
254do { \
255 struct cachefiles_cache *___cache; \
256 \
257 ___cache = container_of((object)->fscache.cache, \
258 struct cachefiles_cache, cache); \
259 cachefiles_io_error(___cache, FMT, ##__VA_ARGS__); \
260} while (0)
261
262
263/*
264 * debug tracing
265 */
266#define dbgprintk(FMT, ...) \
267 printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
268
269/* make sure we maintain the format strings, even when debugging is disabled */
270static inline void _dbprintk(const char *fmt, ...)
271 __attribute__((format(printf, 1, 2)));
272static inline void _dbprintk(const char *fmt, ...)
273{
274}
275
276#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
277#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
278#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
279
280
281#if defined(__KDEBUG)
282#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
283#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
284#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
285
286#elif defined(CONFIG_CACHEFILES_DEBUG)
287#define _enter(FMT, ...) \
288do { \
289 if (cachefiles_debug & CACHEFILES_DEBUG_KENTER) \
290 kenter(FMT, ##__VA_ARGS__); \
291} while (0)
292
293#define _leave(FMT, ...) \
294do { \
295 if (cachefiles_debug & CACHEFILES_DEBUG_KLEAVE) \
296 kleave(FMT, ##__VA_ARGS__); \
297} while (0)
298
299#define _debug(FMT, ...) \
300do { \
301 if (cachefiles_debug & CACHEFILES_DEBUG_KDEBUG) \
302 kdebug(FMT, ##__VA_ARGS__); \
303} while (0)
304
305#else
306#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
307#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
308#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
309#endif
310
311#if 1 /* defined(__KDEBUGALL) */
312
313#define ASSERT(X) \
314do { \
315 if (unlikely(!(X))) { \
316 printk(KERN_ERR "\n"); \
317 printk(KERN_ERR "CacheFiles: Assertion failed\n"); \
318 BUG(); \
319 } \
320} while (0)
321
322#define ASSERTCMP(X, OP, Y) \
323do { \
324 if (unlikely(!((X) OP (Y)))) { \
325 printk(KERN_ERR "\n"); \
326 printk(KERN_ERR "CacheFiles: Assertion failed\n"); \
327 printk(KERN_ERR "%lx " #OP " %lx is false\n", \
328 (unsigned long)(X), (unsigned long)(Y)); \
329 BUG(); \
330 } \
331} while (0)
332
333#define ASSERTIF(C, X) \
334do { \
335 if (unlikely((C) && !(X))) { \
336 printk(KERN_ERR "\n"); \
337 printk(KERN_ERR "CacheFiles: Assertion failed\n"); \
338 BUG(); \
339 } \
340} while (0)
341
342#define ASSERTIFCMP(C, X, OP, Y) \
343do { \
344 if (unlikely((C) && !((X) OP (Y)))) { \
345 printk(KERN_ERR "\n"); \
346 printk(KERN_ERR "CacheFiles: Assertion failed\n"); \
347 printk(KERN_ERR "%lx " #OP " %lx is false\n", \
348 (unsigned long)(X), (unsigned long)(Y)); \
349 BUG(); \
350 } \
351} while (0)
352
353#else
354
355#define ASSERT(X) do {} while (0)
356#define ASSERTCMP(X, OP, Y) do {} while (0)
357#define ASSERTIF(C, X) do {} while (0)
358#define ASSERTIFCMP(C, X, OP, Y) do {} while (0)
359
360#endif
diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c
new file mode 100644
index 000000000000..81b8b2b3a674
--- /dev/null
+++ b/fs/cachefiles/key.c
@@ -0,0 +1,159 @@
1/* Key to pathname encoder
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/slab.h>
13#include "internal.h"
14
15static const char cachefiles_charmap[64] =
16 "0123456789" /* 0 - 9 */
17 "abcdefghijklmnopqrstuvwxyz" /* 10 - 35 */
18 "ABCDEFGHIJKLMNOPQRSTUVWXYZ" /* 36 - 61 */
19 "_-" /* 62 - 63 */
20 ;
21
22static const char cachefiles_filecharmap[256] = {
23 /* we skip space and tab and control chars */
24 [33 ... 46] = 1, /* '!' -> '.' */
25 /* we skip '/' as it's significant to pathwalk */
26 [48 ... 127] = 1, /* '0' -> '~' */
27};
28
29/*
30 * turn the raw key into something cooked
31 * - the raw key should include the length in the two bytes at the front
32 * - the key may be up to 514 bytes in length (including the length word)
33 * - "base64" encode the strange keys, mapping 3 bytes of raw to four of
34 * cooked
35 * - need to cut the cooked key into 252 char lengths (189 raw bytes)
36 */
37char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type)
38{
39 unsigned char csum, ch;
40 unsigned int acc;
41 char *key;
42 int loop, len, max, seg, mark, print;
43
44 _enter(",%d", keylen);
45
46 BUG_ON(keylen < 2 || keylen > 514);
47
48 csum = raw[0] + raw[1];
49 print = 1;
50 for (loop = 2; loop < keylen; loop++) {
51 ch = raw[loop];
52 csum += ch;
53 print &= cachefiles_filecharmap[ch];
54 }
55
56 if (print) {
57 /* if the path is usable ASCII, then we render it directly */
58 max = keylen - 2;
59 max += 2; /* two base64'd length chars on the front */
60 max += 5; /* @checksum/M */
61 max += 3 * 2; /* maximum number of segment dividers (".../M")
62 * is ((514 + 251) / 252) = 3
63 */
64 max += 1; /* NUL on end */
65 } else {
66 /* calculate the maximum length of the cooked key */
67 keylen = (keylen + 2) / 3;
68
69 max = keylen * 4;
70 max += 5; /* @checksum/M */
71 max += 3 * 2; /* maximum number of segment dividers (".../M")
72 * is ((514 + 188) / 189) = 3
73 */
74 max += 1; /* NUL on end */
75 }
76
77 max += 1; /* 2nd NUL on end */
78
79 _debug("max: %d", max);
80
81 key = kmalloc(max, GFP_KERNEL);
82 if (!key)
83 return NULL;
84
85 len = 0;
86
87 /* build the cooked key */
88 sprintf(key, "@%02x%c+", (unsigned) csum, 0);
89 len = 5;
90 mark = len - 1;
91
92 if (print) {
93 acc = *(uint16_t *) raw;
94 raw += 2;
95
96 key[len + 1] = cachefiles_charmap[acc & 63];
97 acc >>= 6;
98 key[len] = cachefiles_charmap[acc & 63];
99 len += 2;
100
101 seg = 250;
102 for (loop = keylen; loop > 0; loop--) {
103 if (seg <= 0) {
104 key[len++] = '\0';
105 mark = len;
106 key[len++] = '+';
107 seg = 252;
108 }
109
110 key[len++] = *raw++;
111 ASSERT(len < max);
112 }
113
114 switch (type) {
115 case FSCACHE_COOKIE_TYPE_INDEX: type = 'I'; break;
116 case FSCACHE_COOKIE_TYPE_DATAFILE: type = 'D'; break;
117 default: type = 'S'; break;
118 }
119 } else {
120 seg = 252;
121 for (loop = keylen; loop > 0; loop--) {
122 if (seg <= 0) {
123 key[len++] = '\0';
124 mark = len;
125 key[len++] = '+';
126 seg = 252;
127 }
128
129 acc = *raw++;
130 acc |= *raw++ << 8;
131 acc |= *raw++ << 16;
132
133 _debug("acc: %06x", acc);
134
135 key[len++] = cachefiles_charmap[acc & 63];
136 acc >>= 6;
137 key[len++] = cachefiles_charmap[acc & 63];
138 acc >>= 6;
139 key[len++] = cachefiles_charmap[acc & 63];
140 acc >>= 6;
141 key[len++] = cachefiles_charmap[acc & 63];
142
143 ASSERT(len < max);
144 }
145
146 switch (type) {
147 case FSCACHE_COOKIE_TYPE_INDEX: type = 'J'; break;
148 case FSCACHE_COOKIE_TYPE_DATAFILE: type = 'E'; break;
149 default: type = 'T'; break;
150 }
151 }
152
153 key[mark] = type;
154 key[len++] = 0;
155 key[len] = 0;
156
157 _leave(" = %p %d", key, len);
158 return key;
159}
diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c
new file mode 100644
index 000000000000..4bfa8cf43bf5
--- /dev/null
+++ b/fs/cachefiles/main.c
@@ -0,0 +1,106 @@
1/* Network filesystem caching backend to use cache files on a premounted
2 * filesystem
3 *
4 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
5 * Written by David Howells (dhowells@redhat.com)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public Licence
9 * as published by the Free Software Foundation; either version
10 * 2 of the Licence, or (at your option) any later version.
11 */
12
13#include <linux/module.h>
14#include <linux/init.h>
15#include <linux/sched.h>
16#include <linux/completion.h>
17#include <linux/slab.h>
18#include <linux/fs.h>
19#include <linux/file.h>
20#include <linux/namei.h>
21#include <linux/mount.h>
22#include <linux/statfs.h>
23#include <linux/sysctl.h>
24#include <linux/miscdevice.h>
25#include "internal.h"
26
27unsigned cachefiles_debug;
28module_param_named(debug, cachefiles_debug, uint, S_IWUSR | S_IRUGO);
29MODULE_PARM_DESC(cachefiles_debug, "CacheFiles debugging mask");
30
31MODULE_DESCRIPTION("Mounted-filesystem based cache");
32MODULE_AUTHOR("Red Hat, Inc.");
33MODULE_LICENSE("GPL");
34
35struct kmem_cache *cachefiles_object_jar;
36
37static struct miscdevice cachefiles_dev = {
38 .minor = MISC_DYNAMIC_MINOR,
39 .name = "cachefiles",
40 .fops = &cachefiles_daemon_fops,
41};
42
43static void cachefiles_object_init_once(void *_object)
44{
45 struct cachefiles_object *object = _object;
46
47 memset(object, 0, sizeof(*object));
48 spin_lock_init(&object->work_lock);
49}
50
51/*
52 * initialise the fs caching module
53 */
54static int __init cachefiles_init(void)
55{
56 int ret;
57
58 ret = misc_register(&cachefiles_dev);
59 if (ret < 0)
60 goto error_dev;
61
62 /* create an object jar */
63 ret = -ENOMEM;
64 cachefiles_object_jar =
65 kmem_cache_create("cachefiles_object_jar",
66 sizeof(struct cachefiles_object),
67 0,
68 SLAB_HWCACHE_ALIGN,
69 cachefiles_object_init_once);
70 if (!cachefiles_object_jar) {
71 printk(KERN_NOTICE
72 "CacheFiles: Failed to allocate an object jar\n");
73 goto error_object_jar;
74 }
75
76 ret = cachefiles_proc_init();
77 if (ret < 0)
78 goto error_proc;
79
80 printk(KERN_INFO "CacheFiles: Loaded\n");
81 return 0;
82
83error_proc:
84 kmem_cache_destroy(cachefiles_object_jar);
85error_object_jar:
86 misc_deregister(&cachefiles_dev);
87error_dev:
88 kerror("failed to register: %d", ret);
89 return ret;
90}
91
92fs_initcall(cachefiles_init);
93
94/*
95 * clean up on module removal
96 */
97static void __exit cachefiles_exit(void)
98{
99 printk(KERN_INFO "CacheFiles: Unloading\n");
100
101 cachefiles_proc_cleanup();
102 kmem_cache_destroy(cachefiles_object_jar);
103 misc_deregister(&cachefiles_dev);
104}
105
106module_exit(cachefiles_exit);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
new file mode 100644
index 000000000000..4ce818ae39ea
--- /dev/null
+++ b/fs/cachefiles/namei.c
@@ -0,0 +1,771 @@
1/* CacheFiles path walking and related routines
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/sched.h>
14#include <linux/file.h>
15#include <linux/fs.h>
16#include <linux/fsnotify.h>
17#include <linux/quotaops.h>
18#include <linux/xattr.h>
19#include <linux/mount.h>
20#include <linux/namei.h>
21#include <linux/security.h>
22#include "internal.h"
23
24static int cachefiles_wait_bit(void *flags)
25{
26 schedule();
27 return 0;
28}
29
30/*
31 * record the fact that an object is now active
32 */
33static void cachefiles_mark_object_active(struct cachefiles_cache *cache,
34 struct cachefiles_object *object)
35{
36 struct cachefiles_object *xobject;
37 struct rb_node **_p, *_parent = NULL;
38 struct dentry *dentry;
39
40 _enter(",%p", object);
41
42try_again:
43 write_lock(&cache->active_lock);
44
45 if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags))
46 BUG();
47
48 dentry = object->dentry;
49 _p = &cache->active_nodes.rb_node;
50 while (*_p) {
51 _parent = *_p;
52 xobject = rb_entry(_parent,
53 struct cachefiles_object, active_node);
54
55 ASSERT(xobject != object);
56
57 if (xobject->dentry > dentry)
58 _p = &(*_p)->rb_left;
59 else if (xobject->dentry < dentry)
60 _p = &(*_p)->rb_right;
61 else
62 goto wait_for_old_object;
63 }
64
65 rb_link_node(&object->active_node, _parent, _p);
66 rb_insert_color(&object->active_node, &cache->active_nodes);
67
68 write_unlock(&cache->active_lock);
69 _leave("");
70 return;
71
72 /* an old object from a previous incarnation is hogging the slot - we
73 * need to wait for it to be destroyed */
74wait_for_old_object:
75 if (xobject->fscache.state < FSCACHE_OBJECT_DYING) {
76 printk(KERN_ERR "\n");
77 printk(KERN_ERR "CacheFiles: Error:"
78 " Unexpected object collision\n");
79 printk(KERN_ERR "xobject: OBJ%x\n",
80 xobject->fscache.debug_id);
81 printk(KERN_ERR "xobjstate=%s\n",
82 fscache_object_states[xobject->fscache.state]);
83 printk(KERN_ERR "xobjflags=%lx\n", xobject->fscache.flags);
84 printk(KERN_ERR "xobjevent=%lx [%lx]\n",
85 xobject->fscache.events, xobject->fscache.event_mask);
86 printk(KERN_ERR "xops=%u inp=%u exc=%u\n",
87 xobject->fscache.n_ops, xobject->fscache.n_in_progress,
88 xobject->fscache.n_exclusive);
89 printk(KERN_ERR "xcookie=%p [pr=%p nd=%p fl=%lx]\n",
90 xobject->fscache.cookie,
91 xobject->fscache.cookie->parent,
92 xobject->fscache.cookie->netfs_data,
93 xobject->fscache.cookie->flags);
94 printk(KERN_ERR "xparent=%p\n",
95 xobject->fscache.parent);
96 printk(KERN_ERR "object: OBJ%x\n",
97 object->fscache.debug_id);
98 printk(KERN_ERR "cookie=%p [pr=%p nd=%p fl=%lx]\n",
99 object->fscache.cookie,
100 object->fscache.cookie->parent,
101 object->fscache.cookie->netfs_data,
102 object->fscache.cookie->flags);
103 printk(KERN_ERR "parent=%p\n",
104 object->fscache.parent);
105 BUG();
106 }
107 atomic_inc(&xobject->usage);
108 write_unlock(&cache->active_lock);
109
110 _debug(">>> wait");
111 wait_on_bit(&xobject->flags, CACHEFILES_OBJECT_ACTIVE,
112 cachefiles_wait_bit, TASK_UNINTERRUPTIBLE);
113 _debug("<<< waited");
114
115 cache->cache.ops->put_object(&xobject->fscache);
116 goto try_again;
117}
118
119/*
120 * delete an object representation from the cache
121 * - file backed objects are unlinked
122 * - directory backed objects are stuffed into the graveyard for userspace to
123 * delete
124 * - unlocks the directory mutex
125 */
126static int cachefiles_bury_object(struct cachefiles_cache *cache,
127 struct dentry *dir,
128 struct dentry *rep)
129{
130 struct dentry *grave, *trap;
131 char nbuffer[8 + 8 + 1];
132 int ret;
133
134 _enter(",'%*.*s','%*.*s'",
135 dir->d_name.len, dir->d_name.len, dir->d_name.name,
136 rep->d_name.len, rep->d_name.len, rep->d_name.name);
137
138 /* non-directories can just be unlinked */
139 if (!S_ISDIR(rep->d_inode->i_mode)) {
140 _debug("unlink stale object");
141 ret = vfs_unlink(dir->d_inode, rep);
142
143 mutex_unlock(&dir->d_inode->i_mutex);
144
145 if (ret == -EIO)
146 cachefiles_io_error(cache, "Unlink failed");
147
148 _leave(" = %d", ret);
149 return ret;
150 }
151
152 /* directories have to be moved to the graveyard */
153 _debug("move stale object to graveyard");
154 mutex_unlock(&dir->d_inode->i_mutex);
155
156try_again:
157 /* first step is to make up a grave dentry in the graveyard */
158 sprintf(nbuffer, "%08x%08x",
159 (uint32_t) get_seconds(),
160 (uint32_t) atomic_inc_return(&cache->gravecounter));
161
162 /* do the multiway lock magic */
163 trap = lock_rename(cache->graveyard, dir);
164
165 /* do some checks before getting the grave dentry */
166 if (rep->d_parent != dir) {
167 /* the entry was probably culled when we dropped the parent dir
168 * lock */
169 unlock_rename(cache->graveyard, dir);
170 _leave(" = 0 [culled?]");
171 return 0;
172 }
173
174 if (!S_ISDIR(cache->graveyard->d_inode->i_mode)) {
175 unlock_rename(cache->graveyard, dir);
176 cachefiles_io_error(cache, "Graveyard no longer a directory");
177 return -EIO;
178 }
179
180 if (trap == rep) {
181 unlock_rename(cache->graveyard, dir);
182 cachefiles_io_error(cache, "May not make directory loop");
183 return -EIO;
184 }
185
186 if (d_mountpoint(rep)) {
187 unlock_rename(cache->graveyard, dir);
188 cachefiles_io_error(cache, "Mountpoint in cache");
189 return -EIO;
190 }
191
192 grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer));
193 if (IS_ERR(grave)) {
194 unlock_rename(cache->graveyard, dir);
195
196 if (PTR_ERR(grave) == -ENOMEM) {
197 _leave(" = -ENOMEM");
198 return -ENOMEM;
199 }
200
201 cachefiles_io_error(cache, "Lookup error %ld",
202 PTR_ERR(grave));
203 return -EIO;
204 }
205
206 if (grave->d_inode) {
207 unlock_rename(cache->graveyard, dir);
208 dput(grave);
209 grave = NULL;
210 cond_resched();
211 goto try_again;
212 }
213
214 if (d_mountpoint(grave)) {
215 unlock_rename(cache->graveyard, dir);
216 dput(grave);
217 cachefiles_io_error(cache, "Mountpoint in graveyard");
218 return -EIO;
219 }
220
221 /* target should not be an ancestor of source */
222 if (trap == grave) {
223 unlock_rename(cache->graveyard, dir);
224 dput(grave);
225 cachefiles_io_error(cache, "May not make directory loop");
226 return -EIO;
227 }
228
229 /* attempt the rename */
230 ret = vfs_rename(dir->d_inode, rep, cache->graveyard->d_inode, grave);
231 if (ret != 0 && ret != -ENOMEM)
232 cachefiles_io_error(cache, "Rename failed with error %d", ret);
233
234 unlock_rename(cache->graveyard, dir);
235 dput(grave);
236 _leave(" = 0");
237 return 0;
238}
239
240/*
241 * delete an object representation from the cache
242 */
243int cachefiles_delete_object(struct cachefiles_cache *cache,
244 struct cachefiles_object *object)
245{
246 struct dentry *dir;
247 int ret;
248
249 _enter(",{%p}", object->dentry);
250
251 ASSERT(object->dentry);
252 ASSERT(object->dentry->d_inode);
253 ASSERT(object->dentry->d_parent);
254
255 dir = dget_parent(object->dentry);
256
257 mutex_lock(&dir->d_inode->i_mutex);
258 ret = cachefiles_bury_object(cache, dir, object->dentry);
259
260 dput(dir);
261 _leave(" = %d", ret);
262 return ret;
263}
264
265/*
266 * walk from the parent object to the child object through the backing
267 * filesystem, creating directories as we go
268 */
269int cachefiles_walk_to_object(struct cachefiles_object *parent,
270 struct cachefiles_object *object,
271 const char *key,
272 struct cachefiles_xattr *auxdata)
273{
274 struct cachefiles_cache *cache;
275 struct dentry *dir, *next = NULL;
276 unsigned long start;
277 const char *name;
278 int ret, nlen;
279
280 _enter("{%p},,%s,", parent->dentry, key);
281
282 cache = container_of(parent->fscache.cache,
283 struct cachefiles_cache, cache);
284
285 ASSERT(parent->dentry);
286 ASSERT(parent->dentry->d_inode);
287
288 if (!(S_ISDIR(parent->dentry->d_inode->i_mode))) {
289 // TODO: convert file to dir
290 _leave("looking up in none directory");
291 return -ENOBUFS;
292 }
293
294 dir = dget(parent->dentry);
295
296advance:
297 /* attempt to transit the first directory component */
298 name = key;
299 nlen = strlen(key);
300
301 /* key ends in a double NUL */
302 key = key + nlen + 1;
303 if (!*key)
304 key = NULL;
305
306lookup_again:
307 /* search the current directory for the element name */
308 _debug("lookup '%s'", name);
309
310 mutex_lock(&dir->d_inode->i_mutex);
311
312 start = jiffies;
313 next = lookup_one_len(name, dir, nlen);
314 cachefiles_hist(cachefiles_lookup_histogram, start);
315 if (IS_ERR(next))
316 goto lookup_error;
317
318 _debug("next -> %p %s", next, next->d_inode ? "positive" : "negative");
319
320 if (!key)
321 object->new = !next->d_inode;
322
323 /* if this element of the path doesn't exist, then the lookup phase
324 * failed, and we can release any readers in the certain knowledge that
325 * there's nothing for them to actually read */
326 if (!next->d_inode)
327 fscache_object_lookup_negative(&object->fscache);
328
329 /* we need to create the object if it's negative */
330 if (key || object->type == FSCACHE_COOKIE_TYPE_INDEX) {
331 /* index objects and intervening tree levels must be subdirs */
332 if (!next->d_inode) {
333 ret = cachefiles_has_space(cache, 1, 0);
334 if (ret < 0)
335 goto create_error;
336
337 start = jiffies;
338 ret = vfs_mkdir(dir->d_inode, next, 0);
339 cachefiles_hist(cachefiles_mkdir_histogram, start);
340 if (ret < 0)
341 goto create_error;
342
343 ASSERT(next->d_inode);
344
345 _debug("mkdir -> %p{%p{ino=%lu}}",
346 next, next->d_inode, next->d_inode->i_ino);
347
348 } else if (!S_ISDIR(next->d_inode->i_mode)) {
349 kerror("inode %lu is not a directory",
350 next->d_inode->i_ino);
351 ret = -ENOBUFS;
352 goto error;
353 }
354
355 } else {
356 /* non-index objects start out life as files */
357 if (!next->d_inode) {
358 ret = cachefiles_has_space(cache, 1, 0);
359 if (ret < 0)
360 goto create_error;
361
362 start = jiffies;
363 ret = vfs_create(dir->d_inode, next, S_IFREG, NULL);
364 cachefiles_hist(cachefiles_create_histogram, start);
365 if (ret < 0)
366 goto create_error;
367
368 ASSERT(next->d_inode);
369
370 _debug("create -> %p{%p{ino=%lu}}",
371 next, next->d_inode, next->d_inode->i_ino);
372
373 } else if (!S_ISDIR(next->d_inode->i_mode) &&
374 !S_ISREG(next->d_inode->i_mode)
375 ) {
376 kerror("inode %lu is not a file or directory",
377 next->d_inode->i_ino);
378 ret = -ENOBUFS;
379 goto error;
380 }
381 }
382
383 /* process the next component */
384 if (key) {
385 _debug("advance");
386 mutex_unlock(&dir->d_inode->i_mutex);
387 dput(dir);
388 dir = next;
389 next = NULL;
390 goto advance;
391 }
392
393 /* we've found the object we were looking for */
394 object->dentry = next;
395
396 /* if we've found that the terminal object exists, then we need to
397 * check its attributes and delete it if it's out of date */
398 if (!object->new) {
399 _debug("validate '%*.*s'",
400 next->d_name.len, next->d_name.len, next->d_name.name);
401
402 ret = cachefiles_check_object_xattr(object, auxdata);
403 if (ret == -ESTALE) {
404 /* delete the object (the deleter drops the directory
405 * mutex) */
406 object->dentry = NULL;
407
408 ret = cachefiles_bury_object(cache, dir, next);
409 dput(next);
410 next = NULL;
411
412 if (ret < 0)
413 goto delete_error;
414
415 _debug("redo lookup");
416 goto lookup_again;
417 }
418 }
419
420 /* note that we're now using this object */
421 cachefiles_mark_object_active(cache, object);
422
423 mutex_unlock(&dir->d_inode->i_mutex);
424 dput(dir);
425 dir = NULL;
426
427 _debug("=== OBTAINED_OBJECT ===");
428
429 if (object->new) {
430 /* attach data to a newly constructed terminal object */
431 ret = cachefiles_set_object_xattr(object, auxdata);
432 if (ret < 0)
433 goto check_error;
434 } else {
435 /* always update the atime on an object we've just looked up
436 * (this is used to keep track of culling, and atimes are only
437 * updated by read, write and readdir but not lookup or
438 * open) */
439 touch_atime(cache->mnt, next);
440 }
441
442 /* open a file interface onto a data file */
443 if (object->type != FSCACHE_COOKIE_TYPE_INDEX) {
444 if (S_ISREG(object->dentry->d_inode->i_mode)) {
445 const struct address_space_operations *aops;
446
447 ret = -EPERM;
448 aops = object->dentry->d_inode->i_mapping->a_ops;
449 if (!aops->bmap)
450 goto check_error;
451
452 object->backer = object->dentry;
453 } else {
454 BUG(); // TODO: open file in data-class subdir
455 }
456 }
457
458 object->new = 0;
459 fscache_obtained_object(&object->fscache);
460
461 _leave(" = 0 [%lu]", object->dentry->d_inode->i_ino);
462 return 0;
463
464create_error:
465 _debug("create error %d", ret);
466 if (ret == -EIO)
467 cachefiles_io_error(cache, "Create/mkdir failed");
468 goto error;
469
470check_error:
471 _debug("check error %d", ret);
472 write_lock(&cache->active_lock);
473 rb_erase(&object->active_node, &cache->active_nodes);
474 clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
475 wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
476 write_unlock(&cache->active_lock);
477
478 dput(object->dentry);
479 object->dentry = NULL;
480 goto error_out;
481
482delete_error:
483 _debug("delete error %d", ret);
484 goto error_out2;
485
486lookup_error:
487 _debug("lookup error %ld", PTR_ERR(next));
488 ret = PTR_ERR(next);
489 if (ret == -EIO)
490 cachefiles_io_error(cache, "Lookup failed");
491 next = NULL;
492error:
493 mutex_unlock(&dir->d_inode->i_mutex);
494 dput(next);
495error_out2:
496 dput(dir);
497error_out:
498 if (ret == -ENOSPC)
499 ret = -ENOBUFS;
500
501 _leave(" = error %d", -ret);
502 return ret;
503}
504
505/*
506 * get a subdirectory
507 */
508struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
509 struct dentry *dir,
510 const char *dirname)
511{
512 struct dentry *subdir;
513 unsigned long start;
514 int ret;
515
516 _enter(",,%s", dirname);
517
518 /* search the current directory for the element name */
519 mutex_lock(&dir->d_inode->i_mutex);
520
521 start = jiffies;
522 subdir = lookup_one_len(dirname, dir, strlen(dirname));
523 cachefiles_hist(cachefiles_lookup_histogram, start);
524 if (IS_ERR(subdir)) {
525 if (PTR_ERR(subdir) == -ENOMEM)
526 goto nomem_d_alloc;
527 goto lookup_error;
528 }
529
530 _debug("subdir -> %p %s",
531 subdir, subdir->d_inode ? "positive" : "negative");
532
533 /* we need to create the subdir if it doesn't exist yet */
534 if (!subdir->d_inode) {
535 ret = cachefiles_has_space(cache, 1, 0);
536 if (ret < 0)
537 goto mkdir_error;
538
539 _debug("attempt mkdir");
540
541 ret = vfs_mkdir(dir->d_inode, subdir, 0700);
542 if (ret < 0)
543 goto mkdir_error;
544
545 ASSERT(subdir->d_inode);
546
547 _debug("mkdir -> %p{%p{ino=%lu}}",
548 subdir,
549 subdir->d_inode,
550 subdir->d_inode->i_ino);
551 }
552
553 mutex_unlock(&dir->d_inode->i_mutex);
554
555 /* we need to make sure the subdir is a directory */
556 ASSERT(subdir->d_inode);
557
558 if (!S_ISDIR(subdir->d_inode->i_mode)) {
559 kerror("%s is not a directory", dirname);
560 ret = -EIO;
561 goto check_error;
562 }
563
564 ret = -EPERM;
565 if (!subdir->d_inode->i_op ||
566 !subdir->d_inode->i_op->setxattr ||
567 !subdir->d_inode->i_op->getxattr ||
568 !subdir->d_inode->i_op->lookup ||
569 !subdir->d_inode->i_op->mkdir ||
570 !subdir->d_inode->i_op->create ||
571 !subdir->d_inode->i_op->rename ||
572 !subdir->d_inode->i_op->rmdir ||
573 !subdir->d_inode->i_op->unlink)
574 goto check_error;
575
576 _leave(" = [%lu]", subdir->d_inode->i_ino);
577 return subdir;
578
579check_error:
580 dput(subdir);
581 _leave(" = %d [check]", ret);
582 return ERR_PTR(ret);
583
584mkdir_error:
585 mutex_unlock(&dir->d_inode->i_mutex);
586 dput(subdir);
587 kerror("mkdir %s failed with error %d", dirname, ret);
588 return ERR_PTR(ret);
589
590lookup_error:
591 mutex_unlock(&dir->d_inode->i_mutex);
592 ret = PTR_ERR(subdir);
593 kerror("Lookup %s failed with error %d", dirname, ret);
594 return ERR_PTR(ret);
595
596nomem_d_alloc:
597 mutex_unlock(&dir->d_inode->i_mutex);
598 _leave(" = -ENOMEM");
599 return ERR_PTR(-ENOMEM);
600}
601
602/*
603 * find out if an object is in use or not
604 * - if finds object and it's not in use:
605 * - returns a pointer to the object and a reference on it
606 * - returns with the directory locked
607 */
608static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
609 struct dentry *dir,
610 char *filename)
611{
612 struct cachefiles_object *object;
613 struct rb_node *_n;
614 struct dentry *victim;
615 unsigned long start;
616 int ret;
617
618 //_enter(",%*.*s/,%s",
619 // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
620
621 /* look up the victim */
622 mutex_lock_nested(&dir->d_inode->i_mutex, 1);
623
624 start = jiffies;
625 victim = lookup_one_len(filename, dir, strlen(filename));
626 cachefiles_hist(cachefiles_lookup_histogram, start);
627 if (IS_ERR(victim))
628 goto lookup_error;
629
630 //_debug("victim -> %p %s",
631 // victim, victim->d_inode ? "positive" : "negative");
632
633 /* if the object is no longer there then we probably retired the object
634 * at the netfs's request whilst the cull was in progress
635 */
636 if (!victim->d_inode) {
637 mutex_unlock(&dir->d_inode->i_mutex);
638 dput(victim);
639 _leave(" = -ENOENT [absent]");
640 return ERR_PTR(-ENOENT);
641 }
642
643 /* check to see if we're using this object */
644 read_lock(&cache->active_lock);
645
646 _n = cache->active_nodes.rb_node;
647
648 while (_n) {
649 object = rb_entry(_n, struct cachefiles_object, active_node);
650
651 if (object->dentry > victim)
652 _n = _n->rb_left;
653 else if (object->dentry < victim)
654 _n = _n->rb_right;
655 else
656 goto object_in_use;
657 }
658
659 read_unlock(&cache->active_lock);
660
661 //_leave(" = %p", victim);
662 return victim;
663
664object_in_use:
665 read_unlock(&cache->active_lock);
666 mutex_unlock(&dir->d_inode->i_mutex);
667 dput(victim);
668 //_leave(" = -EBUSY [in use]");
669 return ERR_PTR(-EBUSY);
670
671lookup_error:
672 mutex_unlock(&dir->d_inode->i_mutex);
673 ret = PTR_ERR(victim);
674 if (ret == -ENOENT) {
675 /* file or dir now absent - probably retired by netfs */
676 _leave(" = -ESTALE [absent]");
677 return ERR_PTR(-ESTALE);
678 }
679
680 if (ret == -EIO) {
681 cachefiles_io_error(cache, "Lookup failed");
682 } else if (ret != -ENOMEM) {
683 kerror("Internal error: %d", ret);
684 ret = -EIO;
685 }
686
687 _leave(" = %d", ret);
688 return ERR_PTR(ret);
689}
690
691/*
692 * cull an object if it's not in use
693 * - called only by cache manager daemon
694 */
695int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
696 char *filename)
697{
698 struct dentry *victim;
699 int ret;
700
701 _enter(",%*.*s/,%s",
702 dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
703
704 victim = cachefiles_check_active(cache, dir, filename);
705 if (IS_ERR(victim))
706 return PTR_ERR(victim);
707
708 _debug("victim -> %p %s",
709 victim, victim->d_inode ? "positive" : "negative");
710
711 /* okay... the victim is not being used so we can cull it
712 * - start by marking it as stale
713 */
714 _debug("victim is cullable");
715
716 ret = cachefiles_remove_object_xattr(cache, victim);
717 if (ret < 0)
718 goto error_unlock;
719
720 /* actually remove the victim (drops the dir mutex) */
721 _debug("bury");
722
723 ret = cachefiles_bury_object(cache, dir, victim);
724 if (ret < 0)
725 goto error;
726
727 dput(victim);
728 _leave(" = 0");
729 return 0;
730
731error_unlock:
732 mutex_unlock(&dir->d_inode->i_mutex);
733error:
734 dput(victim);
735 if (ret == -ENOENT) {
736 /* file or dir now absent - probably retired by netfs */
737 _leave(" = -ESTALE [absent]");
738 return -ESTALE;
739 }
740
741 if (ret != -ENOMEM) {
742 kerror("Internal error: %d", ret);
743 ret = -EIO;
744 }
745
746 _leave(" = %d", ret);
747 return ret;
748}
749
750/*
751 * find out if an object is in use or not
752 * - called only by cache manager daemon
753 * - returns -EBUSY or 0 to indicate whether an object is in use or not
754 */
755int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir,
756 char *filename)
757{
758 struct dentry *victim;
759
760 //_enter(",%*.*s/,%s",
761 // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
762
763 victim = cachefiles_check_active(cache, dir, filename);
764 if (IS_ERR(victim))
765 return PTR_ERR(victim);
766
767 mutex_unlock(&dir->d_inode->i_mutex);
768 dput(victim);
769 //_leave(" = 0");
770 return 0;
771}
diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c
new file mode 100644
index 000000000000..eccd33941199
--- /dev/null
+++ b/fs/cachefiles/proc.c
@@ -0,0 +1,134 @@
1/* CacheFiles statistics
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/proc_fs.h>
14#include <linux/seq_file.h>
15#include "internal.h"
16
17atomic_t cachefiles_lookup_histogram[HZ];
18atomic_t cachefiles_mkdir_histogram[HZ];
19atomic_t cachefiles_create_histogram[HZ];
20
21/*
22 * display the latency histogram
23 */
24static int cachefiles_histogram_show(struct seq_file *m, void *v)
25{
26 unsigned long index;
27 unsigned x, y, z, t;
28
29 switch ((unsigned long) v) {
30 case 1:
31 seq_puts(m, "JIFS SECS LOOKUPS MKDIRS CREATES\n");
32 return 0;
33 case 2:
34 seq_puts(m, "===== ===== ========= ========= =========\n");
35 return 0;
36 default:
37 index = (unsigned long) v - 3;
38 x = atomic_read(&cachefiles_lookup_histogram[index]);
39 y = atomic_read(&cachefiles_mkdir_histogram[index]);
40 z = atomic_read(&cachefiles_create_histogram[index]);
41 if (x == 0 && y == 0 && z == 0)
42 return 0;
43
44 t = (index * 1000) / HZ;
45
46 seq_printf(m, "%4lu 0.%03u %9u %9u %9u\n", index, t, x, y, z);
47 return 0;
48 }
49}
50
51/*
52 * set up the iterator to start reading from the first line
53 */
54static void *cachefiles_histogram_start(struct seq_file *m, loff_t *_pos)
55{
56 if ((unsigned long long)*_pos >= HZ + 2)
57 return NULL;
58 if (*_pos == 0)
59 *_pos = 1;
60 return (void *)(unsigned long) *_pos;
61}
62
63/*
64 * move to the next line
65 */
66static void *cachefiles_histogram_next(struct seq_file *m, void *v, loff_t *pos)
67{
68 (*pos)++;
69 return (unsigned long long)*pos > HZ + 2 ?
70 NULL : (void *)(unsigned long) *pos;
71}
72
73/*
74 * clean up after reading
75 */
76static void cachefiles_histogram_stop(struct seq_file *m, void *v)
77{
78}
79
80static const struct seq_operations cachefiles_histogram_ops = {
81 .start = cachefiles_histogram_start,
82 .stop = cachefiles_histogram_stop,
83 .next = cachefiles_histogram_next,
84 .show = cachefiles_histogram_show,
85};
86
87/*
88 * open "/proc/fs/cachefiles/XXX" which provide statistics summaries
89 */
90static int cachefiles_histogram_open(struct inode *inode, struct file *file)
91{
92 return seq_open(file, &cachefiles_histogram_ops);
93}
94
95static const struct file_operations cachefiles_histogram_fops = {
96 .owner = THIS_MODULE,
97 .open = cachefiles_histogram_open,
98 .read = seq_read,
99 .llseek = seq_lseek,
100 .release = seq_release,
101};
102
103/*
104 * initialise the /proc/fs/cachefiles/ directory
105 */
106int __init cachefiles_proc_init(void)
107{
108 _enter("");
109
110 if (!proc_mkdir("fs/cachefiles", NULL))
111 goto error_dir;
112
113 if (!proc_create("fs/cachefiles/histogram", S_IFREG | 0444, NULL,
114 &cachefiles_histogram_fops))
115 goto error_histogram;
116
117 _leave(" = 0");
118 return 0;
119
120error_histogram:
121 remove_proc_entry("fs/cachefiles", NULL);
122error_dir:
123 _leave(" = -ENOMEM");
124 return -ENOMEM;
125}
126
127/*
128 * clean up the /proc/fs/cachefiles/ directory
129 */
130void cachefiles_proc_cleanup(void)
131{
132 remove_proc_entry("fs/cachefiles/histogram", NULL);
133 remove_proc_entry("fs/cachefiles", NULL);
134}
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
new file mode 100644
index 000000000000..a69787e7dd96
--- /dev/null
+++ b/fs/cachefiles/rdwr.c
@@ -0,0 +1,879 @@
1/* Storage object read/write
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/mount.h>
13#include <linux/file.h>
14#include "internal.h"
15
16/*
17 * detect wake up events generated by the unlocking of pages in which we're
18 * interested
19 * - we use this to detect read completion of backing pages
20 * - the caller holds the waitqueue lock
21 */
22static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
23 int sync, void *_key)
24{
25 struct cachefiles_one_read *monitor =
26 container_of(wait, struct cachefiles_one_read, monitor);
27 struct cachefiles_object *object;
28 struct wait_bit_key *key = _key;
29 struct page *page = wait->private;
30
31 ASSERT(key);
32
33 _enter("{%lu},%u,%d,{%p,%u}",
34 monitor->netfs_page->index, mode, sync,
35 key->flags, key->bit_nr);
36
37 if (key->flags != &page->flags ||
38 key->bit_nr != PG_locked)
39 return 0;
40
41 _debug("--- monitor %p %lx ---", page, page->flags);
42
43 if (!PageUptodate(page) && !PageError(page))
44 dump_stack();
45
46 /* remove from the waitqueue */
47 list_del(&wait->task_list);
48
49 /* move onto the action list and queue for FS-Cache thread pool */
50 ASSERT(monitor->op);
51
52 object = container_of(monitor->op->op.object,
53 struct cachefiles_object, fscache);
54
55 spin_lock(&object->work_lock);
56 list_add_tail(&monitor->op_link, &monitor->op->to_do);
57 spin_unlock(&object->work_lock);
58
59 fscache_enqueue_retrieval(monitor->op);
60 return 0;
61}
62
63/*
64 * copy data from backing pages to netfs pages to complete a read operation
65 * - driven by FS-Cache's thread pool
66 */
67static void cachefiles_read_copier(struct fscache_operation *_op)
68{
69 struct cachefiles_one_read *monitor;
70 struct cachefiles_object *object;
71 struct fscache_retrieval *op;
72 struct pagevec pagevec;
73 int error, max;
74
75 op = container_of(_op, struct fscache_retrieval, op);
76 object = container_of(op->op.object,
77 struct cachefiles_object, fscache);
78
79 _enter("{ino=%lu}", object->backer->d_inode->i_ino);
80
81 pagevec_init(&pagevec, 0);
82
83 max = 8;
84 spin_lock_irq(&object->work_lock);
85
86 while (!list_empty(&op->to_do)) {
87 monitor = list_entry(op->to_do.next,
88 struct cachefiles_one_read, op_link);
89 list_del(&monitor->op_link);
90
91 spin_unlock_irq(&object->work_lock);
92
93 _debug("- copy {%lu}", monitor->back_page->index);
94
95 error = -EIO;
96 if (PageUptodate(monitor->back_page)) {
97 copy_highpage(monitor->netfs_page, monitor->back_page);
98
99 pagevec_add(&pagevec, monitor->netfs_page);
100 fscache_mark_pages_cached(monitor->op, &pagevec);
101 error = 0;
102 }
103
104 if (error)
105 cachefiles_io_error_obj(
106 object,
107 "Readpage failed on backing file %lx",
108 (unsigned long) monitor->back_page->flags);
109
110 page_cache_release(monitor->back_page);
111
112 fscache_end_io(op, monitor->netfs_page, error);
113 page_cache_release(monitor->netfs_page);
114 fscache_put_retrieval(op);
115 kfree(monitor);
116
117 /* let the thread pool have some air occasionally */
118 max--;
119 if (max < 0 || need_resched()) {
120 if (!list_empty(&op->to_do))
121 fscache_enqueue_retrieval(op);
122 _leave(" [maxed out]");
123 return;
124 }
125
126 spin_lock_irq(&object->work_lock);
127 }
128
129 spin_unlock_irq(&object->work_lock);
130 _leave("");
131}
132
133/*
134 * read the corresponding page to the given set from the backing file
135 * - an uncertain page is simply discarded, to be tried again another time
136 */
137static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
138 struct fscache_retrieval *op,
139 struct page *netpage,
140 struct pagevec *pagevec)
141{
142 struct cachefiles_one_read *monitor;
143 struct address_space *bmapping;
144 struct page *newpage, *backpage;
145 int ret;
146
147 _enter("");
148
149 pagevec_reinit(pagevec);
150
151 _debug("read back %p{%lu,%d}",
152 netpage, netpage->index, page_count(netpage));
153
154 monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
155 if (!monitor)
156 goto nomem;
157
158 monitor->netfs_page = netpage;
159 monitor->op = fscache_get_retrieval(op);
160
161 init_waitqueue_func_entry(&monitor->monitor, cachefiles_read_waiter);
162
163 /* attempt to get hold of the backing page */
164 bmapping = object->backer->d_inode->i_mapping;
165 newpage = NULL;
166
167 for (;;) {
168 backpage = find_get_page(bmapping, netpage->index);
169 if (backpage)
170 goto backing_page_already_present;
171
172 if (!newpage) {
173 newpage = page_cache_alloc_cold(bmapping);
174 if (!newpage)
175 goto nomem_monitor;
176 }
177
178 ret = add_to_page_cache(newpage, bmapping,
179 netpage->index, GFP_KERNEL);
180 if (ret == 0)
181 goto installed_new_backing_page;
182 if (ret != -EEXIST)
183 goto nomem_page;
184 }
185
186 /* we've installed a new backing page, so now we need to add it
187 * to the LRU list and start it reading */
188installed_new_backing_page:
189 _debug("- new %p", newpage);
190
191 backpage = newpage;
192 newpage = NULL;
193
194 page_cache_get(backpage);
195 pagevec_add(pagevec, backpage);
196 __pagevec_lru_add_file(pagevec);
197
198read_backing_page:
199 ret = bmapping->a_ops->readpage(NULL, backpage);
200 if (ret < 0)
201 goto read_error;
202
203 /* set the monitor to transfer the data across */
204monitor_backing_page:
205 _debug("- monitor add");
206
207 /* install the monitor */
208 page_cache_get(monitor->netfs_page);
209 page_cache_get(backpage);
210 monitor->back_page = backpage;
211 monitor->monitor.private = backpage;
212 add_page_wait_queue(backpage, &monitor->monitor);
213 monitor = NULL;
214
215 /* but the page may have been read before the monitor was installed, so
216 * the monitor may miss the event - so we have to ensure that we do get
217 * one in such a case */
218 if (trylock_page(backpage)) {
219 _debug("jumpstart %p {%lx}", backpage, backpage->flags);
220 unlock_page(backpage);
221 }
222 goto success;
223
224 /* if the backing page is already present, it can be in one of
225 * three states: read in progress, read failed or read okay */
226backing_page_already_present:
227 _debug("- present");
228
229 if (newpage) {
230 page_cache_release(newpage);
231 newpage = NULL;
232 }
233
234 if (PageError(backpage))
235 goto io_error;
236
237 if (PageUptodate(backpage))
238 goto backing_page_already_uptodate;
239
240 if (!trylock_page(backpage))
241 goto monitor_backing_page;
242 _debug("read %p {%lx}", backpage, backpage->flags);
243 goto read_backing_page;
244
245 /* the backing page is already up to date, attach the netfs
246 * page to the pagecache and LRU and copy the data across */
247backing_page_already_uptodate:
248 _debug("- uptodate");
249
250 pagevec_add(pagevec, netpage);
251 fscache_mark_pages_cached(op, pagevec);
252
253 copy_highpage(netpage, backpage);
254 fscache_end_io(op, netpage, 0);
255
256success:
257 _debug("success");
258 ret = 0;
259
260out:
261 if (backpage)
262 page_cache_release(backpage);
263 if (monitor) {
264 fscache_put_retrieval(monitor->op);
265 kfree(monitor);
266 }
267 _leave(" = %d", ret);
268 return ret;
269
270read_error:
271 _debug("read error %d", ret);
272 if (ret == -ENOMEM)
273 goto out;
274io_error:
275 cachefiles_io_error_obj(object, "Page read error on backing file");
276 ret = -ENOBUFS;
277 goto out;
278
279nomem_page:
280 page_cache_release(newpage);
281nomem_monitor:
282 fscache_put_retrieval(monitor->op);
283 kfree(monitor);
284nomem:
285 _leave(" = -ENOMEM");
286 return -ENOMEM;
287}
288
289/*
290 * read a page from the cache or allocate a block in which to store it
291 * - cache withdrawal is prevented by the caller
292 * - returns -EINTR if interrupted
293 * - returns -ENOMEM if ran out of memory
294 * - returns -ENOBUFS if no buffers can be made available
295 * - returns -ENOBUFS if page is beyond EOF
296 * - if the page is backed by a block in the cache:
297 * - a read will be started which will call the callback on completion
298 * - 0 will be returned
299 * - else if the page is unbacked:
300 * - the metadata will be retained
301 * - -ENODATA will be returned
302 */
303int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
304 struct page *page,
305 gfp_t gfp)
306{
307 struct cachefiles_object *object;
308 struct cachefiles_cache *cache;
309 struct pagevec pagevec;
310 struct inode *inode;
311 sector_t block0, block;
312 unsigned shift;
313 int ret;
314
315 object = container_of(op->op.object,
316 struct cachefiles_object, fscache);
317 cache = container_of(object->fscache.cache,
318 struct cachefiles_cache, cache);
319
320 _enter("{%p},{%lx},,,", object, page->index);
321
322 if (!object->backer)
323 return -ENOBUFS;
324
325 inode = object->backer->d_inode;
326 ASSERT(S_ISREG(inode->i_mode));
327 ASSERT(inode->i_mapping->a_ops->bmap);
328 ASSERT(inode->i_mapping->a_ops->readpages);
329
330 /* calculate the shift required to use bmap */
331 if (inode->i_sb->s_blocksize > PAGE_SIZE)
332 return -ENOBUFS;
333
334 shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
335
336 op->op.flags = FSCACHE_OP_FAST;
337 op->op.processor = cachefiles_read_copier;
338
339 pagevec_init(&pagevec, 0);
340
341 /* we assume the absence or presence of the first block is a good
342 * enough indication for the page as a whole
343 * - TODO: don't use bmap() for this as it is _not_ actually good
344 * enough for this as it doesn't indicate errors, but it's all we've
345 * got for the moment
346 */
347 block0 = page->index;
348 block0 <<= shift;
349
350 block = inode->i_mapping->a_ops->bmap(inode->i_mapping, block0);
351 _debug("%llx -> %llx",
352 (unsigned long long) block0,
353 (unsigned long long) block);
354
355 if (block) {
356 /* submit the apparently valid page to the backing fs to be
357 * read from disk */
358 ret = cachefiles_read_backing_file_one(object, op, page,
359 &pagevec);
360 } else if (cachefiles_has_space(cache, 0, 1) == 0) {
361 /* there's space in the cache we can use */
362 pagevec_add(&pagevec, page);
363 fscache_mark_pages_cached(op, &pagevec);
364 ret = -ENODATA;
365 } else {
366 ret = -ENOBUFS;
367 }
368
369 _leave(" = %d", ret);
370 return ret;
371}
372
373/*
374 * read the corresponding pages to the given set from the backing file
375 * - any uncertain pages are simply discarded, to be tried again another time
376 */
377static int cachefiles_read_backing_file(struct cachefiles_object *object,
378 struct fscache_retrieval *op,
379 struct list_head *list,
380 struct pagevec *mark_pvec)
381{
382 struct cachefiles_one_read *monitor = NULL;
383 struct address_space *bmapping = object->backer->d_inode->i_mapping;
384 struct pagevec lru_pvec;
385 struct page *newpage = NULL, *netpage, *_n, *backpage = NULL;
386 int ret = 0;
387
388 _enter("");
389
390 pagevec_init(&lru_pvec, 0);
391
392 list_for_each_entry_safe(netpage, _n, list, lru) {
393 list_del(&netpage->lru);
394
395 _debug("read back %p{%lu,%d}",
396 netpage, netpage->index, page_count(netpage));
397
398 if (!monitor) {
399 monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
400 if (!monitor)
401 goto nomem;
402
403 monitor->op = fscache_get_retrieval(op);
404 init_waitqueue_func_entry(&monitor->monitor,
405 cachefiles_read_waiter);
406 }
407
408 for (;;) {
409 backpage = find_get_page(bmapping, netpage->index);
410 if (backpage)
411 goto backing_page_already_present;
412
413 if (!newpage) {
414 newpage = page_cache_alloc_cold(bmapping);
415 if (!newpage)
416 goto nomem;
417 }
418
419 ret = add_to_page_cache(newpage, bmapping,
420 netpage->index, GFP_KERNEL);
421 if (ret == 0)
422 goto installed_new_backing_page;
423 if (ret != -EEXIST)
424 goto nomem;
425 }
426
427 /* we've installed a new backing page, so now we need to add it
428 * to the LRU list and start it reading */
429 installed_new_backing_page:
430 _debug("- new %p", newpage);
431
432 backpage = newpage;
433 newpage = NULL;
434
435 page_cache_get(backpage);
436 if (!pagevec_add(&lru_pvec, backpage))
437 __pagevec_lru_add_file(&lru_pvec);
438
439 reread_backing_page:
440 ret = bmapping->a_ops->readpage(NULL, backpage);
441 if (ret < 0)
442 goto read_error;
443
444 /* add the netfs page to the pagecache and LRU, and set the
445 * monitor to transfer the data across */
446 monitor_backing_page:
447 _debug("- monitor add");
448
449 ret = add_to_page_cache(netpage, op->mapping, netpage->index,
450 GFP_KERNEL);
451 if (ret < 0) {
452 if (ret == -EEXIST) {
453 page_cache_release(netpage);
454 continue;
455 }
456 goto nomem;
457 }
458
459 page_cache_get(netpage);
460 if (!pagevec_add(&lru_pvec, netpage))
461 __pagevec_lru_add_file(&lru_pvec);
462
463 /* install a monitor */
464 page_cache_get(netpage);
465 monitor->netfs_page = netpage;
466
467 page_cache_get(backpage);
468 monitor->back_page = backpage;
469 monitor->monitor.private = backpage;
470 add_page_wait_queue(backpage, &monitor->monitor);
471 monitor = NULL;
472
473 /* but the page may have been read before the monitor was
474 * installed, so the monitor may miss the event - so we have to
475 * ensure that we do get one in such a case */
476 if (trylock_page(backpage)) {
477 _debug("2unlock %p {%lx}", backpage, backpage->flags);
478 unlock_page(backpage);
479 }
480
481 page_cache_release(backpage);
482 backpage = NULL;
483
484 page_cache_release(netpage);
485 netpage = NULL;
486 continue;
487
488 /* if the backing page is already present, it can be in one of
489 * three states: read in progress, read failed or read okay */
490 backing_page_already_present:
491 _debug("- present %p", backpage);
492
493 if (PageError(backpage))
494 goto io_error;
495
496 if (PageUptodate(backpage))
497 goto backing_page_already_uptodate;
498
499 _debug("- not ready %p{%lx}", backpage, backpage->flags);
500
501 if (!trylock_page(backpage))
502 goto monitor_backing_page;
503
504 if (PageError(backpage)) {
505 _debug("error %lx", backpage->flags);
506 unlock_page(backpage);
507 goto io_error;
508 }
509
510 if (PageUptodate(backpage))
511 goto backing_page_already_uptodate_unlock;
512
513 /* we've locked a page that's neither up to date nor erroneous,
514 * so we need to attempt to read it again */
515 goto reread_backing_page;
516
517 /* the backing page is already up to date, attach the netfs
518 * page to the pagecache and LRU and copy the data across */
519 backing_page_already_uptodate_unlock:
520 _debug("uptodate %lx", backpage->flags);
521 unlock_page(backpage);
522 backing_page_already_uptodate:
523 _debug("- uptodate");
524
525 ret = add_to_page_cache(netpage, op->mapping, netpage->index,
526 GFP_KERNEL);
527 if (ret < 0) {
528 if (ret == -EEXIST) {
529 page_cache_release(netpage);
530 continue;
531 }
532 goto nomem;
533 }
534
535 copy_highpage(netpage, backpage);
536
537 page_cache_release(backpage);
538 backpage = NULL;
539
540 if (!pagevec_add(mark_pvec, netpage))
541 fscache_mark_pages_cached(op, mark_pvec);
542
543 page_cache_get(netpage);
544 if (!pagevec_add(&lru_pvec, netpage))
545 __pagevec_lru_add_file(&lru_pvec);
546
547 fscache_end_io(op, netpage, 0);
548 page_cache_release(netpage);
549 netpage = NULL;
550 continue;
551 }
552
553 netpage = NULL;
554
555 _debug("out");
556
557out:
558 /* tidy up */
559 pagevec_lru_add_file(&lru_pvec);
560
561 if (newpage)
562 page_cache_release(newpage);
563 if (netpage)
564 page_cache_release(netpage);
565 if (backpage)
566 page_cache_release(backpage);
567 if (monitor) {
568 fscache_put_retrieval(op);
569 kfree(monitor);
570 }
571
572 list_for_each_entry_safe(netpage, _n, list, lru) {
573 list_del(&netpage->lru);
574 page_cache_release(netpage);
575 }
576
577 _leave(" = %d", ret);
578 return ret;
579
580nomem:
581 _debug("nomem");
582 ret = -ENOMEM;
583 goto out;
584
585read_error:
586 _debug("read error %d", ret);
587 if (ret == -ENOMEM)
588 goto out;
589io_error:
590 cachefiles_io_error_obj(object, "Page read error on backing file");
591 ret = -ENOBUFS;
592 goto out;
593}
594
595/*
596 * read a list of pages from the cache or allocate blocks in which to store
597 * them
598 */
599int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
600 struct list_head *pages,
601 unsigned *nr_pages,
602 gfp_t gfp)
603{
604 struct cachefiles_object *object;
605 struct cachefiles_cache *cache;
606 struct list_head backpages;
607 struct pagevec pagevec;
608 struct inode *inode;
609 struct page *page, *_n;
610 unsigned shift, nrbackpages;
611 int ret, ret2, space;
612
613 object = container_of(op->op.object,
614 struct cachefiles_object, fscache);
615 cache = container_of(object->fscache.cache,
616 struct cachefiles_cache, cache);
617
618 _enter("{OBJ%x,%d},,%d,,",
619 object->fscache.debug_id, atomic_read(&op->op.usage),
620 *nr_pages);
621
622 if (!object->backer)
623 return -ENOBUFS;
624
625 space = 1;
626 if (cachefiles_has_space(cache, 0, *nr_pages) < 0)
627 space = 0;
628
629 inode = object->backer->d_inode;
630 ASSERT(S_ISREG(inode->i_mode));
631 ASSERT(inode->i_mapping->a_ops->bmap);
632 ASSERT(inode->i_mapping->a_ops->readpages);
633
634 /* calculate the shift required to use bmap */
635 if (inode->i_sb->s_blocksize > PAGE_SIZE)
636 return -ENOBUFS;
637
638 shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
639
640 pagevec_init(&pagevec, 0);
641
642 op->op.flags = FSCACHE_OP_FAST;
643 op->op.processor = cachefiles_read_copier;
644
645 INIT_LIST_HEAD(&backpages);
646 nrbackpages = 0;
647
648 ret = space ? -ENODATA : -ENOBUFS;
649 list_for_each_entry_safe(page, _n, pages, lru) {
650 sector_t block0, block;
651
652 /* we assume the absence or presence of the first block is a
653 * good enough indication for the page as a whole
654 * - TODO: don't use bmap() for this as it is _not_ actually
655 * good enough for this as it doesn't indicate errors, but
656 * it's all we've got for the moment
657 */
658 block0 = page->index;
659 block0 <<= shift;
660
661 block = inode->i_mapping->a_ops->bmap(inode->i_mapping,
662 block0);
663 _debug("%llx -> %llx",
664 (unsigned long long) block0,
665 (unsigned long long) block);
666
667 if (block) {
668 /* we have data - add it to the list to give to the
669 * backing fs */
670 list_move(&page->lru, &backpages);
671 (*nr_pages)--;
672 nrbackpages++;
673 } else if (space && pagevec_add(&pagevec, page) == 0) {
674 fscache_mark_pages_cached(op, &pagevec);
675 ret = -ENODATA;
676 }
677 }
678
679 if (pagevec_count(&pagevec) > 0)
680 fscache_mark_pages_cached(op, &pagevec);
681
682 if (list_empty(pages))
683 ret = 0;
684
685 /* submit the apparently valid pages to the backing fs to be read from
686 * disk */
687 if (nrbackpages > 0) {
688 ret2 = cachefiles_read_backing_file(object, op, &backpages,
689 &pagevec);
690 if (ret2 == -ENOMEM || ret2 == -EINTR)
691 ret = ret2;
692 }
693
694 if (pagevec_count(&pagevec) > 0)
695 fscache_mark_pages_cached(op, &pagevec);
696
697 _leave(" = %d [nr=%u%s]",
698 ret, *nr_pages, list_empty(pages) ? " empty" : "");
699 return ret;
700}
701
702/*
703 * allocate a block in the cache in which to store a page
704 * - cache withdrawal is prevented by the caller
705 * - returns -EINTR if interrupted
706 * - returns -ENOMEM if ran out of memory
707 * - returns -ENOBUFS if no buffers can be made available
708 * - returns -ENOBUFS if page is beyond EOF
709 * - otherwise:
710 * - the metadata will be retained
711 * - 0 will be returned
712 */
713int cachefiles_allocate_page(struct fscache_retrieval *op,
714 struct page *page,
715 gfp_t gfp)
716{
717 struct cachefiles_object *object;
718 struct cachefiles_cache *cache;
719 struct pagevec pagevec;
720 int ret;
721
722 object = container_of(op->op.object,
723 struct cachefiles_object, fscache);
724 cache = container_of(object->fscache.cache,
725 struct cachefiles_cache, cache);
726
727 _enter("%p,{%lx},", object, page->index);
728
729 ret = cachefiles_has_space(cache, 0, 1);
730 if (ret == 0) {
731 pagevec_init(&pagevec, 0);
732 pagevec_add(&pagevec, page);
733 fscache_mark_pages_cached(op, &pagevec);
734 } else {
735 ret = -ENOBUFS;
736 }
737
738 _leave(" = %d", ret);
739 return ret;
740}
741
742/*
743 * allocate blocks in the cache in which to store a set of pages
744 * - cache withdrawal is prevented by the caller
745 * - returns -EINTR if interrupted
746 * - returns -ENOMEM if ran out of memory
747 * - returns -ENOBUFS if some buffers couldn't be made available
748 * - returns -ENOBUFS if some pages are beyond EOF
749 * - otherwise:
750 * - -ENODATA will be returned
751 * - metadata will be retained for any page marked
752 */
753int cachefiles_allocate_pages(struct fscache_retrieval *op,
754 struct list_head *pages,
755 unsigned *nr_pages,
756 gfp_t gfp)
757{
758 struct cachefiles_object *object;
759 struct cachefiles_cache *cache;
760 struct pagevec pagevec;
761 struct page *page;
762 int ret;
763
764 object = container_of(op->op.object,
765 struct cachefiles_object, fscache);
766 cache = container_of(object->fscache.cache,
767 struct cachefiles_cache, cache);
768
769 _enter("%p,,,%d,", object, *nr_pages);
770
771 ret = cachefiles_has_space(cache, 0, *nr_pages);
772 if (ret == 0) {
773 pagevec_init(&pagevec, 0);
774
775 list_for_each_entry(page, pages, lru) {
776 if (pagevec_add(&pagevec, page) == 0)
777 fscache_mark_pages_cached(op, &pagevec);
778 }
779
780 if (pagevec_count(&pagevec) > 0)
781 fscache_mark_pages_cached(op, &pagevec);
782 ret = -ENODATA;
783 } else {
784 ret = -ENOBUFS;
785 }
786
787 _leave(" = %d", ret);
788 return ret;
789}
790
791/*
792 * request a page be stored in the cache
793 * - cache withdrawal is prevented by the caller
794 * - this request may be ignored if there's no cache block available, in which
795 * case -ENOBUFS will be returned
796 * - if the op is in progress, 0 will be returned
797 */
798int cachefiles_write_page(struct fscache_storage *op, struct page *page)
799{
800 struct cachefiles_object *object;
801 struct cachefiles_cache *cache;
802 mm_segment_t old_fs;
803 struct file *file;
804 loff_t pos;
805 void *data;
806 int ret;
807
808 ASSERT(op != NULL);
809 ASSERT(page != NULL);
810
811 object = container_of(op->op.object,
812 struct cachefiles_object, fscache);
813
814 _enter("%p,%p{%lx},,,", object, page, page->index);
815
816 if (!object->backer) {
817 _leave(" = -ENOBUFS");
818 return -ENOBUFS;
819 }
820
821 ASSERT(S_ISREG(object->backer->d_inode->i_mode));
822
823 cache = container_of(object->fscache.cache,
824 struct cachefiles_cache, cache);
825
826 /* write the page to the backing filesystem and let it store it in its
827 * own time */
828 dget(object->backer);
829 mntget(cache->mnt);
830 file = dentry_open(object->backer, cache->mnt, O_RDWR,
831 cache->cache_cred);
832 if (IS_ERR(file)) {
833 ret = PTR_ERR(file);
834 } else {
835 ret = -EIO;
836 if (file->f_op->write) {
837 pos = (loff_t) page->index << PAGE_SHIFT;
838 data = kmap(page);
839 old_fs = get_fs();
840 set_fs(KERNEL_DS);
841 ret = file->f_op->write(
842 file, (const void __user *) data, PAGE_SIZE,
843 &pos);
844 set_fs(old_fs);
845 kunmap(page);
846 if (ret != PAGE_SIZE)
847 ret = -EIO;
848 }
849 fput(file);
850 }
851
852 if (ret < 0) {
853 if (ret == -EIO)
854 cachefiles_io_error_obj(
855 object, "Write page to backing file failed");
856 ret = -ENOBUFS;
857 }
858
859 _leave(" = %d", ret);
860 return ret;
861}
862
863/*
864 * detach a backing block from a page
865 * - cache withdrawal is prevented by the caller
866 */
867void cachefiles_uncache_page(struct fscache_object *_object, struct page *page)
868{
869 struct cachefiles_object *object;
870 struct cachefiles_cache *cache;
871
872 object = container_of(_object, struct cachefiles_object, fscache);
873 cache = container_of(object->fscache.cache,
874 struct cachefiles_cache, cache);
875
876 _enter("%p,{%lu}", object, page->index);
877
878 spin_unlock(&object->fscache.cookie->lock);
879}
diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c
new file mode 100644
index 000000000000..b5808cdb2232
--- /dev/null
+++ b/fs/cachefiles/security.c
@@ -0,0 +1,116 @@
1/* CacheFiles security management
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/fs.h>
13#include <linux/cred.h>
14#include "internal.h"
15
16/*
17 * determine the security context within which we access the cache from within
18 * the kernel
19 */
20int cachefiles_get_security_ID(struct cachefiles_cache *cache)
21{
22 struct cred *new;
23 int ret;
24
25 _enter("{%s}", cache->secctx);
26
27 new = prepare_kernel_cred(current);
28 if (!new) {
29 ret = -ENOMEM;
30 goto error;
31 }
32
33 if (cache->secctx) {
34 ret = set_security_override_from_ctx(new, cache->secctx);
35 if (ret < 0) {
36 put_cred(new);
37 printk(KERN_ERR "CacheFiles:"
38 " Security denies permission to nominate"
39 " security context: error %d\n",
40 ret);
41 goto error;
42 }
43 }
44
45 cache->cache_cred = new;
46 ret = 0;
47error:
48 _leave(" = %d", ret);
49 return ret;
50}
51
52/*
53 * see if mkdir and create can be performed in the root directory
54 */
55static int cachefiles_check_cache_dir(struct cachefiles_cache *cache,
56 struct dentry *root)
57{
58 int ret;
59
60 ret = security_inode_mkdir(root->d_inode, root, 0);
61 if (ret < 0) {
62 printk(KERN_ERR "CacheFiles:"
63 " Security denies permission to make dirs: error %d",
64 ret);
65 return ret;
66 }
67
68 ret = security_inode_create(root->d_inode, root, 0);
69 if (ret < 0)
70 printk(KERN_ERR "CacheFiles:"
71 " Security denies permission to create files: error %d",
72 ret);
73
74 return ret;
75}
76
77/*
78 * check the security details of the on-disk cache
79 * - must be called with security override in force
80 */
81int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
82 struct dentry *root,
83 const struct cred **_saved_cred)
84{
85 struct cred *new;
86 int ret;
87
88 _enter("");
89
90 /* duplicate the cache creds for COW (the override is currently in
91 * force, so we can use prepare_creds() to do this) */
92 new = prepare_creds();
93 if (!new)
94 return -ENOMEM;
95
96 cachefiles_end_secure(cache, *_saved_cred);
97
98 /* use the cache root dir's security context as the basis with
99 * which create files */
100 ret = set_create_files_as(new, root->d_inode);
101 if (ret < 0) {
102 _leave(" = %d [cfa]", ret);
103 return ret;
104 }
105
106 put_cred(cache->cache_cred);
107 cache->cache_cred = new;
108
109 cachefiles_begin_secure(cache, _saved_cred);
110 ret = cachefiles_check_cache_dir(cache, root);
111
112 if (ret == -EOPNOTSUPP)
113 ret = 0;
114 _leave(" = %d", ret);
115 return ret;
116}
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
new file mode 100644
index 000000000000..f3e7a0bf068b
--- /dev/null
+++ b/fs/cachefiles/xattr.c
@@ -0,0 +1,291 @@
1/* CacheFiles extended attribute management
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/sched.h>
14#include <linux/file.h>
15#include <linux/fs.h>
16#include <linux/fsnotify.h>
17#include <linux/quotaops.h>
18#include <linux/xattr.h>
19#include "internal.h"
20
21static const char cachefiles_xattr_cache[] =
22 XATTR_USER_PREFIX "CacheFiles.cache";
23
24/*
25 * check the type label on an object
26 * - done using xattrs
27 */
28int cachefiles_check_object_type(struct cachefiles_object *object)
29{
30 struct dentry *dentry = object->dentry;
31 char type[3], xtype[3];
32 int ret;
33
34 ASSERT(dentry);
35 ASSERT(dentry->d_inode);
36
37 if (!object->fscache.cookie)
38 strcpy(type, "C3");
39 else
40 snprintf(type, 3, "%02x", object->fscache.cookie->def->type);
41
42 _enter("%p{%s}", object, type);
43
44 /* attempt to install a type label directly */
45 ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2,
46 XATTR_CREATE);
47 if (ret == 0) {
48 _debug("SET"); /* we succeeded */
49 goto error;
50 }
51
52 if (ret != -EEXIST) {
53 kerror("Can't set xattr on %*.*s [%lu] (err %d)",
54 dentry->d_name.len, dentry->d_name.len,
55 dentry->d_name.name, dentry->d_inode->i_ino,
56 -ret);
57 goto error;
58 }
59
60 /* read the current type label */
61 ret = vfs_getxattr(dentry, cachefiles_xattr_cache, xtype, 3);
62 if (ret < 0) {
63 if (ret == -ERANGE)
64 goto bad_type_length;
65
66 kerror("Can't read xattr on %*.*s [%lu] (err %d)",
67 dentry->d_name.len, dentry->d_name.len,
68 dentry->d_name.name, dentry->d_inode->i_ino,
69 -ret);
70 goto error;
71 }
72
73 /* check the type is what we're expecting */
74 if (ret != 2)
75 goto bad_type_length;
76
77 if (xtype[0] != type[0] || xtype[1] != type[1])
78 goto bad_type;
79
80 ret = 0;
81
82error:
83 _leave(" = %d", ret);
84 return ret;
85
86bad_type_length:
87 kerror("Cache object %lu type xattr length incorrect",
88 dentry->d_inode->i_ino);
89 ret = -EIO;
90 goto error;
91
92bad_type:
93 xtype[2] = 0;
94 kerror("Cache object %*.*s [%lu] type %s not %s",
95 dentry->d_name.len, dentry->d_name.len,
96 dentry->d_name.name, dentry->d_inode->i_ino,
97 xtype, type);
98 ret = -EIO;
99 goto error;
100}
101
102/*
103 * set the state xattr on a cache file
104 */
105int cachefiles_set_object_xattr(struct cachefiles_object *object,
106 struct cachefiles_xattr *auxdata)
107{
108 struct dentry *dentry = object->dentry;
109 int ret;
110
111 ASSERT(object->fscache.cookie);
112 ASSERT(dentry);
113
114 _enter("%p,#%d", object, auxdata->len);
115
116 /* attempt to install the cache metadata directly */
117 _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
118
119 ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
120 &auxdata->type, auxdata->len,
121 XATTR_CREATE);
122 if (ret < 0 && ret != -ENOMEM)
123 cachefiles_io_error_obj(
124 object,
125 "Failed to set xattr with error %d", ret);
126
127 _leave(" = %d", ret);
128 return ret;
129}
130
131/*
132 * update the state xattr on a cache file
133 */
134int cachefiles_update_object_xattr(struct cachefiles_object *object,
135 struct cachefiles_xattr *auxdata)
136{
137 struct dentry *dentry = object->dentry;
138 int ret;
139
140 ASSERT(object->fscache.cookie);
141 ASSERT(dentry);
142
143 _enter("%p,#%d", object, auxdata->len);
144
145 /* attempt to install the cache metadata directly */
146 _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
147
148 ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
149 &auxdata->type, auxdata->len,
150 XATTR_REPLACE);
151 if (ret < 0 && ret != -ENOMEM)
152 cachefiles_io_error_obj(
153 object,
154 "Failed to update xattr with error %d", ret);
155
156 _leave(" = %d", ret);
157 return ret;
158}
159
160/*
161 * check the state xattr on a cache file
162 * - return -ESTALE if the object should be deleted
163 */
164int cachefiles_check_object_xattr(struct cachefiles_object *object,
165 struct cachefiles_xattr *auxdata)
166{
167 struct cachefiles_xattr *auxbuf;
168 struct dentry *dentry = object->dentry;
169 int ret;
170
171 _enter("%p,#%d", object, auxdata->len);
172
173 ASSERT(dentry);
174 ASSERT(dentry->d_inode);
175
176 auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL);
177 if (!auxbuf) {
178 _leave(" = -ENOMEM");
179 return -ENOMEM;
180 }
181
182 /* read the current type label */
183 ret = vfs_getxattr(dentry, cachefiles_xattr_cache,
184 &auxbuf->type, 512 + 1);
185 if (ret < 0) {
186 if (ret == -ENODATA)
187 goto stale; /* no attribute - power went off
188 * mid-cull? */
189
190 if (ret == -ERANGE)
191 goto bad_type_length;
192
193 cachefiles_io_error_obj(object,
194 "Can't read xattr on %lu (err %d)",
195 dentry->d_inode->i_ino, -ret);
196 goto error;
197 }
198
199 /* check the on-disk object */
200 if (ret < 1)
201 goto bad_type_length;
202
203 if (auxbuf->type != auxdata->type)
204 goto stale;
205
206 auxbuf->len = ret;
207
208 /* consult the netfs */
209 if (object->fscache.cookie->def->check_aux) {
210 enum fscache_checkaux result;
211 unsigned int dlen;
212
213 dlen = auxbuf->len - 1;
214
215 _debug("checkaux %s #%u",
216 object->fscache.cookie->def->name, dlen);
217
218 result = fscache_check_aux(&object->fscache,
219 &auxbuf->data, dlen);
220
221 switch (result) {
222 /* entry okay as is */
223 case FSCACHE_CHECKAUX_OKAY:
224 goto okay;
225
226 /* entry requires update */
227 case FSCACHE_CHECKAUX_NEEDS_UPDATE:
228 break;
229
230 /* entry requires deletion */
231 case FSCACHE_CHECKAUX_OBSOLETE:
232 goto stale;
233
234 default:
235 BUG();
236 }
237
238 /* update the current label */
239 ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
240 &auxdata->type, auxdata->len,
241 XATTR_REPLACE);
242 if (ret < 0) {
243 cachefiles_io_error_obj(object,
244 "Can't update xattr on %lu"
245 " (error %d)",
246 dentry->d_inode->i_ino, -ret);
247 goto error;
248 }
249 }
250
251okay:
252 ret = 0;
253
254error:
255 kfree(auxbuf);
256 _leave(" = %d", ret);
257 return ret;
258
259bad_type_length:
260 kerror("Cache object %lu xattr length incorrect",
261 dentry->d_inode->i_ino);
262 ret = -EIO;
263 goto error;
264
265stale:
266 ret = -ESTALE;
267 goto error;
268}
269
270/*
271 * remove the object's xattr to mark it stale
272 */
273int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
274 struct dentry *dentry)
275{
276 int ret;
277
278 ret = vfs_removexattr(dentry, cachefiles_xattr_cache);
279 if (ret < 0) {
280 if (ret == -ENOENT || ret == -ENODATA)
281 ret = 0;
282 else if (ret != -ENOMEM)
283 cachefiles_io_error(cache,
284 "Can't remove xattr from %lu"
285 " (error %d)",
286 dentry->d_inode->i_ino, -ret);
287 }
288
289 _leave(" = %d", ret);
290 return ret;
291}
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 851388fafc73..65984006192c 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -6,7 +6,16 @@ the server to treat subsequent connections, especially those that
6are authenticated as guest, as reconnections, invalidating the earlier 6are authenticated as guest, as reconnections, invalidating the earlier
7user's smb session. This fix allows cifs to mount multiple times to the 7user's smb session. This fix allows cifs to mount multiple times to the
8same server with different userids without risking invalidating earlier 8same server with different userids without risking invalidating earlier
9established security contexts. 9established security contexts. fsync now sends SMB Flush operation
10to better ensure that we wait for server to write all of the data to
11server disk (not just write it over the network). Add new mount
12parameter to allow user to disable sending the (slow) SMB flush on
13fsync if desired (fsync still flushes all cached write data to the server).
14Posix file open support added (turned off after one attempt if server
15fails to support it properly, as with Samba server versions prior to 3.3.2)
16Fix "redzone overwritten" bug in cifs_put_tcon (CIFSTcon may allocate too
17little memory for the "nativeFileSystem" field returned by the server
18during mount).
10 19
11Version 1.56 20Version 1.56
12------------ 21------------
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 341a98965bd0..6994a0f54f02 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -118,6 +118,18 @@ config CIFS_DEBUG2
118 option can be turned off unless you are debugging 118 option can be turned off unless you are debugging
119 cifs problems. If unsure, say N. 119 cifs problems. If unsure, say N.
120 120
121config CIFS_DFS_UPCALL
122 bool "DFS feature support"
123 depends on CIFS && KEYS
124 help
125 Distributed File System (DFS) support is used to access shares
126 transparently in an enterprise name space, even if the share
127 moves to a different server. This feature also enables
128 an upcall mechanism for CIFS which contacts userspace helper
129 utilities to provide server name resolution (host names to
130 IP addresses) which is needed for implicit mounts of DFS junction
131 points. If unsure, say N.
132
121config CIFS_EXPERIMENTAL 133config CIFS_EXPERIMENTAL
122 bool "CIFS Experimental Features (EXPERIMENTAL)" 134 bool "CIFS Experimental Features (EXPERIMENTAL)"
123 depends on CIFS && EXPERIMENTAL 135 depends on CIFS && EXPERIMENTAL
@@ -131,12 +143,3 @@ config CIFS_EXPERIMENTAL
131 (which is disabled by default). See the file fs/cifs/README 143 (which is disabled by default). See the file fs/cifs/README
132 for more details. If unsure, say N. 144 for more details. If unsure, say N.
133 145
134config CIFS_DFS_UPCALL
135 bool "DFS feature support (EXPERIMENTAL)"
136 depends on CIFS_EXPERIMENTAL
137 depends on KEYS
138 help
139 Enables an upcall mechanism for CIFS which contacts userspace
140 helper utilities to provide server name resolution (host names to
141 IP addresses) which is needed for implicit mounts of DFS junction
142 points. If unsure, say N.
diff --git a/fs/cifs/README b/fs/cifs/README
index da4515e3be20..07434181623b 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -472,6 +472,19 @@ A partial list of the supported mount options follows:
472 even if the cifs server would support posix advisory locks. 472 even if the cifs server would support posix advisory locks.
473 "forcemand" is accepted as a shorter form of this mount 473 "forcemand" is accepted as a shorter form of this mount
474 option. 474 option.
475 nostrictsync If this mount option is set, when an application does an
476 fsync call then the cifs client does not send an SMB Flush
477 to the server (to force the server to write all dirty data
478 for this file immediately to disk), although cifs still sends
479 all dirty (cached) file data to the server and waits for the
480 server to respond to the write. Since SMB Flush can be
481 very slow, and some servers may be reliable enough (to risk
482 delaying slightly flushing the data to disk on the server),
483 turning on this option may be useful to improve performance for
484 applications that fsync too much, at a small risk of server
485 crash. If this mount option is not set, by default cifs will
486 send an SMB flush request (and wait for a response) on every
487 fsync call.
475 nodfs Disable DFS (global name space support) even if the 488 nodfs Disable DFS (global name space support) even if the
476 server claims to support it. This can help work around 489 server claims to support it. This can help work around
477 a problem with parsing of DFS paths with Samba server 490 a problem with parsing of DFS paths with Samba server
@@ -692,13 +705,14 @@ require this helper. Note that NTLMv2 security (which does not require the
692cifs.upcall helper program), instead of using Kerberos, is sufficient for 705cifs.upcall helper program), instead of using Kerberos, is sufficient for
693some use cases. 706some use cases.
694 707
695Enabling DFS support (used to access shares transparently in an MS-DFS 708DFS support allows transparent redirection to shares in an MS-DFS name space.
696global name space) requires that CONFIG_CIFS_EXPERIMENTAL be enabled. In 709In addition, DFS support for target shares which are specified as UNC
697addition, DFS support for target shares which are specified as UNC
698names which begin with host names (rather than IP addresses) requires 710names which begin with host names (rather than IP addresses) requires
699a user space helper (such as cifs.upcall) to be present in order to 711a user space helper (such as cifs.upcall) to be present in order to
700translate host names to ip address, and the user space helper must also 712translate host names to ip address, and the user space helper must also
701be configured in the file /etc/request-key.conf 713be configured in the file /etc/request-key.conf. Samba, Windows servers and
714many NAS appliances support DFS as a way of constructing a global name
715space to ease network configuration and improve reliability.
702 716
703To use cifs Kerberos and DFS support, the Linux keyutils package should be 717To use cifs Kerberos and DFS support, the Linux keyutils package should be
704installed and something like the following lines should be added to the 718installed and something like the following lines should be added to the
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 490e34bbf27a..7f19fefd3d45 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -340,6 +340,8 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
340 seq_printf(m, "\nWrites: %d Bytes: %lld", 340 seq_printf(m, "\nWrites: %d Bytes: %lld",
341 atomic_read(&tcon->num_writes), 341 atomic_read(&tcon->num_writes),
342 (long long)(tcon->bytes_written)); 342 (long long)(tcon->bytes_written));
343 seq_printf(m, "\nFlushes: %d",
344 atomic_read(&tcon->num_flushes));
343 seq_printf(m, "\nLocks: %d HardLinks: %d " 345 seq_printf(m, "\nLocks: %d HardLinks: %d "
344 "Symlinks: %d", 346 "Symlinks: %d",
345 atomic_read(&tcon->num_locks), 347 atomic_read(&tcon->num_locks),
@@ -402,7 +404,6 @@ cifs_proc_init(void)
402 if (proc_fs_cifs == NULL) 404 if (proc_fs_cifs == NULL)
403 return; 405 return;
404 406
405 proc_fs_cifs->owner = THIS_MODULE;
406 proc_create("DebugData", 0, proc_fs_cifs, &cifs_debug_data_proc_fops); 407 proc_create("DebugData", 0, proc_fs_cifs, &cifs_debug_data_proc_fops);
407 408
408#ifdef CONFIG_CIFS_STATS 409#ifdef CONFIG_CIFS_STATS
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 85c0a74d034d..5fdbf8a14472 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -104,9 +104,9 @@ static char *cifs_get_share_name(const char *node_name)
104 104
105 105
106/** 106/**
107 * compose_mount_options - creates mount options for refferral 107 * cifs_compose_mount_options - creates mount options for refferral
108 * @sb_mountdata: parent/root DFS mount options (template) 108 * @sb_mountdata: parent/root DFS mount options (template)
109 * @dentry: point where we are going to mount 109 * @fullpath: full path in UNC format
110 * @ref: server's referral 110 * @ref: server's referral
111 * @devname: pointer for saving device name 111 * @devname: pointer for saving device name
112 * 112 *
@@ -116,8 +116,8 @@ static char *cifs_get_share_name(const char *node_name)
116 * Returns: pointer to new mount options or ERR_PTR. 116 * Returns: pointer to new mount options or ERR_PTR.
117 * Caller is responcible for freeing retunrned value if it is not error. 117 * Caller is responcible for freeing retunrned value if it is not error.
118 */ 118 */
119static char *compose_mount_options(const char *sb_mountdata, 119char *cifs_compose_mount_options(const char *sb_mountdata,
120 struct dentry *dentry, 120 const char *fullpath,
121 const struct dfs_info3_param *ref, 121 const struct dfs_info3_param *ref,
122 char **devname) 122 char **devname)
123{ 123{
@@ -128,7 +128,6 @@ static char *compose_mount_options(const char *sb_mountdata,
128 char *srvIP = NULL; 128 char *srvIP = NULL;
129 char sep = ','; 129 char sep = ',';
130 int off, noff; 130 int off, noff;
131 char *fullpath;
132 131
133 if (sb_mountdata == NULL) 132 if (sb_mountdata == NULL)
134 return ERR_PTR(-EINVAL); 133 return ERR_PTR(-EINVAL);
@@ -202,17 +201,6 @@ static char *compose_mount_options(const char *sb_mountdata,
202 goto compose_mount_options_err; 201 goto compose_mount_options_err;
203 } 202 }
204 203
205 /*
206 * this function gives us a path with a double backslash prefix. We
207 * require a single backslash for DFS. Temporarily increment fullpath
208 * to put it in the proper form and decrement before freeing it.
209 */
210 fullpath = build_path_from_dentry(dentry);
211 if (!fullpath) {
212 rc = -ENOMEM;
213 goto compose_mount_options_err;
214 }
215 ++fullpath;
216 tkn_e = strchr(tkn_e + 1, '\\'); 204 tkn_e = strchr(tkn_e + 1, '\\');
217 if (tkn_e || (strlen(fullpath) - ref->path_consumed)) { 205 if (tkn_e || (strlen(fullpath) - ref->path_consumed)) {
218 strncat(mountdata, &sep, 1); 206 strncat(mountdata, &sep, 1);
@@ -221,8 +209,6 @@ static char *compose_mount_options(const char *sb_mountdata,
221 strcat(mountdata, tkn_e + 1); 209 strcat(mountdata, tkn_e + 1);
222 strcat(mountdata, fullpath + ref->path_consumed); 210 strcat(mountdata, fullpath + ref->path_consumed);
223 } 211 }
224 --fullpath;
225 kfree(fullpath);
226 212
227 /*cFYI(1,("%s: parent mountdata: %s", __func__,sb_mountdata));*/ 213 /*cFYI(1,("%s: parent mountdata: %s", __func__,sb_mountdata));*/
228 /*cFYI(1, ("%s: submount mountdata: %s", __func__, mountdata ));*/ 214 /*cFYI(1, ("%s: submount mountdata: %s", __func__, mountdata ));*/
@@ -245,10 +231,20 @@ static struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent,
245 struct vfsmount *mnt; 231 struct vfsmount *mnt;
246 char *mountdata; 232 char *mountdata;
247 char *devname = NULL; 233 char *devname = NULL;
234 char *fullpath;
248 235
249 cifs_sb = CIFS_SB(dentry->d_inode->i_sb); 236 cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
250 mountdata = compose_mount_options(cifs_sb->mountdata, 237 /*
251 dentry, ref, &devname); 238 * this function gives us a path with a double backslash prefix. We
239 * require a single backslash for DFS.
240 */
241 fullpath = build_path_from_dentry(dentry);
242 if (!fullpath)
243 return ERR_PTR(-ENOMEM);
244
245 mountdata = cifs_compose_mount_options(cifs_sb->mountdata,
246 fullpath + 1, ref, &devname);
247 kfree(fullpath);
252 248
253 if (IS_ERR(mountdata)) 249 if (IS_ERR(mountdata))
254 return (struct vfsmount *)mountdata; 250 return (struct vfsmount *)mountdata;
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index c4c306f7b06f..4797787c6a44 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -32,6 +32,7 @@
32#define CIFS_MOUNT_OVERR_GID 0x800 /* override gid returned from server */ 32#define CIFS_MOUNT_OVERR_GID 0x800 /* override gid returned from server */
33#define CIFS_MOUNT_DYNPERM 0x1000 /* allow in-memory only mode setting */ 33#define CIFS_MOUNT_DYNPERM 0x1000 /* allow in-memory only mode setting */
34#define CIFS_MOUNT_NOPOSIXBRL 0x2000 /* mandatory not posix byte range lock */ 34#define CIFS_MOUNT_NOPOSIXBRL 0x2000 /* mandatory not posix byte range lock */
35#define CIFS_MOUNT_NOSSYNC 0x4000 /* don't do slow SMBflush on every sync*/
35 36
36struct cifs_sb_info { 37struct cifs_sb_info {
37 struct cifsTconInfo *tcon; /* primary mount */ 38 struct cifsTconInfo *tcon; /* primary mount */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 13ea53251dcf..38491fd3871d 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -606,7 +606,8 @@ cifs_get_sb(struct file_system_type *fs_type,
606 return rc; 606 return rc;
607 } 607 }
608 sb->s_flags |= MS_ACTIVE; 608 sb->s_flags |= MS_ACTIVE;
609 return simple_set_mnt(mnt, sb); 609 simple_set_mnt(mnt, sb);
610 return 0;
610} 611}
611 612
612static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 613static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 2b1d28a9ee28..77e190dc2883 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -78,8 +78,8 @@ extern int cifs_dir_open(struct inode *inode, struct file *file);
78extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir); 78extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
79 79
80/* Functions related to dir entries */ 80/* Functions related to dir entries */
81extern struct dentry_operations cifs_dentry_ops; 81extern const struct dentry_operations cifs_dentry_ops;
82extern struct dentry_operations cifs_ci_dentry_ops; 82extern const struct dentry_operations cifs_ci_dentry_ops;
83 83
84/* Functions related to symlinks */ 84/* Functions related to symlinks */
85extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd); 85extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index e004f6db5fc8..9fbf4dff5da6 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -254,6 +254,7 @@ struct cifsTconInfo {
254 atomic_t num_smbs_sent; 254 atomic_t num_smbs_sent;
255 atomic_t num_writes; 255 atomic_t num_writes;
256 atomic_t num_reads; 256 atomic_t num_reads;
257 atomic_t num_flushes;
257 atomic_t num_oplock_brks; 258 atomic_t num_oplock_brks;
258 atomic_t num_opens; 259 atomic_t num_opens;
259 atomic_t num_closes; 260 atomic_t num_closes;
@@ -298,6 +299,7 @@ struct cifsTconInfo {
298 bool unix_ext:1; /* if false disable Linux extensions to CIFS protocol 299 bool unix_ext:1; /* if false disable Linux extensions to CIFS protocol
299 for this mount even if server would support */ 300 for this mount even if server would support */
300 bool local_lease:1; /* check leases (only) on local system not remote */ 301 bool local_lease:1; /* check leases (only) on local system not remote */
302 bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
301 bool need_reconnect:1; /* connection reset, tid now invalid */ 303 bool need_reconnect:1; /* connection reset, tid now invalid */
302 /* BB add field for back pointer to sb struct(s)? */ 304 /* BB add field for back pointer to sb struct(s)? */
303}; 305};
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index b4e2e9f0ee3d..b370489c8da5 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/cifspdu.h 2 * fs/cifs/cifspdu.h
3 * 3 *
4 * Copyright (c) International Business Machines Corp., 2002,2008 4 * Copyright (c) International Business Machines Corp., 2002,2009
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * This library is free software; you can redistribute it and/or modify 7 * This library is free software; you can redistribute it and/or modify
@@ -23,6 +23,7 @@
23#define _CIFSPDU_H 23#define _CIFSPDU_H
24 24
25#include <net/sock.h> 25#include <net/sock.h>
26#include "smbfsctl.h"
26 27
27#ifdef CONFIG_CIFS_WEAK_PW_HASH 28#ifdef CONFIG_CIFS_WEAK_PW_HASH
28#define LANMAN_PROT 0 29#define LANMAN_PROT 0
@@ -34,15 +35,15 @@
34#define POSIX_PROT (CIFS_PROT+1) 35#define POSIX_PROT (CIFS_PROT+1)
35#define BAD_PROT 0xFFFF 36#define BAD_PROT 0xFFFF
36 37
37/* SMB command codes */ 38/* SMB command codes:
38/* 39 * Note some commands have minimal (wct=0,bcc=0), or uninteresting, responses
39 * Some commands have minimal (wct=0,bcc=0), or uninteresting, responses
40 * (ie which include no useful data other than the SMB error code itself). 40 * (ie which include no useful data other than the SMB error code itself).
41 * Knowing this helps avoid response buffer allocations and copy in some cases 41 * This can allow us to avoid response buffer allocations and copy in some cases
42 */ 42 */
43#define SMB_COM_CREATE_DIRECTORY 0x00 /* trivial response */ 43#define SMB_COM_CREATE_DIRECTORY 0x00 /* trivial response */
44#define SMB_COM_DELETE_DIRECTORY 0x01 /* trivial response */ 44#define SMB_COM_DELETE_DIRECTORY 0x01 /* trivial response */
45#define SMB_COM_CLOSE 0x04 /* triv req/rsp, timestamp ignored */ 45#define SMB_COM_CLOSE 0x04 /* triv req/rsp, timestamp ignored */
46#define SMB_COM_FLUSH 0x05 /* triv req/rsp */
46#define SMB_COM_DELETE 0x06 /* trivial response */ 47#define SMB_COM_DELETE 0x06 /* trivial response */
47#define SMB_COM_RENAME 0x07 /* trivial response */ 48#define SMB_COM_RENAME 0x07 /* trivial response */
48#define SMB_COM_QUERY_INFORMATION 0x08 /* aka getattr */ 49#define SMB_COM_QUERY_INFORMATION 0x08 /* aka getattr */
@@ -790,6 +791,12 @@ typedef struct smb_com_close_rsp {
790 __u16 ByteCount; /* bct = 0 */ 791 __u16 ByteCount; /* bct = 0 */
791} __attribute__((packed)) CLOSE_RSP; 792} __attribute__((packed)) CLOSE_RSP;
792 793
794typedef struct smb_com_flush_req {
795 struct smb_hdr hdr; /* wct = 1 */
796 __u16 FileID;
797 __u16 ByteCount; /* 0 */
798} __attribute__((packed)) FLUSH_REQ;
799
793typedef struct smb_com_findclose_req { 800typedef struct smb_com_findclose_req {
794 struct smb_hdr hdr; /* wct = 1 */ 801 struct smb_hdr hdr; /* wct = 1 */
795 __u16 FileID; 802 __u16 FileID;
@@ -1924,19 +1931,19 @@ typedef struct smb_com_transaction2_get_dfs_refer_req {
1924#define DFS_TYPE_ROOT 0x0001 1931#define DFS_TYPE_ROOT 0x0001
1925 1932
1926/* Referral Entry Flags */ 1933/* Referral Entry Flags */
1927#define DFS_NAME_LIST_REF 0x0200 1934#define DFS_NAME_LIST_REF 0x0200 /* set for domain or DC referral responses */
1935#define DFS_TARGET_SET_BOUNDARY 0x0400 /* only valid with version 4 dfs req */
1928 1936
1929typedef struct dfs_referral_level_3 { 1937typedef struct dfs_referral_level_3 { /* version 4 is same, + one flag bit */
1930 __le16 VersionNumber; 1938 __le16 VersionNumber; /* must be 3 or 4 */
1931 __le16 Size; 1939 __le16 Size;
1932 __le16 ServerType; /* 0x0001 = root targets; 0x0000 = link targets */ 1940 __le16 ServerType; /* 0x0001 = root targets; 0x0000 = link targets */
1933 __le16 ReferralEntryFlags; /* 0x0200 bit set only for domain 1941 __le16 ReferralEntryFlags;
1934 or DC referral responce */
1935 __le32 TimeToLive; 1942 __le32 TimeToLive;
1936 __le16 DfsPathOffset; 1943 __le16 DfsPathOffset;
1937 __le16 DfsAlternatePathOffset; 1944 __le16 DfsAlternatePathOffset;
1938 __le16 NetworkAddressOffset; /* offset of the link target */ 1945 __le16 NetworkAddressOffset; /* offset of the link target */
1939 __le16 ServiceSiteGuid; 1946 __u8 ServiceSiteGuid[16]; /* MBZ, ignored */
1940} __attribute__((packed)) REFERRAL3; 1947} __attribute__((packed)) REFERRAL3;
1941 1948
1942typedef struct smb_com_transaction_get_dfs_refer_rsp { 1949typedef struct smb_com_transaction_get_dfs_refer_rsp {
@@ -1946,48 +1953,15 @@ typedef struct smb_com_transaction_get_dfs_refer_rsp {
1946 __u8 Pad; 1953 __u8 Pad;
1947 __le16 PathConsumed; 1954 __le16 PathConsumed;
1948 __le16 NumberOfReferrals; 1955 __le16 NumberOfReferrals;
1949 __le16 DFSFlags; 1956 __le32 DFSFlags;
1950 __u16 Pad2;
1951 REFERRAL3 referrals[1]; /* array of level 3 dfs_referral structures */ 1957 REFERRAL3 referrals[1]; /* array of level 3 dfs_referral structures */
1952 /* followed by the strings pointed to by the referral structures */ 1958 /* followed by the strings pointed to by the referral structures */
1953} __attribute__((packed)) TRANSACTION2_GET_DFS_REFER_RSP; 1959} __attribute__((packed)) TRANSACTION2_GET_DFS_REFER_RSP;
1954 1960
1955/* DFS Flags */ 1961/* DFS Flags */
1956#define DFSREF_REFERRAL_SERVER 0x0001 1962#define DFSREF_REFERRAL_SERVER 0x00000001 /* all targets are DFS roots */
1957#define DFSREF_STORAGE_SERVER 0x0002 1963#define DFSREF_STORAGE_SERVER 0x00000002 /* no further ref requests needed */
1958 1964#define DFSREF_TARGET_FAILBACK 0x00000004 /* only for DFS referral version 4 */
1959/* IOCTL information */
1960/*
1961 * List of ioctl function codes that look to be of interest to remote clients
1962 * like this one. Need to do some experimentation to make sure they all work
1963 * remotely. Some of the following, such as the encryption/compression ones
1964 * would be invoked from tools via a specialized hook into the VFS rather
1965 * than via the standard vfs entry points
1966 */
1967#define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000
1968#define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004
1969#define FSCTL_REQUEST_BATCH_OPLOCK 0x00090008
1970#define FSCTL_LOCK_VOLUME 0x00090018
1971#define FSCTL_UNLOCK_VOLUME 0x0009001C
1972#define FSCTL_GET_COMPRESSION 0x0009003C
1973#define FSCTL_SET_COMPRESSION 0x0009C040
1974#define FSCTL_REQUEST_FILTER_OPLOCK 0x0009008C
1975#define FSCTL_FILESYS_GET_STATISTICS 0x00090090
1976#define FSCTL_SET_REPARSE_POINT 0x000900A4
1977#define FSCTL_GET_REPARSE_POINT 0x000900A8
1978#define FSCTL_DELETE_REPARSE_POINT 0x000900AC
1979#define FSCTL_SET_SPARSE 0x000900C4
1980#define FSCTL_SET_ZERO_DATA 0x000900C8
1981#define FSCTL_SET_ENCRYPTION 0x000900D7
1982#define FSCTL_ENCRYPTION_FSCTL_IO 0x000900DB
1983#define FSCTL_WRITE_RAW_ENCRYPTED 0x000900DF
1984#define FSCTL_READ_RAW_ENCRYPTED 0x000900E3
1985#define FSCTL_SIS_COPYFILE 0x00090100
1986#define FSCTL_SIS_LINK_FILES 0x0009C104
1987
1988#define IO_REPARSE_TAG_MOUNT_POINT 0xA0000003
1989#define IO_REPARSE_TAG_HSM 0xC0000004
1990#define IO_REPARSE_TAG_SIS 0x80000007
1991 1965
1992/* 1966/*
1993 ************************************************************************ 1967 ************************************************************************
@@ -2508,8 +2482,6 @@ struct data_blob {
2508 6) Use nanosecond timestamps throughout all time fields if 2482 6) Use nanosecond timestamps throughout all time fields if
2509 corresponding attribute flag is set 2483 corresponding attribute flag is set
2510 7) sendfile - handle based copy 2484 7) sendfile - handle based copy
2511 8) Direct i/o
2512 9) Misc fcntls?
2513 2485
2514 what about fixing 64 bit alignment 2486 what about fixing 64 bit alignment
2515 2487
@@ -2628,7 +2600,5 @@ typedef struct file_chattr_info {
2628 __le64 mode; /* list of actual attribute bits on this inode */ 2600 __le64 mode; /* list of actual attribute bits on this inode */
2629} __attribute__((packed)) FILE_CHATTR_INFO; /* ext attributes 2601} __attribute__((packed)) FILE_CHATTR_INFO; /* ext attributes
2630 (chattr, chflags) level 0x206 */ 2602 (chattr, chflags) level 0x206 */
2631 2603#endif /* POSIX */
2632#endif
2633
2634#endif /* _CIFSPDU_H */ 2604#endif /* _CIFSPDU_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 083dfc57c7a3..4167716d32f2 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -44,6 +44,9 @@ extern void _FreeXid(unsigned int);
44extern char *build_path_from_dentry(struct dentry *); 44extern char *build_path_from_dentry(struct dentry *);
45extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb); 45extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb);
46extern char *build_wildcard_path_from_dentry(struct dentry *direntry); 46extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
47extern char *cifs_compose_mount_options(const char *sb_mountdata,
48 const char *fullpath, const struct dfs_info3_param *ref,
49 char **devname);
47/* extern void renew_parental_timestamps(struct dentry *direntry);*/ 50/* extern void renew_parental_timestamps(struct dentry *direntry);*/
48extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *, 51extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
49 struct smb_hdr * /* input */ , 52 struct smb_hdr * /* input */ ,
@@ -92,6 +95,9 @@ extern u64 cifs_UnixTimeToNT(struct timespec);
92extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time); 95extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
93extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time); 96extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
94 97
98extern int cifs_posix_open(char *full_path, struct inode **pinode,
99 struct super_block *sb, int mode, int oflags,
100 int *poplock, __u16 *pnetfid, int xid);
95extern void posix_fill_in_inode(struct inode *tmp_inode, 101extern void posix_fill_in_inode(struct inode *tmp_inode,
96 FILE_UNIX_BASIC_INFO *pData, int isNewInode); 102 FILE_UNIX_BASIC_INFO *pData, int isNewInode);
97extern struct inode *cifs_new_inode(struct super_block *sb, __u64 *inum); 103extern struct inode *cifs_new_inode(struct super_block *sb, __u64 *inum);
@@ -281,6 +287,9 @@ extern int CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon,
281extern int CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, 287extern int CIFSSMBClose(const int xid, struct cifsTconInfo *tcon,
282 const int smb_file_id); 288 const int smb_file_id);
283 289
290extern int CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon,
291 const int smb_file_id);
292
284extern int CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, 293extern int CIFSSMBRead(const int xid, struct cifsTconInfo *tcon,
285 const int netfid, unsigned int count, 294 const int netfid, unsigned int count,
286 const __u64 lseek, unsigned int *nbytes, char **buf, 295 const __u64 lseek, unsigned int *nbytes, char **buf,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 939e2f76b959..bc09c998631f 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1934,6 +1934,27 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
1934} 1934}
1935 1935
1936int 1936int
1937CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
1938{
1939 int rc = 0;
1940 FLUSH_REQ *pSMB = NULL;
1941 cFYI(1, ("In CIFSSMBFlush"));
1942
1943 rc = small_smb_init(SMB_COM_FLUSH, 1, tcon, (void **) &pSMB);
1944 if (rc)
1945 return rc;
1946
1947 pSMB->FileID = (__u16) smb_file_id;
1948 pSMB->ByteCount = 0;
1949 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
1950 cifs_stats_inc(&tcon->num_flushes);
1951 if (rc)
1952 cERROR(1, ("Send error in Flush = %d", rc));
1953
1954 return rc;
1955}
1956
1957int
1937CIFSSMBRename(const int xid, struct cifsTconInfo *tcon, 1958CIFSSMBRename(const int xid, struct cifsTconInfo *tcon,
1938 const char *fromName, const char *toName, 1959 const char *fromName, const char *toName,
1939 const struct nls_table *nls_codepage, int remap) 1960 const struct nls_table *nls_codepage, int remap)
@@ -2356,8 +2377,10 @@ winCreateHardLinkRetry:
2356 PATH_MAX, nls_codepage, remap); 2377 PATH_MAX, nls_codepage, remap);
2357 name_len++; /* trailing null */ 2378 name_len++; /* trailing null */
2358 name_len *= 2; 2379 name_len *= 2;
2359 pSMB->OldFileName[name_len] = 0; /* pad */ 2380
2360 pSMB->OldFileName[name_len + 1] = 0x04; 2381 /* protocol specifies ASCII buffer format (0x04) for unicode */
2382 pSMB->OldFileName[name_len] = 0x04;
2383 pSMB->OldFileName[name_len + 1] = 0x00; /* pad */
2361 name_len2 = 2384 name_len2 =
2362 cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2], 2385 cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
2363 toName, PATH_MAX, nls_codepage, remap); 2386 toName, PATH_MAX, nls_codepage, remap);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index da0f4ffa0613..0de3b5615a22 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -95,6 +95,7 @@ struct smb_vol {
95 bool local_lease:1; /* check leases only on local system, not remote */ 95 bool local_lease:1; /* check leases only on local system, not remote */
96 bool noblocksnd:1; 96 bool noblocksnd:1;
97 bool noautotune:1; 97 bool noautotune:1;
98 bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
98 unsigned int rsize; 99 unsigned int rsize;
99 unsigned int wsize; 100 unsigned int wsize;
100 unsigned int sockopt; 101 unsigned int sockopt;
@@ -1274,6 +1275,10 @@ cifs_parse_mount_options(char *options, const char *devname,
1274 vol->intr = 0; 1275 vol->intr = 0;
1275 } else if (strnicmp(data, "intr", 4) == 0) { 1276 } else if (strnicmp(data, "intr", 4) == 0) {
1276 vol->intr = 1; 1277 vol->intr = 1;
1278 } else if (strnicmp(data, "nostrictsync", 12) == 0) {
1279 vol->nostrictsync = 1;
1280 } else if (strnicmp(data, "strictsync", 10) == 0) {
1281 vol->nostrictsync = 0;
1277 } else if (strnicmp(data, "serverino", 7) == 0) { 1282 } else if (strnicmp(data, "serverino", 7) == 0) {
1278 vol->server_ino = 1; 1283 vol->server_ino = 1;
1279 } else if (strnicmp(data, "noserverino", 9) == 0) { 1284 } else if (strnicmp(data, "noserverino", 9) == 0) {
@@ -2160,6 +2165,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2160 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL; 2165 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL;
2161 if (pvolume_info->nobrl) 2166 if (pvolume_info->nobrl)
2162 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL; 2167 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL;
2168 if (pvolume_info->nostrictsync)
2169 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOSSYNC;
2163 if (pvolume_info->mand_lock) 2170 if (pvolume_info->mand_lock)
2164 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOPOSIXBRL; 2171 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOPOSIXBRL;
2165 if (pvolume_info->cifs_acl) 2172 if (pvolume_info->cifs_acl)
@@ -3667,7 +3674,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
3667 BCC(smb_buffer_response)) { 3674 BCC(smb_buffer_response)) {
3668 kfree(tcon->nativeFileSystem); 3675 kfree(tcon->nativeFileSystem);
3669 tcon->nativeFileSystem = 3676 tcon->nativeFileSystem =
3670 kzalloc(length + 2, GFP_KERNEL); 3677 kzalloc(2*(length + 1), GFP_KERNEL);
3671 if (tcon->nativeFileSystem) 3678 if (tcon->nativeFileSystem)
3672 cifs_strfromUCS_le( 3679 cifs_strfromUCS_le(
3673 tcon->nativeFileSystem, 3680 tcon->nativeFileSystem,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 89fb72832652..54dce78fbb73 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -129,7 +129,7 @@ cifs_bp_rename_retry:
129 return full_path; 129 return full_path;
130} 130}
131 131
132static int cifs_posix_open(char *full_path, struct inode **pinode, 132int cifs_posix_open(char *full_path, struct inode **pinode,
133 struct super_block *sb, int mode, int oflags, 133 struct super_block *sb, int mode, int oflags,
134 int *poplock, __u16 *pnetfid, int xid) 134 int *poplock, __u16 *pnetfid, int xid)
135{ 135{
@@ -187,7 +187,9 @@ static int cifs_posix_open(char *full_path, struct inode **pinode,
187 if (!pinode) 187 if (!pinode)
188 goto posix_open_ret; /* caller does not need info */ 188 goto posix_open_ret; /* caller does not need info */
189 189
190 *pinode = cifs_new_inode(sb, &presp_data->UniqueId); 190 if (*pinode == NULL)
191 *pinode = cifs_new_inode(sb, &presp_data->UniqueId);
192 /* else an inode was passed in. Update its info, don't create one */
191 193
192 /* We do not need to close the file if new_inode fails since 194 /* We do not need to close the file if new_inode fails since
193 the caller will retry qpathinfo as long as inode is null */ 195 the caller will retry qpathinfo as long as inode is null */
@@ -252,7 +254,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
252 return -ENOMEM; 254 return -ENOMEM;
253 } 255 }
254 256
255 mode &= ~current->fs->umask; 257 mode &= ~current_umask();
256 if (oplockEnabled) 258 if (oplockEnabled)
257 oplock = REQ_OPLOCK; 259 oplock = REQ_OPLOCK;
258 260
@@ -477,7 +479,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
477 rc = -ENOMEM; 479 rc = -ENOMEM;
478 else if (pTcon->unix_ext) { 480 else if (pTcon->unix_ext) {
479 struct cifs_unix_set_info_args args = { 481 struct cifs_unix_set_info_args args = {
480 .mode = mode & ~current->fs->umask, 482 .mode = mode & ~current_umask(),
481 .ctime = NO_CHANGE_64, 483 .ctime = NO_CHANGE_64,
482 .atime = NO_CHANGE_64, 484 .atime = NO_CHANGE_64,
483 .mtime = NO_CHANGE_64, 485 .mtime = NO_CHANGE_64,
@@ -699,7 +701,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
699 return rc; 701 return rc;
700} */ 702} */
701 703
702struct dentry_operations cifs_dentry_ops = { 704const struct dentry_operations cifs_dentry_ops = {
703 .d_revalidate = cifs_d_revalidate, 705 .d_revalidate = cifs_d_revalidate,
704/* d_delete: cifs_d_delete, */ /* not needed except for debugging */ 706/* d_delete: cifs_d_delete, */ /* not needed except for debugging */
705}; 707};
@@ -737,7 +739,7 @@ static int cifs_ci_compare(struct dentry *dentry, struct qstr *a,
737 return 1; 739 return 1;
738} 740}
739 741
740struct dentry_operations cifs_ci_dentry_ops = { 742const struct dentry_operations cifs_ci_dentry_ops = {
741 .d_revalidate = cifs_d_revalidate, 743 .d_revalidate = cifs_d_revalidate,
742 .d_hash = cifs_ci_hash, 744 .d_hash = cifs_ci_hash,
743 .d_compare = cifs_ci_compare, 745 .d_compare = cifs_ci_compare,
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 12bb656fbe75..81747acca4c4 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -78,8 +78,36 @@ static inline int cifs_convert_flags(unsigned int flags)
78 return (READ_CONTROL | FILE_WRITE_ATTRIBUTES | FILE_READ_ATTRIBUTES | 78 return (READ_CONTROL | FILE_WRITE_ATTRIBUTES | FILE_READ_ATTRIBUTES |
79 FILE_WRITE_EA | FILE_APPEND_DATA | FILE_WRITE_DATA | 79 FILE_WRITE_EA | FILE_APPEND_DATA | FILE_WRITE_DATA |
80 FILE_READ_DATA); 80 FILE_READ_DATA);
81}
81 82
83static inline fmode_t cifs_posix_convert_flags(unsigned int flags)
84{
85 fmode_t posix_flags = 0;
82 86
87 if ((flags & O_ACCMODE) == O_RDONLY)
88 posix_flags = FMODE_READ;
89 else if ((flags & O_ACCMODE) == O_WRONLY)
90 posix_flags = FMODE_WRITE;
91 else if ((flags & O_ACCMODE) == O_RDWR) {
92 /* GENERIC_ALL is too much permission to request
93 can cause unnecessary access denied on create */
94 /* return GENERIC_ALL; */
95 posix_flags = FMODE_READ | FMODE_WRITE;
96 }
97 /* can not map O_CREAT or O_EXCL or O_TRUNC flags when
98 reopening a file. They had their effect on the original open */
99 if (flags & O_APPEND)
100 posix_flags |= (fmode_t)O_APPEND;
101 if (flags & O_SYNC)
102 posix_flags |= (fmode_t)O_SYNC;
103 if (flags & O_DIRECTORY)
104 posix_flags |= (fmode_t)O_DIRECTORY;
105 if (flags & O_NOFOLLOW)
106 posix_flags |= (fmode_t)O_NOFOLLOW;
107 if (flags & O_DIRECT)
108 posix_flags |= (fmode_t)O_DIRECT;
109
110 return posix_flags;
83} 111}
84 112
85static inline int cifs_get_disposition(unsigned int flags) 113static inline int cifs_get_disposition(unsigned int flags)
@@ -97,6 +125,80 @@ static inline int cifs_get_disposition(unsigned int flags)
97} 125}
98 126
99/* all arguments to this function must be checked for validity in caller */ 127/* all arguments to this function must be checked for validity in caller */
128static inline int cifs_posix_open_inode_helper(struct inode *inode,
129 struct file *file, struct cifsInodeInfo *pCifsInode,
130 struct cifsFileInfo *pCifsFile, int oplock, u16 netfid)
131{
132 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
133/* struct timespec temp; */ /* BB REMOVEME BB */
134
135 file->private_data = kmalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
136 if (file->private_data == NULL)
137 return -ENOMEM;
138 pCifsFile = cifs_init_private(file->private_data, inode, file, netfid);
139 write_lock(&GlobalSMBSeslock);
140 list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList);
141
142 pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
143 if (pCifsInode == NULL) {
144 write_unlock(&GlobalSMBSeslock);
145 return -EINVAL;
146 }
147
148 /* want handles we can use to read with first
149 in the list so we do not have to walk the
150 list to search for one in write_begin */
151 if ((file->f_flags & O_ACCMODE) == O_WRONLY) {
152 list_add_tail(&pCifsFile->flist,
153 &pCifsInode->openFileList);
154 } else {
155 list_add(&pCifsFile->flist,
156 &pCifsInode->openFileList);
157 }
158
159 if (pCifsInode->clientCanCacheRead) {
160 /* we have the inode open somewhere else
161 no need to discard cache data */
162 goto psx_client_can_cache;
163 }
164
165 /* BB FIXME need to fix this check to move it earlier into posix_open
166 BB fIX following section BB FIXME */
167
168 /* if not oplocked, invalidate inode pages if mtime or file
169 size changed */
170/* temp = cifs_NTtimeToUnix(le64_to_cpu(buf->LastWriteTime));
171 if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
172 (file->f_path.dentry->d_inode->i_size ==
173 (loff_t)le64_to_cpu(buf->EndOfFile))) {
174 cFYI(1, ("inode unchanged on server"));
175 } else {
176 if (file->f_path.dentry->d_inode->i_mapping) {
177 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
178 if (rc != 0)
179 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
180 }
181 cFYI(1, ("invalidating remote inode since open detected it "
182 "changed"));
183 invalidate_remote_inode(file->f_path.dentry->d_inode);
184 } */
185
186psx_client_can_cache:
187 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
188 pCifsInode->clientCanCacheAll = true;
189 pCifsInode->clientCanCacheRead = true;
190 cFYI(1, ("Exclusive Oplock granted on inode %p",
191 file->f_path.dentry->d_inode));
192 } else if ((oplock & 0xF) == OPLOCK_READ)
193 pCifsInode->clientCanCacheRead = true;
194
195 /* will have to change the unlock if we reenable the
196 filemap_fdatawrite (which does not seem necessary */
197 write_unlock(&GlobalSMBSeslock);
198 return 0;
199}
200
201/* all arguments to this function must be checked for validity in caller */
100static inline int cifs_open_inode_helper(struct inode *inode, struct file *file, 202static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
101 struct cifsInodeInfo *pCifsInode, struct cifsFileInfo *pCifsFile, 203 struct cifsInodeInfo *pCifsInode, struct cifsFileInfo *pCifsFile,
102 struct cifsTconInfo *pTcon, int *oplock, FILE_ALL_INFO *buf, 204 struct cifsTconInfo *pTcon, int *oplock, FILE_ALL_INFO *buf,
@@ -167,7 +269,7 @@ int cifs_open(struct inode *inode, struct file *file)
167 int rc = -EACCES; 269 int rc = -EACCES;
168 int xid, oplock; 270 int xid, oplock;
169 struct cifs_sb_info *cifs_sb; 271 struct cifs_sb_info *cifs_sb;
170 struct cifsTconInfo *pTcon; 272 struct cifsTconInfo *tcon;
171 struct cifsFileInfo *pCifsFile; 273 struct cifsFileInfo *pCifsFile;
172 struct cifsInodeInfo *pCifsInode; 274 struct cifsInodeInfo *pCifsInode;
173 struct list_head *tmp; 275 struct list_head *tmp;
@@ -180,7 +282,7 @@ int cifs_open(struct inode *inode, struct file *file)
180 xid = GetXid(); 282 xid = GetXid();
181 283
182 cifs_sb = CIFS_SB(inode->i_sb); 284 cifs_sb = CIFS_SB(inode->i_sb);
183 pTcon = cifs_sb->tcon; 285 tcon = cifs_sb->tcon;
184 286
185 if (file->f_flags & O_CREAT) { 287 if (file->f_flags & O_CREAT) {
186 /* search inode for this file and fill in file->private_data */ 288 /* search inode for this file and fill in file->private_data */
@@ -220,6 +322,45 @@ int cifs_open(struct inode *inode, struct file *file)
220 322
221 cFYI(1, ("inode = 0x%p file flags are 0x%x for %s", 323 cFYI(1, ("inode = 0x%p file flags are 0x%x for %s",
222 inode, file->f_flags, full_path)); 324 inode, file->f_flags, full_path));
325
326 if (oplockEnabled)
327 oplock = REQ_OPLOCK;
328 else
329 oplock = 0;
330
331 if (!tcon->broken_posix_open && tcon->unix_ext &&
332 (tcon->ses->capabilities & CAP_UNIX) &&
333 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
334 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
335 int oflags = (int) cifs_posix_convert_flags(file->f_flags);
336 /* can not refresh inode info since size could be stale */
337 rc = cifs_posix_open(full_path, &inode, inode->i_sb,
338 cifs_sb->mnt_file_mode /* ignored */,
339 oflags, &oplock, &netfid, xid);
340 if (rc == 0) {
341 cFYI(1, ("posix open succeeded"));
342 /* no need for special case handling of setting mode
343 on read only files needed here */
344
345 cifs_posix_open_inode_helper(inode, file, pCifsInode,
346 pCifsFile, oplock, netfid);
347 goto out;
348 } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
349 if (tcon->ses->serverNOS)
350 cERROR(1, ("server %s of type %s returned"
351 " unexpected error on SMB posix open"
352 ", disabling posix open support."
353 " Check if server update available.",
354 tcon->ses->serverName,
355 tcon->ses->serverNOS));
356 tcon->broken_posix_open = true;
357 } else if ((rc != -EIO) && (rc != -EREMOTE) &&
358 (rc != -EOPNOTSUPP)) /* path not found or net err */
359 goto out;
360 /* else fallthrough to retry open the old way on network i/o
361 or DFS errors */
362 }
363
223 desiredAccess = cifs_convert_flags(file->f_flags); 364 desiredAccess = cifs_convert_flags(file->f_flags);
224 365
225/********************************************************************* 366/*********************************************************************
@@ -248,11 +389,6 @@ int cifs_open(struct inode *inode, struct file *file)
248 389
249 disposition = cifs_get_disposition(file->f_flags); 390 disposition = cifs_get_disposition(file->f_flags);
250 391
251 if (oplockEnabled)
252 oplock = REQ_OPLOCK;
253 else
254 oplock = 0;
255
256 /* BB pass O_SYNC flag through on file attributes .. BB */ 392 /* BB pass O_SYNC flag through on file attributes .. BB */
257 393
258 /* Also refresh inode by passing in file_info buf returned by SMBOpen 394 /* Also refresh inode by passing in file_info buf returned by SMBOpen
@@ -269,7 +405,7 @@ int cifs_open(struct inode *inode, struct file *file)
269 } 405 }
270 406
271 if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS) 407 if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS)
272 rc = CIFSSMBOpen(xid, pTcon, full_path, disposition, 408 rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
273 desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf, 409 desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
274 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags 410 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
275 & CIFS_MOUNT_MAP_SPECIAL_CHR); 411 & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -278,7 +414,7 @@ int cifs_open(struct inode *inode, struct file *file)
278 414
279 if (rc == -EIO) { 415 if (rc == -EIO) {
280 /* Old server, try legacy style OpenX */ 416 /* Old server, try legacy style OpenX */
281 rc = SMBLegacyOpen(xid, pTcon, full_path, disposition, 417 rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
282 desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf, 418 desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
283 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags 419 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
284 & CIFS_MOUNT_MAP_SPECIAL_CHR); 420 & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -295,12 +431,12 @@ int cifs_open(struct inode *inode, struct file *file)
295 } 431 }
296 pCifsFile = cifs_init_private(file->private_data, inode, file, netfid); 432 pCifsFile = cifs_init_private(file->private_data, inode, file, netfid);
297 write_lock(&GlobalSMBSeslock); 433 write_lock(&GlobalSMBSeslock);
298 list_add(&pCifsFile->tlist, &pTcon->openFileList); 434 list_add(&pCifsFile->tlist, &tcon->openFileList);
299 435
300 pCifsInode = CIFS_I(file->f_path.dentry->d_inode); 436 pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
301 if (pCifsInode) { 437 if (pCifsInode) {
302 rc = cifs_open_inode_helper(inode, file, pCifsInode, 438 rc = cifs_open_inode_helper(inode, file, pCifsInode,
303 pCifsFile, pTcon, 439 pCifsFile, tcon,
304 &oplock, buf, full_path, xid); 440 &oplock, buf, full_path, xid);
305 } else { 441 } else {
306 write_unlock(&GlobalSMBSeslock); 442 write_unlock(&GlobalSMBSeslock);
@@ -309,7 +445,7 @@ int cifs_open(struct inode *inode, struct file *file)
309 if (oplock & CIFS_CREATE_ACTION) { 445 if (oplock & CIFS_CREATE_ACTION) {
310 /* time to set mode which we can not set earlier due to 446 /* time to set mode which we can not set earlier due to
311 problems creating new read-only files */ 447 problems creating new read-only files */
312 if (pTcon->unix_ext) { 448 if (tcon->unix_ext) {
313 struct cifs_unix_set_info_args args = { 449 struct cifs_unix_set_info_args args = {
314 .mode = inode->i_mode, 450 .mode = inode->i_mode,
315 .uid = NO_CHANGE_64, 451 .uid = NO_CHANGE_64,
@@ -319,7 +455,7 @@ int cifs_open(struct inode *inode, struct file *file)
319 .mtime = NO_CHANGE_64, 455 .mtime = NO_CHANGE_64,
320 .device = 0, 456 .device = 0,
321 }; 457 };
322 CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args, 458 CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
323 cifs_sb->local_nls, 459 cifs_sb->local_nls,
324 cifs_sb->mnt_cifs_flags & 460 cifs_sb->mnt_cifs_flags &
325 CIFS_MOUNT_MAP_SPECIAL_CHR); 461 CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -349,7 +485,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
349 int rc = -EACCES; 485 int rc = -EACCES;
350 int xid, oplock; 486 int xid, oplock;
351 struct cifs_sb_info *cifs_sb; 487 struct cifs_sb_info *cifs_sb;
352 struct cifsTconInfo *pTcon; 488 struct cifsTconInfo *tcon;
353 struct cifsFileInfo *pCifsFile; 489 struct cifsFileInfo *pCifsFile;
354 struct cifsInodeInfo *pCifsInode; 490 struct cifsInodeInfo *pCifsInode;
355 struct inode *inode; 491 struct inode *inode;
@@ -387,7 +523,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
387 } 523 }
388 524
389 cifs_sb = CIFS_SB(inode->i_sb); 525 cifs_sb = CIFS_SB(inode->i_sb);
390 pTcon = cifs_sb->tcon; 526 tcon = cifs_sb->tcon;
391 527
392/* can not grab rename sem here because various ops, including 528/* can not grab rename sem here because various ops, including
393 those that already have the rename sem can end up causing writepage 529 those that already have the rename sem can end up causing writepage
@@ -404,20 +540,37 @@ reopen_error_exit:
404 540
405 cFYI(1, ("inode = 0x%p file flags 0x%x for %s", 541 cFYI(1, ("inode = 0x%p file flags 0x%x for %s",
406 inode, file->f_flags, full_path)); 542 inode, file->f_flags, full_path));
407 desiredAccess = cifs_convert_flags(file->f_flags);
408 543
409 if (oplockEnabled) 544 if (oplockEnabled)
410 oplock = REQ_OPLOCK; 545 oplock = REQ_OPLOCK;
411 else 546 else
412 oplock = 0; 547 oplock = 0;
413 548
549 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
550 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
551 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
552 int oflags = (int) cifs_posix_convert_flags(file->f_flags);
553 /* can not refresh inode info since size could be stale */
554 rc = cifs_posix_open(full_path, NULL, inode->i_sb,
555 cifs_sb->mnt_file_mode /* ignored */,
556 oflags, &oplock, &netfid, xid);
557 if (rc == 0) {
558 cFYI(1, ("posix reopen succeeded"));
559 goto reopen_success;
560 }
561 /* fallthrough to retry open the old way on errors, especially
562 in the reconnect path it is important to retry hard */
563 }
564
565 desiredAccess = cifs_convert_flags(file->f_flags);
566
414 /* Can not refresh inode by passing in file_info buf to be returned 567 /* Can not refresh inode by passing in file_info buf to be returned
415 by SMBOpen and then calling get_inode_info with returned buf 568 by SMBOpen and then calling get_inode_info with returned buf
416 since file might have write behind data that needs to be flushed 569 since file might have write behind data that needs to be flushed
417 and server version of file size can be stale. If we knew for sure 570 and server version of file size can be stale. If we knew for sure
418 that inode was not dirty locally we could do this */ 571 that inode was not dirty locally we could do this */
419 572
420 rc = CIFSSMBOpen(xid, pTcon, full_path, disposition, desiredAccess, 573 rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess,
421 CREATE_NOT_DIR, &netfid, &oplock, NULL, 574 CREATE_NOT_DIR, &netfid, &oplock, NULL,
422 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 575 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
423 CIFS_MOUNT_MAP_SPECIAL_CHR); 576 CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -426,6 +579,7 @@ reopen_error_exit:
426 cFYI(1, ("cifs_open returned 0x%x", rc)); 579 cFYI(1, ("cifs_open returned 0x%x", rc));
427 cFYI(1, ("oplock: %d", oplock)); 580 cFYI(1, ("oplock: %d", oplock));
428 } else { 581 } else {
582reopen_success:
429 pCifsFile->netfid = netfid; 583 pCifsFile->netfid = netfid;
430 pCifsFile->invalidHandle = false; 584 pCifsFile->invalidHandle = false;
431 up(&pCifsFile->fh_sem); 585 up(&pCifsFile->fh_sem);
@@ -439,7 +593,7 @@ reopen_error_exit:
439 go to server to get inode info */ 593 go to server to get inode info */
440 pCifsInode->clientCanCacheAll = false; 594 pCifsInode->clientCanCacheAll = false;
441 pCifsInode->clientCanCacheRead = false; 595 pCifsInode->clientCanCacheRead = false;
442 if (pTcon->unix_ext) 596 if (tcon->unix_ext)
443 rc = cifs_get_inode_info_unix(&inode, 597 rc = cifs_get_inode_info_unix(&inode,
444 full_path, inode->i_sb, xid); 598 full_path, inode->i_sb, xid);
445 else 599 else
@@ -467,7 +621,6 @@ reopen_error_exit:
467 cifs_relock_file(pCifsFile); 621 cifs_relock_file(pCifsFile);
468 } 622 }
469 } 623 }
470
471 kfree(full_path); 624 kfree(full_path);
472 FreeXid(xid); 625 FreeXid(xid);
473 return rc; 626 return rc;
@@ -1523,6 +1676,9 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
1523{ 1676{
1524 int xid; 1677 int xid;
1525 int rc = 0; 1678 int rc = 0;
1679 struct cifsTconInfo *tcon;
1680 struct cifsFileInfo *smbfile =
1681 (struct cifsFileInfo *)file->private_data;
1526 struct inode *inode = file->f_path.dentry->d_inode; 1682 struct inode *inode = file->f_path.dentry->d_inode;
1527 1683
1528 xid = GetXid(); 1684 xid = GetXid();
@@ -1534,7 +1690,12 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
1534 if (rc == 0) { 1690 if (rc == 0) {
1535 rc = CIFS_I(inode)->write_behind_rc; 1691 rc = CIFS_I(inode)->write_behind_rc;
1536 CIFS_I(inode)->write_behind_rc = 0; 1692 CIFS_I(inode)->write_behind_rc = 0;
1693 tcon = CIFS_SB(inode->i_sb)->tcon;
1694 if (!rc && tcon && smbfile &&
1695 !(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
1696 rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
1537 } 1697 }
1698
1538 FreeXid(xid); 1699 FreeXid(xid);
1539 return rc; 1700 return rc;
1540} 1701}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 4690a360c855..f121a80fdd6f 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -763,6 +763,9 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
763 struct cifsTconInfo *pTcon = cifs_sb->tcon; 763 struct cifsTconInfo *pTcon = cifs_sb->tcon;
764 FILE_BASIC_INFO info_buf; 764 FILE_BASIC_INFO info_buf;
765 765
766 if (attrs == NULL)
767 return -EINVAL;
768
766 if (attrs->ia_valid & ATTR_ATIME) { 769 if (attrs->ia_valid & ATTR_ATIME) {
767 set_time = true; 770 set_time = true;
768 info_buf.LastAccessTime = 771 info_buf.LastAccessTime =
@@ -1122,7 +1125,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1122 goto mkdir_out; 1125 goto mkdir_out;
1123 } 1126 }
1124 1127
1125 mode &= ~current->fs->umask; 1128 mode &= ~current_umask();
1126 rc = CIFSPOSIXCreate(xid, pTcon, SMB_O_DIRECTORY | SMB_O_CREAT, 1129 rc = CIFSPOSIXCreate(xid, pTcon, SMB_O_DIRECTORY | SMB_O_CREAT,
1127 mode, NULL /* netfid */, pInfo, &oplock, 1130 mode, NULL /* netfid */, pInfo, &oplock,
1128 full_path, cifs_sb->local_nls, 1131 full_path, cifs_sb->local_nls,
@@ -1201,7 +1204,7 @@ mkdir_get_info:
1201 if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2)) 1204 if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2))
1202 direntry->d_inode->i_nlink = 2; 1205 direntry->d_inode->i_nlink = 2;
1203 1206
1204 mode &= ~current->fs->umask; 1207 mode &= ~current_umask();
1205 /* must turn on setgid bit if parent dir has it */ 1208 /* must turn on setgid bit if parent dir has it */
1206 if (inode->i_mode & S_ISGID) 1209 if (inode->i_mode & S_ISGID)
1207 mode |= S_ISGID; 1210 mode |= S_ISGID;
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
new file mode 100644
index 000000000000..7056b891e087
--- /dev/null
+++ b/fs/cifs/smbfsctl.h
@@ -0,0 +1,84 @@
1/*
2 * fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions
3 *
4 * Copyright (c) International Business Machines Corp., 2002,2009
5 * Author(s): Steve French (sfrench@us.ibm.com)
6 *
7 * This library is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published
9 * by the Free Software Foundation; either version 2.1 of the License, or
10 * (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
15 * the GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with this library; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22/* IOCTL information */
23/*
24 * List of ioctl/fsctl function codes that are or could be useful in the
25 * future to remote clients like cifs or SMB2 client. There is probably
26 * a slightly larger set of fsctls that NTFS local filesystem could handle,
27 * including the seven below that we do not have struct definitions for.
28 * Even with protocol definitions for most of these now available, we still
29 * need to do some experimentation to identify which are practical to do
30 * remotely. Some of the following, such as the encryption/compression ones
31 * could be invoked from tools via a specialized hook into the VFS rather
32 * than via the standard vfs entry points
33 */
34#define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000
35#define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004
36#define FSCTL_REQUEST_BATCH_OPLOCK 0x00090008
37#define FSCTL_LOCK_VOLUME 0x00090018
38#define FSCTL_UNLOCK_VOLUME 0x0009001C
39#define FSCTL_IS_PATHNAME_VALID 0x0009002C /* BB add struct */
40#define FSCTL_GET_COMPRESSION 0x0009003C /* BB add struct */
41#define FSCTL_SET_COMPRESSION 0x0009C040 /* BB add struct */
42#define FSCTL_QUERY_FAT_BPB 0x00090058 /* BB add struct */
43/* Verify the next FSCTL number, we had it as 0x00090090 before */
44#define FSCTL_FILESYSTEM_GET_STATS 0x00090060 /* BB add struct */
45#define FSCTL_GET_NTFS_VOLUME_DATA 0x00090064 /* BB add struct */
46#define FSCTL_GET_RETRIEVAL_POINTERS 0x00090073 /* BB add struct */
47#define FSCTL_IS_VOLUME_DIRTY 0x00090078 /* BB add struct */
48#define FSCTL_ALLOW_EXTENDED_DASD_IO 0x00090083 /* BB add struct */
49#define FSCTL_REQUEST_FILTER_OPLOCK 0x0009008C
50#define FSCTL_FIND_FILES_BY_SID 0x0009008F /* BB add struct */
51#define FSCTL_SET_OBJECT_ID 0x00090098 /* BB add struct */
52#define FSCTL_GET_OBJECT_ID 0x0009009C /* BB add struct */
53#define FSCTL_DELETE_OBJECT_ID 0x000900A0 /* BB add struct */
54#define FSCTL_SET_REPARSE_POINT 0x000900A4 /* BB add struct */
55#define FSCTL_GET_REPARSE_POINT 0x000900A8 /* BB add struct */
56#define FSCTL_DELETE_REPARSE_POINT 0x000900AC /* BB add struct */
57#define FSCTL_SET_OBJECT_ID_EXTENDED 0x000900BC /* BB add struct */
58#define FSCTL_CREATE_OR_GET_OBJECT_ID 0x000900C0 /* BB add struct */
59#define FSCTL_SET_SPARSE 0x000900C4 /* BB add struct */
60#define FSCTL_SET_ZERO_DATA 0x000900C8 /* BB add struct */
61#define FSCTL_SET_ENCRYPTION 0x000900D7 /* BB add struct */
62#define FSCTL_ENCRYPTION_FSCTL_IO 0x000900DB /* BB add struct */
63#define FSCTL_WRITE_RAW_ENCRYPTED 0x000900DF /* BB add struct */
64#define FSCTL_READ_RAW_ENCRYPTED 0x000900E3 /* BB add struct */
65#define FSCTL_READ_FILE_USN_DATA 0x000900EB /* BB add struct */
66#define FSCTL_WRITE_USN_CLOSE_RECORD 0x000900EF /* BB add struct */
67#define FSCTL_SIS_COPYFILE 0x00090100 /* BB add struct */
68#define FSCTL_RECALL_FILE 0x00090117 /* BB add struct */
69#define FSCTL_QUERY_SPARING_INFO 0x00090138 /* BB add struct */
70#define FSCTL_SET_ZERO_ON_DEALLOC 0x00090194 /* BB add struct */
71#define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */
72#define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */
73#define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */
74#define FSCTL_SIS_LINK_FILES 0x0009C104
75#define FSCTL_PIPE_PEEK 0x0011400C /* BB add struct */
76#define FSCTL_PIPE_TRANSCEIVE 0x0011C017 /* BB add struct */
77/* strange that the number for this op is not sequential with previous op */
78#define FSCTL_PIPE_WAIT 0x00110018 /* BB add struct */
79#define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */
80#define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */
81
82#define IO_REPARSE_TAG_MOUNT_POINT 0xA0000003
83#define IO_REPARSE_TAG_HSM 0xC0000004
84#define IO_REPARSE_TAG_SIS 0x80000007
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 75b1fa90b2cb..4bb9d0a5decc 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -59,7 +59,7 @@ static int coda_return_EIO(void)
59} 59}
60#define CODA_EIO_ERROR ((void *) (coda_return_EIO)) 60#define CODA_EIO_ERROR ((void *) (coda_return_EIO))
61 61
62static struct dentry_operations coda_dentry_operations = 62static const struct dentry_operations coda_dentry_operations =
63{ 63{
64 .d_revalidate = coda_dentry_revalidate, 64 .d_revalidate = coda_dentry_revalidate,
65 .d_delete = coda_dentry_delete, 65 .d_delete = coda_dentry_delete,
diff --git a/fs/compat.c b/fs/compat.c
index d0145ca27572..3f84d5f15889 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -51,6 +51,7 @@
51#include <linux/poll.h> 51#include <linux/poll.h>
52#include <linux/mm.h> 52#include <linux/mm.h>
53#include <linux/eventpoll.h> 53#include <linux/eventpoll.h>
54#include <linux/fs_struct.h>
54 55
55#include <asm/uaccess.h> 56#include <asm/uaccess.h>
56#include <asm/mmu_context.h> 57#include <asm/mmu_context.h>
@@ -378,6 +379,34 @@ out:
378 return error; 379 return error;
379} 380}
380 381
382/*
383 * This is a copy of sys_ustat, just dealing with a structure layout.
384 * Given how simple this syscall is that apporach is more maintainable
385 * than the various conversion hacks.
386 */
387asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u)
388{
389 struct super_block *sb;
390 struct compat_ustat tmp;
391 struct kstatfs sbuf;
392 int err;
393
394 sb = user_get_super(new_decode_dev(dev));
395 if (!sb)
396 return -EINVAL;
397 err = vfs_statfs(sb->s_root, &sbuf);
398 drop_super(sb);
399 if (err)
400 return err;
401
402 memset(&tmp, 0, sizeof(struct compat_ustat));
403 tmp.f_tfree = sbuf.f_bfree;
404 tmp.f_tinode = sbuf.f_ffree;
405 if (copy_to_user(u, &tmp, sizeof(struct compat_ustat)))
406 return -EFAULT;
407 return 0;
408}
409
381static int get_compat_flock(struct flock *kfl, struct compat_flock __user *ufl) 410static int get_compat_flock(struct flock *kfl, struct compat_flock __user *ufl)
382{ 411{
383 if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) || 412 if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) ||
@@ -1167,16 +1196,12 @@ out:
1167 return ret; 1196 return ret;
1168} 1197}
1169 1198
1170asmlinkage ssize_t 1199static size_t compat_readv(struct file *file,
1171compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen) 1200 const struct compat_iovec __user *vec,
1201 unsigned long vlen, loff_t *pos)
1172{ 1202{
1173 struct file *file;
1174 ssize_t ret = -EBADF; 1203 ssize_t ret = -EBADF;
1175 1204
1176 file = fget(fd);
1177 if (!file)
1178 return -EBADF;
1179
1180 if (!(file->f_mode & FMODE_READ)) 1205 if (!(file->f_mode & FMODE_READ))
1181 goto out; 1206 goto out;
1182 1207
@@ -1184,25 +1209,56 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsign
1184 if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read)) 1209 if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
1185 goto out; 1210 goto out;
1186 1211
1187 ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos); 1212 ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
1188 1213
1189out: 1214out:
1190 if (ret > 0) 1215 if (ret > 0)
1191 add_rchar(current, ret); 1216 add_rchar(current, ret);
1192 inc_syscr(current); 1217 inc_syscr(current);
1193 fput(file);
1194 return ret; 1218 return ret;
1195} 1219}
1196 1220
1197asmlinkage ssize_t 1221asmlinkage ssize_t
1198compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen) 1222compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec,
1223 unsigned long vlen)
1199{ 1224{
1200 struct file *file; 1225 struct file *file;
1201 ssize_t ret = -EBADF; 1226 int fput_needed;
1227 ssize_t ret;
1202 1228
1203 file = fget(fd); 1229 file = fget_light(fd, &fput_needed);
1204 if (!file) 1230 if (!file)
1205 return -EBADF; 1231 return -EBADF;
1232 ret = compat_readv(file, vec, vlen, &file->f_pos);
1233 fput_light(file, fput_needed);
1234 return ret;
1235}
1236
1237asmlinkage ssize_t
1238compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
1239 unsigned long vlen, u32 pos_low, u32 pos_high)
1240{
1241 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1242 struct file *file;
1243 int fput_needed;
1244 ssize_t ret;
1245
1246 if (pos < 0)
1247 return -EINVAL;
1248 file = fget_light(fd, &fput_needed);
1249 if (!file)
1250 return -EBADF;
1251 ret = compat_readv(file, vec, vlen, &pos);
1252 fput_light(file, fput_needed);
1253 return ret;
1254}
1255
1256static size_t compat_writev(struct file *file,
1257 const struct compat_iovec __user *vec,
1258 unsigned long vlen, loff_t *pos)
1259{
1260 ssize_t ret = -EBADF;
1261
1206 if (!(file->f_mode & FMODE_WRITE)) 1262 if (!(file->f_mode & FMODE_WRITE))
1207 goto out; 1263 goto out;
1208 1264
@@ -1210,13 +1266,47 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsig
1210 if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write)) 1266 if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
1211 goto out; 1267 goto out;
1212 1268
1213 ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos); 1269 ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
1214 1270
1215out: 1271out:
1216 if (ret > 0) 1272 if (ret > 0)
1217 add_wchar(current, ret); 1273 add_wchar(current, ret);
1218 inc_syscw(current); 1274 inc_syscw(current);
1219 fput(file); 1275 return ret;
1276}
1277
1278asmlinkage ssize_t
1279compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec,
1280 unsigned long vlen)
1281{
1282 struct file *file;
1283 int fput_needed;
1284 ssize_t ret;
1285
1286 file = fget_light(fd, &fput_needed);
1287 if (!file)
1288 return -EBADF;
1289 ret = compat_writev(file, vec, vlen, &file->f_pos);
1290 fput_light(file, fput_needed);
1291 return ret;
1292}
1293
1294asmlinkage ssize_t
1295compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
1296 unsigned long vlen, u32 pos_low, u32 pos_high)
1297{
1298 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1299 struct file *file;
1300 int fput_needed;
1301 ssize_t ret;
1302
1303 if (pos < 0)
1304 return -EINVAL;
1305 file = fget_light(fd, &fput_needed);
1306 if (!file)
1307 return -EBADF;
1308 ret = compat_writev(file, vec, vlen, &pos);
1309 fput_light(file, fput_needed);
1220 return ret; 1310 return ret;
1221} 1311}
1222 1312
@@ -1392,27 +1482,36 @@ int compat_do_execve(char * filename,
1392{ 1482{
1393 struct linux_binprm *bprm; 1483 struct linux_binprm *bprm;
1394 struct file *file; 1484 struct file *file;
1485 struct files_struct *displaced;
1395 int retval; 1486 int retval;
1396 1487
1488 retval = unshare_files(&displaced);
1489 if (retval)
1490 goto out_ret;
1491
1397 retval = -ENOMEM; 1492 retval = -ENOMEM;
1398 bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); 1493 bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
1399 if (!bprm) 1494 if (!bprm)
1400 goto out_ret; 1495 goto out_files;
1401 1496
1402 retval = mutex_lock_interruptible(&current->cred_exec_mutex); 1497 retval = mutex_lock_interruptible(&current->cred_exec_mutex);
1403 if (retval < 0) 1498 if (retval < 0)
1404 goto out_free; 1499 goto out_free;
1500 current->in_execve = 1;
1405 1501
1406 retval = -ENOMEM; 1502 retval = -ENOMEM;
1407 bprm->cred = prepare_exec_creds(); 1503 bprm->cred = prepare_exec_creds();
1408 if (!bprm->cred) 1504 if (!bprm->cred)
1409 goto out_unlock; 1505 goto out_unlock;
1410 check_unsafe_exec(bprm, current->files); 1506
1507 retval = check_unsafe_exec(bprm);
1508 if (retval)
1509 goto out_unlock;
1411 1510
1412 file = open_exec(filename); 1511 file = open_exec(filename);
1413 retval = PTR_ERR(file); 1512 retval = PTR_ERR(file);
1414 if (IS_ERR(file)) 1513 if (IS_ERR(file))
1415 goto out_unlock; 1514 goto out_unmark;
1416 1515
1417 sched_exec(); 1516 sched_exec();
1418 1517
@@ -1454,9 +1553,15 @@ int compat_do_execve(char * filename,
1454 goto out; 1553 goto out;
1455 1554
1456 /* execve succeeded */ 1555 /* execve succeeded */
1556 write_lock(&current->fs->lock);
1557 current->fs->in_exec = 0;
1558 write_unlock(&current->fs->lock);
1559 current->in_execve = 0;
1457 mutex_unlock(&current->cred_exec_mutex); 1560 mutex_unlock(&current->cred_exec_mutex);
1458 acct_update_integrals(current); 1561 acct_update_integrals(current);
1459 free_bprm(bprm); 1562 free_bprm(bprm);
1563 if (displaced)
1564 put_files_struct(displaced);
1460 return retval; 1565 return retval;
1461 1566
1462out: 1567out:
@@ -1469,12 +1574,21 @@ out_file:
1469 fput(bprm->file); 1574 fput(bprm->file);
1470 } 1575 }
1471 1576
1577out_unmark:
1578 write_lock(&current->fs->lock);
1579 current->fs->in_exec = 0;
1580 write_unlock(&current->fs->lock);
1581
1472out_unlock: 1582out_unlock:
1583 current->in_execve = 0;
1473 mutex_unlock(&current->cred_exec_mutex); 1584 mutex_unlock(&current->cred_exec_mutex);
1474 1585
1475out_free: 1586out_free:
1476 free_bprm(bprm); 1587 free_bprm(bprm);
1477 1588
1589out_files:
1590 if (displaced)
1591 reset_files_struct(displaced);
1478out_ret: 1592out_ret:
1479 return retval; 1593 return retval;
1480} 1594}
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 45e59d3c7f1f..3e87ce443ea2 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -23,7 +23,7 @@
23#include <linux/if.h> 23#include <linux/if.h>
24#include <linux/if_bridge.h> 24#include <linux/if_bridge.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/raid/md.h> 26#include <linux/raid/md_u.h>
27#include <linux/kd.h> 27#include <linux/kd.h>
28#include <linux/route.h> 28#include <linux/route.h>
29#include <linux/in6.h> 29#include <linux/in6.h>
@@ -522,6 +522,11 @@ static int dev_ifsioc(unsigned int fd, unsigned int cmd, unsigned long arg)
522 if (err) 522 if (err)
523 return -EFAULT; 523 return -EFAULT;
524 break; 524 break;
525 case SIOCSHWTSTAMP:
526 if (copy_from_user(&ifr, uifr32, sizeof(*uifr32)))
527 return -EFAULT;
528 ifr.ifr_data = compat_ptr(uifr32->ifr_ifru.ifru_data);
529 break;
525 default: 530 default:
526 if (copy_from_user(&ifr, uifr32, sizeof(*uifr32))) 531 if (copy_from_user(&ifr, uifr32, sizeof(*uifr32)))
527 return -EFAULT; 532 return -EFAULT;
@@ -1993,6 +1998,8 @@ COMPATIBLE_IOCTL(TUNSETGROUP)
1993COMPATIBLE_IOCTL(TUNGETFEATURES) 1998COMPATIBLE_IOCTL(TUNGETFEATURES)
1994COMPATIBLE_IOCTL(TUNSETOFFLOAD) 1999COMPATIBLE_IOCTL(TUNSETOFFLOAD)
1995COMPATIBLE_IOCTL(TUNSETTXFILTER) 2000COMPATIBLE_IOCTL(TUNSETTXFILTER)
2001COMPATIBLE_IOCTL(TUNGETSNDBUF)
2002COMPATIBLE_IOCTL(TUNSETSNDBUF)
1996/* Big V */ 2003/* Big V */
1997COMPATIBLE_IOCTL(VT_SETMODE) 2004COMPATIBLE_IOCTL(VT_SETMODE)
1998COMPATIBLE_IOCTL(VT_GETMODE) 2005COMPATIBLE_IOCTL(VT_GETMODE)
@@ -2566,6 +2573,7 @@ HANDLE_IOCTL(SIOCSIFMAP, dev_ifsioc)
2566HANDLE_IOCTL(SIOCGIFADDR, dev_ifsioc) 2573HANDLE_IOCTL(SIOCGIFADDR, dev_ifsioc)
2567HANDLE_IOCTL(SIOCSIFADDR, dev_ifsioc) 2574HANDLE_IOCTL(SIOCSIFADDR, dev_ifsioc)
2568HANDLE_IOCTL(SIOCSIFHWBROADCAST, dev_ifsioc) 2575HANDLE_IOCTL(SIOCSIFHWBROADCAST, dev_ifsioc)
2576HANDLE_IOCTL(SIOCSHWTSTAMP, dev_ifsioc)
2569 2577
2570/* ioctls used by appletalk ddp.c */ 2578/* ioctls used by appletalk ddp.c */
2571HANDLE_IOCTL(SIOCATALKDIFADDR, dev_ifsioc) 2579HANDLE_IOCTL(SIOCATALKDIFADDR, dev_ifsioc)
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 8e93341f3e82..05373db21a4e 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -72,7 +72,7 @@ static int configfs_d_delete(struct dentry *dentry)
72 return 1; 72 return 1;
73} 73}
74 74
75static struct dentry_operations configfs_dentry_ops = { 75static const struct dentry_operations configfs_dentry_ops = {
76 .d_iput = configfs_d_iput, 76 .d_iput = configfs_d_iput,
77 /* simple_delete_dentry() isn't exported */ 77 /* simple_delete_dentry() isn't exported */
78 .d_delete = configfs_d_delete, 78 .d_delete = configfs_d_delete,
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index a07338d2d140..dd3634e4c967 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -318,6 +318,7 @@ out:
318static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf) 318static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
319{ 319{
320 struct super_block *sb = dentry->d_sb; 320 struct super_block *sb = dentry->d_sb;
321 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
321 322
322 buf->f_type = CRAMFS_MAGIC; 323 buf->f_type = CRAMFS_MAGIC;
323 buf->f_bsize = PAGE_CACHE_SIZE; 324 buf->f_bsize = PAGE_CACHE_SIZE;
@@ -326,6 +327,8 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
326 buf->f_bavail = 0; 327 buf->f_bavail = 0;
327 buf->f_files = CRAMFS_SB(sb)->files; 328 buf->f_files = CRAMFS_SB(sb)->files;
328 buf->f_ffree = 0; 329 buf->f_ffree = 0;
330 buf->f_fsid.val[0] = (u32)id;
331 buf->f_fsid.val[1] = (u32)(id >> 32);
329 buf->f_namelen = CRAMFS_MAXPATHLEN; 332 buf->f_namelen = CRAMFS_MAXPATHLEN;
330 return 0; 333 return 0;
331} 334}
@@ -459,11 +462,14 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
459static int cramfs_readpage(struct file *file, struct page * page) 462static int cramfs_readpage(struct file *file, struct page * page)
460{ 463{
461 struct inode *inode = page->mapping->host; 464 struct inode *inode = page->mapping->host;
462 u32 maxblock, bytes_filled; 465 u32 maxblock;
466 int bytes_filled;
463 void *pgdata; 467 void *pgdata;
464 468
465 maxblock = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 469 maxblock = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
466 bytes_filled = 0; 470 bytes_filled = 0;
471 pgdata = kmap(page);
472
467 if (page->index < maxblock) { 473 if (page->index < maxblock) {
468 struct super_block *sb = inode->i_sb; 474 struct super_block *sb = inode->i_sb;
469 u32 blkptr_offset = OFFSET(inode) + page->index*4; 475 u32 blkptr_offset = OFFSET(inode) + page->index*4;
@@ -472,30 +478,43 @@ static int cramfs_readpage(struct file *file, struct page * page)
472 start_offset = OFFSET(inode) + maxblock*4; 478 start_offset = OFFSET(inode) + maxblock*4;
473 mutex_lock(&read_mutex); 479 mutex_lock(&read_mutex);
474 if (page->index) 480 if (page->index)
475 start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4, 4); 481 start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4,
476 compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) - start_offset); 482 4);
483 compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) -
484 start_offset);
477 mutex_unlock(&read_mutex); 485 mutex_unlock(&read_mutex);
478 pgdata = kmap(page); 486
479 if (compr_len == 0) 487 if (compr_len == 0)
480 ; /* hole */ 488 ; /* hole */
481 else if (compr_len > (PAGE_CACHE_SIZE << 1)) 489 else if (unlikely(compr_len > (PAGE_CACHE_SIZE << 1))) {
482 printk(KERN_ERR "cramfs: bad compressed blocksize %u\n", compr_len); 490 pr_err("cramfs: bad compressed blocksize %u\n",
483 else { 491 compr_len);
492 goto err;
493 } else {
484 mutex_lock(&read_mutex); 494 mutex_lock(&read_mutex);
485 bytes_filled = cramfs_uncompress_block(pgdata, 495 bytes_filled = cramfs_uncompress_block(pgdata,
486 PAGE_CACHE_SIZE, 496 PAGE_CACHE_SIZE,
487 cramfs_read(sb, start_offset, compr_len), 497 cramfs_read(sb, start_offset, compr_len),
488 compr_len); 498 compr_len);
489 mutex_unlock(&read_mutex); 499 mutex_unlock(&read_mutex);
500 if (unlikely(bytes_filled < 0))
501 goto err;
490 } 502 }
491 } else 503 }
492 pgdata = kmap(page); 504
493 memset(pgdata + bytes_filled, 0, PAGE_CACHE_SIZE - bytes_filled); 505 memset(pgdata + bytes_filled, 0, PAGE_CACHE_SIZE - bytes_filled);
494 kunmap(page);
495 flush_dcache_page(page); 506 flush_dcache_page(page);
507 kunmap(page);
496 SetPageUptodate(page); 508 SetPageUptodate(page);
497 unlock_page(page); 509 unlock_page(page);
498 return 0; 510 return 0;
511
512err:
513 kunmap(page);
514 ClearPageUptodate(page);
515 SetPageError(page);
516 unlock_page(page);
517 return 0;
499} 518}
500 519
501static const struct address_space_operations cramfs_aops = { 520static const struct address_space_operations cramfs_aops = {
diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c
index fc3ccb74626f..023329800d2e 100644
--- a/fs/cramfs/uncompress.c
+++ b/fs/cramfs/uncompress.c
@@ -50,7 +50,7 @@ int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen)
50err: 50err:
51 printk("Error %d while decompressing!\n", err); 51 printk("Error %d while decompressing!\n", err);
52 printk("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen); 52 printk("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen);
53 return 0; 53 return -EIO;
54} 54}
55 55
56int cramfs_uncompress_init(void) 56int cramfs_uncompress_init(void)
diff --git a/fs/dcache.c b/fs/dcache.c
index 07e2d4a44bda..761d30be2683 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -17,7 +17,6 @@
17#include <linux/syscalls.h> 17#include <linux/syscalls.h>
18#include <linux/string.h> 18#include <linux/string.h>
19#include <linux/mm.h> 19#include <linux/mm.h>
20#include <linux/fdtable.h>
21#include <linux/fs.h> 20#include <linux/fs.h>
22#include <linux/fsnotify.h> 21#include <linux/fsnotify.h>
23#include <linux/slab.h> 22#include <linux/slab.h>
@@ -32,6 +31,7 @@
32#include <linux/seqlock.h> 31#include <linux/seqlock.h>
33#include <linux/swap.h> 32#include <linux/swap.h>
34#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/fs_struct.h>
35#include "internal.h" 35#include "internal.h"
36 36
37int sysctl_vfs_cache_pressure __read_mostly = 100; 37int sysctl_vfs_cache_pressure __read_mostly = 100;
@@ -1247,15 +1247,18 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
1247 struct dentry *found; 1247 struct dentry *found;
1248 struct dentry *new; 1248 struct dentry *new;
1249 1249
1250 /* Does a dentry matching the name exist already? */ 1250 /*
1251 * First check if a dentry matching the name already exists,
1252 * if not go ahead and create it now.
1253 */
1251 found = d_hash_and_lookup(dentry->d_parent, name); 1254 found = d_hash_and_lookup(dentry->d_parent, name);
1252 /* If not, create it now and return */
1253 if (!found) { 1255 if (!found) {
1254 new = d_alloc(dentry->d_parent, name); 1256 new = d_alloc(dentry->d_parent, name);
1255 if (!new) { 1257 if (!new) {
1256 error = -ENOMEM; 1258 error = -ENOMEM;
1257 goto err_out; 1259 goto err_out;
1258 } 1260 }
1261
1259 found = d_splice_alias(inode, new); 1262 found = d_splice_alias(inode, new);
1260 if (found) { 1263 if (found) {
1261 dput(new); 1264 dput(new);
@@ -1263,61 +1266,46 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
1263 } 1266 }
1264 return new; 1267 return new;
1265 } 1268 }
1266 /* Matching dentry exists, check if it is negative. */ 1269
1270 /*
1271 * If a matching dentry exists, and it's not negative use it.
1272 *
1273 * Decrement the reference count to balance the iget() done
1274 * earlier on.
1275 */
1267 if (found->d_inode) { 1276 if (found->d_inode) {
1268 if (unlikely(found->d_inode != inode)) { 1277 if (unlikely(found->d_inode != inode)) {
1269 /* This can't happen because bad inodes are unhashed. */ 1278 /* This can't happen because bad inodes are unhashed. */
1270 BUG_ON(!is_bad_inode(inode)); 1279 BUG_ON(!is_bad_inode(inode));
1271 BUG_ON(!is_bad_inode(found->d_inode)); 1280 BUG_ON(!is_bad_inode(found->d_inode));
1272 } 1281 }
1273 /*
1274 * Already have the inode and the dentry attached, decrement
1275 * the reference count to balance the iget() done
1276 * earlier on. We found the dentry using d_lookup() so it
1277 * cannot be disconnected and thus we do not need to worry
1278 * about any NFS/disconnectedness issues here.
1279 */
1280 iput(inode); 1282 iput(inode);
1281 return found; 1283 return found;
1282 } 1284 }
1285
1283 /* 1286 /*
1284 * Negative dentry: instantiate it unless the inode is a directory and 1287 * Negative dentry: instantiate it unless the inode is a directory and
1285 * has a 'disconnected' dentry (i.e. IS_ROOT and DCACHE_DISCONNECTED), 1288 * already has a dentry.
1286 * in which case d_move() that in place of the found dentry.
1287 */ 1289 */
1288 if (!S_ISDIR(inode->i_mode)) {
1289 /* Not a directory; everything is easy. */
1290 d_instantiate(found, inode);
1291 return found;
1292 }
1293 spin_lock(&dcache_lock); 1290 spin_lock(&dcache_lock);
1294 if (list_empty(&inode->i_dentry)) { 1291 if (!S_ISDIR(inode->i_mode) || list_empty(&inode->i_dentry)) {
1295 /*
1296 * Directory without a 'disconnected' dentry; we need to do
1297 * d_instantiate() by hand because it takes dcache_lock which
1298 * we already hold.
1299 */
1300 __d_instantiate(found, inode); 1292 __d_instantiate(found, inode);
1301 spin_unlock(&dcache_lock); 1293 spin_unlock(&dcache_lock);
1302 security_d_instantiate(found, inode); 1294 security_d_instantiate(found, inode);
1303 return found; 1295 return found;
1304 } 1296 }
1297
1305 /* 1298 /*
1306 * Directory with a 'disconnected' dentry; get a reference to the 1299 * In case a directory already has a (disconnected) entry grab a
1307 * 'disconnected' dentry. 1300 * reference to it, move it in place and use it.
1308 */ 1301 */
1309 new = list_entry(inode->i_dentry.next, struct dentry, d_alias); 1302 new = list_entry(inode->i_dentry.next, struct dentry, d_alias);
1310 dget_locked(new); 1303 dget_locked(new);
1311 spin_unlock(&dcache_lock); 1304 spin_unlock(&dcache_lock);
1312 /* Do security vodoo. */
1313 security_d_instantiate(found, inode); 1305 security_d_instantiate(found, inode);
1314 /* Move new in place of found. */
1315 d_move(new, found); 1306 d_move(new, found);
1316 /* Balance the iget() we did above. */
1317 iput(inode); 1307 iput(inode);
1318 /* Throw away found. */
1319 dput(found); 1308 dput(found);
1320 /* Use new as the actual dentry. */
1321 return new; 1309 return new;
1322 1310
1323err_out: 1311err_out:
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 81ae9ea3c6e1..0662ba6de85a 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -30,6 +30,7 @@
30 30
31static struct vfsmount *debugfs_mount; 31static struct vfsmount *debugfs_mount;
32static int debugfs_mount_count; 32static int debugfs_mount_count;
33static bool debugfs_registered;
33 34
34static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev) 35static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev)
35{ 36{
@@ -496,6 +497,16 @@ exit:
496} 497}
497EXPORT_SYMBOL_GPL(debugfs_rename); 498EXPORT_SYMBOL_GPL(debugfs_rename);
498 499
500/**
501 * debugfs_initialized - Tells whether debugfs has been registered
502 */
503bool debugfs_initialized(void)
504{
505 return debugfs_registered;
506}
507EXPORT_SYMBOL_GPL(debugfs_initialized);
508
509
499static struct kobject *debug_kobj; 510static struct kobject *debug_kobj;
500 511
501static int __init debugfs_init(void) 512static int __init debugfs_init(void)
@@ -509,11 +520,16 @@ static int __init debugfs_init(void)
509 retval = register_filesystem(&debug_fs_type); 520 retval = register_filesystem(&debug_fs_type);
510 if (retval) 521 if (retval)
511 kobject_put(debug_kobj); 522 kobject_put(debug_kobj);
523 else
524 debugfs_registered = true;
525
512 return retval; 526 return retval;
513} 527}
514 528
515static void __exit debugfs_exit(void) 529static void __exit debugfs_exit(void)
516{ 530{
531 debugfs_registered = false;
532
517 simple_release_fs(&debugfs_mount, &debugfs_mount_count); 533 simple_release_fs(&debugfs_mount, &debugfs_mount_count);
518 unregister_filesystem(&debug_fs_type); 534 unregister_filesystem(&debug_fs_type);
519 kobject_put(debug_kobj); 535 kobject_put(debug_kobj);
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 5f3231b9633f..63a4a59e4148 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -198,9 +198,6 @@ static int mknod_ptmx(struct super_block *sb)
198 198
199 fsi->ptmx_dentry = dentry; 199 fsi->ptmx_dentry = dentry;
200 rc = 0; 200 rc = 0;
201
202 printk(KERN_DEBUG "Created ptmx node in devpts ino %lu\n",
203 inode->i_ino);
204out: 201out:
205 mutex_unlock(&root->d_inode->i_mutex); 202 mutex_unlock(&root->d_inode->i_mutex);
206 return rc; 203 return rc;
@@ -325,179 +322,81 @@ static int compare_init_pts_sb(struct super_block *s, void *p)
325} 322}
326 323
327/* 324/*
328 * Safely parse the mount options in @data and update @opts. 325 * devpts_get_sb()
329 * 326 *
330 * devpts ends up parsing options two times during mount, due to the 327 * If the '-o newinstance' mount option was specified, mount a new
331 * two modes of operation it supports. The first parse occurs in 328 * (private) instance of devpts. PTYs created in this instance are
332 * devpts_get_sb() when determining the mode (single-instance or 329 * independent of the PTYs in other devpts instances.
333 * multi-instance mode). The second parse happens in devpts_remount()
334 * or new_pts_mount() depending on the mode.
335 * 330 *
336 * Parsing of options modifies the @data making subsequent parsing 331 * If the '-o newinstance' option was not specified, mount/remount the
337 * incorrect. So make a local copy of @data and parse it. 332 * initial kernel mount of devpts. This type of mount gives the
333 * legacy, single-instance semantics.
338 * 334 *
339 * Return: 0 On success, -errno on error 335 * The 'newinstance' option is needed to support multiple namespace
340 */ 336 * semantics in devpts while preserving backward compatibility of the
341static int safe_parse_mount_options(void *data, struct pts_mount_opts *opts) 337 * current 'single-namespace' semantics. i.e all mounts of devpts
342{ 338 * without the 'newinstance' mount option should bind to the initial
343 int rc; 339 * kernel mount, like get_sb_single().
344 void *datacp;
345
346 if (!data)
347 return 0;
348
349 /* Use kstrdup() ? */
350 datacp = kmalloc(PAGE_SIZE, GFP_KERNEL);
351 if (!datacp)
352 return -ENOMEM;
353
354 memcpy(datacp, data, PAGE_SIZE);
355 rc = parse_mount_options((char *)datacp, PARSE_MOUNT, opts);
356 kfree(datacp);
357
358 return rc;
359}
360
361/*
362 * Mount a new (private) instance of devpts. PTYs created in this
363 * instance are independent of the PTYs in other devpts instances.
364 */
365static int new_pts_mount(struct file_system_type *fs_type, int flags,
366 void *data, struct vfsmount *mnt)
367{
368 int err;
369 struct pts_fs_info *fsi;
370 struct pts_mount_opts *opts;
371
372 printk(KERN_NOTICE "devpts: newinstance mount\n");
373
374 err = get_sb_nodev(fs_type, flags, data, devpts_fill_super, mnt);
375 if (err)
376 return err;
377
378 fsi = DEVPTS_SB(mnt->mnt_sb);
379 opts = &fsi->mount_opts;
380
381 err = parse_mount_options(data, PARSE_MOUNT, opts);
382 if (err)
383 goto fail;
384
385 err = mknod_ptmx(mnt->mnt_sb);
386 if (err)
387 goto fail;
388
389 return 0;
390
391fail:
392 dput(mnt->mnt_sb->s_root);
393 deactivate_super(mnt->mnt_sb);
394 return err;
395}
396
397/*
398 * Check if 'newinstance' mount option was specified in @data.
399 * 340 *
400 * Return: -errno on error (eg: invalid mount options specified) 341 * Mounts with 'newinstance' option create a new, private namespace.
401 * : 1 if 'newinstance' mount option was specified
402 * : 0 if 'newinstance' mount option was NOT specified
403 */
404static int is_new_instance_mount(void *data)
405{
406 int rc;
407 struct pts_mount_opts opts;
408
409 if (!data)
410 return 0;
411
412 rc = safe_parse_mount_options(data, &opts);
413 if (!rc)
414 rc = opts.newinstance;
415
416 return rc;
417}
418
419/*
420 * get_init_pts_sb()
421 * 342 *
422 * This interface is needed to support multiple namespace semantics in 343 * NOTE:
423 * devpts while preserving backward compatibility of the current 'single-
424 * namespace' semantics. i.e all mounts of devpts without the 'newinstance'
425 * mount option should bind to the initial kernel mount, like
426 * get_sb_single().
427 * 344 *
428 * Mounts with 'newinstance' option create a new private namespace. 345 * For single-mount semantics, devpts cannot use get_sb_single(),
429 *
430 * But for single-mount semantics, devpts cannot use get_sb_single(),
431 * because get_sb_single()/sget() find and use the super-block from 346 * because get_sb_single()/sget() find and use the super-block from
432 * the most recent mount of devpts. But that recent mount may be a 347 * the most recent mount of devpts. But that recent mount may be a
433 * 'newinstance' mount and get_sb_single() would pick the newinstance 348 * 'newinstance' mount and get_sb_single() would pick the newinstance
434 * super-block instead of the initial super-block. 349 * super-block instead of the initial super-block.
435 *
436 * This interface is identical to get_sb_single() except that it
437 * consistently selects the 'single-namespace' superblock even in the
438 * presence of the private namespace (i.e 'newinstance') super-blocks.
439 */ 350 */
440static int get_init_pts_sb(struct file_system_type *fs_type, int flags, 351static int devpts_get_sb(struct file_system_type *fs_type,
441 void *data, struct vfsmount *mnt) 352 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
442{ 353{
443 struct super_block *s;
444 int error; 354 int error;
355 struct pts_mount_opts opts;
356 struct super_block *s;
357
358 memset(&opts, 0, sizeof(opts));
359 if (data) {
360 error = parse_mount_options(data, PARSE_MOUNT, &opts);
361 if (error)
362 return error;
363 }
364
365 if (opts.newinstance)
366 s = sget(fs_type, NULL, set_anon_super, NULL);
367 else
368 s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL);
445 369
446 s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL);
447 if (IS_ERR(s)) 370 if (IS_ERR(s))
448 return PTR_ERR(s); 371 return PTR_ERR(s);
449 372
450 if (!s->s_root) { 373 if (!s->s_root) {
451 s->s_flags = flags; 374 s->s_flags = flags;
452 error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0); 375 error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
453 if (error) { 376 if (error)
454 up_write(&s->s_umount); 377 goto out_undo_sget;
455 deactivate_super(s);
456 return error;
457 }
458 s->s_flags |= MS_ACTIVE; 378 s->s_flags |= MS_ACTIVE;
459 } 379 }
460 do_remount_sb(s, flags, data, 0);
461 return simple_set_mnt(mnt, s);
462}
463 380
464/* 381 simple_set_mnt(mnt, s);
465 * Mount or remount the initial kernel mount of devpts. This type of
466 * mount maintains the legacy, single-instance semantics, while the
467 * kernel still allows multiple-instances.
468 */
469static int init_pts_mount(struct file_system_type *fs_type, int flags,
470 void *data, struct vfsmount *mnt)
471{
472 int err;
473 382
474 err = get_init_pts_sb(fs_type, flags, data, mnt); 383 memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts));
475 if (err)
476 return err;
477 384
478 err = mknod_ptmx(mnt->mnt_sb); 385 error = mknod_ptmx(s);
479 if (err) { 386 if (error)
480 dput(mnt->mnt_sb->s_root); 387 goto out_dput;
481 deactivate_super(mnt->mnt_sb);
482 }
483 388
484 return err; 389 return 0;
485}
486
487static int devpts_get_sb(struct file_system_type *fs_type,
488 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
489{
490 int new;
491
492 new = is_new_instance_mount(data);
493 if (new < 0)
494 return new;
495 390
496 if (new) 391out_dput:
497 return new_pts_mount(fs_type, flags, data, mnt); 392 dput(s->s_root);
498 393
499 return init_pts_mount(fs_type, flags, data, mnt); 394out_undo_sget:
395 up_write(&s->s_umount);
396 deactivate_super(s);
397 return error;
500} 398}
399
501#else 400#else
502/* 401/*
503 * This supports only the legacy single-instance semantics (no 402 * This supports only the legacy single-instance semantics (no
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 92969f879a17..858fba14aaa6 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -156,7 +156,7 @@ void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen
156 156
157 bucket = dir_hash(ls, name, namelen); 157 bucket = dir_hash(ls, name, namelen);
158 158
159 write_lock(&ls->ls_dirtbl[bucket].lock); 159 spin_lock(&ls->ls_dirtbl[bucket].lock);
160 160
161 de = search_bucket(ls, name, namelen, bucket); 161 de = search_bucket(ls, name, namelen, bucket);
162 162
@@ -173,7 +173,7 @@ void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen
173 list_del(&de->list); 173 list_del(&de->list);
174 kfree(de); 174 kfree(de);
175 out: 175 out:
176 write_unlock(&ls->ls_dirtbl[bucket].lock); 176 spin_unlock(&ls->ls_dirtbl[bucket].lock);
177} 177}
178 178
179void dlm_dir_clear(struct dlm_ls *ls) 179void dlm_dir_clear(struct dlm_ls *ls)
@@ -185,14 +185,14 @@ void dlm_dir_clear(struct dlm_ls *ls)
185 DLM_ASSERT(list_empty(&ls->ls_recover_list), ); 185 DLM_ASSERT(list_empty(&ls->ls_recover_list), );
186 186
187 for (i = 0; i < ls->ls_dirtbl_size; i++) { 187 for (i = 0; i < ls->ls_dirtbl_size; i++) {
188 write_lock(&ls->ls_dirtbl[i].lock); 188 spin_lock(&ls->ls_dirtbl[i].lock);
189 head = &ls->ls_dirtbl[i].list; 189 head = &ls->ls_dirtbl[i].list;
190 while (!list_empty(head)) { 190 while (!list_empty(head)) {
191 de = list_entry(head->next, struct dlm_direntry, list); 191 de = list_entry(head->next, struct dlm_direntry, list);
192 list_del(&de->list); 192 list_del(&de->list);
193 put_free_de(ls, de); 193 put_free_de(ls, de);
194 } 194 }
195 write_unlock(&ls->ls_dirtbl[i].lock); 195 spin_unlock(&ls->ls_dirtbl[i].lock);
196 } 196 }
197} 197}
198 198
@@ -307,17 +307,17 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
307 307
308 bucket = dir_hash(ls, name, namelen); 308 bucket = dir_hash(ls, name, namelen);
309 309
310 write_lock(&ls->ls_dirtbl[bucket].lock); 310 spin_lock(&ls->ls_dirtbl[bucket].lock);
311 de = search_bucket(ls, name, namelen, bucket); 311 de = search_bucket(ls, name, namelen, bucket);
312 if (de) { 312 if (de) {
313 *r_nodeid = de->master_nodeid; 313 *r_nodeid = de->master_nodeid;
314 write_unlock(&ls->ls_dirtbl[bucket].lock); 314 spin_unlock(&ls->ls_dirtbl[bucket].lock);
315 if (*r_nodeid == nodeid) 315 if (*r_nodeid == nodeid)
316 return -EEXIST; 316 return -EEXIST;
317 return 0; 317 return 0;
318 } 318 }
319 319
320 write_unlock(&ls->ls_dirtbl[bucket].lock); 320 spin_unlock(&ls->ls_dirtbl[bucket].lock);
321 321
322 if (namelen > DLM_RESNAME_MAXLEN) 322 if (namelen > DLM_RESNAME_MAXLEN)
323 return -EINVAL; 323 return -EINVAL;
@@ -330,7 +330,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
330 de->length = namelen; 330 de->length = namelen;
331 memcpy(de->name, name, namelen); 331 memcpy(de->name, name, namelen);
332 332
333 write_lock(&ls->ls_dirtbl[bucket].lock); 333 spin_lock(&ls->ls_dirtbl[bucket].lock);
334 tmp = search_bucket(ls, name, namelen, bucket); 334 tmp = search_bucket(ls, name, namelen, bucket);
335 if (tmp) { 335 if (tmp) {
336 kfree(de); 336 kfree(de);
@@ -339,7 +339,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
339 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list); 339 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
340 } 340 }
341 *r_nodeid = de->master_nodeid; 341 *r_nodeid = de->master_nodeid;
342 write_unlock(&ls->ls_dirtbl[bucket].lock); 342 spin_unlock(&ls->ls_dirtbl[bucket].lock);
343 return 0; 343 return 0;
344} 344}
345 345
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 076e86f38bc8..d01ca0a711db 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -99,7 +99,7 @@ struct dlm_direntry {
99 99
100struct dlm_dirtable { 100struct dlm_dirtable {
101 struct list_head list; 101 struct list_head list;
102 rwlock_t lock; 102 spinlock_t lock;
103}; 103};
104 104
105struct dlm_rsbtable { 105struct dlm_rsbtable {
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 01e7d39c5fba..205ec95b347e 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -835,7 +835,7 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
835 lkb->lkb_wait_count++; 835 lkb->lkb_wait_count++;
836 hold_lkb(lkb); 836 hold_lkb(lkb);
837 837
838 log_debug(ls, "add overlap %x cur %d new %d count %d flags %x", 838 log_debug(ls, "addwait %x cur %d overlap %d count %d f %x",
839 lkb->lkb_id, lkb->lkb_wait_type, mstype, 839 lkb->lkb_id, lkb->lkb_wait_type, mstype,
840 lkb->lkb_wait_count, lkb->lkb_flags); 840 lkb->lkb_wait_count, lkb->lkb_flags);
841 goto out; 841 goto out;
@@ -851,7 +851,7 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
851 list_add(&lkb->lkb_wait_reply, &ls->ls_waiters); 851 list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
852 out: 852 out:
853 if (error) 853 if (error)
854 log_error(ls, "add_to_waiters %x error %d flags %x %d %d %s", 854 log_error(ls, "addwait error %x %d flags %x %d %d %s",
855 lkb->lkb_id, error, lkb->lkb_flags, mstype, 855 lkb->lkb_id, error, lkb->lkb_flags, mstype,
856 lkb->lkb_wait_type, lkb->lkb_resource->res_name); 856 lkb->lkb_wait_type, lkb->lkb_resource->res_name);
857 mutex_unlock(&ls->ls_waiters_mutex); 857 mutex_unlock(&ls->ls_waiters_mutex);
@@ -863,23 +863,55 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
863 request reply on the requestqueue) between dlm_recover_waiters_pre() which 863 request reply on the requestqueue) between dlm_recover_waiters_pre() which
864 set RESEND and dlm_recover_waiters_post() */ 864 set RESEND and dlm_recover_waiters_post() */
865 865
866static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype) 866static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype,
867 struct dlm_message *ms)
867{ 868{
868 struct dlm_ls *ls = lkb->lkb_resource->res_ls; 869 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
869 int overlap_done = 0; 870 int overlap_done = 0;
870 871
871 if (is_overlap_unlock(lkb) && (mstype == DLM_MSG_UNLOCK_REPLY)) { 872 if (is_overlap_unlock(lkb) && (mstype == DLM_MSG_UNLOCK_REPLY)) {
873 log_debug(ls, "remwait %x unlock_reply overlap", lkb->lkb_id);
872 lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK; 874 lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
873 overlap_done = 1; 875 overlap_done = 1;
874 goto out_del; 876 goto out_del;
875 } 877 }
876 878
877 if (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL_REPLY)) { 879 if (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL_REPLY)) {
880 log_debug(ls, "remwait %x cancel_reply overlap", lkb->lkb_id);
878 lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL; 881 lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
879 overlap_done = 1; 882 overlap_done = 1;
880 goto out_del; 883 goto out_del;
881 } 884 }
882 885
886 /* Cancel state was preemptively cleared by a successful convert,
887 see next comment, nothing to do. */
888
889 if ((mstype == DLM_MSG_CANCEL_REPLY) &&
890 (lkb->lkb_wait_type != DLM_MSG_CANCEL)) {
891 log_debug(ls, "remwait %x cancel_reply wait_type %d",
892 lkb->lkb_id, lkb->lkb_wait_type);
893 return -1;
894 }
895
896 /* Remove for the convert reply, and premptively remove for the
897 cancel reply. A convert has been granted while there's still
898 an outstanding cancel on it (the cancel is moot and the result
899 in the cancel reply should be 0). We preempt the cancel reply
900 because the app gets the convert result and then can follow up
901 with another op, like convert. This subsequent op would see the
902 lingering state of the cancel and fail with -EBUSY. */
903
904 if ((mstype == DLM_MSG_CONVERT_REPLY) &&
905 (lkb->lkb_wait_type == DLM_MSG_CONVERT) &&
906 is_overlap_cancel(lkb) && ms && !ms->m_result) {
907 log_debug(ls, "remwait %x convert_reply zap overlap_cancel",
908 lkb->lkb_id);
909 lkb->lkb_wait_type = 0;
910 lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
911 lkb->lkb_wait_count--;
912 goto out_del;
913 }
914
883 /* N.B. type of reply may not always correspond to type of original 915 /* N.B. type of reply may not always correspond to type of original
884 msg due to lookup->request optimization, verify others? */ 916 msg due to lookup->request optimization, verify others? */
885 917
@@ -888,8 +920,8 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype)
888 goto out_del; 920 goto out_del;
889 } 921 }
890 922
891 log_error(ls, "remove_from_waiters lkid %x flags %x types %d %d", 923 log_error(ls, "remwait error %x reply %d flags %x no wait_type",
892 lkb->lkb_id, lkb->lkb_flags, mstype, lkb->lkb_wait_type); 924 lkb->lkb_id, mstype, lkb->lkb_flags);
893 return -1; 925 return -1;
894 926
895 out_del: 927 out_del:
@@ -899,7 +931,7 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype)
899 this would happen */ 931 this would happen */
900 932
901 if (overlap_done && lkb->lkb_wait_type) { 933 if (overlap_done && lkb->lkb_wait_type) {
902 log_error(ls, "remove_from_waiters %x reply %d give up on %d", 934 log_error(ls, "remwait error %x reply %d wait_type %d overlap",
903 lkb->lkb_id, mstype, lkb->lkb_wait_type); 935 lkb->lkb_id, mstype, lkb->lkb_wait_type);
904 lkb->lkb_wait_count--; 936 lkb->lkb_wait_count--;
905 lkb->lkb_wait_type = 0; 937 lkb->lkb_wait_type = 0;
@@ -921,7 +953,7 @@ static int remove_from_waiters(struct dlm_lkb *lkb, int mstype)
921 int error; 953 int error;
922 954
923 mutex_lock(&ls->ls_waiters_mutex); 955 mutex_lock(&ls->ls_waiters_mutex);
924 error = _remove_from_waiters(lkb, mstype); 956 error = _remove_from_waiters(lkb, mstype, NULL);
925 mutex_unlock(&ls->ls_waiters_mutex); 957 mutex_unlock(&ls->ls_waiters_mutex);
926 return error; 958 return error;
927} 959}
@@ -936,7 +968,7 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
936 968
937 if (ms != &ls->ls_stub_ms) 969 if (ms != &ls->ls_stub_ms)
938 mutex_lock(&ls->ls_waiters_mutex); 970 mutex_lock(&ls->ls_waiters_mutex);
939 error = _remove_from_waiters(lkb, ms->m_type); 971 error = _remove_from_waiters(lkb, ms->m_type, ms);
940 if (ms != &ls->ls_stub_ms) 972 if (ms != &ls->ls_stub_ms)
941 mutex_unlock(&ls->ls_waiters_mutex); 973 mutex_unlock(&ls->ls_waiters_mutex);
942 return error; 974 return error;
@@ -2083,6 +2115,11 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2083 lkb->lkb_timeout_cs = args->timeout; 2115 lkb->lkb_timeout_cs = args->timeout;
2084 rv = 0; 2116 rv = 0;
2085 out: 2117 out:
2118 if (rv)
2119 log_debug(ls, "validate_lock_args %d %x %x %x %d %d %s",
2120 rv, lkb->lkb_id, lkb->lkb_flags, args->flags,
2121 lkb->lkb_status, lkb->lkb_wait_type,
2122 lkb->lkb_resource->res_name);
2086 return rv; 2123 return rv;
2087} 2124}
2088 2125
@@ -2149,6 +2186,13 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
2149 goto out; 2186 goto out;
2150 } 2187 }
2151 2188
2189 /* there's nothing to cancel */
2190 if (lkb->lkb_status == DLM_LKSTS_GRANTED &&
2191 !lkb->lkb_wait_type) {
2192 rv = -EBUSY;
2193 goto out;
2194 }
2195
2152 switch (lkb->lkb_wait_type) { 2196 switch (lkb->lkb_wait_type) {
2153 case DLM_MSG_LOOKUP: 2197 case DLM_MSG_LOOKUP:
2154 case DLM_MSG_REQUEST: 2198 case DLM_MSG_REQUEST:
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index aa32e5f02493..cd8e2df3c295 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -487,7 +487,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
487 goto out_lkbfree; 487 goto out_lkbfree;
488 for (i = 0; i < size; i++) { 488 for (i = 0; i < size; i++) {
489 INIT_LIST_HEAD(&ls->ls_dirtbl[i].list); 489 INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
490 rwlock_init(&ls->ls_dirtbl[i].lock); 490 spin_lock_init(&ls->ls_dirtbl[i].lock);
491 } 491 }
492 492
493 INIT_LIST_HEAD(&ls->ls_waiters); 493 INIT_LIST_HEAD(&ls->ls_waiters);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 103a5ebd1371..609108a83267 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -21,7 +21,7 @@
21 * 21 *
22 * Cluster nodes are referred to by their nodeids. nodeids are 22 * Cluster nodes are referred to by their nodeids. nodeids are
23 * simply 32 bit numbers to the locking module - if they need to 23 * simply 32 bit numbers to the locking module - if they need to
24 * be expanded for the cluster infrastructure then that is it's 24 * be expanded for the cluster infrastructure then that is its
25 * responsibility. It is this layer's 25 * responsibility. It is this layer's
26 * responsibility to resolve these into IP address or 26 * responsibility to resolve these into IP address or
27 * whatever it needs for inter-node communication. 27 * whatever it needs for inter-node communication.
@@ -36,9 +36,9 @@
36 * of high load. Also, this way, the sending thread can collect together 36 * of high load. Also, this way, the sending thread can collect together
37 * messages bound for one node and send them in one block. 37 * messages bound for one node and send them in one block.
38 * 38 *
39 * lowcomms will choose to use wither TCP or SCTP as its transport layer 39 * lowcomms will choose to use either TCP or SCTP as its transport layer
40 * depending on the configuration variable 'protocol'. This should be set 40 * depending on the configuration variable 'protocol'. This should be set
41 * to 0 (default) for TCP or 1 for SCTP. It shouldbe configured using a 41 * to 0 (default) for TCP or 1 for SCTP. It should be configured using a
42 * cluster-wide mechanism as it must be the same on all nodes of the cluster 42 * cluster-wide mechanism as it must be the same on all nodes of the cluster
43 * for the DLM to function. 43 * for the DLM to function.
44 * 44 *
@@ -48,11 +48,11 @@
48#include <net/sock.h> 48#include <net/sock.h>
49#include <net/tcp.h> 49#include <net/tcp.h>
50#include <linux/pagemap.h> 50#include <linux/pagemap.h>
51#include <linux/idr.h>
52#include <linux/file.h> 51#include <linux/file.h>
53#include <linux/mutex.h> 52#include <linux/mutex.h>
54#include <linux/sctp.h> 53#include <linux/sctp.h>
55#include <net/sctp/user.h> 54#include <net/sctp/user.h>
55#include <net/ipv6.h>
56 56
57#include "dlm_internal.h" 57#include "dlm_internal.h"
58#include "lowcomms.h" 58#include "lowcomms.h"
@@ -60,6 +60,7 @@
60#include "config.h" 60#include "config.h"
61 61
62#define NEEDED_RMEM (4*1024*1024) 62#define NEEDED_RMEM (4*1024*1024)
63#define CONN_HASH_SIZE 32
63 64
64struct cbuf { 65struct cbuf {
65 unsigned int base; 66 unsigned int base;
@@ -114,6 +115,7 @@ struct connection {
114 int retries; 115 int retries;
115#define MAX_CONNECT_RETRIES 3 116#define MAX_CONNECT_RETRIES 3
116 int sctp_assoc; 117 int sctp_assoc;
118 struct hlist_node list;
117 struct connection *othercon; 119 struct connection *othercon;
118 struct work_struct rwork; /* Receive workqueue */ 120 struct work_struct rwork; /* Receive workqueue */
119 struct work_struct swork; /* Send workqueue */ 121 struct work_struct swork; /* Send workqueue */
@@ -138,14 +140,37 @@ static int dlm_local_count;
138static struct workqueue_struct *recv_workqueue; 140static struct workqueue_struct *recv_workqueue;
139static struct workqueue_struct *send_workqueue; 141static struct workqueue_struct *send_workqueue;
140 142
141static DEFINE_IDR(connections_idr); 143static struct hlist_head connection_hash[CONN_HASH_SIZE];
142static DEFINE_MUTEX(connections_lock); 144static DEFINE_MUTEX(connections_lock);
143static int max_nodeid;
144static struct kmem_cache *con_cache; 145static struct kmem_cache *con_cache;
145 146
146static void process_recv_sockets(struct work_struct *work); 147static void process_recv_sockets(struct work_struct *work);
147static void process_send_sockets(struct work_struct *work); 148static void process_send_sockets(struct work_struct *work);
148 149
150
151/* This is deliberately very simple because most clusters have simple
152 sequential nodeids, so we should be able to go straight to a connection
153 struct in the array */
154static inline int nodeid_hash(int nodeid)
155{
156 return nodeid & (CONN_HASH_SIZE-1);
157}
158
159static struct connection *__find_con(int nodeid)
160{
161 int r;
162 struct hlist_node *h;
163 struct connection *con;
164
165 r = nodeid_hash(nodeid);
166
167 hlist_for_each_entry(con, h, &connection_hash[r], list) {
168 if (con->nodeid == nodeid)
169 return con;
170 }
171 return NULL;
172}
173
149/* 174/*
150 * If 'allocation' is zero then we don't attempt to create a new 175 * If 'allocation' is zero then we don't attempt to create a new
151 * connection structure for this node. 176 * connection structure for this node.
@@ -154,31 +179,17 @@ static struct connection *__nodeid2con(int nodeid, gfp_t alloc)
154{ 179{
155 struct connection *con = NULL; 180 struct connection *con = NULL;
156 int r; 181 int r;
157 int n;
158 182
159 con = idr_find(&connections_idr, nodeid); 183 con = __find_con(nodeid);
160 if (con || !alloc) 184 if (con || !alloc)
161 return con; 185 return con;
162 186
163 r = idr_pre_get(&connections_idr, alloc);
164 if (!r)
165 return NULL;
166
167 con = kmem_cache_zalloc(con_cache, alloc); 187 con = kmem_cache_zalloc(con_cache, alloc);
168 if (!con) 188 if (!con)
169 return NULL; 189 return NULL;
170 190
171 r = idr_get_new_above(&connections_idr, con, nodeid, &n); 191 r = nodeid_hash(nodeid);
172 if (r) { 192 hlist_add_head(&con->list, &connection_hash[r]);
173 kmem_cache_free(con_cache, con);
174 return NULL;
175 }
176
177 if (n != nodeid) {
178 idr_remove(&connections_idr, n);
179 kmem_cache_free(con_cache, con);
180 return NULL;
181 }
182 193
183 con->nodeid = nodeid; 194 con->nodeid = nodeid;
184 mutex_init(&con->sock_mutex); 195 mutex_init(&con->sock_mutex);
@@ -189,19 +200,30 @@ static struct connection *__nodeid2con(int nodeid, gfp_t alloc)
189 200
190 /* Setup action pointers for child sockets */ 201 /* Setup action pointers for child sockets */
191 if (con->nodeid) { 202 if (con->nodeid) {
192 struct connection *zerocon = idr_find(&connections_idr, 0); 203 struct connection *zerocon = __find_con(0);
193 204
194 con->connect_action = zerocon->connect_action; 205 con->connect_action = zerocon->connect_action;
195 if (!con->rx_action) 206 if (!con->rx_action)
196 con->rx_action = zerocon->rx_action; 207 con->rx_action = zerocon->rx_action;
197 } 208 }
198 209
199 if (nodeid > max_nodeid)
200 max_nodeid = nodeid;
201
202 return con; 210 return con;
203} 211}
204 212
213/* Loop round all connections */
214static void foreach_conn(void (*conn_func)(struct connection *c))
215{
216 int i;
217 struct hlist_node *h, *n;
218 struct connection *con;
219
220 for (i = 0; i < CONN_HASH_SIZE; i++) {
221 hlist_for_each_entry_safe(con, h, n, &connection_hash[i], list){
222 conn_func(con);
223 }
224 }
225}
226
205static struct connection *nodeid2con(int nodeid, gfp_t allocation) 227static struct connection *nodeid2con(int nodeid, gfp_t allocation)
206{ 228{
207 struct connection *con; 229 struct connection *con;
@@ -217,14 +239,17 @@ static struct connection *nodeid2con(int nodeid, gfp_t allocation)
217static struct connection *assoc2con(int assoc_id) 239static struct connection *assoc2con(int assoc_id)
218{ 240{
219 int i; 241 int i;
242 struct hlist_node *h;
220 struct connection *con; 243 struct connection *con;
221 244
222 mutex_lock(&connections_lock); 245 mutex_lock(&connections_lock);
223 for (i=0; i<=max_nodeid; i++) { 246
224 con = __nodeid2con(i, 0); 247 for (i = 0 ; i < CONN_HASH_SIZE; i++) {
225 if (con && con->sctp_assoc == assoc_id) { 248 hlist_for_each_entry(con, h, &connection_hash[i], list) {
226 mutex_unlock(&connections_lock); 249 if (con && con->sctp_assoc == assoc_id) {
227 return con; 250 mutex_unlock(&connections_lock);
251 return con;
252 }
228 } 253 }
229 } 254 }
230 mutex_unlock(&connections_lock); 255 mutex_unlock(&connections_lock);
@@ -250,8 +275,7 @@ static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
250 } else { 275 } else {
251 struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr; 276 struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr;
252 struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr; 277 struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
253 memcpy(&ret6->sin6_addr, &in6->sin6_addr, 278 ipv6_addr_copy(&ret6->sin6_addr, &in6->sin6_addr);
254 sizeof(in6->sin6_addr));
255 } 279 }
256 280
257 return 0; 281 return 0;
@@ -376,25 +400,23 @@ static void sctp_send_shutdown(sctp_assoc_t associd)
376 log_print("send EOF to node failed: %d", ret); 400 log_print("send EOF to node failed: %d", ret);
377} 401}
378 402
403static void sctp_init_failed_foreach(struct connection *con)
404{
405 con->sctp_assoc = 0;
406 if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
407 if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
408 queue_work(send_workqueue, &con->swork);
409 }
410}
411
379/* INIT failed but we don't know which node... 412/* INIT failed but we don't know which node...
380 restart INIT on all pending nodes */ 413 restart INIT on all pending nodes */
381static void sctp_init_failed(void) 414static void sctp_init_failed(void)
382{ 415{
383 int i;
384 struct connection *con;
385
386 mutex_lock(&connections_lock); 416 mutex_lock(&connections_lock);
387 for (i=1; i<=max_nodeid; i++) { 417
388 con = __nodeid2con(i, 0); 418 foreach_conn(sctp_init_failed_foreach);
389 if (!con) 419
390 continue;
391 con->sctp_assoc = 0;
392 if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
393 if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) {
394 queue_work(send_workqueue, &con->swork);
395 }
396 }
397 }
398 mutex_unlock(&connections_lock); 420 mutex_unlock(&connections_lock);
399} 421}
400 422
@@ -1313,13 +1335,10 @@ out_connect:
1313 1335
1314static void clean_one_writequeue(struct connection *con) 1336static void clean_one_writequeue(struct connection *con)
1315{ 1337{
1316 struct list_head *list; 1338 struct writequeue_entry *e, *safe;
1317 struct list_head *temp;
1318 1339
1319 spin_lock(&con->writequeue_lock); 1340 spin_lock(&con->writequeue_lock);
1320 list_for_each_safe(list, temp, &con->writequeue) { 1341 list_for_each_entry_safe(e, safe, &con->writequeue, list) {
1321 struct writequeue_entry *e =
1322 list_entry(list, struct writequeue_entry, list);
1323 list_del(&e->list); 1342 list_del(&e->list);
1324 free_entry(e); 1343 free_entry(e);
1325 } 1344 }
@@ -1369,14 +1388,7 @@ static void process_send_sockets(struct work_struct *work)
1369/* Discard all entries on the write queues */ 1388/* Discard all entries on the write queues */
1370static void clean_writequeues(void) 1389static void clean_writequeues(void)
1371{ 1390{
1372 int nodeid; 1391 foreach_conn(clean_one_writequeue);
1373
1374 for (nodeid = 1; nodeid <= max_nodeid; nodeid++) {
1375 struct connection *con = __nodeid2con(nodeid, 0);
1376
1377 if (con)
1378 clean_one_writequeue(con);
1379 }
1380} 1392}
1381 1393
1382static void work_stop(void) 1394static void work_stop(void)
@@ -1406,23 +1418,29 @@ static int work_start(void)
1406 return 0; 1418 return 0;
1407} 1419}
1408 1420
1409void dlm_lowcomms_stop(void) 1421static void stop_conn(struct connection *con)
1410{ 1422{
1411 int i; 1423 con->flags |= 0x0F;
1412 struct connection *con; 1424 if (con->sock)
1425 con->sock->sk->sk_user_data = NULL;
1426}
1413 1427
1428static void free_conn(struct connection *con)
1429{
1430 close_connection(con, true);
1431 if (con->othercon)
1432 kmem_cache_free(con_cache, con->othercon);
1433 hlist_del(&con->list);
1434 kmem_cache_free(con_cache, con);
1435}
1436
1437void dlm_lowcomms_stop(void)
1438{
1414 /* Set all the flags to prevent any 1439 /* Set all the flags to prevent any
1415 socket activity. 1440 socket activity.
1416 */ 1441 */
1417 mutex_lock(&connections_lock); 1442 mutex_lock(&connections_lock);
1418 for (i = 0; i <= max_nodeid; i++) { 1443 foreach_conn(stop_conn);
1419 con = __nodeid2con(i, 0);
1420 if (con) {
1421 con->flags |= 0x0F;
1422 if (con->sock)
1423 con->sock->sk->sk_user_data = NULL;
1424 }
1425 }
1426 mutex_unlock(&connections_lock); 1444 mutex_unlock(&connections_lock);
1427 1445
1428 work_stop(); 1446 work_stop();
@@ -1430,25 +1448,20 @@ void dlm_lowcomms_stop(void)
1430 mutex_lock(&connections_lock); 1448 mutex_lock(&connections_lock);
1431 clean_writequeues(); 1449 clean_writequeues();
1432 1450
1433 for (i = 0; i <= max_nodeid; i++) { 1451 foreach_conn(free_conn);
1434 con = __nodeid2con(i, 0); 1452
1435 if (con) {
1436 close_connection(con, true);
1437 if (con->othercon)
1438 kmem_cache_free(con_cache, con->othercon);
1439 kmem_cache_free(con_cache, con);
1440 }
1441 }
1442 max_nodeid = 0;
1443 mutex_unlock(&connections_lock); 1453 mutex_unlock(&connections_lock);
1444 kmem_cache_destroy(con_cache); 1454 kmem_cache_destroy(con_cache);
1445 idr_init(&connections_idr);
1446} 1455}
1447 1456
1448int dlm_lowcomms_start(void) 1457int dlm_lowcomms_start(void)
1449{ 1458{
1450 int error = -EINVAL; 1459 int error = -EINVAL;
1451 struct connection *con; 1460 struct connection *con;
1461 int i;
1462
1463 for (i = 0; i < CONN_HASH_SIZE; i++)
1464 INIT_HLIST_HEAD(&connection_hash[i]);
1452 1465
1453 init_local(); 1466 init_local();
1454 if (!dlm_local_count) { 1467 if (!dlm_local_count) {
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 065149e84f42..ebce994ab0b7 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. 2 * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved.
3 * 3 *
4 * This copyrighted material is made available to anyone wishing to use, 4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions 5 * modify, copy, or redistribute it subject to the terms and conditions
@@ -84,7 +84,7 @@ struct dlm_lock_result32 {
84 84
85static void compat_input(struct dlm_write_request *kb, 85static void compat_input(struct dlm_write_request *kb,
86 struct dlm_write_request32 *kb32, 86 struct dlm_write_request32 *kb32,
87 size_t count) 87 int namelen)
88{ 88{
89 kb->version[0] = kb32->version[0]; 89 kb->version[0] = kb32->version[0];
90 kb->version[1] = kb32->version[1]; 90 kb->version[1] = kb32->version[1];
@@ -96,8 +96,7 @@ static void compat_input(struct dlm_write_request *kb,
96 kb->cmd == DLM_USER_REMOVE_LOCKSPACE) { 96 kb->cmd == DLM_USER_REMOVE_LOCKSPACE) {
97 kb->i.lspace.flags = kb32->i.lspace.flags; 97 kb->i.lspace.flags = kb32->i.lspace.flags;
98 kb->i.lspace.minor = kb32->i.lspace.minor; 98 kb->i.lspace.minor = kb32->i.lspace.minor;
99 memcpy(kb->i.lspace.name, kb32->i.lspace.name, count - 99 memcpy(kb->i.lspace.name, kb32->i.lspace.name, namelen);
100 offsetof(struct dlm_write_request32, i.lspace.name));
101 } else if (kb->cmd == DLM_USER_PURGE) { 100 } else if (kb->cmd == DLM_USER_PURGE) {
102 kb->i.purge.nodeid = kb32->i.purge.nodeid; 101 kb->i.purge.nodeid = kb32->i.purge.nodeid;
103 kb->i.purge.pid = kb32->i.purge.pid; 102 kb->i.purge.pid = kb32->i.purge.pid;
@@ -115,8 +114,7 @@ static void compat_input(struct dlm_write_request *kb,
115 kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr; 114 kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr;
116 kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb; 115 kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb;
117 memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN); 116 memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN);
118 memcpy(kb->i.lock.name, kb32->i.lock.name, count - 117 memcpy(kb->i.lock.name, kb32->i.lock.name, namelen);
119 offsetof(struct dlm_write_request32, i.lock.name));
120 } 118 }
121} 119}
122 120
@@ -539,9 +537,16 @@ static ssize_t device_write(struct file *file, const char __user *buf,
539#ifdef CONFIG_COMPAT 537#ifdef CONFIG_COMPAT
540 if (!kbuf->is64bit) { 538 if (!kbuf->is64bit) {
541 struct dlm_write_request32 *k32buf; 539 struct dlm_write_request32 *k32buf;
540 int namelen = 0;
541
542 if (count > sizeof(struct dlm_write_request32))
543 namelen = count - sizeof(struct dlm_write_request32);
544
542 k32buf = (struct dlm_write_request32 *)kbuf; 545 k32buf = (struct dlm_write_request32 *)kbuf;
543 kbuf = kmalloc(count + 1 + (sizeof(struct dlm_write_request) - 546
544 sizeof(struct dlm_write_request32)), GFP_KERNEL); 547 /* add 1 after namelen so that the name string is terminated */
548 kbuf = kzalloc(sizeof(struct dlm_write_request) + namelen + 1,
549 GFP_KERNEL);
545 if (!kbuf) { 550 if (!kbuf) {
546 kfree(k32buf); 551 kfree(k32buf);
547 return -ENOMEM; 552 return -ENOMEM;
@@ -549,7 +554,8 @@ static ssize_t device_write(struct file *file, const char __user *buf,
549 554
550 if (proc) 555 if (proc)
551 set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags); 556 set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
552 compat_input(kbuf, k32buf, count + 1); 557
558 compat_input(kbuf, k32buf, namelen);
553 kfree(k32buf); 559 kfree(k32buf);
554 } 560 }
555#endif 561#endif
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 3e5637fc3779..b6a719a909f8 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -18,7 +18,7 @@ static void drop_pagecache_sb(struct super_block *sb)
18 18
19 spin_lock(&inode_lock); 19 spin_lock(&inode_lock);
20 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 20 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
21 if (inode->i_state & (I_FREEING|I_WILL_FREE)) 21 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
22 continue; 22 continue;
23 if (inode->i_mapping->nrpages == 0) 23 if (inode->i_mapping->nrpages == 0)
24 continue; 24 continue;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index f6caeb1d1106..8b65f289ee00 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -946,6 +946,8 @@ static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs(
946 list_for_each_entry(global_auth_tok, 946 list_for_each_entry(global_auth_tok,
947 &mount_crypt_stat->global_auth_tok_list, 947 &mount_crypt_stat->global_auth_tok_list,
948 mount_crypt_stat_list) { 948 mount_crypt_stat_list) {
949 if (global_auth_tok->flags & ECRYPTFS_AUTH_TOK_FNEK)
950 continue;
949 rc = ecryptfs_add_keysig(crypt_stat, global_auth_tok->sig); 951 rc = ecryptfs_add_keysig(crypt_stat, global_auth_tok->sig);
950 if (rc) { 952 if (rc) {
951 printk(KERN_ERR "Error adding keysig; rc = [%d]\n", rc); 953 printk(KERN_ERR "Error adding keysig; rc = [%d]\n", rc);
@@ -1322,14 +1324,13 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max,
1322} 1324}
1323 1325
1324static int 1326static int
1325ecryptfs_write_metadata_to_contents(struct ecryptfs_crypt_stat *crypt_stat, 1327ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry,
1326 struct dentry *ecryptfs_dentry, 1328 char *virt, size_t virt_len)
1327 char *virt)
1328{ 1329{
1329 int rc; 1330 int rc;
1330 1331
1331 rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt, 1332 rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt,
1332 0, crypt_stat->num_header_bytes_at_front); 1333 0, virt_len);
1333 if (rc) 1334 if (rc)
1334 printk(KERN_ERR "%s: Error attempting to write header " 1335 printk(KERN_ERR "%s: Error attempting to write header "
1335 "information to lower file; rc = [%d]\n", __func__, 1336 "information to lower file; rc = [%d]\n", __func__,
@@ -1339,7 +1340,6 @@ ecryptfs_write_metadata_to_contents(struct ecryptfs_crypt_stat *crypt_stat,
1339 1340
1340static int 1341static int
1341ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry, 1342ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry,
1342 struct ecryptfs_crypt_stat *crypt_stat,
1343 char *page_virt, size_t size) 1343 char *page_virt, size_t size)
1344{ 1344{
1345 int rc; 1345 int rc;
@@ -1349,6 +1349,17 @@ ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry,
1349 return rc; 1349 return rc;
1350} 1350}
1351 1351
1352static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask,
1353 unsigned int order)
1354{
1355 struct page *page;
1356
1357 page = alloc_pages(gfp_mask | __GFP_ZERO, order);
1358 if (page)
1359 return (unsigned long) page_address(page);
1360 return 0;
1361}
1362
1352/** 1363/**
1353 * ecryptfs_write_metadata 1364 * ecryptfs_write_metadata
1354 * @ecryptfs_dentry: The eCryptfs dentry 1365 * @ecryptfs_dentry: The eCryptfs dentry
@@ -1365,7 +1376,9 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
1365{ 1376{
1366 struct ecryptfs_crypt_stat *crypt_stat = 1377 struct ecryptfs_crypt_stat *crypt_stat =
1367 &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; 1378 &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat;
1379 unsigned int order;
1368 char *virt; 1380 char *virt;
1381 size_t virt_len;
1369 size_t size = 0; 1382 size_t size = 0;
1370 int rc = 0; 1383 int rc = 0;
1371 1384
@@ -1381,33 +1394,35 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
1381 rc = -EINVAL; 1394 rc = -EINVAL;
1382 goto out; 1395 goto out;
1383 } 1396 }
1397 virt_len = crypt_stat->num_header_bytes_at_front;
1398 order = get_order(virt_len);
1384 /* Released in this function */ 1399 /* Released in this function */
1385 virt = (char *)get_zeroed_page(GFP_KERNEL); 1400 virt = (char *)ecryptfs_get_zeroed_pages(GFP_KERNEL, order);
1386 if (!virt) { 1401 if (!virt) {
1387 printk(KERN_ERR "%s: Out of memory\n", __func__); 1402 printk(KERN_ERR "%s: Out of memory\n", __func__);
1388 rc = -ENOMEM; 1403 rc = -ENOMEM;
1389 goto out; 1404 goto out;
1390 } 1405 }
1391 rc = ecryptfs_write_headers_virt(virt, PAGE_CACHE_SIZE, &size, 1406 rc = ecryptfs_write_headers_virt(virt, virt_len, &size, crypt_stat,
1392 crypt_stat, ecryptfs_dentry); 1407 ecryptfs_dentry);
1393 if (unlikely(rc)) { 1408 if (unlikely(rc)) {
1394 printk(KERN_ERR "%s: Error whilst writing headers; rc = [%d]\n", 1409 printk(KERN_ERR "%s: Error whilst writing headers; rc = [%d]\n",
1395 __func__, rc); 1410 __func__, rc);
1396 goto out_free; 1411 goto out_free;
1397 } 1412 }
1398 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) 1413 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
1399 rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, 1414 rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt,
1400 crypt_stat, virt, size); 1415 size);
1401 else 1416 else
1402 rc = ecryptfs_write_metadata_to_contents(crypt_stat, 1417 rc = ecryptfs_write_metadata_to_contents(ecryptfs_dentry, virt,
1403 ecryptfs_dentry, virt); 1418 virt_len);
1404 if (rc) { 1419 if (rc) {
1405 printk(KERN_ERR "%s: Error writing metadata out to lower file; " 1420 printk(KERN_ERR "%s: Error writing metadata out to lower file; "
1406 "rc = [%d]\n", __func__, rc); 1421 "rc = [%d]\n", __func__, rc);
1407 goto out_free; 1422 goto out_free;
1408 } 1423 }
1409out_free: 1424out_free:
1410 free_page((unsigned long)virt); 1425 free_pages((unsigned long)virt, order);
1411out: 1426out:
1412 return rc; 1427 return rc;
1413} 1428}
@@ -2206,17 +2221,19 @@ int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
2206 struct dentry *ecryptfs_dir_dentry, 2221 struct dentry *ecryptfs_dir_dentry,
2207 const char *name, size_t name_size) 2222 const char *name, size_t name_size)
2208{ 2223{
2224 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
2225 &ecryptfs_superblock_to_private(
2226 ecryptfs_dir_dentry->d_sb)->mount_crypt_stat;
2209 char *decoded_name; 2227 char *decoded_name;
2210 size_t decoded_name_size; 2228 size_t decoded_name_size;
2211 size_t packet_size; 2229 size_t packet_size;
2212 int rc = 0; 2230 int rc = 0;
2213 2231
2214 if ((name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) 2232 if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
2233 && !(mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
2234 && (name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE)
2215 && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX, 2235 && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
2216 ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) { 2236 ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) {
2217 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
2218 &ecryptfs_superblock_to_private(
2219 ecryptfs_dir_dentry->d_sb)->mount_crypt_stat;
2220 const char *orig_name = name; 2237 const char *orig_name = name;
2221 size_t orig_name_size = name_size; 2238 size_t orig_name_size = name_size;
2222 2239
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 5e596583946c..2dda5ade75bc 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -89,7 +89,7 @@ static void ecryptfs_d_release(struct dentry *dentry)
89 return; 89 return;
90} 90}
91 91
92struct dentry_operations ecryptfs_dops = { 92const struct dentry_operations ecryptfs_dops = {
93 .d_revalidate = ecryptfs_d_revalidate, 93 .d_revalidate = ecryptfs_d_revalidate,
94 .d_release = ecryptfs_d_release, 94 .d_release = ecryptfs_d_release,
95}; 95};
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index c11fc95714ab..064c5820e4e5 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -328,6 +328,7 @@ struct ecryptfs_dentry_info {
328 */ 328 */
329struct ecryptfs_global_auth_tok { 329struct ecryptfs_global_auth_tok {
330#define ECRYPTFS_AUTH_TOK_INVALID 0x00000001 330#define ECRYPTFS_AUTH_TOK_INVALID 0x00000001
331#define ECRYPTFS_AUTH_TOK_FNEK 0x00000002
331 u32 flags; 332 u32 flags;
332 struct list_head mount_crypt_stat_list; 333 struct list_head mount_crypt_stat_list;
333 struct key *global_auth_tok_key; 334 struct key *global_auth_tok_key;
@@ -579,7 +580,7 @@ extern const struct inode_operations ecryptfs_main_iops;
579extern const struct inode_operations ecryptfs_dir_iops; 580extern const struct inode_operations ecryptfs_dir_iops;
580extern const struct inode_operations ecryptfs_symlink_iops; 581extern const struct inode_operations ecryptfs_symlink_iops;
581extern const struct super_operations ecryptfs_sops; 582extern const struct super_operations ecryptfs_sops;
582extern struct dentry_operations ecryptfs_dops; 583extern const struct dentry_operations ecryptfs_dops;
583extern struct address_space_operations ecryptfs_aops; 584extern struct address_space_operations ecryptfs_aops;
584extern int ecryptfs_verbosity; 585extern int ecryptfs_verbosity;
585extern unsigned int ecryptfs_message_buf_len; 586extern unsigned int ecryptfs_message_buf_len;
@@ -619,7 +620,6 @@ int ecryptfs_interpose(struct dentry *hidden_dentry,
619 u32 flags); 620 u32 flags);
620int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, 621int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
621 struct dentry *lower_dentry, 622 struct dentry *lower_dentry,
622 struct ecryptfs_crypt_stat *crypt_stat,
623 struct inode *ecryptfs_dir_inode, 623 struct inode *ecryptfs_dir_inode,
624 struct nameidata *ecryptfs_nd); 624 struct nameidata *ecryptfs_nd);
625int ecryptfs_decode_and_decrypt_filename(char **decrypted_name, 625int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
@@ -696,7 +696,7 @@ ecryptfs_write_header_metadata(char *virt,
696int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig); 696int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig);
697int 697int
698ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, 698ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
699 char *sig); 699 char *sig, u32 global_auth_tok_flags);
700int ecryptfs_get_global_auth_tok_for_sig( 700int ecryptfs_get_global_auth_tok_for_sig(
701 struct ecryptfs_global_auth_tok **global_auth_tok, 701 struct ecryptfs_global_auth_tok **global_auth_tok,
702 struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig); 702 struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 5697899a168d..55b3145b8072 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -246,7 +246,6 @@ out:
246 */ 246 */
247int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, 247int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
248 struct dentry *lower_dentry, 248 struct dentry *lower_dentry,
249 struct ecryptfs_crypt_stat *crypt_stat,
250 struct inode *ecryptfs_dir_inode, 249 struct inode *ecryptfs_dir_inode,
251 struct nameidata *ecryptfs_nd) 250 struct nameidata *ecryptfs_nd)
252{ 251{
@@ -254,6 +253,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
254 struct vfsmount *lower_mnt; 253 struct vfsmount *lower_mnt;
255 struct inode *lower_inode; 254 struct inode *lower_inode;
256 struct ecryptfs_mount_crypt_stat *mount_crypt_stat; 255 struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
256 struct ecryptfs_crypt_stat *crypt_stat;
257 char *page_virt = NULL; 257 char *page_virt = NULL;
258 u64 file_size; 258 u64 file_size;
259 int rc = 0; 259 int rc = 0;
@@ -314,6 +314,11 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
314 goto out_free_kmem; 314 goto out_free_kmem;
315 } 315 }
316 } 316 }
317 crypt_stat = &ecryptfs_inode_to_private(
318 ecryptfs_dentry->d_inode)->crypt_stat;
319 /* TODO: lock for crypt_stat comparison */
320 if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
321 ecryptfs_set_default_sizes(crypt_stat);
317 rc = ecryptfs_read_and_validate_header_region(page_virt, 322 rc = ecryptfs_read_and_validate_header_region(page_virt,
318 ecryptfs_dentry->d_inode); 323 ecryptfs_dentry->d_inode);
319 if (rc) { 324 if (rc) {
@@ -362,9 +367,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
362{ 367{
363 char *encrypted_and_encoded_name = NULL; 368 char *encrypted_and_encoded_name = NULL;
364 size_t encrypted_and_encoded_name_size; 369 size_t encrypted_and_encoded_name_size;
365 struct ecryptfs_crypt_stat *crypt_stat = NULL;
366 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; 370 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
367 struct ecryptfs_inode_info *inode_info;
368 struct dentry *lower_dir_dentry, *lower_dentry; 371 struct dentry *lower_dir_dentry, *lower_dentry;
369 int rc = 0; 372 int rc = 0;
370 373
@@ -388,26 +391,15 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
388 } 391 }
389 if (lower_dentry->d_inode) 392 if (lower_dentry->d_inode)
390 goto lookup_and_interpose; 393 goto lookup_and_interpose;
391 inode_info = ecryptfs_inode_to_private(ecryptfs_dentry->d_inode); 394 mount_crypt_stat = &ecryptfs_superblock_to_private(
392 if (inode_info) { 395 ecryptfs_dentry->d_sb)->mount_crypt_stat;
393 crypt_stat = &inode_info->crypt_stat; 396 if (!(mount_crypt_stat
394 /* TODO: lock for crypt_stat comparison */ 397 && (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)))
395 if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
396 ecryptfs_set_default_sizes(crypt_stat);
397 }
398 if (crypt_stat)
399 mount_crypt_stat = crypt_stat->mount_crypt_stat;
400 else
401 mount_crypt_stat = &ecryptfs_superblock_to_private(
402 ecryptfs_dentry->d_sb)->mount_crypt_stat;
403 if (!(crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
404 && !(mount_crypt_stat && (mount_crypt_stat->flags
405 & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)))
406 goto lookup_and_interpose; 398 goto lookup_and_interpose;
407 dput(lower_dentry); 399 dput(lower_dentry);
408 rc = ecryptfs_encrypt_and_encode_filename( 400 rc = ecryptfs_encrypt_and_encode_filename(
409 &encrypted_and_encoded_name, &encrypted_and_encoded_name_size, 401 &encrypted_and_encoded_name, &encrypted_and_encoded_name_size,
410 crypt_stat, mount_crypt_stat, ecryptfs_dentry->d_name.name, 402 NULL, mount_crypt_stat, ecryptfs_dentry->d_name.name,
411 ecryptfs_dentry->d_name.len); 403 ecryptfs_dentry->d_name.len);
412 if (rc) { 404 if (rc) {
413 printk(KERN_ERR "%s: Error attempting to encrypt and encode " 405 printk(KERN_ERR "%s: Error attempting to encrypt and encode "
@@ -426,7 +418,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
426 } 418 }
427lookup_and_interpose: 419lookup_and_interpose:
428 rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry, 420 rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry,
429 crypt_stat, ecryptfs_dir_inode, 421 ecryptfs_dir_inode,
430 ecryptfs_nd); 422 ecryptfs_nd);
431 goto out; 423 goto out;
432out_d_drop: 424out_d_drop:
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index ff539420cc6f..af737bb56cb7 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -740,8 +740,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
740out_release_free_unlock: 740out_release_free_unlock:
741 crypto_free_hash(s->hash_desc.tfm); 741 crypto_free_hash(s->hash_desc.tfm);
742out_free_unlock: 742out_free_unlock:
743 memset(s->block_aligned_filename, 0, s->block_aligned_filename_size); 743 kzfree(s->block_aligned_filename);
744 kfree(s->block_aligned_filename);
745out_unlock: 744out_unlock:
746 mutex_unlock(s->tfm_mutex); 745 mutex_unlock(s->tfm_mutex);
747out: 746out:
@@ -2375,7 +2374,7 @@ struct kmem_cache *ecryptfs_global_auth_tok_cache;
2375 2374
2376int 2375int
2377ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, 2376ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
2378 char *sig) 2377 char *sig, u32 global_auth_tok_flags)
2379{ 2378{
2380 struct ecryptfs_global_auth_tok *new_auth_tok; 2379 struct ecryptfs_global_auth_tok *new_auth_tok;
2381 int rc = 0; 2380 int rc = 0;
@@ -2389,6 +2388,7 @@ ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
2389 goto out; 2388 goto out;
2390 } 2389 }
2391 memcpy(new_auth_tok->sig, sig, ECRYPTFS_SIG_SIZE_HEX); 2390 memcpy(new_auth_tok->sig, sig, ECRYPTFS_SIG_SIZE_HEX);
2391 new_auth_tok->flags = global_auth_tok_flags;
2392 new_auth_tok->sig[ECRYPTFS_SIG_SIZE_HEX] = '\0'; 2392 new_auth_tok->sig[ECRYPTFS_SIG_SIZE_HEX] = '\0';
2393 mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); 2393 mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
2394 list_add(&new_auth_tok->mount_crypt_stat_list, 2394 list_add(&new_auth_tok->mount_crypt_stat_list,
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 789cf2e1be1e..aed56c25539b 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -319,7 +319,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
319 case ecryptfs_opt_ecryptfs_sig: 319 case ecryptfs_opt_ecryptfs_sig:
320 sig_src = args[0].from; 320 sig_src = args[0].from;
321 rc = ecryptfs_add_global_auth_tok(mount_crypt_stat, 321 rc = ecryptfs_add_global_auth_tok(mount_crypt_stat,
322 sig_src); 322 sig_src, 0);
323 if (rc) { 323 if (rc) {
324 printk(KERN_ERR "Error attempting to register " 324 printk(KERN_ERR "Error attempting to register "
325 "global sig; rc = [%d]\n", rc); 325 "global sig; rc = [%d]\n", rc);
@@ -370,7 +370,8 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
370 ECRYPTFS_SIG_SIZE_HEX] = '\0'; 370 ECRYPTFS_SIG_SIZE_HEX] = '\0';
371 rc = ecryptfs_add_global_auth_tok( 371 rc = ecryptfs_add_global_auth_tok(
372 mount_crypt_stat, 372 mount_crypt_stat,
373 mount_crypt_stat->global_default_fnek_sig); 373 mount_crypt_stat->global_default_fnek_sig,
374 ECRYPTFS_AUTH_TOK_FNEK);
374 if (rc) { 375 if (rc) {
375 printk(KERN_ERR "Error attempting to register " 376 printk(KERN_ERR "Error attempting to register "
376 "global fnek sig [%s]; rc = [%d]\n", 377 "global fnek sig [%s]; rc = [%d]\n",
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 96ef51489e01..295e7fa56755 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -291,8 +291,7 @@ int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon)
291 if (daemon->user_ns) 291 if (daemon->user_ns)
292 put_user_ns(daemon->user_ns); 292 put_user_ns(daemon->user_ns);
293 mutex_unlock(&daemon->mux); 293 mutex_unlock(&daemon->mux);
294 memset(daemon, 0, sizeof(*daemon)); 294 kzfree(daemon);
295 kfree(daemon);
296out: 295out:
297 return rc; 296 return rc;
298} 297}
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 73b19cfc91fc..f04942810818 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -329,18 +329,22 @@ out_no_fs:
329} 329}
330 330
331static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) { 331static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
332 struct efs_sb_info *sb = SUPER_INFO(dentry->d_sb); 332 struct super_block *sb = dentry->d_sb;
333 struct efs_sb_info *sbi = SUPER_INFO(sb);
334 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
333 335
334 buf->f_type = EFS_SUPER_MAGIC; /* efs magic number */ 336 buf->f_type = EFS_SUPER_MAGIC; /* efs magic number */
335 buf->f_bsize = EFS_BLOCKSIZE; /* blocksize */ 337 buf->f_bsize = EFS_BLOCKSIZE; /* blocksize */
336 buf->f_blocks = sb->total_groups * /* total data blocks */ 338 buf->f_blocks = sbi->total_groups * /* total data blocks */
337 (sb->group_size - sb->inode_blocks); 339 (sbi->group_size - sbi->inode_blocks);
338 buf->f_bfree = sb->data_free; /* free data blocks */ 340 buf->f_bfree = sbi->data_free; /* free data blocks */
339 buf->f_bavail = sb->data_free; /* free blocks for non-root */ 341 buf->f_bavail = sbi->data_free; /* free blocks for non-root */
340 buf->f_files = sb->total_groups * /* total inodes */ 342 buf->f_files = sbi->total_groups * /* total inodes */
341 sb->inode_blocks * 343 sbi->inode_blocks *
342 (EFS_BLOCKSIZE / sizeof(struct efs_dinode)); 344 (EFS_BLOCKSIZE / sizeof(struct efs_dinode));
343 buf->f_ffree = sb->inode_free; /* free inodes */ 345 buf->f_ffree = sbi->inode_free; /* free inodes */
346 buf->f_fsid.val[0] = (u32)id;
347 buf->f_fsid.val[1] = (u32)(id >> 32);
344 buf->f_namelen = EFS_MAXNAMELEN; /* max filename length */ 348 buf->f_namelen = EFS_MAXNAMELEN; /* max filename length */
345 349
346 return 0; 350 return 0;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 5de2c2db3aa2..2a701d593d35 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -28,6 +28,7 @@ struct eventfd_ctx {
28 * issue a wakeup. 28 * issue a wakeup.
29 */ 29 */
30 __u64 count; 30 __u64 count;
31 unsigned int flags;
31}; 32};
32 33
33/* 34/*
@@ -50,7 +51,7 @@ int eventfd_signal(struct file *file, int n)
50 n = (int) (ULLONG_MAX - ctx->count); 51 n = (int) (ULLONG_MAX - ctx->count);
51 ctx->count += n; 52 ctx->count += n;
52 if (waitqueue_active(&ctx->wqh)) 53 if (waitqueue_active(&ctx->wqh))
53 wake_up_locked(&ctx->wqh); 54 wake_up_locked_poll(&ctx->wqh, POLLIN);
54 spin_unlock_irqrestore(&ctx->wqh.lock, flags); 55 spin_unlock_irqrestore(&ctx->wqh.lock, flags);
55 56
56 return n; 57 return n;
@@ -87,22 +88,20 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
87{ 88{
88 struct eventfd_ctx *ctx = file->private_data; 89 struct eventfd_ctx *ctx = file->private_data;
89 ssize_t res; 90 ssize_t res;
90 __u64 ucnt; 91 __u64 ucnt = 0;
91 DECLARE_WAITQUEUE(wait, current); 92 DECLARE_WAITQUEUE(wait, current);
92 93
93 if (count < sizeof(ucnt)) 94 if (count < sizeof(ucnt))
94 return -EINVAL; 95 return -EINVAL;
95 spin_lock_irq(&ctx->wqh.lock); 96 spin_lock_irq(&ctx->wqh.lock);
96 res = -EAGAIN; 97 res = -EAGAIN;
97 ucnt = ctx->count; 98 if (ctx->count > 0)
98 if (ucnt > 0)
99 res = sizeof(ucnt); 99 res = sizeof(ucnt);
100 else if (!(file->f_flags & O_NONBLOCK)) { 100 else if (!(file->f_flags & O_NONBLOCK)) {
101 __add_wait_queue(&ctx->wqh, &wait); 101 __add_wait_queue(&ctx->wqh, &wait);
102 for (res = 0;;) { 102 for (res = 0;;) {
103 set_current_state(TASK_INTERRUPTIBLE); 103 set_current_state(TASK_INTERRUPTIBLE);
104 if (ctx->count > 0) { 104 if (ctx->count > 0) {
105 ucnt = ctx->count;
106 res = sizeof(ucnt); 105 res = sizeof(ucnt);
107 break; 106 break;
108 } 107 }
@@ -117,10 +116,11 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
117 __remove_wait_queue(&ctx->wqh, &wait); 116 __remove_wait_queue(&ctx->wqh, &wait);
118 __set_current_state(TASK_RUNNING); 117 __set_current_state(TASK_RUNNING);
119 } 118 }
120 if (res > 0) { 119 if (likely(res > 0)) {
121 ctx->count = 0; 120 ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
121 ctx->count -= ucnt;
122 if (waitqueue_active(&ctx->wqh)) 122 if (waitqueue_active(&ctx->wqh))
123 wake_up_locked(&ctx->wqh); 123 wake_up_locked_poll(&ctx->wqh, POLLOUT);
124 } 124 }
125 spin_unlock_irq(&ctx->wqh.lock); 125 spin_unlock_irq(&ctx->wqh.lock);
126 if (res > 0 && put_user(ucnt, (__u64 __user *) buf)) 126 if (res > 0 && put_user(ucnt, (__u64 __user *) buf))
@@ -166,10 +166,10 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
166 __remove_wait_queue(&ctx->wqh, &wait); 166 __remove_wait_queue(&ctx->wqh, &wait);
167 __set_current_state(TASK_RUNNING); 167 __set_current_state(TASK_RUNNING);
168 } 168 }
169 if (res > 0) { 169 if (likely(res > 0)) {
170 ctx->count += ucnt; 170 ctx->count += ucnt;
171 if (waitqueue_active(&ctx->wqh)) 171 if (waitqueue_active(&ctx->wqh))
172 wake_up_locked(&ctx->wqh); 172 wake_up_locked_poll(&ctx->wqh, POLLIN);
173 } 173 }
174 spin_unlock_irq(&ctx->wqh.lock); 174 spin_unlock_irq(&ctx->wqh.lock);
175 175
@@ -207,7 +207,7 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
207 BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); 207 BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
208 BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); 208 BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
209 209
210 if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK)) 210 if (flags & ~EFD_FLAGS_SET)
211 return -EINVAL; 211 return -EINVAL;
212 212
213 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 213 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
@@ -216,13 +216,14 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
216 216
217 init_waitqueue_head(&ctx->wqh); 217 init_waitqueue_head(&ctx->wqh);
218 ctx->count = count; 218 ctx->count = count;
219 ctx->flags = flags;
219 220
220 /* 221 /*
221 * When we call this, the initialization must be complete, since 222 * When we call this, the initialization must be complete, since
222 * anon_inode_getfd() will install the fd. 223 * anon_inode_getfd() will install the fd.
223 */ 224 */
224 fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx, 225 fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
225 flags & (O_CLOEXEC | O_NONBLOCK)); 226 flags & EFD_SHARED_FCNTL_FLAGS);
226 if (fd < 0) 227 if (fd < 0)
227 kfree(ctx); 228 kfree(ctx);
228 return fd; 229 return fd;
@@ -232,3 +233,4 @@ SYSCALL_DEFINE1(eventfd, unsigned int, count)
232{ 233{
233 return sys_eventfd2(count, 0); 234 return sys_eventfd2(count, 0);
234} 235}
236
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 011b9b8c90c6..a89f370fadb5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * fs/eventpoll.c (Efficent event polling implementation) 2 * fs/eventpoll.c (Efficient event retrieval implementation)
3 * Copyright (C) 2001,...,2007 Davide Libenzi 3 * Copyright (C) 2001,...,2009 Davide Libenzi
4 * 4 *
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
@@ -71,29 +71,11 @@
71 * a better scalability. 71 * a better scalability.
72 */ 72 */
73 73
74#define DEBUG_EPOLL 0
75
76#if DEBUG_EPOLL > 0
77#define DPRINTK(x) printk x
78#define DNPRINTK(n, x) do { if ((n) <= DEBUG_EPOLL) printk x; } while (0)
79#else /* #if DEBUG_EPOLL > 0 */
80#define DPRINTK(x) (void) 0
81#define DNPRINTK(n, x) (void) 0
82#endif /* #if DEBUG_EPOLL > 0 */
83
84#define DEBUG_EPI 0
85
86#if DEBUG_EPI != 0
87#define EPI_SLAB_DEBUG (SLAB_DEBUG_FREE | SLAB_RED_ZONE /* | SLAB_POISON */)
88#else /* #if DEBUG_EPI != 0 */
89#define EPI_SLAB_DEBUG 0
90#endif /* #if DEBUG_EPI != 0 */
91
92/* Epoll private bits inside the event mask */ 74/* Epoll private bits inside the event mask */
93#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET) 75#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET)
94 76
95/* Maximum number of poll wake up nests we are allowing */ 77/* Maximum number of nesting allowed inside epoll sets */
96#define EP_MAX_POLLWAKE_NESTS 4 78#define EP_MAX_NESTS 4
97 79
98/* Maximum msec timeout value storeable in a long int */ 80/* Maximum msec timeout value storeable in a long int */
99#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ) 81#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
@@ -110,24 +92,21 @@ struct epoll_filefd {
110}; 92};
111 93
112/* 94/*
113 * Node that is linked into the "wake_task_list" member of the "struct poll_safewake". 95 * Structure used to track possible nested calls, for too deep recursions
114 * It is used to keep track on all tasks that are currently inside the wake_up() code 96 * and loop cycles.
115 * to 1) short-circuit the one coming from the same task and same wait queue head
116 * (loop) 2) allow a maximum number of epoll descriptors inclusion nesting
117 * 3) let go the ones coming from other tasks.
118 */ 97 */
119struct wake_task_node { 98struct nested_call_node {
120 struct list_head llink; 99 struct list_head llink;
121 struct task_struct *task; 100 void *cookie;
122 wait_queue_head_t *wq; 101 int cpu;
123}; 102};
124 103
125/* 104/*
126 * This is used to implement the safe poll wake up avoiding to reenter 105 * This structure is used as collector for nested calls, to check for
127 * the poll callback from inside wake_up(). 106 * maximum recursion dept and loop cycles.
128 */ 107 */
129struct poll_safewake { 108struct nested_calls {
130 struct list_head wake_task_list; 109 struct list_head tasks_call_list;
131 spinlock_t lock; 110 spinlock_t lock;
132}; 111};
133 112
@@ -213,7 +192,7 @@ struct eppoll_entry {
213 struct list_head llink; 192 struct list_head llink;
214 193
215 /* The "base" pointer is set to the container "struct epitem" */ 194 /* The "base" pointer is set to the container "struct epitem" */
216 void *base; 195 struct epitem *base;
217 196
218 /* 197 /*
219 * Wait queue item that will be linked to the target file wait 198 * Wait queue item that will be linked to the target file wait
@@ -231,6 +210,12 @@ struct ep_pqueue {
231 struct epitem *epi; 210 struct epitem *epi;
232}; 211};
233 212
213/* Used by the ep_send_events() function as callback private data */
214struct ep_send_events_data {
215 int maxevents;
216 struct epoll_event __user *events;
217};
218
234/* 219/*
235 * Configuration options available inside /proc/sys/fs/epoll/ 220 * Configuration options available inside /proc/sys/fs/epoll/
236 */ 221 */
@@ -242,8 +227,11 @@ static int max_user_watches __read_mostly;
242 */ 227 */
243static DEFINE_MUTEX(epmutex); 228static DEFINE_MUTEX(epmutex);
244 229
245/* Safe wake up implementation */ 230/* Used for safe wake up implementation */
246static struct poll_safewake psw; 231static struct nested_calls poll_safewake_ncalls;
232
233/* Used to call file's f_op->poll() under the nested calls boundaries */
234static struct nested_calls poll_readywalk_ncalls;
247 235
248/* Slab cache used to allocate "struct epitem" */ 236/* Slab cache used to allocate "struct epitem" */
249static struct kmem_cache *epi_cache __read_mostly; 237static struct kmem_cache *epi_cache __read_mostly;
@@ -312,89 +300,230 @@ static inline int ep_op_has_event(int op)
312} 300}
313 301
314/* Initialize the poll safe wake up structure */ 302/* Initialize the poll safe wake up structure */
315static void ep_poll_safewake_init(struct poll_safewake *psw) 303static void ep_nested_calls_init(struct nested_calls *ncalls)
316{ 304{
317 305 INIT_LIST_HEAD(&ncalls->tasks_call_list);
318 INIT_LIST_HEAD(&psw->wake_task_list); 306 spin_lock_init(&ncalls->lock);
319 spin_lock_init(&psw->lock);
320} 307}
321 308
322/* 309/**
323 * Perform a safe wake up of the poll wait list. The problem is that 310 * ep_call_nested - Perform a bound (possibly) nested call, by checking
324 * with the new callback'd wake up system, it is possible that the 311 * that the recursion limit is not exceeded, and that
325 * poll callback is reentered from inside the call to wake_up() done 312 * the same nested call (by the meaning of same cookie) is
326 * on the poll wait queue head. The rule is that we cannot reenter the 313 * no re-entered.
327 * wake up code from the same task more than EP_MAX_POLLWAKE_NESTS times, 314 *
328 * and we cannot reenter the same wait queue head at all. This will 315 * @ncalls: Pointer to the nested_calls structure to be used for this call.
329 * enable to have a hierarchy of epoll file descriptor of no more than 316 * @max_nests: Maximum number of allowed nesting calls.
330 * EP_MAX_POLLWAKE_NESTS deep. We need the irq version of the spin lock 317 * @nproc: Nested call core function pointer.
331 * because this one gets called by the poll callback, that in turn is called 318 * @priv: Opaque data to be passed to the @nproc callback.
332 * from inside a wake_up(), that might be called from irq context. 319 * @cookie: Cookie to be used to identify this nested call.
320 *
321 * Returns: Returns the code returned by the @nproc callback, or -1 if
322 * the maximum recursion limit has been exceeded.
333 */ 323 */
334static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq) 324static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
325 int (*nproc)(void *, void *, int), void *priv,
326 void *cookie)
335{ 327{
336 int wake_nests = 0; 328 int error, call_nests = 0;
337 unsigned long flags; 329 unsigned long flags;
338 struct task_struct *this_task = current; 330 int this_cpu = get_cpu();
339 struct list_head *lsthead = &psw->wake_task_list; 331 struct list_head *lsthead = &ncalls->tasks_call_list;
340 struct wake_task_node *tncur; 332 struct nested_call_node *tncur;
341 struct wake_task_node tnode; 333 struct nested_call_node tnode;
342 334
343 spin_lock_irqsave(&psw->lock, flags); 335 spin_lock_irqsave(&ncalls->lock, flags);
344 336
345 /* Try to see if the current task is already inside this wakeup call */ 337 /*
338 * Try to see if the current task is already inside this wakeup call.
339 * We use a list here, since the population inside this set is always
340 * very much limited.
341 */
346 list_for_each_entry(tncur, lsthead, llink) { 342 list_for_each_entry(tncur, lsthead, llink) {
347 343 if (tncur->cpu == this_cpu &&
348 if (tncur->wq == wq || 344 (tncur->cookie == cookie || ++call_nests > max_nests)) {
349 (tncur->task == this_task && ++wake_nests > EP_MAX_POLLWAKE_NESTS)) {
350 /* 345 /*
351 * Ops ... loop detected or maximum nest level reached. 346 * Ops ... loop detected or maximum nest level reached.
352 * We abort this wake by breaking the cycle itself. 347 * We abort this wake by breaking the cycle itself.
353 */ 348 */
354 spin_unlock_irqrestore(&psw->lock, flags); 349 error = -1;
355 return; 350 goto out_unlock;
356 } 351 }
357 } 352 }
358 353
359 /* Add the current task to the list */ 354 /* Add the current task and cookie to the list */
360 tnode.task = this_task; 355 tnode.cpu = this_cpu;
361 tnode.wq = wq; 356 tnode.cookie = cookie;
362 list_add(&tnode.llink, lsthead); 357 list_add(&tnode.llink, lsthead);
363 358
364 spin_unlock_irqrestore(&psw->lock, flags); 359 spin_unlock_irqrestore(&ncalls->lock, flags);
365 360
366 /* Do really wake up now */ 361 /* Call the nested function */
367 wake_up_nested(wq, 1 + wake_nests); 362 error = (*nproc)(priv, cookie, call_nests);
368 363
369 /* Remove the current task from the list */ 364 /* Remove the current task from the list */
370 spin_lock_irqsave(&psw->lock, flags); 365 spin_lock_irqsave(&ncalls->lock, flags);
371 list_del(&tnode.llink); 366 list_del(&tnode.llink);
372 spin_unlock_irqrestore(&psw->lock, flags); 367 out_unlock:
368 spin_unlock_irqrestore(&ncalls->lock, flags);
369
370 put_cpu();
371 return error;
372}
373
374#ifdef CONFIG_DEBUG_LOCK_ALLOC
375static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
376 unsigned long events, int subclass)
377{
378 unsigned long flags;
379
380 spin_lock_irqsave_nested(&wqueue->lock, flags, subclass);
381 wake_up_locked_poll(wqueue, events);
382 spin_unlock_irqrestore(&wqueue->lock, flags);
383}
384#else
385static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
386 unsigned long events, int subclass)
387{
388 wake_up_poll(wqueue, events);
389}
390#endif
391
392static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
393{
394 ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,
395 1 + call_nests);
396 return 0;
397}
398
399/*
400 * Perform a safe wake up of the poll wait list. The problem is that
401 * with the new callback'd wake up system, it is possible that the
402 * poll callback is reentered from inside the call to wake_up() done
403 * on the poll wait queue head. The rule is that we cannot reenter the
404 * wake up code from the same task more than EP_MAX_NESTS times,
405 * and we cannot reenter the same wait queue head at all. This will
406 * enable to have a hierarchy of epoll file descriptor of no more than
407 * EP_MAX_NESTS deep.
408 */
409static void ep_poll_safewake(wait_queue_head_t *wq)
410{
411 ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
412 ep_poll_wakeup_proc, NULL, wq);
373} 413}
374 414
375/* 415/*
376 * This function unregister poll callbacks from the associated file descriptor. 416 * This function unregisters poll callbacks from the associated file
377 * Since this must be called without holding "ep->lock" the atomic exchange trick 417 * descriptor. Must be called with "mtx" held (or "epmutex" if called from
378 * will protect us from multiple unregister. 418 * ep_free).
379 */ 419 */
380static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) 420static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
381{ 421{
382 int nwait;
383 struct list_head *lsthead = &epi->pwqlist; 422 struct list_head *lsthead = &epi->pwqlist;
384 struct eppoll_entry *pwq; 423 struct eppoll_entry *pwq;
385 424
386 /* This is called without locks, so we need the atomic exchange */ 425 while (!list_empty(lsthead)) {
387 nwait = xchg(&epi->nwait, 0); 426 pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
388 427
389 if (nwait) { 428 list_del(&pwq->llink);
390 while (!list_empty(lsthead)) { 429 remove_wait_queue(pwq->whead, &pwq->wait);
391 pwq = list_first_entry(lsthead, struct eppoll_entry, llink); 430 kmem_cache_free(pwq_cache, pwq);
431 }
432}
392 433
393 list_del_init(&pwq->llink); 434/**
394 remove_wait_queue(pwq->whead, &pwq->wait); 435 * ep_scan_ready_list - Scans the ready list in a way that makes possible for
395 kmem_cache_free(pwq_cache, pwq); 436 * the scan code, to call f_op->poll(). Also allows for
396 } 437 * O(NumReady) performance.
438 *
439 * @ep: Pointer to the epoll private data structure.
440 * @sproc: Pointer to the scan callback.
441 * @priv: Private opaque data passed to the @sproc callback.
442 *
443 * Returns: The same integer error code returned by the @sproc callback.
444 */
445static int ep_scan_ready_list(struct eventpoll *ep,
446 int (*sproc)(struct eventpoll *,
447 struct list_head *, void *),
448 void *priv)
449{
450 int error, pwake = 0;
451 unsigned long flags;
452 struct epitem *epi, *nepi;
453 LIST_HEAD(txlist);
454
455 /*
456 * We need to lock this because we could be hit by
457 * eventpoll_release_file() and epoll_ctl().
458 */
459 mutex_lock(&ep->mtx);
460
461 /*
462 * Steal the ready list, and re-init the original one to the
463 * empty list. Also, set ep->ovflist to NULL so that events
464 * happening while looping w/out locks, are not lost. We cannot
465 * have the poll callback to queue directly on ep->rdllist,
466 * because we want the "sproc" callback to be able to do it
467 * in a lockless way.
468 */
469 spin_lock_irqsave(&ep->lock, flags);
470 list_splice_init(&ep->rdllist, &txlist);
471 ep->ovflist = NULL;
472 spin_unlock_irqrestore(&ep->lock, flags);
473
474 /*
475 * Now call the callback function.
476 */
477 error = (*sproc)(ep, &txlist, priv);
478
479 spin_lock_irqsave(&ep->lock, flags);
480 /*
481 * During the time we spent inside the "sproc" callback, some
482 * other events might have been queued by the poll callback.
483 * We re-insert them inside the main ready-list here.
484 */
485 for (nepi = ep->ovflist; (epi = nepi) != NULL;
486 nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
487 /*
488 * We need to check if the item is already in the list.
489 * During the "sproc" callback execution time, items are
490 * queued into ->ovflist but the "txlist" might already
491 * contain them, and the list_splice() below takes care of them.
492 */
493 if (!ep_is_linked(&epi->rdllink))
494 list_add_tail(&epi->rdllink, &ep->rdllist);
495 }
496 /*
497 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
498 * releasing the lock, events will be queued in the normal way inside
499 * ep->rdllist.
500 */
501 ep->ovflist = EP_UNACTIVE_PTR;
502
503 /*
504 * Quickly re-inject items left on "txlist".
505 */
506 list_splice(&txlist, &ep->rdllist);
507
508 if (!list_empty(&ep->rdllist)) {
509 /*
510 * Wake up (if active) both the eventpoll wait list and
511 * the ->poll() wait list (delayed after we release the lock).
512 */
513 if (waitqueue_active(&ep->wq))
514 wake_up_locked(&ep->wq);
515 if (waitqueue_active(&ep->poll_wait))
516 pwake++;
397 } 517 }
518 spin_unlock_irqrestore(&ep->lock, flags);
519
520 mutex_unlock(&ep->mtx);
521
522 /* We have to call this outside the lock */
523 if (pwake)
524 ep_poll_safewake(&ep->poll_wait);
525
526 return error;
398} 527}
399 528
400/* 529/*
@@ -417,10 +546,10 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
417 ep_unregister_pollwait(ep, epi); 546 ep_unregister_pollwait(ep, epi);
418 547
419 /* Remove the current item from the list of epoll hooks */ 548 /* Remove the current item from the list of epoll hooks */
420 spin_lock(&file->f_ep_lock); 549 spin_lock(&file->f_lock);
421 if (ep_is_linked(&epi->fllink)) 550 if (ep_is_linked(&epi->fllink))
422 list_del_init(&epi->fllink); 551 list_del_init(&epi->fllink);
423 spin_unlock(&file->f_ep_lock); 552 spin_unlock(&file->f_lock);
424 553
425 rb_erase(&epi->rbn, &ep->rbr); 554 rb_erase(&epi->rbn, &ep->rbr);
426 555
@@ -434,9 +563,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
434 563
435 atomic_dec(&ep->user->epoll_watches); 564 atomic_dec(&ep->user->epoll_watches);
436 565
437 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n",
438 current, ep, file));
439
440 return 0; 566 return 0;
441} 567}
442 568
@@ -447,7 +573,7 @@ static void ep_free(struct eventpoll *ep)
447 573
448 /* We need to release all tasks waiting for these file */ 574 /* We need to release all tasks waiting for these file */
449 if (waitqueue_active(&ep->poll_wait)) 575 if (waitqueue_active(&ep->poll_wait))
450 ep_poll_safewake(&psw, &ep->poll_wait); 576 ep_poll_safewake(&ep->poll_wait);
451 577
452 /* 578 /*
453 * We need to lock this because we could be hit by 579 * We need to lock this because we could be hit by
@@ -492,26 +618,54 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
492 if (ep) 618 if (ep)
493 ep_free(ep); 619 ep_free(ep);
494 620
495 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep));
496 return 0; 621 return 0;
497} 622}
498 623
624static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
625 void *priv)
626{
627 struct epitem *epi, *tmp;
628
629 list_for_each_entry_safe(epi, tmp, head, rdllink) {
630 if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
631 epi->event.events)
632 return POLLIN | POLLRDNORM;
633 else {
634 /*
635 * Item has been dropped into the ready list by the poll
636 * callback, but it's not actually ready, as far as
637 * caller requested events goes. We can remove it here.
638 */
639 list_del_init(&epi->rdllink);
640 }
641 }
642
643 return 0;
644}
645
646static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
647{
648 return ep_scan_ready_list(priv, ep_read_events_proc, NULL);
649}
650
499static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) 651static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
500{ 652{
501 unsigned int pollflags = 0; 653 int pollflags;
502 unsigned long flags;
503 struct eventpoll *ep = file->private_data; 654 struct eventpoll *ep = file->private_data;
504 655
505 /* Insert inside our poll wait queue */ 656 /* Insert inside our poll wait queue */
506 poll_wait(file, &ep->poll_wait, wait); 657 poll_wait(file, &ep->poll_wait, wait);
507 658
508 /* Check our condition */ 659 /*
509 spin_lock_irqsave(&ep->lock, flags); 660 * Proceed to find out if wanted events are really available inside
510 if (!list_empty(&ep->rdllist)) 661 * the ready list. This need to be done under ep_call_nested()
511 pollflags = POLLIN | POLLRDNORM; 662 * supervision, since the call to f_op->poll() done on listed files
512 spin_unlock_irqrestore(&ep->lock, flags); 663 * could re-enter here.
664 */
665 pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
666 ep_poll_readyevents_proc, ep, ep);
513 667
514 return pollflags; 668 return pollflags != -1 ? pollflags : 0;
515} 669}
516 670
517/* File callbacks that implement the eventpoll file behaviour */ 671/* File callbacks that implement the eventpoll file behaviour */
@@ -538,15 +692,17 @@ void eventpoll_release_file(struct file *file)
538 struct epitem *epi; 692 struct epitem *epi;
539 693
540 /* 694 /*
541 * We don't want to get "file->f_ep_lock" because it is not 695 * We don't want to get "file->f_lock" because it is not
542 * necessary. It is not necessary because we're in the "struct file" 696 * necessary. It is not necessary because we're in the "struct file"
543 * cleanup path, and this means that noone is using this file anymore. 697 * cleanup path, and this means that noone is using this file anymore.
544 * So, for example, epoll_ctl() cannot hit here sicne if we reach this 698 * So, for example, epoll_ctl() cannot hit here since if we reach this
545 * point, the file counter already went to zero and fget() would fail. 699 * point, the file counter already went to zero and fget() would fail.
546 * The only hit might come from ep_free() but by holding the mutex 700 * The only hit might come from ep_free() but by holding the mutex
547 * will correctly serialize the operation. We do need to acquire 701 * will correctly serialize the operation. We do need to acquire
548 * "ep->mtx" after "epmutex" because ep_remove() requires it when called 702 * "ep->mtx" after "epmutex" because ep_remove() requires it when called
549 * from anywhere but ep_free(). 703 * from anywhere but ep_free().
704 *
705 * Besides, ep_remove() acquires the lock, so we can't hold it here.
550 */ 706 */
551 mutex_lock(&epmutex); 707 mutex_lock(&epmutex);
552 708
@@ -586,8 +742,6 @@ static int ep_alloc(struct eventpoll **pep)
586 742
587 *pep = ep; 743 *pep = ep;
588 744
589 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n",
590 current, ep));
591 return 0; 745 return 0;
592 746
593free_uid: 747free_uid:
@@ -621,9 +775,6 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
621 } 775 }
622 } 776 }
623 777
624 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n",
625 current, file, epir));
626
627 return epir; 778 return epir;
628} 779}
629 780
@@ -639,9 +790,6 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
639 struct epitem *epi = ep_item_from_wait(wait); 790 struct epitem *epi = ep_item_from_wait(wait);
640 struct eventpoll *ep = epi->ep; 791 struct eventpoll *ep = epi->ep;
641 792
642 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
643 current, epi->ffd.file, epi, ep));
644
645 spin_lock_irqsave(&ep->lock, flags); 793 spin_lock_irqsave(&ep->lock, flags);
646 794
647 /* 795 /*
@@ -654,6 +802,15 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
654 goto out_unlock; 802 goto out_unlock;
655 803
656 /* 804 /*
805 * Check the events coming with the callback. At this stage, not
806 * every device reports the events in the "key" parameter of the
807 * callback. We need to be able to handle both cases here, hence the
808 * test for "key" != NULL before the event match test.
809 */
810 if (key && !((unsigned long) key & epi->event.events))
811 goto out_unlock;
812
813 /*
657 * If we are trasfering events to userspace, we can hold no locks 814 * If we are trasfering events to userspace, we can hold no locks
658 * (because we're accessing user memory, and because of linux f_op->poll() 815 * (because we're accessing user memory, and because of linux f_op->poll()
659 * semantics). All the events that happens during that period of time are 816 * semantics). All the events that happens during that period of time are
@@ -668,12 +825,9 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
668 } 825 }
669 826
670 /* If this file is already in the ready list we exit soon */ 827 /* If this file is already in the ready list we exit soon */
671 if (ep_is_linked(&epi->rdllink)) 828 if (!ep_is_linked(&epi->rdllink))
672 goto is_linked; 829 list_add_tail(&epi->rdllink, &ep->rdllist);
673
674 list_add_tail(&epi->rdllink, &ep->rdllist);
675 830
676is_linked:
677 /* 831 /*
678 * Wake up ( if active ) both the eventpoll wait list and the ->poll() 832 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
679 * wait list. 833 * wait list.
@@ -688,7 +842,7 @@ out_unlock:
688 842
689 /* We have to call this outside the lock */ 843 /* We have to call this outside the lock */
690 if (pwake) 844 if (pwake)
691 ep_poll_safewake(&psw, &ep->poll_wait); 845 ep_poll_safewake(&ep->poll_wait);
692 846
693 return 1; 847 return 1;
694} 848}
@@ -785,9 +939,9 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
785 goto error_unregister; 939 goto error_unregister;
786 940
787 /* Add the current item to the list of active epoll hook for this file */ 941 /* Add the current item to the list of active epoll hook for this file */
788 spin_lock(&tfile->f_ep_lock); 942 spin_lock(&tfile->f_lock);
789 list_add_tail(&epi->fllink, &tfile->f_ep_links); 943 list_add_tail(&epi->fllink, &tfile->f_ep_links);
790 spin_unlock(&tfile->f_ep_lock); 944 spin_unlock(&tfile->f_lock);
791 945
792 /* 946 /*
793 * Add the current item to the RB tree. All RB tree operations are 947 * Add the current item to the RB tree. All RB tree operations are
@@ -815,10 +969,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
815 969
816 /* We have to call this outside the lock */ 970 /* We have to call this outside the lock */
817 if (pwake) 971 if (pwake)
818 ep_poll_safewake(&psw, &ep->poll_wait); 972 ep_poll_safewake(&ep->poll_wait);
819
820 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
821 current, ep, tfile, fd));
822 973
823 return 0; 974 return 0;
824 975
@@ -849,15 +1000,14 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
849{ 1000{
850 int pwake = 0; 1001 int pwake = 0;
851 unsigned int revents; 1002 unsigned int revents;
852 unsigned long flags;
853 1003
854 /* 1004 /*
855 * Set the new event interest mask before calling f_op->poll(), otherwise 1005 * Set the new event interest mask before calling f_op->poll();
856 * a potential race might occur. In fact if we do this operation inside 1006 * otherwise we might miss an event that happens between the
857 * the lock, an event might happen between the f_op->poll() call and the 1007 * f_op->poll() call and the new event set registering.
858 * new event set registering.
859 */ 1008 */
860 epi->event.events = event->events; 1009 epi->event.events = event->events;
1010 epi->event.data = event->data; /* protected by mtx */
861 1011
862 /* 1012 /*
863 * Get current event bits. We can safely use the file* here because 1013 * Get current event bits. We can safely use the file* here because
@@ -865,16 +1015,12 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
865 */ 1015 */
866 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); 1016 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
867 1017
868 spin_lock_irqsave(&ep->lock, flags);
869
870 /* Copy the data member from inside the lock */
871 epi->event.data = event->data;
872
873 /* 1018 /*
874 * If the item is "hot" and it is not registered inside the ready 1019 * If the item is "hot" and it is not registered inside the ready
875 * list, push it inside. 1020 * list, push it inside.
876 */ 1021 */
877 if (revents & event->events) { 1022 if (revents & event->events) {
1023 spin_lock_irq(&ep->lock);
878 if (!ep_is_linked(&epi->rdllink)) { 1024 if (!ep_is_linked(&epi->rdllink)) {
879 list_add_tail(&epi->rdllink, &ep->rdllist); 1025 list_add_tail(&epi->rdllink, &ep->rdllist);
880 1026
@@ -884,142 +1030,84 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
884 if (waitqueue_active(&ep->poll_wait)) 1030 if (waitqueue_active(&ep->poll_wait))
885 pwake++; 1031 pwake++;
886 } 1032 }
1033 spin_unlock_irq(&ep->lock);
887 } 1034 }
888 spin_unlock_irqrestore(&ep->lock, flags);
889 1035
890 /* We have to call this outside the lock */ 1036 /* We have to call this outside the lock */
891 if (pwake) 1037 if (pwake)
892 ep_poll_safewake(&psw, &ep->poll_wait); 1038 ep_poll_safewake(&ep->poll_wait);
893 1039
894 return 0; 1040 return 0;
895} 1041}
896 1042
897static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, 1043static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
898 int maxevents) 1044 void *priv)
899{ 1045{
900 int eventcnt, error = -EFAULT, pwake = 0; 1046 struct ep_send_events_data *esed = priv;
1047 int eventcnt;
901 unsigned int revents; 1048 unsigned int revents;
902 unsigned long flags; 1049 struct epitem *epi;
903 struct epitem *epi, *nepi; 1050 struct epoll_event __user *uevent;
904 struct list_head txlist;
905
906 INIT_LIST_HEAD(&txlist);
907
908 /*
909 * We need to lock this because we could be hit by
910 * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL).
911 */
912 mutex_lock(&ep->mtx);
913
914 /*
915 * Steal the ready list, and re-init the original one to the
916 * empty list. Also, set ep->ovflist to NULL so that events
917 * happening while looping w/out locks, are not lost. We cannot
918 * have the poll callback to queue directly on ep->rdllist,
919 * because we are doing it in the loop below, in a lockless way.
920 */
921 spin_lock_irqsave(&ep->lock, flags);
922 list_splice(&ep->rdllist, &txlist);
923 INIT_LIST_HEAD(&ep->rdllist);
924 ep->ovflist = NULL;
925 spin_unlock_irqrestore(&ep->lock, flags);
926 1051
927 /* 1052 /*
928 * We can loop without lock because this is a task private list. 1053 * We can loop without lock because we are passed a task private list.
929 * We just splice'd out the ep->rdllist in ep_collect_ready_items(). 1054 * Items cannot vanish during the loop because ep_scan_ready_list() is
930 * Items cannot vanish during the loop because we are holding "mtx". 1055 * holding "mtx" during this call.
931 */ 1056 */
932 for (eventcnt = 0; !list_empty(&txlist) && eventcnt < maxevents;) { 1057 for (eventcnt = 0, uevent = esed->events;
933 epi = list_first_entry(&txlist, struct epitem, rdllink); 1058 !list_empty(head) && eventcnt < esed->maxevents;) {
1059 epi = list_first_entry(head, struct epitem, rdllink);
934 1060
935 list_del_init(&epi->rdllink); 1061 list_del_init(&epi->rdllink);
936 1062
937 /* 1063 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
938 * Get the ready file event set. We can safely use the file 1064 epi->event.events;
939 * because we are holding the "mtx" and this will guarantee
940 * that both the file and the item will not vanish.
941 */
942 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
943 revents &= epi->event.events;
944 1065
945 /* 1066 /*
946 * Is the event mask intersect the caller-requested one, 1067 * If the event mask intersect the caller-requested one,
947 * deliver the event to userspace. Again, we are holding 1068 * deliver the event to userspace. Again, ep_scan_ready_list()
948 * "mtx", so no operations coming from userspace can change 1069 * is holding "mtx", so no operations coming from userspace
949 * the item. 1070 * can change the item.
950 */ 1071 */
951 if (revents) { 1072 if (revents) {
952 if (__put_user(revents, 1073 if (__put_user(revents, &uevent->events) ||
953 &events[eventcnt].events) || 1074 __put_user(epi->event.data, &uevent->data)) {
954 __put_user(epi->event.data, 1075 list_add(&epi->rdllink, head);
955 &events[eventcnt].data)) 1076 return eventcnt ? eventcnt : -EFAULT;
956 goto errxit; 1077 }
1078 eventcnt++;
1079 uevent++;
957 if (epi->event.events & EPOLLONESHOT) 1080 if (epi->event.events & EPOLLONESHOT)
958 epi->event.events &= EP_PRIVATE_BITS; 1081 epi->event.events &= EP_PRIVATE_BITS;
959 eventcnt++; 1082 else if (!(epi->event.events & EPOLLET)) {
1083 /*
1084 * If this file has been added with Level
1085 * Trigger mode, we need to insert back inside
1086 * the ready list, so that the next call to
1087 * epoll_wait() will check again the events
1088 * availability. At this point, noone can insert
1089 * into ep->rdllist besides us. The epoll_ctl()
1090 * callers are locked out by
1091 * ep_scan_ready_list() holding "mtx" and the
1092 * poll callback will queue them in ep->ovflist.
1093 */
1094 list_add_tail(&epi->rdllink, &ep->rdllist);
1095 }
960 } 1096 }
961 /*
962 * At this point, noone can insert into ep->rdllist besides
963 * us. The epoll_ctl() callers are locked out by us holding
964 * "mtx" and the poll callback will queue them in ep->ovflist.
965 */
966 if (!(epi->event.events & EPOLLET) &&
967 (revents & epi->event.events))
968 list_add_tail(&epi->rdllink, &ep->rdllist);
969 }
970 error = 0;
971
972errxit:
973
974 spin_lock_irqsave(&ep->lock, flags);
975 /*
976 * During the time we spent in the loop above, some other events
977 * might have been queued by the poll callback. We re-insert them
978 * inside the main ready-list here.
979 */
980 for (nepi = ep->ovflist; (epi = nepi) != NULL;
981 nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
982 /*
983 * If the above loop quit with errors, the epoll item might still
984 * be linked to "txlist", and the list_splice() done below will
985 * take care of those cases.
986 */
987 if (!ep_is_linked(&epi->rdllink))
988 list_add_tail(&epi->rdllink, &ep->rdllist);
989 } 1097 }
990 /*
991 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
992 * releasing the lock, events will be queued in the normal way inside
993 * ep->rdllist.
994 */
995 ep->ovflist = EP_UNACTIVE_PTR;
996 1098
997 /* 1099 return eventcnt;
998 * In case of error in the event-send loop, or in case the number of 1100}
999 * ready events exceeds the userspace limit, we need to splice the
1000 * "txlist" back inside ep->rdllist.
1001 */
1002 list_splice(&txlist, &ep->rdllist);
1003
1004 if (!list_empty(&ep->rdllist)) {
1005 /*
1006 * Wake up (if active) both the eventpoll wait list and the ->poll()
1007 * wait list (delayed after we release the lock).
1008 */
1009 if (waitqueue_active(&ep->wq))
1010 wake_up_locked(&ep->wq);
1011 if (waitqueue_active(&ep->poll_wait))
1012 pwake++;
1013 }
1014 spin_unlock_irqrestore(&ep->lock, flags);
1015 1101
1016 mutex_unlock(&ep->mtx); 1102static int ep_send_events(struct eventpoll *ep,
1103 struct epoll_event __user *events, int maxevents)
1104{
1105 struct ep_send_events_data esed;
1017 1106
1018 /* We have to call this outside the lock */ 1107 esed.maxevents = maxevents;
1019 if (pwake) 1108 esed.events = events;
1020 ep_poll_safewake(&psw, &ep->poll_wait);
1021 1109
1022 return eventcnt == 0 ? error: eventcnt; 1110 return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
1023} 1111}
1024 1112
1025static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, 1113static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
@@ -1031,7 +1119,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1031 wait_queue_t wait; 1119 wait_queue_t wait;
1032 1120
1033 /* 1121 /*
1034 * Calculate the timeout by checking for the "infinite" value ( -1 ) 1122 * Calculate the timeout by checking for the "infinite" value (-1)
1035 * and the overflow condition. The passed timeout is in milliseconds, 1123 * and the overflow condition. The passed timeout is in milliseconds,
1036 * that why (t * HZ) / 1000. 1124 * that why (t * HZ) / 1000.
1037 */ 1125 */
@@ -1074,9 +1162,8 @@ retry:
1074 1162
1075 set_current_state(TASK_RUNNING); 1163 set_current_state(TASK_RUNNING);
1076 } 1164 }
1077
1078 /* Is it worth to try to dig for events ? */ 1165 /* Is it worth to try to dig for events ? */
1079 eavail = !list_empty(&ep->rdllist); 1166 eavail = !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
1080 1167
1081 spin_unlock_irqrestore(&ep->lock, flags); 1168 spin_unlock_irqrestore(&ep->lock, flags);
1082 1169
@@ -1097,41 +1184,30 @@ retry:
1097 */ 1184 */
1098SYSCALL_DEFINE1(epoll_create1, int, flags) 1185SYSCALL_DEFINE1(epoll_create1, int, flags)
1099{ 1186{
1100 int error, fd = -1; 1187 int error;
1101 struct eventpoll *ep; 1188 struct eventpoll *ep = NULL;
1102 1189
1103 /* Check the EPOLL_* constant for consistency. */ 1190 /* Check the EPOLL_* constant for consistency. */
1104 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); 1191 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
1105 1192
1106 if (flags & ~EPOLL_CLOEXEC) 1193 if (flags & ~EPOLL_CLOEXEC)
1107 return -EINVAL; 1194 return -EINVAL;
1108
1109 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
1110 current, flags));
1111
1112 /* 1195 /*
1113 * Create the internal data structure ( "struct eventpoll" ). 1196 * Create the internal data structure ("struct eventpoll").
1114 */ 1197 */
1115 error = ep_alloc(&ep); 1198 error = ep_alloc(&ep);
1116 if (error < 0) { 1199 if (error < 0)
1117 fd = error; 1200 return error;
1118 goto error_return;
1119 }
1120
1121 /* 1201 /*
1122 * Creates all the items needed to setup an eventpoll file. That is, 1202 * Creates all the items needed to setup an eventpoll file. That is,
1123 * a file structure and a free file descriptor. 1203 * a file structure and a free file descriptor.
1124 */ 1204 */
1125 fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, 1205 error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
1126 flags & O_CLOEXEC); 1206 flags & O_CLOEXEC);
1127 if (fd < 0) 1207 if (error < 0)
1128 ep_free(ep); 1208 ep_free(ep);
1129 1209
1130error_return: 1210 return error;
1131 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
1132 current, flags, fd));
1133
1134 return fd;
1135} 1211}
1136 1212
1137SYSCALL_DEFINE1(epoll_create, int, size) 1213SYSCALL_DEFINE1(epoll_create, int, size)
@@ -1156,9 +1232,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1156 struct epitem *epi; 1232 struct epitem *epi;
1157 struct epoll_event epds; 1233 struct epoll_event epds;
1158 1234
1159 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
1160 current, epfd, op, fd, event));
1161
1162 error = -EFAULT; 1235 error = -EFAULT;
1163 if (ep_op_has_event(op) && 1236 if (ep_op_has_event(op) &&
1164 copy_from_user(&epds, event, sizeof(struct epoll_event))) 1237 copy_from_user(&epds, event, sizeof(struct epoll_event)))
@@ -1209,7 +1282,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1209 case EPOLL_CTL_ADD: 1282 case EPOLL_CTL_ADD:
1210 if (!epi) { 1283 if (!epi) {
1211 epds.events |= POLLERR | POLLHUP; 1284 epds.events |= POLLERR | POLLHUP;
1212
1213 error = ep_insert(ep, &epds, tfile, fd); 1285 error = ep_insert(ep, &epds, tfile, fd);
1214 } else 1286 } else
1215 error = -EEXIST; 1287 error = -EEXIST;
@@ -1235,8 +1307,6 @@ error_tgt_fput:
1235error_fput: 1307error_fput:
1236 fput(file); 1308 fput(file);
1237error_return: 1309error_return:
1238 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
1239 current, epfd, op, fd, event, error));
1240 1310
1241 return error; 1311 return error;
1242} 1312}
@@ -1252,9 +1322,6 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
1252 struct file *file; 1322 struct file *file;
1253 struct eventpoll *ep; 1323 struct eventpoll *ep;
1254 1324
1255 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
1256 current, epfd, events, maxevents, timeout));
1257
1258 /* The maximum number of event must be greater than zero */ 1325 /* The maximum number of event must be greater than zero */
1259 if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) 1326 if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
1260 return -EINVAL; 1327 return -EINVAL;
@@ -1291,8 +1358,6 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
1291error_fput: 1358error_fput:
1292 fput(file); 1359 fput(file);
1293error_return: 1360error_return:
1294 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
1295 current, epfd, events, maxevents, timeout, error));
1296 1361
1297 return error; 1362 return error;
1298} 1363}
@@ -1357,17 +1422,18 @@ static int __init eventpoll_init(void)
1357 EP_ITEM_COST; 1422 EP_ITEM_COST;
1358 1423
1359 /* Initialize the structure used to perform safe poll wait head wake ups */ 1424 /* Initialize the structure used to perform safe poll wait head wake ups */
1360 ep_poll_safewake_init(&psw); 1425 ep_nested_calls_init(&poll_safewake_ncalls);
1426
1427 /* Initialize the structure used to perform file's f_op->poll() calls */
1428 ep_nested_calls_init(&poll_readywalk_ncalls);
1361 1429
1362 /* Allocates slab cache used to allocate "struct epitem" items */ 1430 /* Allocates slab cache used to allocate "struct epitem" items */
1363 epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), 1431 epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
1364 0, SLAB_HWCACHE_ALIGN|EPI_SLAB_DEBUG|SLAB_PANIC, 1432 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
1365 NULL);
1366 1433
1367 /* Allocates slab cache used to allocate "struct eppoll_entry" */ 1434 /* Allocates slab cache used to allocate "struct eppoll_entry" */
1368 pwq_cache = kmem_cache_create("eventpoll_pwq", 1435 pwq_cache = kmem_cache_create("eventpoll_pwq",
1369 sizeof(struct eppoll_entry), 0, 1436 sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);
1370 EPI_SLAB_DEBUG|SLAB_PANIC, NULL);
1371 1437
1372 return 0; 1438 return 0;
1373} 1439}
diff --git a/fs/exec.c b/fs/exec.c
index af1600cfa8c9..e015c0b5a082 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -46,6 +46,7 @@
46#include <linux/proc_fs.h> 46#include <linux/proc_fs.h>
47#include <linux/mount.h> 47#include <linux/mount.h>
48#include <linux/security.h> 48#include <linux/security.h>
49#include <linux/ima.h>
49#include <linux/syscalls.h> 50#include <linux/syscalls.h>
50#include <linux/tsacct_kern.h> 51#include <linux/tsacct_kern.h>
51#include <linux/cn_proc.h> 52#include <linux/cn_proc.h>
@@ -53,6 +54,7 @@
53#include <linux/tracehook.h> 54#include <linux/tracehook.h>
54#include <linux/kmod.h> 55#include <linux/kmod.h>
55#include <linux/fsnotify.h> 56#include <linux/fsnotify.h>
57#include <linux/fs_struct.h>
56 58
57#include <asm/uaccess.h> 59#include <asm/uaccess.h>
58#include <asm/mmu_context.h> 60#include <asm/mmu_context.h>
@@ -128,6 +130,9 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
128 MAY_READ | MAY_EXEC | MAY_OPEN); 130 MAY_READ | MAY_EXEC | MAY_OPEN);
129 if (error) 131 if (error)
130 goto exit; 132 goto exit;
133 error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN);
134 if (error)
135 goto exit;
131 136
132 file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE); 137 file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
133 error = PTR_ERR(file); 138 error = PTR_ERR(file);
@@ -675,6 +680,9 @@ struct file *open_exec(const char *name)
675 err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN); 680 err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN);
676 if (err) 681 if (err)
677 goto out_path_put; 682 goto out_path_put;
683 err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN);
684 if (err)
685 goto out_path_put;
678 686
679 file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE); 687 file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
680 if (IS_ERR(file)) 688 if (IS_ERR(file))
@@ -1057,32 +1065,35 @@ EXPORT_SYMBOL(install_exec_creds);
1057 * - the caller must hold current->cred_exec_mutex to protect against 1065 * - the caller must hold current->cred_exec_mutex to protect against
1058 * PTRACE_ATTACH 1066 * PTRACE_ATTACH
1059 */ 1067 */
1060void check_unsafe_exec(struct linux_binprm *bprm, struct files_struct *files) 1068int check_unsafe_exec(struct linux_binprm *bprm)
1061{ 1069{
1062 struct task_struct *p = current, *t; 1070 struct task_struct *p = current, *t;
1063 unsigned long flags; 1071 unsigned long flags;
1064 unsigned n_fs, n_files, n_sighand; 1072 unsigned n_fs;
1073 int res = 0;
1065 1074
1066 bprm->unsafe = tracehook_unsafe_exec(p); 1075 bprm->unsafe = tracehook_unsafe_exec(p);
1067 1076
1068 n_fs = 1; 1077 n_fs = 1;
1069 n_files = 1; 1078 write_lock(&p->fs->lock);
1070 n_sighand = 1;
1071 lock_task_sighand(p, &flags); 1079 lock_task_sighand(p, &flags);
1072 for (t = next_thread(p); t != p; t = next_thread(t)) { 1080 for (t = next_thread(p); t != p; t = next_thread(t)) {
1073 if (t->fs == p->fs) 1081 if (t->fs == p->fs)
1074 n_fs++; 1082 n_fs++;
1075 if (t->files == files)
1076 n_files++;
1077 n_sighand++;
1078 } 1083 }
1079 1084
1080 if (atomic_read(&p->fs->count) > n_fs || 1085 if (p->fs->users > n_fs) {
1081 atomic_read(&p->files->count) > n_files ||
1082 atomic_read(&p->sighand->count) > n_sighand)
1083 bprm->unsafe |= LSM_UNSAFE_SHARE; 1086 bprm->unsafe |= LSM_UNSAFE_SHARE;
1087 } else {
1088 if (p->fs->in_exec)
1089 res = -EAGAIN;
1090 p->fs->in_exec = 1;
1091 }
1084 1092
1085 unlock_task_sighand(p, &flags); 1093 unlock_task_sighand(p, &flags);
1094 write_unlock(&p->fs->lock);
1095
1096 return res;
1086} 1097}
1087 1098
1088/* 1099/*
@@ -1192,6 +1203,9 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
1192 retval = security_bprm_check(bprm); 1203 retval = security_bprm_check(bprm);
1193 if (retval) 1204 if (retval)
1194 return retval; 1205 return retval;
1206 retval = ima_bprm_check(bprm);
1207 if (retval)
1208 return retval;
1195 1209
1196 /* kernel module loader fixup */ 1210 /* kernel module loader fixup */
1197 /* so we don't try to load run modprobe in kernel space. */ 1211 /* so we don't try to load run modprobe in kernel space. */
@@ -1292,17 +1306,21 @@ int do_execve(char * filename,
1292 retval = mutex_lock_interruptible(&current->cred_exec_mutex); 1306 retval = mutex_lock_interruptible(&current->cred_exec_mutex);
1293 if (retval < 0) 1307 if (retval < 0)
1294 goto out_free; 1308 goto out_free;
1309 current->in_execve = 1;
1295 1310
1296 retval = -ENOMEM; 1311 retval = -ENOMEM;
1297 bprm->cred = prepare_exec_creds(); 1312 bprm->cred = prepare_exec_creds();
1298 if (!bprm->cred) 1313 if (!bprm->cred)
1299 goto out_unlock; 1314 goto out_unlock;
1300 check_unsafe_exec(bprm, displaced); 1315
1316 retval = check_unsafe_exec(bprm);
1317 if (retval)
1318 goto out_unlock;
1301 1319
1302 file = open_exec(filename); 1320 file = open_exec(filename);
1303 retval = PTR_ERR(file); 1321 retval = PTR_ERR(file);
1304 if (IS_ERR(file)) 1322 if (IS_ERR(file))
1305 goto out_unlock; 1323 goto out_unmark;
1306 1324
1307 sched_exec(); 1325 sched_exec();
1308 1326
@@ -1345,6 +1363,10 @@ int do_execve(char * filename,
1345 goto out; 1363 goto out;
1346 1364
1347 /* execve succeeded */ 1365 /* execve succeeded */
1366 write_lock(&current->fs->lock);
1367 current->fs->in_exec = 0;
1368 write_unlock(&current->fs->lock);
1369 current->in_execve = 0;
1348 mutex_unlock(&current->cred_exec_mutex); 1370 mutex_unlock(&current->cred_exec_mutex);
1349 acct_update_integrals(current); 1371 acct_update_integrals(current);
1350 free_bprm(bprm); 1372 free_bprm(bprm);
@@ -1362,7 +1384,13 @@ out_file:
1362 fput(bprm->file); 1384 fput(bprm->file);
1363 } 1385 }
1364 1386
1387out_unmark:
1388 write_lock(&current->fs->lock);
1389 current->fs->in_exec = 0;
1390 write_unlock(&current->fs->lock);
1391
1365out_unlock: 1392out_unlock:
1393 current->in_execve = 0;
1366 mutex_unlock(&current->cred_exec_mutex); 1394 mutex_unlock(&current->cred_exec_mutex);
1367 1395
1368out_free: 1396out_free:
diff --git a/fs/exofs/BUGS b/fs/exofs/BUGS
new file mode 100644
index 000000000000..1b2d4c63a579
--- /dev/null
+++ b/fs/exofs/BUGS
@@ -0,0 +1,3 @@
1- Out-of-space may cause a severe problem if the object (and directory entry)
2 were written, but the inode attributes failed. Then if the filesystem was
3 unmounted and mounted the kernel can get into an endless loop doing a readdir.
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
new file mode 100644
index 000000000000..cc2d22db119c
--- /dev/null
+++ b/fs/exofs/Kbuild
@@ -0,0 +1,16 @@
1#
2# Kbuild for the EXOFS module
3#
4# Copyright (C) 2008 Panasas Inc. All rights reserved.
5#
6# Authors:
7# Boaz Harrosh <bharrosh@panasas.com>
8#
9# This program is free software; you can redistribute it and/or modify
10# it under the terms of the GNU General Public License version 2
11#
12# Kbuild - Gets included from the Kernels Makefile and build system
13#
14
15exofs-y := osd.o inode.o file.o symlink.o namei.o dir.o super.o
16obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
new file mode 100644
index 000000000000..86194b2f799d
--- /dev/null
+++ b/fs/exofs/Kconfig
@@ -0,0 +1,13 @@
1config EXOFS_FS
2 tristate "exofs: OSD based file system support"
3 depends on SCSI_OSD_ULD
4 help
5 EXOFS is a file system that uses an OSD storage device,
6 as its backing storage.
7
8# Debugging-related stuff
9config EXOFS_DEBUG
10 bool "Enable debugging"
11 depends on EXOFS_FS
12 help
13 This option enables EXOFS debug prints.
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
new file mode 100644
index 000000000000..b1512c4bb8c7
--- /dev/null
+++ b/fs/exofs/common.h
@@ -0,0 +1,184 @@
1/*
2 * common.h - Common definitions for both Kernel and user-mode utilities
3 *
4 * Copyright (C) 2005, 2006
5 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
6 * Copyright (C) 2005, 2006
7 * International Business Machines
8 * Copyright (C) 2008, 2009
9 * Boaz Harrosh <bharrosh@panasas.com>
10 *
11 * Copyrights for code taken from ext2:
12 * Copyright (C) 1992, 1993, 1994, 1995
13 * Remy Card (card@masi.ibp.fr)
14 * Laboratoire MASI - Institut Blaise Pascal
15 * Universite Pierre et Marie Curie (Paris VI)
16 * from
17 * linux/fs/minix/inode.c
18 * Copyright (C) 1991, 1992 Linus Torvalds
19 *
20 * This file is part of exofs.
21 *
22 * exofs is free software; you can redistribute it and/or modify
23 * it under the terms of the GNU General Public License as published by
24 * the Free Software Foundation. Since it is based on ext2, and the only
25 * valid version of GPL for the Linux kernel is version 2, the only valid
26 * version of GPL for exofs is version 2.
27 *
28 * exofs is distributed in the hope that it will be useful,
29 * but WITHOUT ANY WARRANTY; without even the implied warranty of
30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31 * GNU General Public License for more details.
32 *
33 * You should have received a copy of the GNU General Public License
34 * along with exofs; if not, write to the Free Software
35 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
36 */
37
38#ifndef __EXOFS_COM_H__
39#define __EXOFS_COM_H__
40
41#include <linux/types.h>
42
43#include <scsi/osd_attributes.h>
44#include <scsi/osd_initiator.h>
45#include <scsi/osd_sec.h>
46
47/****************************************************************************
48 * Object ID related defines
49 * NOTE: inode# = object ID - EXOFS_OBJ_OFF
50 ****************************************************************************/
51#define EXOFS_MIN_PID 0x10000 /* Smallest partition ID */
52#define EXOFS_OBJ_OFF 0x10000 /* offset for objects */
53#define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */
54#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */
55
56/* exofs Application specific page/attribute */
57# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3)
58# define EXOFS_ATTR_INODE_DATA 1
59
60/*
61 * The maximum number of files we can have is limited by the size of the
62 * inode number. This is the largest object ID that the file system supports.
63 * Object IDs 0, 1, and 2 are always in use (see above defines).
64 */
65enum {
66 EXOFS_MAX_INO_ID = (sizeof(ino_t) * 8 == 64) ? ULLONG_MAX :
67 (1ULL << (sizeof(ino_t) * 8ULL - 1ULL)),
68 EXOFS_MAX_ID = (EXOFS_MAX_INO_ID - 1 - EXOFS_OBJ_OFF),
69};
70
71/****************************************************************************
72 * Misc.
73 ****************************************************************************/
74#define EXOFS_BLKSHIFT 12
75#define EXOFS_BLKSIZE (1UL << EXOFS_BLKSHIFT)
76
77/****************************************************************************
78 * superblock-related things
79 ****************************************************************************/
80#define EXOFS_SUPER_MAGIC 0x5DF5
81
82/*
83 * The file system control block - stored in an object's data (mainly, the one
84 * with ID EXOFS_SUPER_ID). This is where the in-memory superblock is stored
85 * on disk. Right now it just has a magic value, which is basically a sanity
86 * check on our ability to communicate with the object store.
87 */
88struct exofs_fscb {
89 __le64 s_nextid; /* Highest object ID used */
90 __le32 s_numfiles; /* Number of files on fs */
91 __le16 s_magic; /* Magic signature */
92 __le16 s_newfs; /* Non-zero if this is a new fs */
93};
94
95/****************************************************************************
96 * inode-related things
97 ****************************************************************************/
98#define EXOFS_IDATA 5
99
100/*
101 * The file control block - stored in an object's attributes. This is where
102 * the in-memory inode is stored on disk.
103 */
104struct exofs_fcb {
105 __le64 i_size; /* Size of the file */
106 __le16 i_mode; /* File mode */
107 __le16 i_links_count; /* Links count */
108 __le32 i_uid; /* Owner Uid */
109 __le32 i_gid; /* Group Id */
110 __le32 i_atime; /* Access time */
111 __le32 i_ctime; /* Creation time */
112 __le32 i_mtime; /* Modification time */
113 __le32 i_flags; /* File flags (unused for now)*/
114 __le32 i_generation; /* File version (for NFS) */
115 __le32 i_data[EXOFS_IDATA]; /* Short symlink names and device #s */
116};
117
118#define EXOFS_INO_ATTR_SIZE sizeof(struct exofs_fcb)
119
120/* This is the Attribute the fcb is stored in */
121static const struct __weak osd_attr g_attr_inode_data = ATTR_DEF(
122 EXOFS_APAGE_FS_DATA,
123 EXOFS_ATTR_INODE_DATA,
124 EXOFS_INO_ATTR_SIZE);
125
126/****************************************************************************
127 * dentry-related things
128 ****************************************************************************/
129#define EXOFS_NAME_LEN 255
130
131/*
132 * The on-disk directory entry
133 */
134struct exofs_dir_entry {
135 __le64 inode_no; /* inode number */
136 __le16 rec_len; /* directory entry length */
137 u8 name_len; /* name length */
138 u8 file_type; /* umm...file type */
139 char name[EXOFS_NAME_LEN]; /* file name */
140};
141
142enum {
143 EXOFS_FT_UNKNOWN,
144 EXOFS_FT_REG_FILE,
145 EXOFS_FT_DIR,
146 EXOFS_FT_CHRDEV,
147 EXOFS_FT_BLKDEV,
148 EXOFS_FT_FIFO,
149 EXOFS_FT_SOCK,
150 EXOFS_FT_SYMLINK,
151 EXOFS_FT_MAX
152};
153
154#define EXOFS_DIR_PAD 4
155#define EXOFS_DIR_ROUND (EXOFS_DIR_PAD - 1)
156#define EXOFS_DIR_REC_LEN(name_len) \
157 (((name_len) + offsetof(struct exofs_dir_entry, name) + \
158 EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
159
160/*************************
161 * function declarations *
162 *************************/
163/* osd.c */
164void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
165 const struct osd_obj_id *obj);
166
167int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid);
168static inline int exofs_check_ok(struct osd_request *or)
169{
170 return exofs_check_ok_resid(or, NULL, NULL);
171}
172int exofs_sync_op(struct osd_request *or, int timeout, u8 *cred);
173int exofs_async_op(struct osd_request *or,
174 osd_req_done_fn *async_done, void *caller_context, u8 *cred);
175
176int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr);
177
178int osd_req_read_kern(struct osd_request *or,
179 const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
180
181int osd_req_write_kern(struct osd_request *or,
182 const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
183
184#endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
new file mode 100644
index 000000000000..65b0c8c776a1
--- /dev/null
+++ b/fs/exofs/dir.c
@@ -0,0 +1,672 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include "exofs.h"
37
38static inline unsigned exofs_chunk_size(struct inode *inode)
39{
40 return inode->i_sb->s_blocksize;
41}
42
43static inline void exofs_put_page(struct page *page)
44{
45 kunmap(page);
46 page_cache_release(page);
47}
48
49/* Accesses dir's inode->i_size must be called under inode lock */
50static inline unsigned long dir_pages(struct inode *inode)
51{
52 return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
53}
54
55static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr)
56{
57 loff_t last_byte = inode->i_size;
58
59 last_byte -= page_nr << PAGE_CACHE_SHIFT;
60 if (last_byte > PAGE_CACHE_SIZE)
61 last_byte = PAGE_CACHE_SIZE;
62 return last_byte;
63}
64
65static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len)
66{
67 struct address_space *mapping = page->mapping;
68 struct inode *dir = mapping->host;
69 int err = 0;
70
71 dir->i_version++;
72
73 if (!PageUptodate(page))
74 SetPageUptodate(page);
75
76 if (pos+len > dir->i_size) {
77 i_size_write(dir, pos+len);
78 mark_inode_dirty(dir);
79 }
80 set_page_dirty(page);
81
82 if (IS_DIRSYNC(dir))
83 err = write_one_page(page, 1);
84 else
85 unlock_page(page);
86
87 return err;
88}
89
90static void exofs_check_page(struct page *page)
91{
92 struct inode *dir = page->mapping->host;
93 unsigned chunk_size = exofs_chunk_size(dir);
94 char *kaddr = page_address(page);
95 unsigned offs, rec_len;
96 unsigned limit = PAGE_CACHE_SIZE;
97 struct exofs_dir_entry *p;
98 char *error;
99
100 /* if the page is the last one in the directory */
101 if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
102 limit = dir->i_size & ~PAGE_CACHE_MASK;
103 if (limit & (chunk_size - 1))
104 goto Ebadsize;
105 if (!limit)
106 goto out;
107 }
108 for (offs = 0; offs <= limit - EXOFS_DIR_REC_LEN(1); offs += rec_len) {
109 p = (struct exofs_dir_entry *)(kaddr + offs);
110 rec_len = le16_to_cpu(p->rec_len);
111
112 if (rec_len < EXOFS_DIR_REC_LEN(1))
113 goto Eshort;
114 if (rec_len & 3)
115 goto Ealign;
116 if (rec_len < EXOFS_DIR_REC_LEN(p->name_len))
117 goto Enamelen;
118 if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
119 goto Espan;
120 }
121 if (offs != limit)
122 goto Eend;
123out:
124 SetPageChecked(page);
125 return;
126
127Ebadsize:
128 EXOFS_ERR("ERROR [exofs_check_page]: "
129 "size of directory #%lu is not a multiple of chunk size",
130 dir->i_ino
131 );
132 goto fail;
133Eshort:
134 error = "rec_len is smaller than minimal";
135 goto bad_entry;
136Ealign:
137 error = "unaligned directory entry";
138 goto bad_entry;
139Enamelen:
140 error = "rec_len is too small for name_len";
141 goto bad_entry;
142Espan:
143 error = "directory entry across blocks";
144 goto bad_entry;
145bad_entry:
146 EXOFS_ERR(
147 "ERROR [exofs_check_page]: bad entry in directory #%lu: %s - "
148 "offset=%lu, inode=%llu, rec_len=%d, name_len=%d",
149 dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
150 _LLU(le64_to_cpu(p->inode_no)),
151 rec_len, p->name_len);
152 goto fail;
153Eend:
154 p = (struct exofs_dir_entry *)(kaddr + offs);
155 EXOFS_ERR("ERROR [exofs_check_page]: "
156 "entry in directory #%lu spans the page boundary"
157 "offset=%lu, inode=%llu",
158 dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
159 _LLU(le64_to_cpu(p->inode_no)));
160fail:
161 SetPageChecked(page);
162 SetPageError(page);
163}
164
165static struct page *exofs_get_page(struct inode *dir, unsigned long n)
166{
167 struct address_space *mapping = dir->i_mapping;
168 struct page *page = read_mapping_page(mapping, n, NULL);
169
170 if (!IS_ERR(page)) {
171 kmap(page);
172 if (!PageChecked(page))
173 exofs_check_page(page);
174 if (PageError(page))
175 goto fail;
176 }
177 return page;
178
179fail:
180 exofs_put_page(page);
181 return ERR_PTR(-EIO);
182}
183
184static inline int exofs_match(int len, const unsigned char *name,
185 struct exofs_dir_entry *de)
186{
187 if (len != de->name_len)
188 return 0;
189 if (!de->inode_no)
190 return 0;
191 return !memcmp(name, de->name, len);
192}
193
194static inline
195struct exofs_dir_entry *exofs_next_entry(struct exofs_dir_entry *p)
196{
197 return (struct exofs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
198}
199
200static inline unsigned
201exofs_validate_entry(char *base, unsigned offset, unsigned mask)
202{
203 struct exofs_dir_entry *de = (struct exofs_dir_entry *)(base + offset);
204 struct exofs_dir_entry *p =
205 (struct exofs_dir_entry *)(base + (offset&mask));
206 while ((char *)p < (char *)de) {
207 if (p->rec_len == 0)
208 break;
209 p = exofs_next_entry(p);
210 }
211 return (char *)p - base;
212}
213
214static unsigned char exofs_filetype_table[EXOFS_FT_MAX] = {
215 [EXOFS_FT_UNKNOWN] = DT_UNKNOWN,
216 [EXOFS_FT_REG_FILE] = DT_REG,
217 [EXOFS_FT_DIR] = DT_DIR,
218 [EXOFS_FT_CHRDEV] = DT_CHR,
219 [EXOFS_FT_BLKDEV] = DT_BLK,
220 [EXOFS_FT_FIFO] = DT_FIFO,
221 [EXOFS_FT_SOCK] = DT_SOCK,
222 [EXOFS_FT_SYMLINK] = DT_LNK,
223};
224
225#define S_SHIFT 12
226static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = {
227 [S_IFREG >> S_SHIFT] = EXOFS_FT_REG_FILE,
228 [S_IFDIR >> S_SHIFT] = EXOFS_FT_DIR,
229 [S_IFCHR >> S_SHIFT] = EXOFS_FT_CHRDEV,
230 [S_IFBLK >> S_SHIFT] = EXOFS_FT_BLKDEV,
231 [S_IFIFO >> S_SHIFT] = EXOFS_FT_FIFO,
232 [S_IFSOCK >> S_SHIFT] = EXOFS_FT_SOCK,
233 [S_IFLNK >> S_SHIFT] = EXOFS_FT_SYMLINK,
234};
235
236static inline
237void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
238{
239 mode_t mode = inode->i_mode;
240 de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
241}
242
243static int
244exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
245{
246 loff_t pos = filp->f_pos;
247 struct inode *inode = filp->f_path.dentry->d_inode;
248 unsigned int offset = pos & ~PAGE_CACHE_MASK;
249 unsigned long n = pos >> PAGE_CACHE_SHIFT;
250 unsigned long npages = dir_pages(inode);
251 unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
252 unsigned char *types = NULL;
253 int need_revalidate = (filp->f_version != inode->i_version);
254
255 if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
256 return 0;
257
258 types = exofs_filetype_table;
259
260 for ( ; n < npages; n++, offset = 0) {
261 char *kaddr, *limit;
262 struct exofs_dir_entry *de;
263 struct page *page = exofs_get_page(inode, n);
264
265 if (IS_ERR(page)) {
266 EXOFS_ERR("ERROR: "
267 "bad page in #%lu",
268 inode->i_ino);
269 filp->f_pos += PAGE_CACHE_SIZE - offset;
270 return PTR_ERR(page);
271 }
272 kaddr = page_address(page);
273 if (unlikely(need_revalidate)) {
274 if (offset) {
275 offset = exofs_validate_entry(kaddr, offset,
276 chunk_mask);
277 filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
278 }
279 filp->f_version = inode->i_version;
280 need_revalidate = 0;
281 }
282 de = (struct exofs_dir_entry *)(kaddr + offset);
283 limit = kaddr + exofs_last_byte(inode, n) -
284 EXOFS_DIR_REC_LEN(1);
285 for (; (char *)de <= limit; de = exofs_next_entry(de)) {
286 if (de->rec_len == 0) {
287 EXOFS_ERR("ERROR: "
288 "zero-length directory entry");
289 exofs_put_page(page);
290 return -EIO;
291 }
292 if (de->inode_no) {
293 int over;
294 unsigned char d_type = DT_UNKNOWN;
295
296 if (types && de->file_type < EXOFS_FT_MAX)
297 d_type = types[de->file_type];
298
299 offset = (char *)de - kaddr;
300 over = filldir(dirent, de->name, de->name_len,
301 (n<<PAGE_CACHE_SHIFT) | offset,
302 le64_to_cpu(de->inode_no),
303 d_type);
304 if (over) {
305 exofs_put_page(page);
306 return 0;
307 }
308 }
309 filp->f_pos += le16_to_cpu(de->rec_len);
310 }
311 exofs_put_page(page);
312 }
313
314 return 0;
315}
316
317struct exofs_dir_entry *exofs_find_entry(struct inode *dir,
318 struct dentry *dentry, struct page **res_page)
319{
320 const unsigned char *name = dentry->d_name.name;
321 int namelen = dentry->d_name.len;
322 unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
323 unsigned long start, n;
324 unsigned long npages = dir_pages(dir);
325 struct page *page = NULL;
326 struct exofs_i_info *oi = exofs_i(dir);
327 struct exofs_dir_entry *de;
328
329 if (npages == 0)
330 goto out;
331
332 *res_page = NULL;
333
334 start = oi->i_dir_start_lookup;
335 if (start >= npages)
336 start = 0;
337 n = start;
338 do {
339 char *kaddr;
340 page = exofs_get_page(dir, n);
341 if (!IS_ERR(page)) {
342 kaddr = page_address(page);
343 de = (struct exofs_dir_entry *) kaddr;
344 kaddr += exofs_last_byte(dir, n) - reclen;
345 while ((char *) de <= kaddr) {
346 if (de->rec_len == 0) {
347 EXOFS_ERR(
348 "ERROR: exofs_find_entry: "
349 "zero-length directory entry");
350 exofs_put_page(page);
351 goto out;
352 }
353 if (exofs_match(namelen, name, de))
354 goto found;
355 de = exofs_next_entry(de);
356 }
357 exofs_put_page(page);
358 }
359 if (++n >= npages)
360 n = 0;
361 } while (n != start);
362out:
363 return NULL;
364
365found:
366 *res_page = page;
367 oi->i_dir_start_lookup = n;
368 return de;
369}
370
371struct exofs_dir_entry *exofs_dotdot(struct inode *dir, struct page **p)
372{
373 struct page *page = exofs_get_page(dir, 0);
374 struct exofs_dir_entry *de = NULL;
375
376 if (!IS_ERR(page)) {
377 de = exofs_next_entry(
378 (struct exofs_dir_entry *)page_address(page));
379 *p = page;
380 }
381 return de;
382}
383
384ino_t exofs_parent_ino(struct dentry *child)
385{
386 struct page *page;
387 struct exofs_dir_entry *de;
388 ino_t ino;
389
390 de = exofs_dotdot(child->d_inode, &page);
391 if (!de)
392 return 0;
393
394 ino = le64_to_cpu(de->inode_no);
395 exofs_put_page(page);
396 return ino;
397}
398
399ino_t exofs_inode_by_name(struct inode *dir, struct dentry *dentry)
400{
401 ino_t res = 0;
402 struct exofs_dir_entry *de;
403 struct page *page;
404
405 de = exofs_find_entry(dir, dentry, &page);
406 if (de) {
407 res = le64_to_cpu(de->inode_no);
408 exofs_put_page(page);
409 }
410 return res;
411}
412
413int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
414 struct page *page, struct inode *inode)
415{
416 loff_t pos = page_offset(page) +
417 (char *) de - (char *) page_address(page);
418 unsigned len = le16_to_cpu(de->rec_len);
419 int err;
420
421 lock_page(page);
422 err = exofs_write_begin(NULL, page->mapping, pos, len,
423 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
424 if (err)
425 EXOFS_ERR("exofs_set_link: exofs_write_begin FAILD => %d\n",
426 err);
427
428 de->inode_no = cpu_to_le64(inode->i_ino);
429 exofs_set_de_type(de, inode);
430 if (likely(!err))
431 err = exofs_commit_chunk(page, pos, len);
432 exofs_put_page(page);
433 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
434 mark_inode_dirty(dir);
435 return err;
436}
437
438int exofs_add_link(struct dentry *dentry, struct inode *inode)
439{
440 struct inode *dir = dentry->d_parent->d_inode;
441 const unsigned char *name = dentry->d_name.name;
442 int namelen = dentry->d_name.len;
443 unsigned chunk_size = exofs_chunk_size(dir);
444 unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
445 unsigned short rec_len, name_len;
446 struct page *page = NULL;
447 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
448 struct exofs_dir_entry *de;
449 unsigned long npages = dir_pages(dir);
450 unsigned long n;
451 char *kaddr;
452 loff_t pos;
453 int err;
454
455 for (n = 0; n <= npages; n++) {
456 char *dir_end;
457
458 page = exofs_get_page(dir, n);
459 err = PTR_ERR(page);
460 if (IS_ERR(page))
461 goto out;
462 lock_page(page);
463 kaddr = page_address(page);
464 dir_end = kaddr + exofs_last_byte(dir, n);
465 de = (struct exofs_dir_entry *)kaddr;
466 kaddr += PAGE_CACHE_SIZE - reclen;
467 while ((char *)de <= kaddr) {
468 if ((char *)de == dir_end) {
469 name_len = 0;
470 rec_len = chunk_size;
471 de->rec_len = cpu_to_le16(chunk_size);
472 de->inode_no = 0;
473 goto got_it;
474 }
475 if (de->rec_len == 0) {
476 EXOFS_ERR("ERROR: exofs_add_link: "
477 "zero-length directory entry");
478 err = -EIO;
479 goto out_unlock;
480 }
481 err = -EEXIST;
482 if (exofs_match(namelen, name, de))
483 goto out_unlock;
484 name_len = EXOFS_DIR_REC_LEN(de->name_len);
485 rec_len = le16_to_cpu(de->rec_len);
486 if (!de->inode_no && rec_len >= reclen)
487 goto got_it;
488 if (rec_len >= name_len + reclen)
489 goto got_it;
490 de = (struct exofs_dir_entry *) ((char *) de + rec_len);
491 }
492 unlock_page(page);
493 exofs_put_page(page);
494 }
495
496 EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=%p", dentry, inode);
497 return -EINVAL;
498
499got_it:
500 pos = page_offset(page) +
501 (char *)de - (char *)page_address(page);
502 err = exofs_write_begin(NULL, page->mapping, pos, rec_len, 0,
503 &page, NULL);
504 if (err)
505 goto out_unlock;
506 if (de->inode_no) {
507 struct exofs_dir_entry *de1 =
508 (struct exofs_dir_entry *)((char *)de + name_len);
509 de1->rec_len = cpu_to_le16(rec_len - name_len);
510 de->rec_len = cpu_to_le16(name_len);
511 de = de1;
512 }
513 de->name_len = namelen;
514 memcpy(de->name, name, namelen);
515 de->inode_no = cpu_to_le64(inode->i_ino);
516 exofs_set_de_type(de, inode);
517 err = exofs_commit_chunk(page, pos, rec_len);
518 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
519 mark_inode_dirty(dir);
520 sbi->s_numfiles++;
521
522out_put:
523 exofs_put_page(page);
524out:
525 return err;
526out_unlock:
527 unlock_page(page);
528 goto out_put;
529}
530
531int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
532{
533 struct address_space *mapping = page->mapping;
534 struct inode *inode = mapping->host;
535 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
536 char *kaddr = page_address(page);
537 unsigned from = ((char *)dir - kaddr) & ~(exofs_chunk_size(inode)-1);
538 unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
539 loff_t pos;
540 struct exofs_dir_entry *pde = NULL;
541 struct exofs_dir_entry *de = (struct exofs_dir_entry *) (kaddr + from);
542 int err;
543
544 while (de < dir) {
545 if (de->rec_len == 0) {
546 EXOFS_ERR("ERROR: exofs_delete_entry:"
547 "zero-length directory entry");
548 err = -EIO;
549 goto out;
550 }
551 pde = de;
552 de = exofs_next_entry(de);
553 }
554 if (pde)
555 from = (char *)pde - (char *)page_address(page);
556 pos = page_offset(page) + from;
557 lock_page(page);
558 err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0,
559 &page, NULL);
560 if (err)
561 EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILD => %d\n",
562 err);
563 if (pde)
564 pde->rec_len = cpu_to_le16(to - from);
565 dir->inode_no = 0;
566 if (likely(!err))
567 err = exofs_commit_chunk(page, pos, to - from);
568 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
569 mark_inode_dirty(inode);
570 sbi->s_numfiles--;
571out:
572 exofs_put_page(page);
573 return err;
574}
575
576/* kept aligned on 4 bytes */
577#define THIS_DIR ".\0\0"
578#define PARENT_DIR "..\0"
579
580int exofs_make_empty(struct inode *inode, struct inode *parent)
581{
582 struct address_space *mapping = inode->i_mapping;
583 struct page *page = grab_cache_page(mapping, 0);
584 unsigned chunk_size = exofs_chunk_size(inode);
585 struct exofs_dir_entry *de;
586 int err;
587 void *kaddr;
588
589 if (!page)
590 return -ENOMEM;
591
592 err = exofs_write_begin(NULL, page->mapping, 0, chunk_size, 0,
593 &page, NULL);
594 if (err) {
595 unlock_page(page);
596 goto fail;
597 }
598
599 kaddr = kmap_atomic(page, KM_USER0);
600 de = (struct exofs_dir_entry *)kaddr;
601 de->name_len = 1;
602 de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1));
603 memcpy(de->name, THIS_DIR, sizeof(THIS_DIR));
604 de->inode_no = cpu_to_le64(inode->i_ino);
605 exofs_set_de_type(de, inode);
606
607 de = (struct exofs_dir_entry *)(kaddr + EXOFS_DIR_REC_LEN(1));
608 de->name_len = 2;
609 de->rec_len = cpu_to_le16(chunk_size - EXOFS_DIR_REC_LEN(1));
610 de->inode_no = cpu_to_le64(parent->i_ino);
611 memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
612 exofs_set_de_type(de, inode);
613 kunmap_atomic(page, KM_USER0);
614 err = exofs_commit_chunk(page, 0, chunk_size);
615fail:
616 page_cache_release(page);
617 return err;
618}
619
620int exofs_empty_dir(struct inode *inode)
621{
622 struct page *page = NULL;
623 unsigned long i, npages = dir_pages(inode);
624
625 for (i = 0; i < npages; i++) {
626 char *kaddr;
627 struct exofs_dir_entry *de;
628 page = exofs_get_page(inode, i);
629
630 if (IS_ERR(page))
631 continue;
632
633 kaddr = page_address(page);
634 de = (struct exofs_dir_entry *)kaddr;
635 kaddr += exofs_last_byte(inode, i) - EXOFS_DIR_REC_LEN(1);
636
637 while ((char *)de <= kaddr) {
638 if (de->rec_len == 0) {
639 EXOFS_ERR("ERROR: exofs_empty_dir: "
640 "zero-length directory entry"
641 "kaddr=%p, de=%p\n", kaddr, de);
642 goto not_empty;
643 }
644 if (de->inode_no != 0) {
645 /* check for . and .. */
646 if (de->name[0] != '.')
647 goto not_empty;
648 if (de->name_len > 2)
649 goto not_empty;
650 if (de->name_len < 2) {
651 if (le64_to_cpu(de->inode_no) !=
652 inode->i_ino)
653 goto not_empty;
654 } else if (de->name[1] != '.')
655 goto not_empty;
656 }
657 de = exofs_next_entry(de);
658 }
659 exofs_put_page(page);
660 }
661 return 1;
662
663not_empty:
664 exofs_put_page(page);
665 return 0;
666}
667
668const struct file_operations exofs_dir_operations = {
669 .llseek = generic_file_llseek,
670 .read = generic_read_dir,
671 .readdir = exofs_readdir,
672};
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
new file mode 100644
index 000000000000..0fd4c7859679
--- /dev/null
+++ b/fs/exofs/exofs.h
@@ -0,0 +1,180 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include <linux/fs.h>
37#include <linux/time.h>
38#include "common.h"
39
40#ifndef __EXOFS_H__
41#define __EXOFS_H__
42
43#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
44
45#ifdef CONFIG_EXOFS_DEBUG
46#define EXOFS_DBGMSG(fmt, a...) \
47 printk(KERN_NOTICE "exofs @%s:%d: " fmt, __func__, __LINE__, ##a)
48#else
49#define EXOFS_DBGMSG(fmt, a...) \
50 do { if (0) printk(fmt, ##a); } while (0)
51#endif
52
53/* u64 has problems with printk this will cast it to unsigned long long */
54#define _LLU(x) (unsigned long long)(x)
55
56/*
57 * our extension to the in-memory superblock
58 */
59struct exofs_sb_info {
60 struct osd_dev *s_dev; /* returned by get_osd_dev */
61 osd_id s_pid; /* partition ID of file system*/
62 int s_timeout; /* timeout for OSD operations */
63 uint64_t s_nextid; /* highest object ID used */
64 uint32_t s_numfiles; /* number of files on fs */
65 spinlock_t s_next_gen_lock; /* spinlock for gen # update */
66 u32 s_next_generation; /* next gen # to use */
67 atomic_t s_curr_pending; /* number of pending commands */
68 uint8_t s_cred[OSD_CAP_LEN]; /* all-powerful credential */
69};
70
71/*
72 * our extension to the in-memory inode
73 */
74struct exofs_i_info {
75 unsigned long i_flags; /* various atomic flags */
76 uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/
77 uint32_t i_dir_start_lookup; /* which page to start lookup */
78 wait_queue_head_t i_wq; /* wait queue for inode */
79 uint64_t i_commit_size; /* the object's written length */
80 uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */
81 struct inode vfs_inode; /* normal in-memory inode */
82};
83
84/*
85 * our inode flags
86 */
87#define OBJ_2BCREATED 0 /* object will be created soon*/
88#define OBJ_CREATED 1 /* object has been created on the osd*/
89
90static inline int obj_2bcreated(struct exofs_i_info *oi)
91{
92 return test_bit(OBJ_2BCREATED, &oi->i_flags);
93}
94
95static inline void set_obj_2bcreated(struct exofs_i_info *oi)
96{
97 set_bit(OBJ_2BCREATED, &oi->i_flags);
98}
99
100static inline int obj_created(struct exofs_i_info *oi)
101{
102 return test_bit(OBJ_CREATED, &oi->i_flags);
103}
104
105static inline void set_obj_created(struct exofs_i_info *oi)
106{
107 set_bit(OBJ_CREATED, &oi->i_flags);
108}
109
110int __exofs_wait_obj_created(struct exofs_i_info *oi);
111static inline int wait_obj_created(struct exofs_i_info *oi)
112{
113 if (likely(obj_created(oi)))
114 return 0;
115
116 return __exofs_wait_obj_created(oi);
117}
118
119/*
120 * get to our inode from the vfs inode
121 */
122static inline struct exofs_i_info *exofs_i(struct inode *inode)
123{
124 return container_of(inode, struct exofs_i_info, vfs_inode);
125}
126
127/*
128 * Maximum count of links to a file
129 */
130#define EXOFS_LINK_MAX 32000
131
132/*************************
133 * function declarations *
134 *************************/
135/* inode.c */
136void exofs_truncate(struct inode *inode);
137int exofs_setattr(struct dentry *, struct iattr *);
138int exofs_write_begin(struct file *file, struct address_space *mapping,
139 loff_t pos, unsigned len, unsigned flags,
140 struct page **pagep, void **fsdata);
141extern struct inode *exofs_iget(struct super_block *, unsigned long);
142struct inode *exofs_new_inode(struct inode *, int);
143extern int exofs_write_inode(struct inode *, int);
144extern void exofs_delete_inode(struct inode *);
145
146/* dir.c: */
147int exofs_add_link(struct dentry *, struct inode *);
148ino_t exofs_inode_by_name(struct inode *, struct dentry *);
149int exofs_delete_entry(struct exofs_dir_entry *, struct page *);
150int exofs_make_empty(struct inode *, struct inode *);
151struct exofs_dir_entry *exofs_find_entry(struct inode *, struct dentry *,
152 struct page **);
153int exofs_empty_dir(struct inode *);
154struct exofs_dir_entry *exofs_dotdot(struct inode *, struct page **);
155ino_t exofs_parent_ino(struct dentry *child);
156int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
157 struct inode *);
158
159/*********************
160 * operation vectors *
161 *********************/
162/* dir.c: */
163extern const struct file_operations exofs_dir_operations;
164
165/* file.c */
166extern const struct inode_operations exofs_file_inode_operations;
167extern const struct file_operations exofs_file_operations;
168
169/* inode.c */
170extern const struct address_space_operations exofs_aops;
171
172/* namei.c */
173extern const struct inode_operations exofs_dir_inode_operations;
174extern const struct inode_operations exofs_special_inode_operations;
175
176/* symlink.c */
177extern const struct inode_operations exofs_symlink_inode_operations;
178extern const struct inode_operations exofs_fast_symlink_inode_operations;
179
180#endif
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
new file mode 100644
index 000000000000..6ed7fe484752
--- /dev/null
+++ b/fs/exofs/file.c
@@ -0,0 +1,87 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include <linux/buffer_head.h>
37
38#include "exofs.h"
39
40static int exofs_release_file(struct inode *inode, struct file *filp)
41{
42 return 0;
43}
44
45static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
46 int datasync)
47{
48 int ret;
49 struct address_space *mapping = filp->f_mapping;
50
51 ret = filemap_write_and_wait(mapping);
52 if (ret)
53 return ret;
54
55 /*Note: file_fsync below also calles sync_blockdev, which is a no-op
56 * for exofs, but other then that it does sync_inode and
57 * sync_superblock which is what we need here.
58 */
59 return file_fsync(filp, dentry, datasync);
60}
61
62static int exofs_flush(struct file *file, fl_owner_t id)
63{
64 exofs_file_fsync(file, file->f_path.dentry, 1);
65 /* TODO: Flush the OSD target */
66 return 0;
67}
68
69const struct file_operations exofs_file_operations = {
70 .llseek = generic_file_llseek,
71 .read = do_sync_read,
72 .write = do_sync_write,
73 .aio_read = generic_file_aio_read,
74 .aio_write = generic_file_aio_write,
75 .mmap = generic_file_mmap,
76 .open = generic_file_open,
77 .release = exofs_release_file,
78 .fsync = exofs_file_fsync,
79 .flush = exofs_flush,
80 .splice_read = generic_file_splice_read,
81 .splice_write = generic_file_splice_write,
82};
83
84const struct inode_operations exofs_file_inode_operations = {
85 .truncate = exofs_truncate,
86 .setattr = exofs_setattr,
87};
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
new file mode 100644
index 000000000000..ba8d9fab4693
--- /dev/null
+++ b/fs/exofs/inode.c
@@ -0,0 +1,1303 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include <linux/writeback.h>
37#include <linux/buffer_head.h>
38#include <scsi/scsi_device.h>
39
40#include "exofs.h"
41
42#ifdef CONFIG_EXOFS_DEBUG
43# define EXOFS_DEBUG_OBJ_ISIZE 1
44#endif
45
46struct page_collect {
47 struct exofs_sb_info *sbi;
48 struct request_queue *req_q;
49 struct inode *inode;
50 unsigned expected_pages;
51
52 struct bio *bio;
53 unsigned nr_pages;
54 unsigned long length;
55 loff_t pg_first; /* keep 64bit also in 32-arches */
56};
57
58static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
59 struct inode *inode)
60{
61 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
62 struct request_queue *req_q = sbi->s_dev->scsi_device->request_queue;
63
64 pcol->sbi = sbi;
65 pcol->req_q = req_q;
66 pcol->inode = inode;
67 pcol->expected_pages = expected_pages;
68
69 pcol->bio = NULL;
70 pcol->nr_pages = 0;
71 pcol->length = 0;
72 pcol->pg_first = -1;
73
74 EXOFS_DBGMSG("_pcol_init ino=0x%lx expected_pages=%u\n", inode->i_ino,
75 expected_pages);
76}
77
78static void _pcol_reset(struct page_collect *pcol)
79{
80 pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
81
82 pcol->bio = NULL;
83 pcol->nr_pages = 0;
84 pcol->length = 0;
85 pcol->pg_first = -1;
86 EXOFS_DBGMSG("_pcol_reset ino=0x%lx expected_pages=%u\n",
87 pcol->inode->i_ino, pcol->expected_pages);
88
89 /* this is probably the end of the loop but in writes
90 * it might not end here. don't be left with nothing
91 */
92 if (!pcol->expected_pages)
93 pcol->expected_pages = 128;
94}
95
96static int pcol_try_alloc(struct page_collect *pcol)
97{
98 int pages = min_t(unsigned, pcol->expected_pages, BIO_MAX_PAGES);
99
100 for (; pages; pages >>= 1) {
101 pcol->bio = bio_alloc(GFP_KERNEL, pages);
102 if (likely(pcol->bio))
103 return 0;
104 }
105
106 EXOFS_ERR("Failed to kcalloc expected_pages=%u\n",
107 pcol->expected_pages);
108 return -ENOMEM;
109}
110
111static void pcol_free(struct page_collect *pcol)
112{
113 bio_put(pcol->bio);
114 pcol->bio = NULL;
115}
116
117static int pcol_add_page(struct page_collect *pcol, struct page *page,
118 unsigned len)
119{
120 int added_len = bio_add_pc_page(pcol->req_q, pcol->bio, page, len, 0);
121 if (unlikely(len != added_len))
122 return -ENOMEM;
123
124 ++pcol->nr_pages;
125 pcol->length += len;
126 return 0;
127}
128
129static int update_read_page(struct page *page, int ret)
130{
131 if (ret == 0) {
132 /* Everything is OK */
133 SetPageUptodate(page);
134 if (PageError(page))
135 ClearPageError(page);
136 } else if (ret == -EFAULT) {
137 /* In this case we were trying to read something that wasn't on
138 * disk yet - return a page full of zeroes. This should be OK,
139 * because the object should be empty (if there was a write
140 * before this read, the read would be waiting with the page
141 * locked */
142 clear_highpage(page);
143
144 SetPageUptodate(page);
145 if (PageError(page))
146 ClearPageError(page);
147 ret = 0; /* recovered error */
148 EXOFS_DBGMSG("recovered read error\n");
149 } else /* Error */
150 SetPageError(page);
151
152 return ret;
153}
154
155static void update_write_page(struct page *page, int ret)
156{
157 if (ret) {
158 mapping_set_error(page->mapping, ret);
159 SetPageError(page);
160 }
161 end_page_writeback(page);
162}
163
164/* Called at the end of reads, to optionally unlock pages and update their
165 * status.
166 */
167static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
168 bool do_unlock)
169{
170 struct bio_vec *bvec;
171 int i;
172 u64 resid;
173 u64 good_bytes;
174 u64 length = 0;
175 int ret = exofs_check_ok_resid(or, &resid, NULL);
176
177 osd_end_request(or);
178
179 if (likely(!ret))
180 good_bytes = pcol->length;
181 else if (!resid)
182 good_bytes = 0;
183 else
184 good_bytes = pcol->length - resid;
185
186 EXOFS_DBGMSG("readpages_done(0x%lx) good_bytes=0x%llx"
187 " length=0x%lx nr_pages=%u\n",
188 pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
189 pcol->nr_pages);
190
191 __bio_for_each_segment(bvec, pcol->bio, i, 0) {
192 struct page *page = bvec->bv_page;
193 struct inode *inode = page->mapping->host;
194 int page_stat;
195
196 if (inode != pcol->inode)
197 continue; /* osd might add more pages at end */
198
199 if (likely(length < good_bytes))
200 page_stat = 0;
201 else
202 page_stat = ret;
203
204 EXOFS_DBGMSG(" readpages_done(0x%lx, 0x%lx) %s\n",
205 inode->i_ino, page->index,
206 page_stat ? "bad_bytes" : "good_bytes");
207
208 ret = update_read_page(page, page_stat);
209 if (do_unlock)
210 unlock_page(page);
211 length += bvec->bv_len;
212 }
213
214 pcol_free(pcol);
215 EXOFS_DBGMSG("readpages_done END\n");
216 return ret;
217}
218
219/* callback of async reads */
220static void readpages_done(struct osd_request *or, void *p)
221{
222 struct page_collect *pcol = p;
223
224 __readpages_done(or, pcol, true);
225 atomic_dec(&pcol->sbi->s_curr_pending);
226 kfree(p);
227}
228
229static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
230{
231 struct bio_vec *bvec;
232 int i;
233
234 __bio_for_each_segment(bvec, pcol->bio, i, 0) {
235 struct page *page = bvec->bv_page;
236
237 if (rw == READ)
238 update_read_page(page, ret);
239 else
240 update_write_page(page, ret);
241
242 unlock_page(page);
243 }
244 pcol_free(pcol);
245}
246
247static int read_exec(struct page_collect *pcol, bool is_sync)
248{
249 struct exofs_i_info *oi = exofs_i(pcol->inode);
250 struct osd_obj_id obj = {pcol->sbi->s_pid,
251 pcol->inode->i_ino + EXOFS_OBJ_OFF};
252 struct osd_request *or = NULL;
253 struct page_collect *pcol_copy = NULL;
254 loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
255 int ret;
256
257 if (!pcol->bio)
258 return 0;
259
260 /* see comment in _readpage() about sync reads */
261 WARN_ON(is_sync && (pcol->nr_pages != 1));
262
263 or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
264 if (unlikely(!or)) {
265 ret = -ENOMEM;
266 goto err;
267 }
268
269 osd_req_read(or, &obj, pcol->bio, i_start);
270
271 if (is_sync) {
272 exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred);
273 return __readpages_done(or, pcol, false);
274 }
275
276 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
277 if (!pcol_copy) {
278 ret = -ENOMEM;
279 goto err;
280 }
281
282 *pcol_copy = *pcol;
283 ret = exofs_async_op(or, readpages_done, pcol_copy, oi->i_cred);
284 if (unlikely(ret))
285 goto err;
286
287 atomic_inc(&pcol->sbi->s_curr_pending);
288
289 EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
290 obj.id, _LLU(i_start), pcol->length);
291
292 /* pages ownership was passed to pcol_copy */
293 _pcol_reset(pcol);
294 return 0;
295
296err:
297 if (!is_sync)
298 _unlock_pcol_pages(pcol, ret, READ);
299 kfree(pcol_copy);
300 if (or)
301 osd_end_request(or);
302 return ret;
303}
304
305/* readpage_strip is called either directly from readpage() or by the VFS from
306 * within read_cache_pages(), to add one more page to be read. It will try to
307 * collect as many contiguous pages as posible. If a discontinuity is
308 * encountered, or it runs out of resources, it will submit the previous segment
309 * and will start a new collection. Eventually caller must submit the last
310 * segment if present.
311 */
312static int readpage_strip(void *data, struct page *page)
313{
314 struct page_collect *pcol = data;
315 struct inode *inode = pcol->inode;
316 struct exofs_i_info *oi = exofs_i(inode);
317 loff_t i_size = i_size_read(inode);
318 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
319 size_t len;
320 int ret;
321
322 /* FIXME: Just for debugging, will be removed */
323 if (PageUptodate(page))
324 EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
325 page->index);
326
327 if (page->index < end_index)
328 len = PAGE_CACHE_SIZE;
329 else if (page->index == end_index)
330 len = i_size & ~PAGE_CACHE_MASK;
331 else
332 len = 0;
333
334 if (!len || !obj_created(oi)) {
335 /* this will be out of bounds, or doesn't exist yet.
336 * Current page is cleared and the request is split
337 */
338 clear_highpage(page);
339
340 SetPageUptodate(page);
341 if (PageError(page))
342 ClearPageError(page);
343
344 unlock_page(page);
345 EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
346 " splitting\n", inode->i_ino, page->index);
347
348 return read_exec(pcol, false);
349 }
350
351try_again:
352
353 if (unlikely(pcol->pg_first == -1)) {
354 pcol->pg_first = page->index;
355 } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
356 page->index)) {
357 /* Discontinuity detected, split the request */
358 ret = read_exec(pcol, false);
359 if (unlikely(ret))
360 goto fail;
361 goto try_again;
362 }
363
364 if (!pcol->bio) {
365 ret = pcol_try_alloc(pcol);
366 if (unlikely(ret))
367 goto fail;
368 }
369
370 if (len != PAGE_CACHE_SIZE)
371 zero_user(page, len, PAGE_CACHE_SIZE - len);
372
373 EXOFS_DBGMSG(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
374 inode->i_ino, page->index, len);
375
376 ret = pcol_add_page(pcol, page, len);
377 if (ret) {
378 EXOFS_DBGMSG("Failed pcol_add_page pages[i]=%p "
379 "this_len=0x%zx nr_pages=%u length=0x%lx\n",
380 page, len, pcol->nr_pages, pcol->length);
381
382 /* split the request, and start again with current page */
383 ret = read_exec(pcol, false);
384 if (unlikely(ret))
385 goto fail;
386
387 goto try_again;
388 }
389
390 return 0;
391
392fail:
393 /* SetPageError(page); ??? */
394 unlock_page(page);
395 return ret;
396}
397
398static int exofs_readpages(struct file *file, struct address_space *mapping,
399 struct list_head *pages, unsigned nr_pages)
400{
401 struct page_collect pcol;
402 int ret;
403
404 _pcol_init(&pcol, nr_pages, mapping->host);
405
406 ret = read_cache_pages(mapping, pages, readpage_strip, &pcol);
407 if (ret) {
408 EXOFS_ERR("read_cache_pages => %d\n", ret);
409 return ret;
410 }
411
412 return read_exec(&pcol, false);
413}
414
415static int _readpage(struct page *page, bool is_sync)
416{
417 struct page_collect pcol;
418 int ret;
419
420 _pcol_init(&pcol, 1, page->mapping->host);
421
422 /* readpage_strip might call read_exec(,async) inside at several places
423 * but this is safe for is_async=0 since read_exec will not do anything
424 * when we have a single page.
425 */
426 ret = readpage_strip(&pcol, page);
427 if (ret) {
428 EXOFS_ERR("_readpage => %d\n", ret);
429 return ret;
430 }
431
432 return read_exec(&pcol, is_sync);
433}
434
435/*
436 * We don't need the file
437 */
438static int exofs_readpage(struct file *file, struct page *page)
439{
440 return _readpage(page, false);
441}
442
443/* Callback for osd_write. All writes are asynchronouse */
444static void writepages_done(struct osd_request *or, void *p)
445{
446 struct page_collect *pcol = p;
447 struct bio_vec *bvec;
448 int i;
449 u64 resid;
450 u64 good_bytes;
451 u64 length = 0;
452
453 int ret = exofs_check_ok_resid(or, NULL, &resid);
454
455 osd_end_request(or);
456 atomic_dec(&pcol->sbi->s_curr_pending);
457
458 if (likely(!ret))
459 good_bytes = pcol->length;
460 else if (!resid)
461 good_bytes = 0;
462 else
463 good_bytes = pcol->length - resid;
464
465 EXOFS_DBGMSG("writepages_done(0x%lx) good_bytes=0x%llx"
466 " length=0x%lx nr_pages=%u\n",
467 pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
468 pcol->nr_pages);
469
470 __bio_for_each_segment(bvec, pcol->bio, i, 0) {
471 struct page *page = bvec->bv_page;
472 struct inode *inode = page->mapping->host;
473 int page_stat;
474
475 if (inode != pcol->inode)
476 continue; /* osd might add more pages to a bio */
477
478 if (likely(length < good_bytes))
479 page_stat = 0;
480 else
481 page_stat = ret;
482
483 update_write_page(page, page_stat);
484 unlock_page(page);
485 EXOFS_DBGMSG(" writepages_done(0x%lx, 0x%lx) status=%d\n",
486 inode->i_ino, page->index, page_stat);
487
488 length += bvec->bv_len;
489 }
490
491 pcol_free(pcol);
492 kfree(pcol);
493 EXOFS_DBGMSG("writepages_done END\n");
494}
495
496static int write_exec(struct page_collect *pcol)
497{
498 struct exofs_i_info *oi = exofs_i(pcol->inode);
499 struct osd_obj_id obj = {pcol->sbi->s_pid,
500 pcol->inode->i_ino + EXOFS_OBJ_OFF};
501 struct osd_request *or = NULL;
502 struct page_collect *pcol_copy = NULL;
503 loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
504 int ret;
505
506 if (!pcol->bio)
507 return 0;
508
509 or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
510 if (unlikely(!or)) {
511 EXOFS_ERR("write_exec: Faild to osd_start_request()\n");
512 ret = -ENOMEM;
513 goto err;
514 }
515
516 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
517 if (!pcol_copy) {
518 EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n");
519 ret = -ENOMEM;
520 goto err;
521 }
522
523 *pcol_copy = *pcol;
524
525 osd_req_write(or, &obj, pcol_copy->bio, i_start);
526 ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred);
527 if (unlikely(ret)) {
528 EXOFS_ERR("write_exec: exofs_async_op() Faild\n");
529 goto err;
530 }
531
532 atomic_inc(&pcol->sbi->s_curr_pending);
533 EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
534 pcol->inode->i_ino, pcol->pg_first, _LLU(i_start),
535 pcol->length);
536 /* pages ownership was passed to pcol_copy */
537 _pcol_reset(pcol);
538 return 0;
539
540err:
541 _unlock_pcol_pages(pcol, ret, WRITE);
542 kfree(pcol_copy);
543 if (or)
544 osd_end_request(or);
545 return ret;
546}
547
548/* writepage_strip is called either directly from writepage() or by the VFS from
549 * within write_cache_pages(), to add one more page to be written to storage.
550 * It will try to collect as many contiguous pages as possible. If a
551 * discontinuity is encountered or it runs out of resources it will submit the
552 * previous segment and will start a new collection.
553 * Eventually caller must submit the last segment if present.
554 */
555static int writepage_strip(struct page *page,
556 struct writeback_control *wbc_unused, void *data)
557{
558 struct page_collect *pcol = data;
559 struct inode *inode = pcol->inode;
560 struct exofs_i_info *oi = exofs_i(inode);
561 loff_t i_size = i_size_read(inode);
562 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
563 size_t len;
564 int ret;
565
566 BUG_ON(!PageLocked(page));
567
568 ret = wait_obj_created(oi);
569 if (unlikely(ret))
570 goto fail;
571
572 if (page->index < end_index)
573 /* in this case, the page is within the limits of the file */
574 len = PAGE_CACHE_SIZE;
575 else {
576 len = i_size & ~PAGE_CACHE_MASK;
577
578 if (page->index > end_index || !len) {
579 /* in this case, the page is outside the limits
580 * (truncate in progress)
581 */
582 ret = write_exec(pcol);
583 if (unlikely(ret))
584 goto fail;
585 if (PageError(page))
586 ClearPageError(page);
587 unlock_page(page);
588 return 0;
589 }
590 }
591
592try_again:
593
594 if (unlikely(pcol->pg_first == -1)) {
595 pcol->pg_first = page->index;
596 } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
597 page->index)) {
598 /* Discontinuity detected, split the request */
599 ret = write_exec(pcol);
600 if (unlikely(ret))
601 goto fail;
602 goto try_again;
603 }
604
605 if (!pcol->bio) {
606 ret = pcol_try_alloc(pcol);
607 if (unlikely(ret))
608 goto fail;
609 }
610
611 EXOFS_DBGMSG(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
612 inode->i_ino, page->index, len);
613
614 ret = pcol_add_page(pcol, page, len);
615 if (unlikely(ret)) {
616 EXOFS_DBGMSG("Failed pcol_add_page "
617 "nr_pages=%u total_length=0x%lx\n",
618 pcol->nr_pages, pcol->length);
619
620 /* split the request, next loop will start again */
621 ret = write_exec(pcol);
622 if (unlikely(ret)) {
623 EXOFS_DBGMSG("write_exec faild => %d", ret);
624 goto fail;
625 }
626
627 goto try_again;
628 }
629
630 BUG_ON(PageWriteback(page));
631 set_page_writeback(page);
632
633 return 0;
634
635fail:
636 set_bit(AS_EIO, &page->mapping->flags);
637 unlock_page(page);
638 return ret;
639}
640
641static int exofs_writepages(struct address_space *mapping,
642 struct writeback_control *wbc)
643{
644 struct page_collect pcol;
645 long start, end, expected_pages;
646 int ret;
647
648 start = wbc->range_start >> PAGE_CACHE_SHIFT;
649 end = (wbc->range_end == LLONG_MAX) ?
650 start + mapping->nrpages :
651 wbc->range_end >> PAGE_CACHE_SHIFT;
652
653 if (start || end)
654 expected_pages = min(end - start + 1, 32L);
655 else
656 expected_pages = mapping->nrpages;
657
658 EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx"
659 " m->nrpages=%lu start=0x%lx end=0x%lx\n",
660 mapping->host->i_ino, wbc->range_start, wbc->range_end,
661 mapping->nrpages, start, end);
662
663 _pcol_init(&pcol, expected_pages, mapping->host);
664
665 ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
666 if (ret) {
667 EXOFS_ERR("write_cache_pages => %d\n", ret);
668 return ret;
669 }
670
671 return write_exec(&pcol);
672}
673
674static int exofs_writepage(struct page *page, struct writeback_control *wbc)
675{
676 struct page_collect pcol;
677 int ret;
678
679 _pcol_init(&pcol, 1, page->mapping->host);
680
681 ret = writepage_strip(page, NULL, &pcol);
682 if (ret) {
683 EXOFS_ERR("exofs_writepage => %d\n", ret);
684 return ret;
685 }
686
687 return write_exec(&pcol);
688}
689
690int exofs_write_begin(struct file *file, struct address_space *mapping,
691 loff_t pos, unsigned len, unsigned flags,
692 struct page **pagep, void **fsdata)
693{
694 int ret = 0;
695 struct page *page;
696
697 page = *pagep;
698 if (page == NULL) {
699 ret = simple_write_begin(file, mapping, pos, len, flags, pagep,
700 fsdata);
701 if (ret) {
702 EXOFS_DBGMSG("simple_write_begin faild\n");
703 return ret;
704 }
705
706 page = *pagep;
707 }
708
709 /* read modify write */
710 if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
711 ret = _readpage(page, true);
712 if (ret) {
713 /*SetPageError was done by _readpage. Is it ok?*/
714 unlock_page(page);
715 EXOFS_DBGMSG("__readpage_filler faild\n");
716 }
717 }
718
719 return ret;
720}
721
722static int exofs_write_begin_export(struct file *file,
723 struct address_space *mapping,
724 loff_t pos, unsigned len, unsigned flags,
725 struct page **pagep, void **fsdata)
726{
727 *pagep = NULL;
728
729 return exofs_write_begin(file, mapping, pos, len, flags, pagep,
730 fsdata);
731}
732
733const struct address_space_operations exofs_aops = {
734 .readpage = exofs_readpage,
735 .readpages = exofs_readpages,
736 .writepage = exofs_writepage,
737 .writepages = exofs_writepages,
738 .write_begin = exofs_write_begin_export,
739 .write_end = simple_write_end,
740};
741
742/******************************************************************************
743 * INODE OPERATIONS
744 *****************************************************************************/
745
746/*
747 * Test whether an inode is a fast symlink.
748 */
749static inline int exofs_inode_is_fast_symlink(struct inode *inode)
750{
751 struct exofs_i_info *oi = exofs_i(inode);
752
753 return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0);
754}
755
756/*
757 * get_block_t - Fill in a buffer_head
758 * An OSD takes care of block allocation so we just fake an allocation by
759 * putting in the inode's sector_t in the buffer_head.
760 * TODO: What about the case of create==0 and @iblock does not exist in the
761 * object?
762 */
763static int exofs_get_block(struct inode *inode, sector_t iblock,
764 struct buffer_head *bh_result, int create)
765{
766 map_bh(bh_result, inode->i_sb, iblock);
767 return 0;
768}
769
770const struct osd_attr g_attr_logical_length = ATTR_DEF(
771 OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
772
773/*
774 * Truncate a file to the specified size - all we have to do is set the size
775 * attribute. We make sure the object exists first.
776 */
777void exofs_truncate(struct inode *inode)
778{
779 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
780 struct exofs_i_info *oi = exofs_i(inode);
781 struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
782 struct osd_request *or;
783 struct osd_attr attr;
784 loff_t isize = i_size_read(inode);
785 __be64 newsize;
786 int ret;
787
788 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
789 || S_ISLNK(inode->i_mode)))
790 return;
791 if (exofs_inode_is_fast_symlink(inode))
792 return;
793 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
794 return;
795 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
796
797 nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
798
799 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
800 if (unlikely(!or)) {
801 EXOFS_ERR("ERROR: exofs_truncate: osd_start_request failed\n");
802 goto fail;
803 }
804
805 osd_req_set_attributes(or, &obj);
806
807 newsize = cpu_to_be64((u64)isize);
808 attr = g_attr_logical_length;
809 attr.val_ptr = &newsize;
810 osd_req_add_set_attr_list(or, &attr, 1);
811
812 /* if we are about to truncate an object, and it hasn't been
813 * created yet, wait
814 */
815 if (unlikely(wait_obj_created(oi)))
816 goto fail;
817
818 ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
819 osd_end_request(or);
820 if (ret)
821 goto fail;
822
823out:
824 mark_inode_dirty(inode);
825 return;
826fail:
827 make_bad_inode(inode);
828 goto out;
829}
830
831/*
832 * Set inode attributes - just call generic functions.
833 */
834int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
835{
836 struct inode *inode = dentry->d_inode;
837 int error;
838
839 error = inode_change_ok(inode, iattr);
840 if (error)
841 return error;
842
843 error = inode_setattr(inode, iattr);
844 return error;
845}
846
847/*
848 * Read an inode from the OSD, and return it as is. We also return the size
849 * attribute in the 'sanity' argument if we got compiled with debugging turned
850 * on.
851 */
852static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
853 struct exofs_fcb *inode, uint64_t *sanity)
854{
855 struct exofs_sb_info *sbi = sb->s_fs_info;
856 struct osd_request *or;
857 struct osd_attr attr;
858 struct osd_obj_id obj = {sbi->s_pid,
859 oi->vfs_inode.i_ino + EXOFS_OBJ_OFF};
860 int ret;
861
862 exofs_make_credential(oi->i_cred, &obj);
863
864 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
865 if (unlikely(!or)) {
866 EXOFS_ERR("exofs_get_inode: osd_start_request failed.\n");
867 return -ENOMEM;
868 }
869 osd_req_get_attributes(or, &obj);
870
871 /* we need the inode attribute */
872 osd_req_add_get_attr_list(or, &g_attr_inode_data, 1);
873
874#ifdef EXOFS_DEBUG_OBJ_ISIZE
875 /* we get the size attributes to do a sanity check */
876 osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
877#endif
878
879 ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
880 if (ret)
881 goto out;
882
883 attr = g_attr_inode_data;
884 ret = extract_attr_from_req(or, &attr);
885 if (ret) {
886 EXOFS_ERR("exofs_get_inode: extract_attr_from_req failed\n");
887 goto out;
888 }
889
890 WARN_ON(attr.len != EXOFS_INO_ATTR_SIZE);
891 memcpy(inode, attr.val_ptr, EXOFS_INO_ATTR_SIZE);
892
893#ifdef EXOFS_DEBUG_OBJ_ISIZE
894 attr = g_attr_logical_length;
895 ret = extract_attr_from_req(or, &attr);
896 if (ret) {
897 EXOFS_ERR("ERROR: extract attr from or failed\n");
898 goto out;
899 }
900 *sanity = get_unaligned_be64(attr.val_ptr);
901#endif
902
903out:
904 osd_end_request(or);
905 return ret;
906}
907
908/*
909 * Fill in an inode read from the OSD and set it up for use
910 */
911struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
912{
913 struct exofs_i_info *oi;
914 struct exofs_fcb fcb;
915 struct inode *inode;
916 uint64_t uninitialized_var(sanity);
917 int ret;
918
919 inode = iget_locked(sb, ino);
920 if (!inode)
921 return ERR_PTR(-ENOMEM);
922 if (!(inode->i_state & I_NEW))
923 return inode;
924 oi = exofs_i(inode);
925
926 /* read the inode from the osd */
927 ret = exofs_get_inode(sb, oi, &fcb, &sanity);
928 if (ret)
929 goto bad_inode;
930
931 init_waitqueue_head(&oi->i_wq);
932 set_obj_created(oi);
933
934 /* copy stuff from on-disk struct to in-memory struct */
935 inode->i_mode = le16_to_cpu(fcb.i_mode);
936 inode->i_uid = le32_to_cpu(fcb.i_uid);
937 inode->i_gid = le32_to_cpu(fcb.i_gid);
938 inode->i_nlink = le16_to_cpu(fcb.i_links_count);
939 inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime);
940 inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime);
941 inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime);
942 inode->i_ctime.tv_nsec =
943 inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
944 oi->i_commit_size = le64_to_cpu(fcb.i_size);
945 i_size_write(inode, oi->i_commit_size);
946 inode->i_blkbits = EXOFS_BLKSHIFT;
947 inode->i_generation = le32_to_cpu(fcb.i_generation);
948
949#ifdef EXOFS_DEBUG_OBJ_ISIZE
950 if ((inode->i_size != sanity) &&
951 (!exofs_inode_is_fast_symlink(inode))) {
952 EXOFS_ERR("WARNING: Size of object from inode and "
953 "attributes differ (%lld != %llu)\n",
954 inode->i_size, _LLU(sanity));
955 }
956#endif
957
958 oi->i_dir_start_lookup = 0;
959
960 if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {
961 ret = -ESTALE;
962 goto bad_inode;
963 }
964
965 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
966 if (fcb.i_data[0])
967 inode->i_rdev =
968 old_decode_dev(le32_to_cpu(fcb.i_data[0]));
969 else
970 inode->i_rdev =
971 new_decode_dev(le32_to_cpu(fcb.i_data[1]));
972 } else {
973 memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
974 }
975
976 if (S_ISREG(inode->i_mode)) {
977 inode->i_op = &exofs_file_inode_operations;
978 inode->i_fop = &exofs_file_operations;
979 inode->i_mapping->a_ops = &exofs_aops;
980 } else if (S_ISDIR(inode->i_mode)) {
981 inode->i_op = &exofs_dir_inode_operations;
982 inode->i_fop = &exofs_dir_operations;
983 inode->i_mapping->a_ops = &exofs_aops;
984 } else if (S_ISLNK(inode->i_mode)) {
985 if (exofs_inode_is_fast_symlink(inode))
986 inode->i_op = &exofs_fast_symlink_inode_operations;
987 else {
988 inode->i_op = &exofs_symlink_inode_operations;
989 inode->i_mapping->a_ops = &exofs_aops;
990 }
991 } else {
992 inode->i_op = &exofs_special_inode_operations;
993 if (fcb.i_data[0])
994 init_special_inode(inode, inode->i_mode,
995 old_decode_dev(le32_to_cpu(fcb.i_data[0])));
996 else
997 init_special_inode(inode, inode->i_mode,
998 new_decode_dev(le32_to_cpu(fcb.i_data[1])));
999 }
1000
1001 unlock_new_inode(inode);
1002 return inode;
1003
1004bad_inode:
1005 iget_failed(inode);
1006 return ERR_PTR(ret);
1007}
1008
1009int __exofs_wait_obj_created(struct exofs_i_info *oi)
1010{
1011 if (!obj_created(oi)) {
1012 BUG_ON(!obj_2bcreated(oi));
1013 wait_event(oi->i_wq, obj_created(oi));
1014 }
1015 return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
1016}
1017/*
1018 * Callback function from exofs_new_inode(). The important thing is that we
1019 * set the obj_created flag so that other methods know that the object exists on
1020 * the OSD.
1021 */
1022static void create_done(struct osd_request *or, void *p)
1023{
1024 struct inode *inode = p;
1025 struct exofs_i_info *oi = exofs_i(inode);
1026 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
1027 int ret;
1028
1029 ret = exofs_check_ok(or);
1030 osd_end_request(or);
1031 atomic_dec(&sbi->s_curr_pending);
1032
1033 if (unlikely(ret)) {
1034 EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
1035 _LLU(sbi->s_pid), _LLU(inode->i_ino + EXOFS_OBJ_OFF));
1036 make_bad_inode(inode);
1037 } else
1038 set_obj_created(oi);
1039
1040 atomic_dec(&inode->i_count);
1041 wake_up(&oi->i_wq);
1042}
1043
1044/*
1045 * Set up a new inode and create an object for it on the OSD
1046 */
1047struct inode *exofs_new_inode(struct inode *dir, int mode)
1048{
1049 struct super_block *sb;
1050 struct inode *inode;
1051 struct exofs_i_info *oi;
1052 struct exofs_sb_info *sbi;
1053 struct osd_request *or;
1054 struct osd_obj_id obj;
1055 int ret;
1056
1057 sb = dir->i_sb;
1058 inode = new_inode(sb);
1059 if (!inode)
1060 return ERR_PTR(-ENOMEM);
1061
1062 oi = exofs_i(inode);
1063
1064 init_waitqueue_head(&oi->i_wq);
1065 set_obj_2bcreated(oi);
1066
1067 sbi = sb->s_fs_info;
1068
1069 sb->s_dirt = 1;
1070 inode->i_uid = current->cred->fsuid;
1071 if (dir->i_mode & S_ISGID) {
1072 inode->i_gid = dir->i_gid;
1073 if (S_ISDIR(mode))
1074 mode |= S_ISGID;
1075 } else {
1076 inode->i_gid = current->cred->fsgid;
1077 }
1078 inode->i_mode = mode;
1079
1080 inode->i_ino = sbi->s_nextid++;
1081 inode->i_blkbits = EXOFS_BLKSHIFT;
1082 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
1083 oi->i_commit_size = inode->i_size = 0;
1084 spin_lock(&sbi->s_next_gen_lock);
1085 inode->i_generation = sbi->s_next_generation++;
1086 spin_unlock(&sbi->s_next_gen_lock);
1087 insert_inode_hash(inode);
1088
1089 mark_inode_dirty(inode);
1090
1091 obj.partition = sbi->s_pid;
1092 obj.id = inode->i_ino + EXOFS_OBJ_OFF;
1093 exofs_make_credential(oi->i_cred, &obj);
1094
1095 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
1096 if (unlikely(!or)) {
1097 EXOFS_ERR("exofs_new_inode: osd_start_request failed\n");
1098 return ERR_PTR(-ENOMEM);
1099 }
1100
1101 osd_req_create_object(or, &obj);
1102
1103 /* increment the refcount so that the inode will still be around when we
1104 * reach the callback
1105 */
1106 atomic_inc(&inode->i_count);
1107
1108 ret = exofs_async_op(or, create_done, inode, oi->i_cred);
1109 if (ret) {
1110 atomic_dec(&inode->i_count);
1111 osd_end_request(or);
1112 return ERR_PTR(-EIO);
1113 }
1114 atomic_inc(&sbi->s_curr_pending);
1115
1116 return inode;
1117}
1118
1119/*
1120 * struct to pass two arguments to update_inode's callback
1121 */
1122struct updatei_args {
1123 struct exofs_sb_info *sbi;
1124 struct exofs_fcb fcb;
1125};
1126
1127/*
1128 * Callback function from exofs_update_inode().
1129 */
1130static void updatei_done(struct osd_request *or, void *p)
1131{
1132 struct updatei_args *args = p;
1133
1134 osd_end_request(or);
1135
1136 atomic_dec(&args->sbi->s_curr_pending);
1137
1138 kfree(args);
1139}
1140
1141/*
1142 * Write the inode to the OSD. Just fill up the struct, and set the attribute
1143 * synchronously or asynchronously depending on the do_sync flag.
1144 */
1145static int exofs_update_inode(struct inode *inode, int do_sync)
1146{
1147 struct exofs_i_info *oi = exofs_i(inode);
1148 struct super_block *sb = inode->i_sb;
1149 struct exofs_sb_info *sbi = sb->s_fs_info;
1150 struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
1151 struct osd_request *or;
1152 struct osd_attr attr;
1153 struct exofs_fcb *fcb;
1154 struct updatei_args *args;
1155 int ret;
1156
1157 args = kzalloc(sizeof(*args), GFP_KERNEL);
1158 if (!args)
1159 return -ENOMEM;
1160
1161 fcb = &args->fcb;
1162
1163 fcb->i_mode = cpu_to_le16(inode->i_mode);
1164 fcb->i_uid = cpu_to_le32(inode->i_uid);
1165 fcb->i_gid = cpu_to_le32(inode->i_gid);
1166 fcb->i_links_count = cpu_to_le16(inode->i_nlink);
1167 fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
1168 fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
1169 fcb->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
1170 oi->i_commit_size = i_size_read(inode);
1171 fcb->i_size = cpu_to_le64(oi->i_commit_size);
1172 fcb->i_generation = cpu_to_le32(inode->i_generation);
1173
1174 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
1175 if (old_valid_dev(inode->i_rdev)) {
1176 fcb->i_data[0] =
1177 cpu_to_le32(old_encode_dev(inode->i_rdev));
1178 fcb->i_data[1] = 0;
1179 } else {
1180 fcb->i_data[0] = 0;
1181 fcb->i_data[1] =
1182 cpu_to_le32(new_encode_dev(inode->i_rdev));
1183 fcb->i_data[2] = 0;
1184 }
1185 } else
1186 memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
1187
1188 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
1189 if (unlikely(!or)) {
1190 EXOFS_ERR("exofs_update_inode: osd_start_request failed.\n");
1191 ret = -ENOMEM;
1192 goto free_args;
1193 }
1194
1195 osd_req_set_attributes(or, &obj);
1196
1197 attr = g_attr_inode_data;
1198 attr.val_ptr = fcb;
1199 osd_req_add_set_attr_list(or, &attr, 1);
1200
1201 if (!obj_created(oi)) {
1202 EXOFS_DBGMSG("!obj_created\n");
1203 BUG_ON(!obj_2bcreated(oi));
1204 wait_event(oi->i_wq, obj_created(oi));
1205 EXOFS_DBGMSG("wait_event done\n");
1206 }
1207
1208 if (do_sync) {
1209 ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
1210 osd_end_request(or);
1211 goto free_args;
1212 } else {
1213 args->sbi = sbi;
1214
1215 ret = exofs_async_op(or, updatei_done, args, oi->i_cred);
1216 if (ret) {
1217 osd_end_request(or);
1218 goto free_args;
1219 }
1220 atomic_inc(&sbi->s_curr_pending);
1221 goto out; /* deallocation in updatei_done */
1222 }
1223
1224free_args:
1225 kfree(args);
1226out:
1227 EXOFS_DBGMSG("ret=>%d\n", ret);
1228 return ret;
1229}
1230
1231int exofs_write_inode(struct inode *inode, int wait)
1232{
1233 return exofs_update_inode(inode, wait);
1234}
1235
1236/*
1237 * Callback function from exofs_delete_inode() - don't have much cleaning up to
1238 * do.
1239 */
1240static void delete_done(struct osd_request *or, void *p)
1241{
1242 struct exofs_sb_info *sbi;
1243 osd_end_request(or);
1244 sbi = p;
1245 atomic_dec(&sbi->s_curr_pending);
1246}
1247
1248/*
1249 * Called when the refcount of an inode reaches zero. We remove the object
1250 * from the OSD here. We make sure the object was created before we try and
1251 * delete it.
1252 */
1253void exofs_delete_inode(struct inode *inode)
1254{
1255 struct exofs_i_info *oi = exofs_i(inode);
1256 struct super_block *sb = inode->i_sb;
1257 struct exofs_sb_info *sbi = sb->s_fs_info;
1258 struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
1259 struct osd_request *or;
1260 int ret;
1261
1262 truncate_inode_pages(&inode->i_data, 0);
1263
1264 if (is_bad_inode(inode))
1265 goto no_delete;
1266
1267 mark_inode_dirty(inode);
1268 exofs_update_inode(inode, inode_needs_sync(inode));
1269
1270 inode->i_size = 0;
1271 if (inode->i_blocks)
1272 exofs_truncate(inode);
1273
1274 clear_inode(inode);
1275
1276 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
1277 if (unlikely(!or)) {
1278 EXOFS_ERR("exofs_delete_inode: osd_start_request failed\n");
1279 return;
1280 }
1281
1282 osd_req_remove_object(or, &obj);
1283
1284 /* if we are deleting an obj that hasn't been created yet, wait */
1285 if (!obj_created(oi)) {
1286 BUG_ON(!obj_2bcreated(oi));
1287 wait_event(oi->i_wq, obj_created(oi));
1288 }
1289
1290 ret = exofs_async_op(or, delete_done, sbi, oi->i_cred);
1291 if (ret) {
1292 EXOFS_ERR(
1293 "ERROR: @exofs_delete_inode exofs_async_op failed\n");
1294 osd_end_request(or);
1295 return;
1296 }
1297 atomic_inc(&sbi->s_curr_pending);
1298
1299 return;
1300
1301no_delete:
1302 clear_inode(inode);
1303}
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
new file mode 100644
index 000000000000..77fdd765e76d
--- /dev/null
+++ b/fs/exofs/namei.c
@@ -0,0 +1,342 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include "exofs.h"
37
38static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode)
39{
40 int err = exofs_add_link(dentry, inode);
41 if (!err) {
42 d_instantiate(dentry, inode);
43 return 0;
44 }
45 inode_dec_link_count(inode);
46 iput(inode);
47 return err;
48}
49
50static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry,
51 struct nameidata *nd)
52{
53 struct inode *inode;
54 ino_t ino;
55
56 if (dentry->d_name.len > EXOFS_NAME_LEN)
57 return ERR_PTR(-ENAMETOOLONG);
58
59 ino = exofs_inode_by_name(dir, dentry);
60 inode = NULL;
61 if (ino) {
62 inode = exofs_iget(dir->i_sb, ino);
63 if (IS_ERR(inode))
64 return ERR_CAST(inode);
65 }
66 return d_splice_alias(inode, dentry);
67}
68
69static int exofs_create(struct inode *dir, struct dentry *dentry, int mode,
70 struct nameidata *nd)
71{
72 struct inode *inode = exofs_new_inode(dir, mode);
73 int err = PTR_ERR(inode);
74 if (!IS_ERR(inode)) {
75 inode->i_op = &exofs_file_inode_operations;
76 inode->i_fop = &exofs_file_operations;
77 inode->i_mapping->a_ops = &exofs_aops;
78 mark_inode_dirty(inode);
79 err = exofs_add_nondir(dentry, inode);
80 }
81 return err;
82}
83
84static int exofs_mknod(struct inode *dir, struct dentry *dentry, int mode,
85 dev_t rdev)
86{
87 struct inode *inode;
88 int err;
89
90 if (!new_valid_dev(rdev))
91 return -EINVAL;
92
93 inode = exofs_new_inode(dir, mode);
94 err = PTR_ERR(inode);
95 if (!IS_ERR(inode)) {
96 init_special_inode(inode, inode->i_mode, rdev);
97 mark_inode_dirty(inode);
98 err = exofs_add_nondir(dentry, inode);
99 }
100 return err;
101}
102
103static int exofs_symlink(struct inode *dir, struct dentry *dentry,
104 const char *symname)
105{
106 struct super_block *sb = dir->i_sb;
107 int err = -ENAMETOOLONG;
108 unsigned l = strlen(symname)+1;
109 struct inode *inode;
110 struct exofs_i_info *oi;
111
112 if (l > sb->s_blocksize)
113 goto out;
114
115 inode = exofs_new_inode(dir, S_IFLNK | S_IRWXUGO);
116 err = PTR_ERR(inode);
117 if (IS_ERR(inode))
118 goto out;
119
120 oi = exofs_i(inode);
121 if (l > sizeof(oi->i_data)) {
122 /* slow symlink */
123 inode->i_op = &exofs_symlink_inode_operations;
124 inode->i_mapping->a_ops = &exofs_aops;
125 memset(oi->i_data, 0, sizeof(oi->i_data));
126
127 err = page_symlink(inode, symname, l);
128 if (err)
129 goto out_fail;
130 } else {
131 /* fast symlink */
132 inode->i_op = &exofs_fast_symlink_inode_operations;
133 memcpy(oi->i_data, symname, l);
134 inode->i_size = l-1;
135 }
136 mark_inode_dirty(inode);
137
138 err = exofs_add_nondir(dentry, inode);
139out:
140 return err;
141
142out_fail:
143 inode_dec_link_count(inode);
144 iput(inode);
145 goto out;
146}
147
148static int exofs_link(struct dentry *old_dentry, struct inode *dir,
149 struct dentry *dentry)
150{
151 struct inode *inode = old_dentry->d_inode;
152
153 if (inode->i_nlink >= EXOFS_LINK_MAX)
154 return -EMLINK;
155
156 inode->i_ctime = CURRENT_TIME;
157 inode_inc_link_count(inode);
158 atomic_inc(&inode->i_count);
159
160 return exofs_add_nondir(dentry, inode);
161}
162
163static int exofs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
164{
165 struct inode *inode;
166 int err = -EMLINK;
167
168 if (dir->i_nlink >= EXOFS_LINK_MAX)
169 goto out;
170
171 inode_inc_link_count(dir);
172
173 inode = exofs_new_inode(dir, S_IFDIR | mode);
174 err = PTR_ERR(inode);
175 if (IS_ERR(inode))
176 goto out_dir;
177
178 inode->i_op = &exofs_dir_inode_operations;
179 inode->i_fop = &exofs_dir_operations;
180 inode->i_mapping->a_ops = &exofs_aops;
181
182 inode_inc_link_count(inode);
183
184 err = exofs_make_empty(inode, dir);
185 if (err)
186 goto out_fail;
187
188 err = exofs_add_link(dentry, inode);
189 if (err)
190 goto out_fail;
191
192 d_instantiate(dentry, inode);
193out:
194 return err;
195
196out_fail:
197 inode_dec_link_count(inode);
198 inode_dec_link_count(inode);
199 iput(inode);
200out_dir:
201 inode_dec_link_count(dir);
202 goto out;
203}
204
205static int exofs_unlink(struct inode *dir, struct dentry *dentry)
206{
207 struct inode *inode = dentry->d_inode;
208 struct exofs_dir_entry *de;
209 struct page *page;
210 int err = -ENOENT;
211
212 de = exofs_find_entry(dir, dentry, &page);
213 if (!de)
214 goto out;
215
216 err = exofs_delete_entry(de, page);
217 if (err)
218 goto out;
219
220 inode->i_ctime = dir->i_ctime;
221 inode_dec_link_count(inode);
222 err = 0;
223out:
224 return err;
225}
226
227static int exofs_rmdir(struct inode *dir, struct dentry *dentry)
228{
229 struct inode *inode = dentry->d_inode;
230 int err = -ENOTEMPTY;
231
232 if (exofs_empty_dir(inode)) {
233 err = exofs_unlink(dir, dentry);
234 if (!err) {
235 inode->i_size = 0;
236 inode_dec_link_count(inode);
237 inode_dec_link_count(dir);
238 }
239 }
240 return err;
241}
242
243static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
244 struct inode *new_dir, struct dentry *new_dentry)
245{
246 struct inode *old_inode = old_dentry->d_inode;
247 struct inode *new_inode = new_dentry->d_inode;
248 struct page *dir_page = NULL;
249 struct exofs_dir_entry *dir_de = NULL;
250 struct page *old_page;
251 struct exofs_dir_entry *old_de;
252 int err = -ENOENT;
253
254 old_de = exofs_find_entry(old_dir, old_dentry, &old_page);
255 if (!old_de)
256 goto out;
257
258 if (S_ISDIR(old_inode->i_mode)) {
259 err = -EIO;
260 dir_de = exofs_dotdot(old_inode, &dir_page);
261 if (!dir_de)
262 goto out_old;
263 }
264
265 if (new_inode) {
266 struct page *new_page;
267 struct exofs_dir_entry *new_de;
268
269 err = -ENOTEMPTY;
270 if (dir_de && !exofs_empty_dir(new_inode))
271 goto out_dir;
272
273 err = -ENOENT;
274 new_de = exofs_find_entry(new_dir, new_dentry, &new_page);
275 if (!new_de)
276 goto out_dir;
277 inode_inc_link_count(old_inode);
278 err = exofs_set_link(new_dir, new_de, new_page, old_inode);
279 new_inode->i_ctime = CURRENT_TIME;
280 if (dir_de)
281 drop_nlink(new_inode);
282 inode_dec_link_count(new_inode);
283 if (err)
284 goto out_dir;
285 } else {
286 if (dir_de) {
287 err = -EMLINK;
288 if (new_dir->i_nlink >= EXOFS_LINK_MAX)
289 goto out_dir;
290 }
291 inode_inc_link_count(old_inode);
292 err = exofs_add_link(new_dentry, old_inode);
293 if (err) {
294 inode_dec_link_count(old_inode);
295 goto out_dir;
296 }
297 if (dir_de)
298 inode_inc_link_count(new_dir);
299 }
300
301 old_inode->i_ctime = CURRENT_TIME;
302
303 exofs_delete_entry(old_de, old_page);
304 inode_dec_link_count(old_inode);
305
306 if (dir_de) {
307 err = exofs_set_link(old_inode, dir_de, dir_page, new_dir);
308 inode_dec_link_count(old_dir);
309 if (err)
310 goto out_dir;
311 }
312 return 0;
313
314
315out_dir:
316 if (dir_de) {
317 kunmap(dir_page);
318 page_cache_release(dir_page);
319 }
320out_old:
321 kunmap(old_page);
322 page_cache_release(old_page);
323out:
324 return err;
325}
326
327const struct inode_operations exofs_dir_inode_operations = {
328 .create = exofs_create,
329 .lookup = exofs_lookup,
330 .link = exofs_link,
331 .unlink = exofs_unlink,
332 .symlink = exofs_symlink,
333 .mkdir = exofs_mkdir,
334 .rmdir = exofs_rmdir,
335 .mknod = exofs_mknod,
336 .rename = exofs_rename,
337 .setattr = exofs_setattr,
338};
339
340const struct inode_operations exofs_special_inode_operations = {
341 .setattr = exofs_setattr,
342};
diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c
new file mode 100644
index 000000000000..b249ae97fb15
--- /dev/null
+++ b/fs/exofs/osd.c
@@ -0,0 +1,153 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * This file is part of exofs.
10 *
11 * exofs is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation. Since it is based on ext2, and the only
14 * valid version of GPL for the Linux kernel is version 2, the only valid
15 * version of GPL for exofs is version 2.
16 *
17 * exofs is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with exofs; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
25 */
26
27#include <scsi/scsi_device.h>
28#include <scsi/osd_sense.h>
29
30#include "exofs.h"
31
32int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
33{
34 struct osd_sense_info osi;
35 int ret = osd_req_decode_sense(or, &osi);
36
37 if (ret) { /* translate to Linux codes */
38 if (osi.additional_code == scsi_invalid_field_in_cdb) {
39 if (osi.cdb_field_offset == OSD_CFO_STARTING_BYTE)
40 ret = -EFAULT;
41 if (osi.cdb_field_offset == OSD_CFO_OBJECT_ID)
42 ret = -ENOENT;
43 else
44 ret = -EINVAL;
45 } else if (osi.additional_code == osd_quota_error)
46 ret = -ENOSPC;
47 else
48 ret = -EIO;
49 }
50
51 /* FIXME: should be include in osd_sense_info */
52 if (in_resid)
53 *in_resid = or->in.req ? or->in.req->data_len : 0;
54
55 if (out_resid)
56 *out_resid = or->out.req ? or->out.req->data_len : 0;
57
58 return ret;
59}
60
61void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
62{
63 osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
64}
65
66/*
67 * Perform a synchronous OSD operation.
68 */
69int exofs_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
70{
71 int ret;
72
73 or->timeout = timeout;
74 ret = osd_finalize_request(or, 0, credential, NULL);
75 if (ret) {
76 EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
77 return ret;
78 }
79
80 ret = osd_execute_request(or);
81
82 if (ret)
83 EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
84 /* osd_req_decode_sense(or, ret); */
85 return ret;
86}
87
88/*
89 * Perform an asynchronous OSD operation.
90 */
91int exofs_async_op(struct osd_request *or, osd_req_done_fn *async_done,
92 void *caller_context, u8 *cred)
93{
94 int ret;
95
96 ret = osd_finalize_request(or, 0, cred, NULL);
97 if (ret) {
98 EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
99 return ret;
100 }
101
102 ret = osd_execute_request_async(or, async_done, caller_context);
103
104 if (ret)
105 EXOFS_DBGMSG("osd_execute_request_async() => %d\n", ret);
106 return ret;
107}
108
109int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
110{
111 struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
112 void *iter = NULL;
113 int nelem;
114
115 do {
116 nelem = 1;
117 osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
118 if ((cur_attr.attr_page == attr->attr_page) &&
119 (cur_attr.attr_id == attr->attr_id)) {
120 attr->len = cur_attr.len;
121 attr->val_ptr = cur_attr.val_ptr;
122 return 0;
123 }
124 } while (iter);
125
126 return -EIO;
127}
128
129int osd_req_read_kern(struct osd_request *or,
130 const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
131{
132 struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
133 struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
134
135 if (!bio)
136 return -ENOMEM;
137
138 osd_req_read(or, obj, bio, offset);
139 return 0;
140}
141
142int osd_req_write_kern(struct osd_request *or,
143 const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
144{
145 struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
146 struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
147
148 if (!bio)
149 return -ENOMEM;
150
151 osd_req_write(or, obj, bio, offset);
152 return 0;
153}
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
new file mode 100644
index 000000000000..9f1985e857e2
--- /dev/null
+++ b/fs/exofs/super.c
@@ -0,0 +1,584 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include <linux/string.h>
37#include <linux/parser.h>
38#include <linux/vfs.h>
39#include <linux/random.h>
40#include <linux/exportfs.h>
41
42#include "exofs.h"
43
44/******************************************************************************
45 * MOUNT OPTIONS
46 *****************************************************************************/
47
48/*
49 * struct to hold what we get from mount options
50 */
51struct exofs_mountopt {
52 const char *dev_name;
53 uint64_t pid;
54 int timeout;
55};
56
57/*
58 * exofs-specific mount-time options.
59 */
60enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err };
61
62/*
63 * Our mount-time options. These should ideally be 64-bit unsigned, but the
64 * kernel's parsing functions do not currently support that. 32-bit should be
65 * sufficient for most applications now.
66 */
67static match_table_t tokens = {
68 {Opt_pid, "pid=%u"},
69 {Opt_to, "to=%u"},
70 {Opt_err, NULL}
71};
72
73/*
74 * The main option parsing method. Also makes sure that all of the mandatory
75 * mount options were set.
76 */
77static int parse_options(char *options, struct exofs_mountopt *opts)
78{
79 char *p;
80 substring_t args[MAX_OPT_ARGS];
81 int option;
82 bool s_pid = false;
83
84 EXOFS_DBGMSG("parse_options %s\n", options);
85 /* defaults */
86 memset(opts, 0, sizeof(*opts));
87 opts->timeout = BLK_DEFAULT_SG_TIMEOUT;
88
89 while ((p = strsep(&options, ",")) != NULL) {
90 int token;
91 char str[32];
92
93 if (!*p)
94 continue;
95
96 token = match_token(p, tokens, args);
97 switch (token) {
98 case Opt_pid:
99 if (0 == match_strlcpy(str, &args[0], sizeof(str)))
100 return -EINVAL;
101 opts->pid = simple_strtoull(str, NULL, 0);
102 if (opts->pid < EXOFS_MIN_PID) {
103 EXOFS_ERR("Partition ID must be >= %u",
104 EXOFS_MIN_PID);
105 return -EINVAL;
106 }
107 s_pid = 1;
108 break;
109 case Opt_to:
110 if (match_int(&args[0], &option))
111 return -EINVAL;
112 if (option <= 0) {
113 EXOFS_ERR("Timout must be > 0");
114 return -EINVAL;
115 }
116 opts->timeout = option * HZ;
117 break;
118 }
119 }
120
121 if (!s_pid) {
122 EXOFS_ERR("Need to specify the following options:\n");
123 EXOFS_ERR(" -o pid=pid_no_to_use\n");
124 return -EINVAL;
125 }
126
127 return 0;
128}
129
130/******************************************************************************
131 * INODE CACHE
132 *****************************************************************************/
133
134/*
135 * Our inode cache. Isn't it pretty?
136 */
137static struct kmem_cache *exofs_inode_cachep;
138
139/*
140 * Allocate an inode in the cache
141 */
142static struct inode *exofs_alloc_inode(struct super_block *sb)
143{
144 struct exofs_i_info *oi;
145
146 oi = kmem_cache_alloc(exofs_inode_cachep, GFP_KERNEL);
147 if (!oi)
148 return NULL;
149
150 oi->vfs_inode.i_version = 1;
151 return &oi->vfs_inode;
152}
153
154/*
155 * Remove an inode from the cache
156 */
157static void exofs_destroy_inode(struct inode *inode)
158{
159 kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
160}
161
162/*
163 * Initialize the inode
164 */
165static void exofs_init_once(void *foo)
166{
167 struct exofs_i_info *oi = foo;
168
169 inode_init_once(&oi->vfs_inode);
170}
171
172/*
173 * Create and initialize the inode cache
174 */
175static int init_inodecache(void)
176{
177 exofs_inode_cachep = kmem_cache_create("exofs_inode_cache",
178 sizeof(struct exofs_i_info), 0,
179 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
180 exofs_init_once);
181 if (exofs_inode_cachep == NULL)
182 return -ENOMEM;
183 return 0;
184}
185
186/*
187 * Destroy the inode cache
188 */
189static void destroy_inodecache(void)
190{
191 kmem_cache_destroy(exofs_inode_cachep);
192}
193
194/******************************************************************************
195 * SUPERBLOCK FUNCTIONS
196 *****************************************************************************/
197static const struct super_operations exofs_sops;
198static const struct export_operations exofs_export_ops;
199
200/*
201 * Write the superblock to the OSD
202 */
203static void exofs_write_super(struct super_block *sb)
204{
205 struct exofs_sb_info *sbi;
206 struct exofs_fscb *fscb;
207 struct osd_request *or;
208 struct osd_obj_id obj;
209 int ret;
210
211 fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
212 if (!fscb) {
213 EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
214 return;
215 }
216
217 lock_kernel();
218 sbi = sb->s_fs_info;
219 fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
220 fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
221 fscb->s_magic = cpu_to_le16(sb->s_magic);
222 fscb->s_newfs = 0;
223
224 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
225 if (unlikely(!or)) {
226 EXOFS_ERR("exofs_write_super: osd_start_request failed.\n");
227 goto out;
228 }
229
230 obj.partition = sbi->s_pid;
231 obj.id = EXOFS_SUPER_ID;
232 ret = osd_req_write_kern(or, &obj, 0, fscb, sizeof(*fscb));
233 if (unlikely(ret)) {
234 EXOFS_ERR("exofs_write_super: osd_req_write_kern failed.\n");
235 goto out;
236 }
237
238 ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
239 if (unlikely(ret)) {
240 EXOFS_ERR("exofs_write_super: exofs_sync_op failed.\n");
241 goto out;
242 }
243 sb->s_dirt = 0;
244
245out:
246 if (or)
247 osd_end_request(or);
248 unlock_kernel();
249 kfree(fscb);
250}
251
252/*
253 * This function is called when the vfs is freeing the superblock. We just
254 * need to free our own part.
255 */
256static void exofs_put_super(struct super_block *sb)
257{
258 int num_pend;
259 struct exofs_sb_info *sbi = sb->s_fs_info;
260
261 /* make sure there are no pending commands */
262 for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
263 num_pend = atomic_read(&sbi->s_curr_pending)) {
264 wait_queue_head_t wq;
265 init_waitqueue_head(&wq);
266 wait_event_timeout(wq,
267 (atomic_read(&sbi->s_curr_pending) == 0),
268 msecs_to_jiffies(100));
269 }
270
271 osduld_put_device(sbi->s_dev);
272 kfree(sb->s_fs_info);
273 sb->s_fs_info = NULL;
274}
275
276/*
277 * Read the superblock from the OSD and fill in the fields
278 */
279static int exofs_fill_super(struct super_block *sb, void *data, int silent)
280{
281 struct inode *root;
282 struct exofs_mountopt *opts = data;
283 struct exofs_sb_info *sbi; /*extended info */
284 struct exofs_fscb fscb; /*on-disk superblock info */
285 struct osd_request *or = NULL;
286 struct osd_obj_id obj;
287 int ret;
288
289 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
290 if (!sbi)
291 return -ENOMEM;
292 sb->s_fs_info = sbi;
293
294 /* use mount options to fill superblock */
295 sbi->s_dev = osduld_path_lookup(opts->dev_name);
296 if (IS_ERR(sbi->s_dev)) {
297 ret = PTR_ERR(sbi->s_dev);
298 sbi->s_dev = NULL;
299 goto free_sbi;
300 }
301
302 sbi->s_pid = opts->pid;
303 sbi->s_timeout = opts->timeout;
304
305 /* fill in some other data by hand */
306 memset(sb->s_id, 0, sizeof(sb->s_id));
307 strcpy(sb->s_id, "exofs");
308 sb->s_blocksize = EXOFS_BLKSIZE;
309 sb->s_blocksize_bits = EXOFS_BLKSHIFT;
310 sb->s_maxbytes = MAX_LFS_FILESIZE;
311 atomic_set(&sbi->s_curr_pending, 0);
312 sb->s_bdev = NULL;
313 sb->s_dev = 0;
314
315 /* read data from on-disk superblock object */
316 obj.partition = sbi->s_pid;
317 obj.id = EXOFS_SUPER_ID;
318 exofs_make_credential(sbi->s_cred, &obj);
319
320 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
321 if (unlikely(!or)) {
322 if (!silent)
323 EXOFS_ERR(
324 "exofs_fill_super: osd_start_request failed.\n");
325 ret = -ENOMEM;
326 goto free_sbi;
327 }
328 ret = osd_req_read_kern(or, &obj, 0, &fscb, sizeof(fscb));
329 if (unlikely(ret)) {
330 if (!silent)
331 EXOFS_ERR(
332 "exofs_fill_super: osd_req_read_kern failed.\n");
333 ret = -ENOMEM;
334 goto free_sbi;
335 }
336
337 ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
338 if (unlikely(ret)) {
339 if (!silent)
340 EXOFS_ERR("exofs_fill_super: exofs_sync_op failed.\n");
341 ret = -EIO;
342 goto free_sbi;
343 }
344
345 sb->s_magic = le16_to_cpu(fscb.s_magic);
346 sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
347 sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
348
349 /* make sure what we read from the object store is correct */
350 if (sb->s_magic != EXOFS_SUPER_MAGIC) {
351 if (!silent)
352 EXOFS_ERR("ERROR: Bad magic value\n");
353 ret = -EINVAL;
354 goto free_sbi;
355 }
356
357 /* start generation numbers from a random point */
358 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
359 spin_lock_init(&sbi->s_next_gen_lock);
360
361 /* set up operation vectors */
362 sb->s_op = &exofs_sops;
363 sb->s_export_op = &exofs_export_ops;
364 root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
365 if (IS_ERR(root)) {
366 EXOFS_ERR("ERROR: exofs_iget failed\n");
367 ret = PTR_ERR(root);
368 goto free_sbi;
369 }
370 sb->s_root = d_alloc_root(root);
371 if (!sb->s_root) {
372 iput(root);
373 EXOFS_ERR("ERROR: get root inode failed\n");
374 ret = -ENOMEM;
375 goto free_sbi;
376 }
377
378 if (!S_ISDIR(root->i_mode)) {
379 dput(sb->s_root);
380 sb->s_root = NULL;
381 EXOFS_ERR("ERROR: corrupt root inode (mode = %hd)\n",
382 root->i_mode);
383 ret = -EINVAL;
384 goto free_sbi;
385 }
386
387 ret = 0;
388out:
389 if (or)
390 osd_end_request(or);
391 return ret;
392
393free_sbi:
394 osduld_put_device(sbi->s_dev); /* NULL safe */
395 kfree(sbi);
396 goto out;
397}
398
399/*
400 * Set up the superblock (calls exofs_fill_super eventually)
401 */
402static int exofs_get_sb(struct file_system_type *type,
403 int flags, const char *dev_name,
404 void *data, struct vfsmount *mnt)
405{
406 struct exofs_mountopt opts;
407 int ret;
408
409 ret = parse_options(data, &opts);
410 if (ret)
411 return ret;
412
413 opts.dev_name = dev_name;
414 return get_sb_nodev(type, flags, &opts, exofs_fill_super, mnt);
415}
416
417/*
418 * Return information about the file system state in the buffer. This is used
419 * by the 'df' command, for example.
420 */
421static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
422{
423 struct super_block *sb = dentry->d_sb;
424 struct exofs_sb_info *sbi = sb->s_fs_info;
425 struct osd_obj_id obj = {sbi->s_pid, 0};
426 struct osd_attr attrs[] = {
427 ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
428 OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
429 ATTR_DEF(OSD_APAGE_PARTITION_INFORMATION,
430 OSD_ATTR_PI_USED_CAPACITY, sizeof(__be64)),
431 };
432 uint64_t capacity = ULLONG_MAX;
433 uint64_t used = ULLONG_MAX;
434 struct osd_request *or;
435 uint8_t cred_a[OSD_CAP_LEN];
436 int ret;
437
438 /* get used/capacity attributes */
439 exofs_make_credential(cred_a, &obj);
440
441 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
442 if (unlikely(!or)) {
443 EXOFS_DBGMSG("exofs_statfs: osd_start_request failed.\n");
444 return -ENOMEM;
445 }
446
447 osd_req_get_attributes(or, &obj);
448 osd_req_add_get_attr_list(or, attrs, ARRAY_SIZE(attrs));
449 ret = exofs_sync_op(or, sbi->s_timeout, cred_a);
450 if (unlikely(ret))
451 goto out;
452
453 ret = extract_attr_from_req(or, &attrs[0]);
454 if (likely(!ret))
455 capacity = get_unaligned_be64(attrs[0].val_ptr);
456 else
457 EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n");
458
459 ret = extract_attr_from_req(or, &attrs[1]);
460 if (likely(!ret))
461 used = get_unaligned_be64(attrs[1].val_ptr);
462 else
463 EXOFS_DBGMSG("exofs_statfs: get used-space failed.\n");
464
465 /* fill in the stats buffer */
466 buf->f_type = EXOFS_SUPER_MAGIC;
467 buf->f_bsize = EXOFS_BLKSIZE;
468 buf->f_blocks = (capacity >> EXOFS_BLKSHIFT);
469 buf->f_bfree = ((capacity - used) >> EXOFS_BLKSHIFT);
470 buf->f_bavail = buf->f_bfree;
471 buf->f_files = sbi->s_numfiles;
472 buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles;
473 buf->f_namelen = EXOFS_NAME_LEN;
474
475out:
476 osd_end_request(or);
477 return ret;
478}
479
480static const struct super_operations exofs_sops = {
481 .alloc_inode = exofs_alloc_inode,
482 .destroy_inode = exofs_destroy_inode,
483 .write_inode = exofs_write_inode,
484 .delete_inode = exofs_delete_inode,
485 .put_super = exofs_put_super,
486 .write_super = exofs_write_super,
487 .statfs = exofs_statfs,
488};
489
490/******************************************************************************
491 * EXPORT OPERATIONS
492 *****************************************************************************/
493
494struct dentry *exofs_get_parent(struct dentry *child)
495{
496 unsigned long ino = exofs_parent_ino(child);
497
498 if (!ino)
499 return NULL;
500
501 return d_obtain_alias(exofs_iget(child->d_inode->i_sb, ino));
502}
503
504static struct inode *exofs_nfs_get_inode(struct super_block *sb,
505 u64 ino, u32 generation)
506{
507 struct inode *inode;
508
509 inode = exofs_iget(sb, ino);
510 if (IS_ERR(inode))
511 return ERR_CAST(inode);
512 if (generation && inode->i_generation != generation) {
513 /* we didn't find the right inode.. */
514 iput(inode);
515 return ERR_PTR(-ESTALE);
516 }
517 return inode;
518}
519
520static struct dentry *exofs_fh_to_dentry(struct super_block *sb,
521 struct fid *fid, int fh_len, int fh_type)
522{
523 return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
524 exofs_nfs_get_inode);
525}
526
527static struct dentry *exofs_fh_to_parent(struct super_block *sb,
528 struct fid *fid, int fh_len, int fh_type)
529{
530 return generic_fh_to_parent(sb, fid, fh_len, fh_type,
531 exofs_nfs_get_inode);
532}
533
534static const struct export_operations exofs_export_ops = {
535 .fh_to_dentry = exofs_fh_to_dentry,
536 .fh_to_parent = exofs_fh_to_parent,
537 .get_parent = exofs_get_parent,
538};
539
540/******************************************************************************
541 * INSMOD/RMMOD
542 *****************************************************************************/
543
544/*
545 * struct that describes this file system
546 */
547static struct file_system_type exofs_type = {
548 .owner = THIS_MODULE,
549 .name = "exofs",
550 .get_sb = exofs_get_sb,
551 .kill_sb = generic_shutdown_super,
552};
553
554static int __init init_exofs(void)
555{
556 int err;
557
558 err = init_inodecache();
559 if (err)
560 goto out;
561
562 err = register_filesystem(&exofs_type);
563 if (err)
564 goto out_d;
565
566 return 0;
567out_d:
568 destroy_inodecache();
569out:
570 return err;
571}
572
573static void __exit exit_exofs(void)
574{
575 unregister_filesystem(&exofs_type);
576 destroy_inodecache();
577}
578
579MODULE_AUTHOR("Avishay Traeger <avishay@gmail.com>");
580MODULE_DESCRIPTION("exofs");
581MODULE_LICENSE("GPL");
582
583module_init(init_exofs)
584module_exit(exit_exofs)
diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c
new file mode 100644
index 000000000000..36e2d7bc7f7b
--- /dev/null
+++ b/fs/exofs/symlink.c
@@ -0,0 +1,57 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
4 * Copyright (C) 2005, 2006
5 * International Business Machines
6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com>
8 *
9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 * from
15 * linux/fs/minix/inode.c
16 * Copyright (C) 1991, 1992 Linus Torvalds
17 *
18 * This file is part of exofs.
19 *
20 * exofs is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation. Since it is based on ext2, and the only
23 * valid version of GPL for the Linux kernel is version 2, the only valid
24 * version of GPL for exofs is version 2.
25 *
26 * exofs is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with exofs; if not, write to the Free Software
33 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
34 */
35
36#include <linux/namei.h>
37
38#include "exofs.h"
39
40static void *exofs_follow_link(struct dentry *dentry, struct nameidata *nd)
41{
42 struct exofs_i_info *oi = exofs_i(dentry->d_inode);
43
44 nd_set_link(nd, (char *)oi->i_data);
45 return NULL;
46}
47
48const struct inode_operations exofs_symlink_inode_operations = {
49 .readlink = generic_readlink,
50 .follow_link = page_follow_link_light,
51 .put_link = page_put_link,
52};
53
54const struct inode_operations exofs_fast_symlink_inode_operations = {
55 .readlink = generic_readlink,
56 .follow_link = exofs_follow_link,
57};
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index ae8c4f850b27..d46e38cb85c5 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -318,7 +318,7 @@ ext2_init_acl(struct inode *inode, struct inode *dir)
318 return PTR_ERR(acl); 318 return PTR_ERR(acl);
319 } 319 }
320 if (!acl) 320 if (!acl)
321 inode->i_mode &= ~current->fs->umask; 321 inode->i_mode &= ~current_umask();
322 } 322 }
323 if (test_opt(inode->i_sb, POSIX_ACL) && acl) { 323 if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
324 struct posix_acl *clone; 324 struct posix_acl *clone;
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 4a29d6376081..7f8d2e5a7ea6 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -570,7 +570,7 @@ do_more:
570error_return: 570error_return:
571 brelse(bitmap_bh); 571 brelse(bitmap_bh);
572 release_blocks(sb, freed); 572 release_blocks(sb, freed);
573 DQUOT_FREE_BLOCK(inode, freed); 573 vfs_dq_free_block(inode, freed);
574} 574}
575 575
576/** 576/**
@@ -1247,7 +1247,7 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
1247 /* 1247 /*
1248 * Check quota for allocation of this block. 1248 * Check quota for allocation of this block.
1249 */ 1249 */
1250 if (DQUOT_ALLOC_BLOCK(inode, num)) { 1250 if (vfs_dq_alloc_block(inode, num)) {
1251 *errp = -EDQUOT; 1251 *errp = -EDQUOT;
1252 return 0; 1252 return 0;
1253 } 1253 }
@@ -1409,7 +1409,7 @@ allocated:
1409 1409
1410 *errp = 0; 1410 *errp = 0;
1411 brelse(bitmap_bh); 1411 brelse(bitmap_bh);
1412 DQUOT_FREE_BLOCK(inode, *count-num); 1412 vfs_dq_free_block(inode, *count-num);
1413 *count = num; 1413 *count = num;
1414 return ret_block; 1414 return ret_block;
1415 1415
@@ -1420,7 +1420,7 @@ out:
1420 * Undo the block allocation 1420 * Undo the block allocation
1421 */ 1421 */
1422 if (!performed_allocation) 1422 if (!performed_allocation)
1423 DQUOT_FREE_BLOCK(inode, *count); 1423 vfs_dq_free_block(inode, *count);
1424 brelse(bitmap_bh); 1424 brelse(bitmap_bh);
1425 return 0; 1425 return 0;
1426} 1426}
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 66321a877e74..15387c9c17d8 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -121,8 +121,8 @@ void ext2_free_inode (struct inode * inode)
121 if (!is_bad_inode(inode)) { 121 if (!is_bad_inode(inode)) {
122 /* Quota is already initialized in iput() */ 122 /* Quota is already initialized in iput() */
123 ext2_xattr_delete_inode(inode); 123 ext2_xattr_delete_inode(inode);
124 DQUOT_FREE_INODE(inode); 124 vfs_dq_free_inode(inode);
125 DQUOT_DROP(inode); 125 vfs_dq_drop(inode);
126 } 126 }
127 127
128 es = EXT2_SB(sb)->s_es; 128 es = EXT2_SB(sb)->s_es;
@@ -586,7 +586,7 @@ got:
586 goto fail_drop; 586 goto fail_drop;
587 } 587 }
588 588
589 if (DQUOT_ALLOC_INODE(inode)) { 589 if (vfs_dq_alloc_inode(inode)) {
590 err = -EDQUOT; 590 err = -EDQUOT;
591 goto fail_drop; 591 goto fail_drop;
592 } 592 }
@@ -605,10 +605,10 @@ got:
605 return inode; 605 return inode;
606 606
607fail_free_drop: 607fail_free_drop:
608 DQUOT_FREE_INODE(inode); 608 vfs_dq_free_inode(inode);
609 609
610fail_drop: 610fail_drop:
611 DQUOT_DROP(inode); 611 vfs_dq_drop(inode);
612 inode->i_flags |= S_NOQUOTA; 612 inode->i_flags |= S_NOQUOTA;
613 inode->i_nlink = 0; 613 inode->i_nlink = 0;
614 unlock_new_inode(inode); 614 unlock_new_inode(inode);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 23fff2f87783..b43b95563663 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1444,7 +1444,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
1444 return error; 1444 return error;
1445 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || 1445 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
1446 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { 1446 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
1447 error = DQUOT_TRANSFER(inode, iattr) ? -EDQUOT : 0; 1447 error = vfs_dq_transfer(inode, iattr) ? -EDQUOT : 0;
1448 if (error) 1448 if (error)
1449 return error; 1449 return error;
1450 } 1450 }
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 7c6e3606f0ec..f983225266dc 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1331,6 +1331,7 @@ static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data,
1331 sb->s_blocksize - offset : toread; 1331 sb->s_blocksize - offset : toread;
1332 1332
1333 tmp_bh.b_state = 0; 1333 tmp_bh.b_state = 0;
1334 tmp_bh.b_size = sb->s_blocksize;
1334 err = ext2_get_block(inode, blk, &tmp_bh, 0); 1335 err = ext2_get_block(inode, blk, &tmp_bh, 0);
1335 if (err < 0) 1336 if (err < 0)
1336 return err; 1337 return err;
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 987a5261cc2e..7913531ec6d5 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -642,7 +642,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
642 ea_bdebug(new_bh, "reusing block"); 642 ea_bdebug(new_bh, "reusing block");
643 643
644 error = -EDQUOT; 644 error = -EDQUOT;
645 if (DQUOT_ALLOC_BLOCK(inode, 1)) { 645 if (vfs_dq_alloc_block(inode, 1)) {
646 unlock_buffer(new_bh); 646 unlock_buffer(new_bh);
647 goto cleanup; 647 goto cleanup;
648 } 648 }
@@ -699,7 +699,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
699 * as if nothing happened and cleanup the unused block */ 699 * as if nothing happened and cleanup the unused block */
700 if (error && error != -ENOSPC) { 700 if (error && error != -ENOSPC) {
701 if (new_bh && new_bh != old_bh) 701 if (new_bh && new_bh != old_bh)
702 DQUOT_FREE_BLOCK(inode, 1); 702 vfs_dq_free_block(inode, 1);
703 goto cleanup; 703 goto cleanup;
704 } 704 }
705 } else 705 } else
@@ -731,7 +731,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
731 le32_add_cpu(&HDR(old_bh)->h_refcount, -1); 731 le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
732 if (ce) 732 if (ce)
733 mb_cache_entry_release(ce); 733 mb_cache_entry_release(ce);
734 DQUOT_FREE_BLOCK(inode, 1); 734 vfs_dq_free_block(inode, 1);
735 mark_buffer_dirty(old_bh); 735 mark_buffer_dirty(old_bh);
736 ea_bdebug(old_bh, "refcount now=%d", 736 ea_bdebug(old_bh, "refcount now=%d",
737 le32_to_cpu(HDR(old_bh)->h_refcount)); 737 le32_to_cpu(HDR(old_bh)->h_refcount));
@@ -794,7 +794,7 @@ ext2_xattr_delete_inode(struct inode *inode)
794 mark_buffer_dirty(bh); 794 mark_buffer_dirty(bh);
795 if (IS_SYNC(inode)) 795 if (IS_SYNC(inode))
796 sync_dirty_buffer(bh); 796 sync_dirty_buffer(bh);
797 DQUOT_FREE_BLOCK(inode, 1); 797 vfs_dq_free_block(inode, 1);
798 } 798 }
799 EXT2_I(inode)->i_file_acl = 0; 799 EXT2_I(inode)->i_file_acl = 0;
800 800
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index b60bb241880c..d81ef2fdb08e 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -323,7 +323,7 @@ ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
323 return PTR_ERR(acl); 323 return PTR_ERR(acl);
324 } 324 }
325 if (!acl) 325 if (!acl)
326 inode->i_mode &= ~current->fs->umask; 326 inode->i_mode &= ~current_umask();
327 } 327 }
328 if (test_opt(inode->i_sb, POSIX_ACL) && acl) { 328 if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
329 struct posix_acl *clone; 329 struct posix_acl *clone;
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 0dbf1c048475..225202db8974 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -676,7 +676,7 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode,
676 } 676 }
677 ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); 677 ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
678 if (dquot_freed_blocks) 678 if (dquot_freed_blocks)
679 DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); 679 vfs_dq_free_block(inode, dquot_freed_blocks);
680 return; 680 return;
681} 681}
682 682
@@ -1502,7 +1502,7 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
1502 /* 1502 /*
1503 * Check quota for allocation of this block. 1503 * Check quota for allocation of this block.
1504 */ 1504 */
1505 if (DQUOT_ALLOC_BLOCK(inode, num)) { 1505 if (vfs_dq_alloc_block(inode, num)) {
1506 *errp = -EDQUOT; 1506 *errp = -EDQUOT;
1507 return 0; 1507 return 0;
1508 } 1508 }
@@ -1714,7 +1714,7 @@ allocated:
1714 1714
1715 *errp = 0; 1715 *errp = 0;
1716 brelse(bitmap_bh); 1716 brelse(bitmap_bh);
1717 DQUOT_FREE_BLOCK(inode, *count-num); 1717 vfs_dq_free_block(inode, *count-num);
1718 *count = num; 1718 *count = num;
1719 return ret_block; 1719 return ret_block;
1720 1720
@@ -1729,7 +1729,7 @@ out:
1729 * Undo the block allocation 1729 * Undo the block allocation
1730 */ 1730 */
1731 if (!performed_allocation) 1731 if (!performed_allocation)
1732 DQUOT_FREE_BLOCK(inode, *count); 1732 vfs_dq_free_block(inode, *count);
1733 brelse(bitmap_bh); 1733 brelse(bitmap_bh);
1734 return 0; 1734 return 0;
1735} 1735}
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 5853f4440af4..3d724a95882f 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -42,7 +42,7 @@ const struct file_operations ext3_dir_operations = {
42 .llseek = generic_file_llseek, 42 .llseek = generic_file_llseek,
43 .read = generic_read_dir, 43 .read = generic_read_dir,
44 .readdir = ext3_readdir, /* we take BKL. needed?*/ 44 .readdir = ext3_readdir, /* we take BKL. needed?*/
45 .ioctl = ext3_ioctl, /* BKL held */ 45 .unlocked_ioctl = ext3_ioctl,
46#ifdef CONFIG_COMPAT 46#ifdef CONFIG_COMPAT
47 .compat_ioctl = ext3_compat_ioctl, 47 .compat_ioctl = ext3_compat_ioctl,
48#endif 48#endif
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 3be1e0689c9a..5b49704b231b 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -33,6 +33,10 @@
33 */ 33 */
34static int ext3_release_file (struct inode * inode, struct file * filp) 34static int ext3_release_file (struct inode * inode, struct file * filp)
35{ 35{
36 if (EXT3_I(inode)->i_state & EXT3_STATE_FLUSH_ON_CLOSE) {
37 filemap_flush(inode->i_mapping);
38 EXT3_I(inode)->i_state &= ~EXT3_STATE_FLUSH_ON_CLOSE;
39 }
36 /* if we are the last writer on the inode, drop the block reservation */ 40 /* if we are the last writer on the inode, drop the block reservation */
37 if ((filp->f_mode & FMODE_WRITE) && 41 if ((filp->f_mode & FMODE_WRITE) &&
38 (atomic_read(&inode->i_writecount) == 1)) 42 (atomic_read(&inode->i_writecount) == 1))
@@ -112,7 +116,7 @@ const struct file_operations ext3_file_operations = {
112 .write = do_sync_write, 116 .write = do_sync_write,
113 .aio_read = generic_file_aio_read, 117 .aio_read = generic_file_aio_read,
114 .aio_write = ext3_file_write, 118 .aio_write = ext3_file_write,
115 .ioctl = ext3_ioctl, 119 .unlocked_ioctl = ext3_ioctl,
116#ifdef CONFIG_COMPAT 120#ifdef CONFIG_COMPAT
117 .compat_ioctl = ext3_compat_ioctl, 121 .compat_ioctl = ext3_compat_ioctl,
118#endif 122#endif
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 8de6c720e510..dd13d60d524b 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -123,10 +123,10 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
123 * Note: we must free any quota before locking the superblock, 123 * Note: we must free any quota before locking the superblock,
124 * as writing the quota to disk may need the lock as well. 124 * as writing the quota to disk may need the lock as well.
125 */ 125 */
126 DQUOT_INIT(inode); 126 vfs_dq_init(inode);
127 ext3_xattr_delete_inode(handle, inode); 127 ext3_xattr_delete_inode(handle, inode);
128 DQUOT_FREE_INODE(inode); 128 vfs_dq_free_inode(inode);
129 DQUOT_DROP(inode); 129 vfs_dq_drop(inode);
130 130
131 is_directory = S_ISDIR(inode->i_mode); 131 is_directory = S_ISDIR(inode->i_mode);
132 132
@@ -589,7 +589,7 @@ got:
589 sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0; 589 sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
590 590
591 ret = inode; 591 ret = inode;
592 if(DQUOT_ALLOC_INODE(inode)) { 592 if (vfs_dq_alloc_inode(inode)) {
593 err = -EDQUOT; 593 err = -EDQUOT;
594 goto fail_drop; 594 goto fail_drop;
595 } 595 }
@@ -620,10 +620,10 @@ really_out:
620 return ret; 620 return ret;
621 621
622fail_free_drop: 622fail_free_drop:
623 DQUOT_FREE_INODE(inode); 623 vfs_dq_free_inode(inode);
624 624
625fail_drop: 625fail_drop:
626 DQUOT_DROP(inode); 626 vfs_dq_drop(inode);
627 inode->i_flags |= S_NOQUOTA; 627 inode->i_flags |= S_NOQUOTA;
628 inode->i_nlink = 0; 628 inode->i_nlink = 0;
629 unlock_new_inode(inode); 629 unlock_new_inode(inode);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 5fa453b49a64..466a332e0bd1 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1149,12 +1149,15 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping,
1149 struct page **pagep, void **fsdata) 1149 struct page **pagep, void **fsdata)
1150{ 1150{
1151 struct inode *inode = mapping->host; 1151 struct inode *inode = mapping->host;
1152 int ret, needed_blocks = ext3_writepage_trans_blocks(inode); 1152 int ret;
1153 handle_t *handle; 1153 handle_t *handle;
1154 int retries = 0; 1154 int retries = 0;
1155 struct page *page; 1155 struct page *page;
1156 pgoff_t index; 1156 pgoff_t index;
1157 unsigned from, to; 1157 unsigned from, to;
1158 /* Reserve one block more for addition to orphan list in case
1159 * we allocate blocks but write fails for some reason */
1160 int needed_blocks = ext3_writepage_trans_blocks(inode) + 1;
1158 1161
1159 index = pos >> PAGE_CACHE_SHIFT; 1162 index = pos >> PAGE_CACHE_SHIFT;
1160 from = pos & (PAGE_CACHE_SIZE - 1); 1163 from = pos & (PAGE_CACHE_SIZE - 1);
@@ -1184,15 +1187,20 @@ retry:
1184 } 1187 }
1185write_begin_failed: 1188write_begin_failed:
1186 if (ret) { 1189 if (ret) {
1187 ext3_journal_stop(handle);
1188 unlock_page(page);
1189 page_cache_release(page);
1190 /* 1190 /*
1191 * block_write_begin may have instantiated a few blocks 1191 * block_write_begin may have instantiated a few blocks
1192 * outside i_size. Trim these off again. Don't need 1192 * outside i_size. Trim these off again. Don't need
1193 * i_size_read because we hold i_mutex. 1193 * i_size_read because we hold i_mutex.
1194 *
1195 * Add inode to orphan list in case we crash before truncate
1196 * finishes.
1194 */ 1197 */
1195 if (pos + len > inode->i_size) 1198 if (pos + len > inode->i_size)
1199 ext3_orphan_add(handle, inode);
1200 ext3_journal_stop(handle);
1201 unlock_page(page);
1202 page_cache_release(page);
1203 if (pos + len > inode->i_size)
1196 vmtruncate(inode, inode->i_size); 1204 vmtruncate(inode, inode->i_size);
1197 } 1205 }
1198 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) 1206 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
@@ -1211,6 +1219,18 @@ int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1211 return err; 1219 return err;
1212} 1220}
1213 1221
1222/* For ordered writepage and write_end functions */
1223static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1224{
1225 /*
1226 * Write could have mapped the buffer but it didn't copy the data in
1227 * yet. So avoid filing such buffer into a transaction.
1228 */
1229 if (buffer_mapped(bh) && buffer_uptodate(bh))
1230 return ext3_journal_dirty_data(handle, bh);
1231 return 0;
1232}
1233
1214/* For write_end() in data=journal mode */ 1234/* For write_end() in data=journal mode */
1215static int write_end_fn(handle_t *handle, struct buffer_head *bh) 1235static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1216{ 1236{
@@ -1221,26 +1241,20 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1221} 1241}
1222 1242
1223/* 1243/*
1224 * Generic write_end handler for ordered and writeback ext3 journal modes. 1244 * This is nasty and subtle: ext3_write_begin() could have allocated blocks
1225 * We can't use generic_write_end, because that unlocks the page and we need to 1245 * for the whole page but later we failed to copy the data in. Update inode
1226 * unlock the page after ext3_journal_stop, but ext3_journal_stop must run 1246 * size according to what we managed to copy. The rest is going to be
1227 * after block_write_end. 1247 * truncated in write_end function.
1228 */ 1248 */
1229static int ext3_generic_write_end(struct file *file, 1249static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied)
1230 struct address_space *mapping,
1231 loff_t pos, unsigned len, unsigned copied,
1232 struct page *page, void *fsdata)
1233{ 1250{
1234 struct inode *inode = file->f_mapping->host; 1251 /* What matters to us is i_disksize. We don't write i_size anywhere */
1235 1252 if (pos + copied > inode->i_size)
1236 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 1253 i_size_write(inode, pos + copied);
1237 1254 if (pos + copied > EXT3_I(inode)->i_disksize) {
1238 if (pos+copied > inode->i_size) { 1255 EXT3_I(inode)->i_disksize = pos + copied;
1239 i_size_write(inode, pos+copied);
1240 mark_inode_dirty(inode); 1256 mark_inode_dirty(inode);
1241 } 1257 }
1242
1243 return copied;
1244} 1258}
1245 1259
1246/* 1260/*
@@ -1260,35 +1274,29 @@ static int ext3_ordered_write_end(struct file *file,
1260 unsigned from, to; 1274 unsigned from, to;
1261 int ret = 0, ret2; 1275 int ret = 0, ret2;
1262 1276
1263 from = pos & (PAGE_CACHE_SIZE - 1); 1277 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1264 to = from + len;
1265 1278
1279 from = pos & (PAGE_CACHE_SIZE - 1);
1280 to = from + copied;
1266 ret = walk_page_buffers(handle, page_buffers(page), 1281 ret = walk_page_buffers(handle, page_buffers(page),
1267 from, to, NULL, ext3_journal_dirty_data); 1282 from, to, NULL, journal_dirty_data_fn);
1268 1283
1269 if (ret == 0) { 1284 if (ret == 0)
1270 /* 1285 update_file_sizes(inode, pos, copied);
1271 * generic_write_end() will run mark_inode_dirty() if i_size 1286 /*
1272 * changes. So let's piggyback the i_disksize mark_inode_dirty 1287 * There may be allocated blocks outside of i_size because
1273 * into that. 1288 * we failed to copy some data. Prepare for truncate.
1274 */ 1289 */
1275 loff_t new_i_size; 1290 if (pos + len > inode->i_size)
1276 1291 ext3_orphan_add(handle, inode);
1277 new_i_size = pos + copied;
1278 if (new_i_size > EXT3_I(inode)->i_disksize)
1279 EXT3_I(inode)->i_disksize = new_i_size;
1280 ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
1281 page, fsdata);
1282 copied = ret2;
1283 if (ret2 < 0)
1284 ret = ret2;
1285 }
1286 ret2 = ext3_journal_stop(handle); 1292 ret2 = ext3_journal_stop(handle);
1287 if (!ret) 1293 if (!ret)
1288 ret = ret2; 1294 ret = ret2;
1289 unlock_page(page); 1295 unlock_page(page);
1290 page_cache_release(page); 1296 page_cache_release(page);
1291 1297
1298 if (pos + len > inode->i_size)
1299 vmtruncate(inode, inode->i_size);
1292 return ret ? ret : copied; 1300 return ret ? ret : copied;
1293} 1301}
1294 1302
@@ -1299,25 +1307,22 @@ static int ext3_writeback_write_end(struct file *file,
1299{ 1307{
1300 handle_t *handle = ext3_journal_current_handle(); 1308 handle_t *handle = ext3_journal_current_handle();
1301 struct inode *inode = file->f_mapping->host; 1309 struct inode *inode = file->f_mapping->host;
1302 int ret = 0, ret2; 1310 int ret;
1303 loff_t new_i_size;
1304
1305 new_i_size = pos + copied;
1306 if (new_i_size > EXT3_I(inode)->i_disksize)
1307 EXT3_I(inode)->i_disksize = new_i_size;
1308
1309 ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
1310 page, fsdata);
1311 copied = ret2;
1312 if (ret2 < 0)
1313 ret = ret2;
1314 1311
1315 ret2 = ext3_journal_stop(handle); 1312 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1316 if (!ret) 1313 update_file_sizes(inode, pos, copied);
1317 ret = ret2; 1314 /*
1315 * There may be allocated blocks outside of i_size because
1316 * we failed to copy some data. Prepare for truncate.
1317 */
1318 if (pos + len > inode->i_size)
1319 ext3_orphan_add(handle, inode);
1320 ret = ext3_journal_stop(handle);
1318 unlock_page(page); 1321 unlock_page(page);
1319 page_cache_release(page); 1322 page_cache_release(page);
1320 1323
1324 if (pos + len > inode->i_size)
1325 vmtruncate(inode, inode->i_size);
1321 return ret ? ret : copied; 1326 return ret ? ret : copied;
1322} 1327}
1323 1328
@@ -1338,15 +1343,23 @@ static int ext3_journalled_write_end(struct file *file,
1338 if (copied < len) { 1343 if (copied < len) {
1339 if (!PageUptodate(page)) 1344 if (!PageUptodate(page))
1340 copied = 0; 1345 copied = 0;
1341 page_zero_new_buffers(page, from+copied, to); 1346 page_zero_new_buffers(page, from + copied, to);
1347 to = from + copied;
1342 } 1348 }
1343 1349
1344 ret = walk_page_buffers(handle, page_buffers(page), from, 1350 ret = walk_page_buffers(handle, page_buffers(page), from,
1345 to, &partial, write_end_fn); 1351 to, &partial, write_end_fn);
1346 if (!partial) 1352 if (!partial)
1347 SetPageUptodate(page); 1353 SetPageUptodate(page);
1348 if (pos+copied > inode->i_size) 1354
1349 i_size_write(inode, pos+copied); 1355 if (pos + copied > inode->i_size)
1356 i_size_write(inode, pos + copied);
1357 /*
1358 * There may be allocated blocks outside of i_size because
1359 * we failed to copy some data. Prepare for truncate.
1360 */
1361 if (pos + len > inode->i_size)
1362 ext3_orphan_add(handle, inode);
1350 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; 1363 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1351 if (inode->i_size > EXT3_I(inode)->i_disksize) { 1364 if (inode->i_size > EXT3_I(inode)->i_disksize) {
1352 EXT3_I(inode)->i_disksize = inode->i_size; 1365 EXT3_I(inode)->i_disksize = inode->i_size;
@@ -1361,6 +1374,8 @@ static int ext3_journalled_write_end(struct file *file,
1361 unlock_page(page); 1374 unlock_page(page);
1362 page_cache_release(page); 1375 page_cache_release(page);
1363 1376
1377 if (pos + len > inode->i_size)
1378 vmtruncate(inode, inode->i_size);
1364 return ret ? ret : copied; 1379 return ret ? ret : copied;
1365} 1380}
1366 1381
@@ -1428,11 +1443,9 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
1428 return 0; 1443 return 0;
1429} 1444}
1430 1445
1431static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) 1446static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
1432{ 1447{
1433 if (buffer_mapped(bh)) 1448 return !buffer_mapped(bh);
1434 return ext3_journal_dirty_data(handle, bh);
1435 return 0;
1436} 1449}
1437 1450
1438/* 1451/*
@@ -1505,6 +1518,15 @@ static int ext3_ordered_writepage(struct page *page,
1505 if (ext3_journal_current_handle()) 1518 if (ext3_journal_current_handle())
1506 goto out_fail; 1519 goto out_fail;
1507 1520
1521 if (!page_has_buffers(page)) {
1522 create_empty_buffers(page, inode->i_sb->s_blocksize,
1523 (1 << BH_Dirty)|(1 << BH_Uptodate));
1524 } else if (!walk_page_buffers(NULL, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
1525 /* Provide NULL instead of get_block so that we catch bugs if buffers weren't really mapped */
1526 return block_write_full_page(page, NULL, wbc);
1527 }
1528 page_bufs = page_buffers(page);
1529
1508 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); 1530 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1509 1531
1510 if (IS_ERR(handle)) { 1532 if (IS_ERR(handle)) {
@@ -1512,11 +1534,6 @@ static int ext3_ordered_writepage(struct page *page,
1512 goto out_fail; 1534 goto out_fail;
1513 } 1535 }
1514 1536
1515 if (!page_has_buffers(page)) {
1516 create_empty_buffers(page, inode->i_sb->s_blocksize,
1517 (1 << BH_Dirty)|(1 << BH_Uptodate));
1518 }
1519 page_bufs = page_buffers(page);
1520 walk_page_buffers(handle, page_bufs, 0, 1537 walk_page_buffers(handle, page_bufs, 0,
1521 PAGE_CACHE_SIZE, NULL, bget_one); 1538 PAGE_CACHE_SIZE, NULL, bget_one);
1522 1539
@@ -2346,6 +2363,9 @@ void ext3_truncate(struct inode *inode)
2346 if (!ext3_can_truncate(inode)) 2363 if (!ext3_can_truncate(inode))
2347 return; 2364 return;
2348 2365
2366 if (inode->i_size == 0 && ext3_should_writeback_data(inode))
2367 ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE;
2368
2349 /* 2369 /*
2350 * We have to lock the EOF page here, because lock_page() nests 2370 * We have to lock the EOF page here, because lock_page() nests
2351 * outside journal_start(). 2371 * outside journal_start().
@@ -3055,7 +3075,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
3055 error = PTR_ERR(handle); 3075 error = PTR_ERR(handle);
3056 goto err_out; 3076 goto err_out;
3057 } 3077 }
3058 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; 3078 error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
3059 if (error) { 3079 if (error) {
3060 ext3_journal_stop(handle); 3080 ext3_journal_stop(handle);
3061 return error; 3081 return error;
@@ -3146,7 +3166,7 @@ static int ext3_writepage_trans_blocks(struct inode *inode)
3146 ret = 2 * (bpp + indirects) + 2; 3166 ret = 2 * (bpp + indirects) + 2;
3147 3167
3148#ifdef CONFIG_QUOTA 3168#ifdef CONFIG_QUOTA
3149 /* We know that structure was already allocated during DQUOT_INIT so 3169 /* We know that structure was already allocated during vfs_dq_init so
3150 * we will be updating only the data blocks + inodes */ 3170 * we will be updating only the data blocks + inodes */
3151 ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb); 3171 ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb);
3152#endif 3172#endif
@@ -3237,7 +3257,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
3237 * i_size has been changed by generic_commit_write() and we thus need 3257 * i_size has been changed by generic_commit_write() and we thus need
3238 * to include the updated inode in the current transaction. 3258 * to include the updated inode in the current transaction.
3239 * 3259 *
3240 * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks 3260 * Also, vfs_dq_alloc_space() will always dirty the inode when blocks
3241 * are allocated to the file. 3261 * are allocated to the file.
3242 * 3262 *
3243 * If the inode is marked synchronous, we don't honour that here - doing 3263 * If the inode is marked synchronous, we don't honour that here - doing
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 5e86ce9a86e0..88974814783a 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -15,12 +15,11 @@
15#include <linux/mount.h> 15#include <linux/mount.h>
16#include <linux/time.h> 16#include <linux/time.h>
17#include <linux/compat.h> 17#include <linux/compat.h>
18#include <linux/smp_lock.h>
19#include <asm/uaccess.h> 18#include <asm/uaccess.h>
20 19
21int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, 20long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
22 unsigned long arg)
23{ 21{
22 struct inode *inode = filp->f_dentry->d_inode;
24 struct ext3_inode_info *ei = EXT3_I(inode); 23 struct ext3_inode_info *ei = EXT3_I(inode);
25 unsigned int flags; 24 unsigned int flags;
26 unsigned short rsv_window_size; 25 unsigned short rsv_window_size;
@@ -39,29 +38,25 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
39 unsigned int oldflags; 38 unsigned int oldflags;
40 unsigned int jflag; 39 unsigned int jflag;
41 40
41 if (!is_owner_or_cap(inode))
42 return -EACCES;
43
44 if (get_user(flags, (int __user *) arg))
45 return -EFAULT;
46
42 err = mnt_want_write(filp->f_path.mnt); 47 err = mnt_want_write(filp->f_path.mnt);
43 if (err) 48 if (err)
44 return err; 49 return err;
45 50
46 if (!is_owner_or_cap(inode)) {
47 err = -EACCES;
48 goto flags_out;
49 }
50
51 if (get_user(flags, (int __user *) arg)) {
52 err = -EFAULT;
53 goto flags_out;
54 }
55
56 flags = ext3_mask_flags(inode->i_mode, flags); 51 flags = ext3_mask_flags(inode->i_mode, flags);
57 52
58 mutex_lock(&inode->i_mutex); 53 mutex_lock(&inode->i_mutex);
54
59 /* Is it quota file? Do not allow user to mess with it */ 55 /* Is it quota file? Do not allow user to mess with it */
60 if (IS_NOQUOTA(inode)) { 56 err = -EPERM;
61 mutex_unlock(&inode->i_mutex); 57 if (IS_NOQUOTA(inode))
62 err = -EPERM;
63 goto flags_out; 58 goto flags_out;
64 } 59
65 oldflags = ei->i_flags; 60 oldflags = ei->i_flags;
66 61
67 /* The JOURNAL_DATA flag is modifiable only by root */ 62 /* The JOURNAL_DATA flag is modifiable only by root */
@@ -74,11 +69,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
74 * This test looks nicer. Thanks to Pauline Middelink 69 * This test looks nicer. Thanks to Pauline Middelink
75 */ 70 */
76 if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) { 71 if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
77 if (!capable(CAP_LINUX_IMMUTABLE)) { 72 if (!capable(CAP_LINUX_IMMUTABLE))
78 mutex_unlock(&inode->i_mutex);
79 err = -EPERM;
80 goto flags_out; 73 goto flags_out;
81 }
82 } 74 }
83 75
84 /* 76 /*
@@ -86,17 +78,12 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
86 * the relevant capability. 78 * the relevant capability.
87 */ 79 */
88 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { 80 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
89 if (!capable(CAP_SYS_RESOURCE)) { 81 if (!capable(CAP_SYS_RESOURCE))
90 mutex_unlock(&inode->i_mutex);
91 err = -EPERM;
92 goto flags_out; 82 goto flags_out;
93 }
94 } 83 }
95 84
96
97 handle = ext3_journal_start(inode, 1); 85 handle = ext3_journal_start(inode, 1);
98 if (IS_ERR(handle)) { 86 if (IS_ERR(handle)) {
99 mutex_unlock(&inode->i_mutex);
100 err = PTR_ERR(handle); 87 err = PTR_ERR(handle);
101 goto flags_out; 88 goto flags_out;
102 } 89 }
@@ -116,15 +103,13 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
116 err = ext3_mark_iloc_dirty(handle, inode, &iloc); 103 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
117flags_err: 104flags_err:
118 ext3_journal_stop(handle); 105 ext3_journal_stop(handle);
119 if (err) { 106 if (err)
120 mutex_unlock(&inode->i_mutex); 107 goto flags_out;
121 return err;
122 }
123 108
124 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) 109 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
125 err = ext3_change_inode_journal_flag(inode, jflag); 110 err = ext3_change_inode_journal_flag(inode, jflag);
126 mutex_unlock(&inode->i_mutex);
127flags_out: 111flags_out:
112 mutex_unlock(&inode->i_mutex);
128 mnt_drop_write(filp->f_path.mnt); 113 mnt_drop_write(filp->f_path.mnt);
129 return err; 114 return err;
130 } 115 }
@@ -140,6 +125,7 @@ flags_out:
140 125
141 if (!is_owner_or_cap(inode)) 126 if (!is_owner_or_cap(inode))
142 return -EPERM; 127 return -EPERM;
128
143 err = mnt_want_write(filp->f_path.mnt); 129 err = mnt_want_write(filp->f_path.mnt);
144 if (err) 130 if (err)
145 return err; 131 return err;
@@ -147,6 +133,7 @@ flags_out:
147 err = -EFAULT; 133 err = -EFAULT;
148 goto setversion_out; 134 goto setversion_out;
149 } 135 }
136
150 handle = ext3_journal_start(inode, 1); 137 handle = ext3_journal_start(inode, 1);
151 if (IS_ERR(handle)) { 138 if (IS_ERR(handle)) {
152 err = PTR_ERR(handle); 139 err = PTR_ERR(handle);
@@ -299,9 +286,6 @@ group_add_out:
299#ifdef CONFIG_COMPAT 286#ifdef CONFIG_COMPAT
300long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 287long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
301{ 288{
302 struct inode *inode = file->f_path.dentry->d_inode;
303 int ret;
304
305 /* These are just misnamed, they actually get/put from/to user an int */ 289 /* These are just misnamed, they actually get/put from/to user an int */
306 switch (cmd) { 290 switch (cmd) {
307 case EXT3_IOC32_GETFLAGS: 291 case EXT3_IOC32_GETFLAGS:
@@ -341,9 +325,6 @@ long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
341 default: 325 default:
342 return -ENOIOCTLCMD; 326 return -ENOIOCTLCMD;
343 } 327 }
344 lock_kernel(); 328 return ext3_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
345 ret = ext3_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
346 unlock_kernel();
347 return ret;
348} 329}
349#endif 330#endif
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 4db4ffa1edad..6ff7b9730234 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -161,12 +161,12 @@ static struct dx_frame *dx_probe(struct qstr *entry,
161 struct dx_frame *frame, 161 struct dx_frame *frame,
162 int *err); 162 int *err);
163static void dx_release (struct dx_frame *frames); 163static void dx_release (struct dx_frame *frames);
164static int dx_make_map (struct ext3_dir_entry_2 *de, int size, 164static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
165 struct dx_hash_info *hinfo, struct dx_map_entry map[]); 165 struct dx_hash_info *hinfo, struct dx_map_entry map[]);
166static void dx_sort_map(struct dx_map_entry *map, unsigned count); 166static void dx_sort_map(struct dx_map_entry *map, unsigned count);
167static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, 167static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
168 struct dx_map_entry *offsets, int count); 168 struct dx_map_entry *offsets, int count);
169static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); 169static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize);
170static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); 170static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
171static int ext3_htree_next_block(struct inode *dir, __u32 hash, 171static int ext3_htree_next_block(struct inode *dir, __u32 hash,
172 struct dx_frame *frame, 172 struct dx_frame *frame,
@@ -708,14 +708,14 @@ errout:
708 * Create map of hash values, offsets, and sizes, stored at end of block. 708 * Create map of hash values, offsets, and sizes, stored at end of block.
709 * Returns number of entries mapped. 709 * Returns number of entries mapped.
710 */ 710 */
711static int dx_make_map (struct ext3_dir_entry_2 *de, int size, 711static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
712 struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) 712 struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
713{ 713{
714 int count = 0; 714 int count = 0;
715 char *base = (char *) de; 715 char *base = (char *) de;
716 struct dx_hash_info h = *hinfo; 716 struct dx_hash_info h = *hinfo;
717 717
718 while ((char *) de < base + size) 718 while ((char *) de < base + blocksize)
719 { 719 {
720 if (de->name_len && de->inode) { 720 if (de->name_len && de->inode) {
721 ext3fs_dirhash(de->name, de->name_len, &h); 721 ext3fs_dirhash(de->name, de->name_len, &h);
@@ -1047,8 +1047,16 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
1047 return ERR_PTR(-EIO); 1047 return ERR_PTR(-EIO);
1048 } 1048 }
1049 inode = ext3_iget(dir->i_sb, ino); 1049 inode = ext3_iget(dir->i_sb, ino);
1050 if (IS_ERR(inode)) 1050 if (unlikely(IS_ERR(inode))) {
1051 return ERR_CAST(inode); 1051 if (PTR_ERR(inode) == -ESTALE) {
1052 ext3_error(dir->i_sb, __func__,
1053 "deleted inode referenced: %lu",
1054 ino);
1055 return ERR_PTR(-EIO);
1056 } else {
1057 return ERR_CAST(inode);
1058 }
1059 }
1052 } 1060 }
1053 return d_splice_alias(inode, dentry); 1061 return d_splice_alias(inode, dentry);
1054} 1062}
@@ -1120,13 +1128,14 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
1120 * Compact each dir entry in the range to the minimal rec_len. 1128 * Compact each dir entry in the range to the minimal rec_len.
1121 * Returns pointer to last entry in range. 1129 * Returns pointer to last entry in range.
1122 */ 1130 */
1123static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size) 1131static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize)
1124{ 1132{
1125 struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base; 1133 struct ext3_dir_entry_2 *next, *to, *prev;
1134 struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)base;
1126 unsigned rec_len = 0; 1135 unsigned rec_len = 0;
1127 1136
1128 prev = to = de; 1137 prev = to = de;
1129 while ((char*)de < base + size) { 1138 while ((char *)de < base + blocksize) {
1130 next = ext3_next_entry(de); 1139 next = ext3_next_entry(de);
1131 if (de->inode && de->name_len) { 1140 if (de->inode && de->name_len) {
1132 rec_len = EXT3_DIR_REC_LEN(de->name_len); 1141 rec_len = EXT3_DIR_REC_LEN(de->name_len);
@@ -2049,7 +2058,7 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
2049 2058
2050 /* Initialize quotas before so that eventual writes go in 2059 /* Initialize quotas before so that eventual writes go in
2051 * separate transaction */ 2060 * separate transaction */
2052 DQUOT_INIT(dentry->d_inode); 2061 vfs_dq_init(dentry->d_inode);
2053 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); 2062 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
2054 if (IS_ERR(handle)) 2063 if (IS_ERR(handle))
2055 return PTR_ERR(handle); 2064 return PTR_ERR(handle);
@@ -2108,7 +2117,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
2108 2117
2109 /* Initialize quotas before so that eventual writes go 2118 /* Initialize quotas before so that eventual writes go
2110 * in separate transaction */ 2119 * in separate transaction */
2111 DQUOT_INIT(dentry->d_inode); 2120 vfs_dq_init(dentry->d_inode);
2112 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); 2121 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
2113 if (IS_ERR(handle)) 2122 if (IS_ERR(handle))
2114 return PTR_ERR(handle); 2123 return PTR_ERR(handle);
@@ -2265,14 +2274,14 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
2265 struct inode * old_inode, * new_inode; 2274 struct inode * old_inode, * new_inode;
2266 struct buffer_head * old_bh, * new_bh, * dir_bh; 2275 struct buffer_head * old_bh, * new_bh, * dir_bh;
2267 struct ext3_dir_entry_2 * old_de, * new_de; 2276 struct ext3_dir_entry_2 * old_de, * new_de;
2268 int retval; 2277 int retval, flush_file = 0;
2269 2278
2270 old_bh = new_bh = dir_bh = NULL; 2279 old_bh = new_bh = dir_bh = NULL;
2271 2280
2272 /* Initialize quotas before so that eventual writes go 2281 /* Initialize quotas before so that eventual writes go
2273 * in separate transaction */ 2282 * in separate transaction */
2274 if (new_dentry->d_inode) 2283 if (new_dentry->d_inode)
2275 DQUOT_INIT(new_dentry->d_inode); 2284 vfs_dq_init(new_dentry->d_inode);
2276 handle = ext3_journal_start(old_dir, 2 * 2285 handle = ext3_journal_start(old_dir, 2 *
2277 EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) + 2286 EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) +
2278 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); 2287 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
@@ -2401,6 +2410,8 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
2401 ext3_mark_inode_dirty(handle, new_inode); 2410 ext3_mark_inode_dirty(handle, new_inode);
2402 if (!new_inode->i_nlink) 2411 if (!new_inode->i_nlink)
2403 ext3_orphan_add(handle, new_inode); 2412 ext3_orphan_add(handle, new_inode);
2413 if (ext3_should_writeback_data(new_inode))
2414 flush_file = 1;
2404 } 2415 }
2405 retval = 0; 2416 retval = 0;
2406 2417
@@ -2409,6 +2420,8 @@ end_rename:
2409 brelse (old_bh); 2420 brelse (old_bh);
2410 brelse (new_bh); 2421 brelse (new_bh);
2411 ext3_journal_stop(handle); 2422 ext3_journal_stop(handle);
2423 if (retval == 0 && flush_file)
2424 filemap_flush(old_inode->i_mapping);
2412 return retval; 2425 return retval;
2413} 2426}
2414 2427
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 4a970411a458..9e5b8e387e1e 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -707,8 +707,6 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
707#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") 707#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
708#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) 708#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
709 709
710static int ext3_dquot_initialize(struct inode *inode, int type);
711static int ext3_dquot_drop(struct inode *inode);
712static int ext3_write_dquot(struct dquot *dquot); 710static int ext3_write_dquot(struct dquot *dquot);
713static int ext3_acquire_dquot(struct dquot *dquot); 711static int ext3_acquire_dquot(struct dquot *dquot);
714static int ext3_release_dquot(struct dquot *dquot); 712static int ext3_release_dquot(struct dquot *dquot);
@@ -723,8 +721,8 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
723 const char *data, size_t len, loff_t off); 721 const char *data, size_t len, loff_t off);
724 722
725static struct dquot_operations ext3_quota_operations = { 723static struct dquot_operations ext3_quota_operations = {
726 .initialize = ext3_dquot_initialize, 724 .initialize = dquot_initialize,
727 .drop = ext3_dquot_drop, 725 .drop = dquot_drop,
728 .alloc_space = dquot_alloc_space, 726 .alloc_space = dquot_alloc_space,
729 .alloc_inode = dquot_alloc_inode, 727 .alloc_inode = dquot_alloc_inode,
730 .free_space = dquot_free_space, 728 .free_space = dquot_free_space,
@@ -1438,7 +1436,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
1438 } 1436 }
1439 1437
1440 list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); 1438 list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
1441 DQUOT_INIT(inode); 1439 vfs_dq_init(inode);
1442 if (inode->i_nlink) { 1440 if (inode->i_nlink) {
1443 printk(KERN_DEBUG 1441 printk(KERN_DEBUG
1444 "%s: truncating inode %lu to %Ld bytes\n", 1442 "%s: truncating inode %lu to %Ld bytes\n",
@@ -2702,7 +2700,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
2702 * Process 1 Process 2 2700 * Process 1 Process 2
2703 * ext3_create() quota_sync() 2701 * ext3_create() quota_sync()
2704 * journal_start() write_dquot() 2702 * journal_start() write_dquot()
2705 * DQUOT_INIT() down(dqio_mutex) 2703 * vfs_dq_init() down(dqio_mutex)
2706 * down(dqio_mutex) journal_start() 2704 * down(dqio_mutex) journal_start()
2707 * 2705 *
2708 */ 2706 */
@@ -2714,44 +2712,6 @@ static inline struct inode *dquot_to_inode(struct dquot *dquot)
2714 return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; 2712 return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
2715} 2713}
2716 2714
2717static int ext3_dquot_initialize(struct inode *inode, int type)
2718{
2719 handle_t *handle;
2720 int ret, err;
2721
2722 /* We may create quota structure so we need to reserve enough blocks */
2723 handle = ext3_journal_start(inode, 2*EXT3_QUOTA_INIT_BLOCKS(inode->i_sb));
2724 if (IS_ERR(handle))
2725 return PTR_ERR(handle);
2726 ret = dquot_initialize(inode, type);
2727 err = ext3_journal_stop(handle);
2728 if (!ret)
2729 ret = err;
2730 return ret;
2731}
2732
2733static int ext3_dquot_drop(struct inode *inode)
2734{
2735 handle_t *handle;
2736 int ret, err;
2737
2738 /* We may delete quota structure so we need to reserve enough blocks */
2739 handle = ext3_journal_start(inode, 2*EXT3_QUOTA_DEL_BLOCKS(inode->i_sb));
2740 if (IS_ERR(handle)) {
2741 /*
2742 * We call dquot_drop() anyway to at least release references
2743 * to quota structures so that umount does not hang.
2744 */
2745 dquot_drop(inode);
2746 return PTR_ERR(handle);
2747 }
2748 ret = dquot_drop(inode);
2749 err = ext3_journal_stop(handle);
2750 if (!ret)
2751 ret = err;
2752 return ret;
2753}
2754
2755static int ext3_write_dquot(struct dquot *dquot) 2715static int ext3_write_dquot(struct dquot *dquot)
2756{ 2716{
2757 int ret, err; 2717 int ret, err;
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 175414ac2210..83b7be849bd5 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -498,7 +498,7 @@ ext3_xattr_release_block(handle_t *handle, struct inode *inode,
498 error = ext3_journal_dirty_metadata(handle, bh); 498 error = ext3_journal_dirty_metadata(handle, bh);
499 if (IS_SYNC(inode)) 499 if (IS_SYNC(inode))
500 handle->h_sync = 1; 500 handle->h_sync = 1;
501 DQUOT_FREE_BLOCK(inode, 1); 501 vfs_dq_free_block(inode, 1);
502 ea_bdebug(bh, "refcount now=%d; releasing", 502 ea_bdebug(bh, "refcount now=%d; releasing",
503 le32_to_cpu(BHDR(bh)->h_refcount)); 503 le32_to_cpu(BHDR(bh)->h_refcount));
504 if (ce) 504 if (ce)
@@ -774,7 +774,7 @@ inserted:
774 /* The old block is released after updating 774 /* The old block is released after updating
775 the inode. */ 775 the inode. */
776 error = -EDQUOT; 776 error = -EDQUOT;
777 if (DQUOT_ALLOC_BLOCK(inode, 1)) 777 if (vfs_dq_alloc_block(inode, 1))
778 goto cleanup; 778 goto cleanup;
779 error = ext3_journal_get_write_access(handle, 779 error = ext3_journal_get_write_access(handle,
780 new_bh); 780 new_bh);
@@ -848,7 +848,7 @@ cleanup:
848 return error; 848 return error;
849 849
850cleanup_dquot: 850cleanup_dquot:
851 DQUOT_FREE_BLOCK(inode, 1); 851 vfs_dq_free_block(inode, 1);
852 goto cleanup; 852 goto cleanup;
853 853
854bad_block: 854bad_block:
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 7505482a08fa..418b6f3b0ae8 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -18,7 +18,7 @@ config EXT4_FS
18 filesystem; while there will be some performance gains from 18 filesystem; while there will be some performance gains from
19 the delayed allocation and inode table readahead, the best 19 the delayed allocation and inode table readahead, the best
20 performance gains will require enabling ext4 features in the 20 performance gains will require enabling ext4 features in the
21 filesystem, or formating a new filesystem as an ext4 21 filesystem, or formatting a new filesystem as an ext4
22 filesystem initially. 22 filesystem initially.
23 23
24 To compile this file system support as a module, choose M here. The 24 To compile this file system support as a module, choose M here. The
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 694ed6fadcc8..647e0d65a284 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -323,7 +323,7 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
323 return PTR_ERR(acl); 323 return PTR_ERR(acl);
324 } 324 }
325 if (!acl) 325 if (!acl)
326 inode->i_mode &= ~current->fs->umask; 326 inode->i_mode &= ~current_umask();
327 } 327 }
328 if (test_opt(inode->i_sb, POSIX_ACL) && acl) { 328 if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
329 struct posix_acl *clone; 329 struct posix_acl *clone;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index de9459b4cb94..53c72ad85877 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -55,7 +55,8 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
55} 55}
56 56
57static int ext4_group_used_meta_blocks(struct super_block *sb, 57static int ext4_group_used_meta_blocks(struct super_block *sb,
58 ext4_group_t block_group) 58 ext4_group_t block_group,
59 struct ext4_group_desc *gdp)
59{ 60{
60 ext4_fsblk_t tmp; 61 ext4_fsblk_t tmp;
61 struct ext4_sb_info *sbi = EXT4_SB(sb); 62 struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -63,10 +64,6 @@ static int ext4_group_used_meta_blocks(struct super_block *sb,
63 int used_blocks = sbi->s_itb_per_group + 2; 64 int used_blocks = sbi->s_itb_per_group + 2;
64 65
65 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { 66 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
66 struct ext4_group_desc *gdp;
67 struct buffer_head *bh;
68
69 gdp = ext4_get_group_desc(sb, block_group, &bh);
70 if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), 67 if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
71 block_group)) 68 block_group))
72 used_blocks--; 69 used_blocks--;
@@ -177,7 +174,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
177 */ 174 */
178 mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data); 175 mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
179 } 176 }
180 return free_blocks - ext4_group_used_meta_blocks(sb, block_group); 177 return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
181} 178}
182 179
183 180
@@ -473,9 +470,8 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
473 470
474 if (sbi->s_log_groups_per_flex) { 471 if (sbi->s_log_groups_per_flex) {
475 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 472 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
476 spin_lock(sb_bgl_lock(sbi, flex_group)); 473 atomic_add(blocks_freed,
477 sbi->s_flex_groups[flex_group].free_blocks += blocks_freed; 474 &sbi->s_flex_groups[flex_group].free_blocks);
478 spin_unlock(sb_bgl_lock(sbi, flex_group));
479 } 475 }
480 /* 476 /*
481 * request to reload the buddy with the 477 * request to reload the buddy with the
@@ -536,7 +532,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
536 ext4_mb_free_blocks(handle, inode, block, count, 532 ext4_mb_free_blocks(handle, inode, block, count,
537 metadata, &dquot_freed_blocks); 533 metadata, &dquot_freed_blocks);
538 if (dquot_freed_blocks) 534 if (dquot_freed_blocks)
539 DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); 535 vfs_dq_free_block(inode, dquot_freed_blocks);
540 return; 536 return;
541} 537}
542 538
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2df2e40b01af..b64789929a65 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -67,7 +67,8 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
67 unsigned int offset) 67 unsigned int offset)
68{ 68{
69 const char *error_msg = NULL; 69 const char *error_msg = NULL;
70 const int rlen = ext4_rec_len_from_disk(de->rec_len); 70 const int rlen = ext4_rec_len_from_disk(de->rec_len,
71 dir->i_sb->s_blocksize);
71 72
72 if (rlen < EXT4_DIR_REC_LEN(1)) 73 if (rlen < EXT4_DIR_REC_LEN(1))
73 error_msg = "rec_len is smaller than minimal"; 74 error_msg = "rec_len is smaller than minimal";
@@ -178,10 +179,11 @@ revalidate:
178 * least that it is non-zero. A 179 * least that it is non-zero. A
179 * failure will be detected in the 180 * failure will be detected in the
180 * dirent test below. */ 181 * dirent test below. */
181 if (ext4_rec_len_from_disk(de->rec_len) 182 if (ext4_rec_len_from_disk(de->rec_len,
182 < EXT4_DIR_REC_LEN(1)) 183 sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
183 break; 184 break;
184 i += ext4_rec_len_from_disk(de->rec_len); 185 i += ext4_rec_len_from_disk(de->rec_len,
186 sb->s_blocksize);
185 } 187 }
186 offset = i; 188 offset = i;
187 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) 189 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
@@ -203,7 +205,8 @@ revalidate:
203 ret = stored; 205 ret = stored;
204 goto out; 206 goto out;
205 } 207 }
206 offset += ext4_rec_len_from_disk(de->rec_len); 208 offset += ext4_rec_len_from_disk(de->rec_len,
209 sb->s_blocksize);
207 if (le32_to_cpu(de->inode)) { 210 if (le32_to_cpu(de->inode)) {
208 /* We might block in the next section 211 /* We might block in the next section
209 * if the data destination is 212 * if the data destination is
@@ -225,7 +228,8 @@ revalidate:
225 goto revalidate; 228 goto revalidate;
226 stored++; 229 stored++;
227 } 230 }
228 filp->f_pos += ext4_rec_len_from_disk(de->rec_len); 231 filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
232 sb->s_blocksize);
229 } 233 }
230 offset = 0; 234 offset = 0;
231 brelse(bh); 235 brelse(bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b0c87dce66a3..d0f15ef56de1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -20,6 +20,7 @@
20#include <linux/blkdev.h> 20#include <linux/blkdev.h>
21#include <linux/magic.h> 21#include <linux/magic.h>
22#include <linux/jbd2.h> 22#include <linux/jbd2.h>
23#include <linux/quota.h>
23#include "ext4_i.h" 24#include "ext4_i.h"
24 25
25/* 26/*
@@ -32,14 +33,6 @@
32#undef EXT4FS_DEBUG 33#undef EXT4FS_DEBUG
33 34
34/* 35/*
35 * Define EXT4_RESERVATION to reserve data blocks for expanding files
36 */
37#define EXT4_DEFAULT_RESERVE_BLOCKS 8
38/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
39#define EXT4_MAX_RESERVE_BLOCKS 1027
40#define EXT4_RESERVE_WINDOW_NOT_ALLOCATED 0
41
42/*
43 * Debug code 36 * Debug code
44 */ 37 */
45#ifdef EXT4FS_DEBUG 38#ifdef EXT4FS_DEBUG
@@ -53,8 +46,6 @@
53#define ext4_debug(f, a...) do {} while (0) 46#define ext4_debug(f, a...) do {} while (0)
54#endif 47#endif
55 48
56#define EXT4_MULTIBLOCK_ALLOCATOR 1
57
58/* prefer goal again. length */ 49/* prefer goal again. length */
59#define EXT4_MB_HINT_MERGE 1 50#define EXT4_MB_HINT_MERGE 1
60/* blocks already reserved */ 51/* blocks already reserved */
@@ -179,8 +170,9 @@ struct ext4_group_desc
179 */ 170 */
180 171
181struct flex_groups { 172struct flex_groups {
182 __u32 free_inodes; 173 atomic_t free_inodes;
183 __u32 free_blocks; 174 atomic_t free_blocks;
175 atomic_t used_dirs;
184}; 176};
185 177
186#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ 178#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
@@ -248,6 +240,30 @@ struct flex_groups {
248#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ 240#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
249#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */ 241#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */
250 242
243/* Flags that should be inherited by new inodes from their parent. */
244#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
245 EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\
246 EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
247 EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
248 EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
249
250/* Flags that are appropriate for regular files (all but dir-specific ones). */
251#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
252
253/* Flags that are appropriate for non-directories/regular files. */
254#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
255
256/* Mask out flags that are inappropriate for the given type of inode. */
257static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
258{
259 if (S_ISDIR(mode))
260 return flags;
261 else if (S_ISREG(mode))
262 return flags & EXT4_REG_FLMASK;
263 else
264 return flags & EXT4_OTHER_FLMASK;
265}
266
251/* 267/*
252 * Inode dynamic state flags 268 * Inode dynamic state flags
253 */ 269 */
@@ -255,6 +271,7 @@ struct flex_groups {
255#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */ 271#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */
256#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */ 272#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
257#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ 273#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
274#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
258 275
259/* Used to pass group descriptor data when online resize is done */ 276/* Used to pass group descriptor data when online resize is done */
260struct ext4_new_group_input { 277struct ext4_new_group_input {
@@ -302,7 +319,9 @@ struct ext4_new_group_data {
302#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) 319#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
303#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) 320#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input)
304#define EXT4_IOC_MIGRATE _IO('f', 9) 321#define EXT4_IOC_MIGRATE _IO('f', 9)
322 /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
305 /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ 323 /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
324#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
306 325
307/* 326/*
308 * ioctl commands in 32 bit emulation 327 * ioctl commands in 32 bit emulation
@@ -530,7 +549,7 @@ do { \
530#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ 549#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */
531#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ 550#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */
532#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ 551#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */
533#define EXT4_MOUNT_RESERVATION 0x10000 /* Preallocation */ 552#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */
534#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ 553#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */
535#define EXT4_MOUNT_NOBH 0x40000 /* No bufferheads */ 554#define EXT4_MOUNT_NOBH 0x40000 /* No bufferheads */
536#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ 555#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
@@ -665,7 +684,8 @@ struct ext4_super_block {
665 __u8 s_log_groups_per_flex; /* FLEX_BG group size */ 684 __u8 s_log_groups_per_flex; /* FLEX_BG group size */
666 __u8 s_reserved_char_pad2; 685 __u8 s_reserved_char_pad2;
667 __le16 s_reserved_pad; 686 __le16 s_reserved_pad;
668 __u32 s_reserved[162]; /* Padding to the end of the block */ 687 __le64 s_kbytes_written; /* nr of lifetime kilobytes written */
688 __u32 s_reserved[160]; /* Padding to the end of the block */
669}; 689};
670 690
671#ifdef __KERNEL__ 691#ifdef __KERNEL__
@@ -813,6 +833,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
813#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ 833#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
814 834
815/* 835/*
836 * Minimum number of groups in a flexgroup before we separate out
837 * directories into the first block group of a flexgroup
838 */
839#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4
840
841/*
816 * Structure of a directory entry 842 * Structure of a directory entry
817 */ 843 */
818#define EXT4_NAME_LEN 255 844#define EXT4_NAME_LEN 255
@@ -864,24 +890,6 @@ struct ext4_dir_entry_2 {
864 ~EXT4_DIR_ROUND) 890 ~EXT4_DIR_ROUND)
865#define EXT4_MAX_REC_LEN ((1<<16)-1) 891#define EXT4_MAX_REC_LEN ((1<<16)-1)
866 892
867static inline unsigned ext4_rec_len_from_disk(__le16 dlen)
868{
869 unsigned len = le16_to_cpu(dlen);
870
871 if (len == EXT4_MAX_REC_LEN || len == 0)
872 return 1 << 16;
873 return len;
874}
875
876static inline __le16 ext4_rec_len_to_disk(unsigned len)
877{
878 if (len == (1 << 16))
879 return cpu_to_le16(EXT4_MAX_REC_LEN);
880 else if (len > (1 << 16))
881 BUG();
882 return cpu_to_le16(len);
883}
884
885/* 893/*
886 * Hash Tree Directory indexing 894 * Hash Tree Directory indexing
887 * (c) Daniel Phillips, 2001 895 * (c) Daniel Phillips, 2001
@@ -969,22 +977,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
969 977
970extern struct proc_dir_entry *ext4_proc_root; 978extern struct proc_dir_entry *ext4_proc_root;
971 979
972#ifdef CONFIG_PROC_FS
973extern const struct file_operations ext4_ui_proc_fops;
974
975#define EXT4_PROC_HANDLER(name, var) \
976do { \
977 proc = proc_create_data(name, mode, sbi->s_proc, \
978 &ext4_ui_proc_fops, &sbi->s_##var); \
979 if (proc == NULL) { \
980 printk(KERN_ERR "EXT4-fs: can't create %s\n", name); \
981 goto err_out; \
982 } \
983} while (0)
984#else
985#define EXT4_PROC_HANDLER(name, var)
986#endif
987
988/* 980/*
989 * Function prototypes 981 * Function prototypes
990 */ 982 */
@@ -1091,13 +1083,15 @@ extern int ext4_can_truncate(struct inode *inode);
1091extern void ext4_truncate(struct inode *); 1083extern void ext4_truncate(struct inode *);
1092extern void ext4_set_inode_flags(struct inode *); 1084extern void ext4_set_inode_flags(struct inode *);
1093extern void ext4_get_inode_flags(struct ext4_inode_info *); 1085extern void ext4_get_inode_flags(struct ext4_inode_info *);
1086extern int ext4_alloc_da_blocks(struct inode *inode);
1094extern void ext4_set_aops(struct inode *inode); 1087extern void ext4_set_aops(struct inode *inode);
1095extern int ext4_writepage_trans_blocks(struct inode *); 1088extern int ext4_writepage_trans_blocks(struct inode *);
1096extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks); 1089extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
1097extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); 1090extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
1098extern int ext4_block_truncate_page(handle_t *handle, 1091extern int ext4_block_truncate_page(handle_t *handle,
1099 struct address_space *mapping, loff_t from); 1092 struct address_space *mapping, loff_t from);
1100extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page); 1093extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1094extern qsize_t ext4_get_reserved_space(struct inode *inode);
1101 1095
1102/* ioctl.c */ 1096/* ioctl.c */
1103extern long ext4_ioctl(struct file *, unsigned int, unsigned long); 1097extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -1105,7 +1099,10 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
1105 1099
1106/* migrate.c */ 1100/* migrate.c */
1107extern int ext4_ext_migrate(struct inode *); 1101extern int ext4_ext_migrate(struct inode *);
1102
1108/* namei.c */ 1103/* namei.c */
1104extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize);
1105extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize);
1109extern int ext4_orphan_add(handle_t *, struct inode *); 1106extern int ext4_orphan_add(handle_t *, struct inode *);
1110extern int ext4_orphan_del(handle_t *, struct inode *); 1107extern int ext4_orphan_del(handle_t *, struct inode *);
1111extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, 1108extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 18cb67b2cbbc..f0c3ec85bd48 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -241,5 +241,6 @@ extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
241extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *, 241extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
242 ext4_lblk_t *, ext4_fsblk_t *); 242 ext4_lblk_t *, ext4_fsblk_t *);
243extern void ext4_ext_drop_refs(struct ext4_ext_path *); 243extern void ext4_ext_drop_refs(struct ext4_ext_path *);
244extern int ext4_ext_check_inode(struct inode *inode);
244#endif /* _EXT4_EXTENTS */ 245#endif /* _EXT4_EXTENTS */
245 246
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index e69acc16f5c4..4ce2187123aa 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -33,9 +33,6 @@ typedef __u32 ext4_lblk_t;
33/* data type for block group number */ 33/* data type for block group number */
34typedef unsigned int ext4_group_t; 34typedef unsigned int ext4_group_t;
35 35
36#define rsv_start rsv_window._rsv_start
37#define rsv_end rsv_window._rsv_end
38
39/* 36/*
40 * storage for cached extent 37 * storage for cached extent
41 */ 38 */
@@ -125,6 +122,9 @@ struct ext4_inode_info {
125 struct list_head i_prealloc_list; 122 struct list_head i_prealloc_list;
126 spinlock_t i_prealloc_lock; 123 spinlock_t i_prealloc_lock;
127 124
125 /* ialloc */
126 ext4_group_t i_last_alloc_group;
127
128 /* allocation reservation info for delalloc */ 128 /* allocation reservation info for delalloc */
129 unsigned int i_reserved_data_blocks; 129 unsigned int i_reserved_data_blocks;
130 unsigned int i_reserved_meta_blocks; 130 unsigned int i_reserved_meta_blocks;
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 039b6ea1a042..57b71fefbccf 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -62,12 +62,10 @@ struct ext4_sb_info {
62 struct percpu_counter s_freeinodes_counter; 62 struct percpu_counter s_freeinodes_counter;
63 struct percpu_counter s_dirs_counter; 63 struct percpu_counter s_dirs_counter;
64 struct percpu_counter s_dirtyblocks_counter; 64 struct percpu_counter s_dirtyblocks_counter;
65 struct blockgroup_lock s_blockgroup_lock; 65 struct blockgroup_lock *s_blockgroup_lock;
66 struct proc_dir_entry *s_proc; 66 struct proc_dir_entry *s_proc;
67 67 struct kobject s_kobj;
68 /* root of the per fs reservation window tree */ 68 struct completion s_kobj_unregister;
69 spinlock_t s_rsv_window_lock;
70 struct rb_root s_rsv_window_root;
71 69
72 /* Journaling */ 70 /* Journaling */
73 struct inode *s_journal_inode; 71 struct inode *s_journal_inode;
@@ -146,6 +144,10 @@ struct ext4_sb_info {
146 /* locality groups */ 144 /* locality groups */
147 struct ext4_locality_group *s_locality_groups; 145 struct ext4_locality_group *s_locality_groups;
148 146
147 /* for write statistics */
148 unsigned long s_sectors_written_start;
149 u64 s_kbytes_written;
150
149 unsigned int s_log_groups_per_flex; 151 unsigned int s_log_groups_per_flex;
150 struct flex_groups *s_flex_groups; 152 struct flex_groups *s_flex_groups;
151}; 153};
@@ -153,7 +155,7 @@ struct ext4_sb_info {
153static inline spinlock_t * 155static inline spinlock_t *
154sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group) 156sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
155{ 157{
156 return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group); 158 return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
157} 159}
158 160
159#endif /* _EXT4_SB */ 161#endif /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e2eab196875f..ac77d8b8251d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -152,6 +152,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
152 ext4_fsblk_t bg_start; 152 ext4_fsblk_t bg_start;
153 ext4_fsblk_t last_block; 153 ext4_fsblk_t last_block;
154 ext4_grpblk_t colour; 154 ext4_grpblk_t colour;
155 ext4_group_t block_group;
156 int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
155 int depth; 157 int depth;
156 158
157 if (path) { 159 if (path) {
@@ -170,10 +172,31 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
170 } 172 }
171 173
172 /* OK. use inode's group */ 174 /* OK. use inode's group */
173 bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + 175 block_group = ei->i_block_group;
176 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
177 /*
178 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
179 * block groups per flexgroup, reserve the first block
180 * group for directories and special files. Regular
181 * files will start at the second block group. This
182 * tends to speed up directory access and improves
183 * fsck times.
184 */
185 block_group &= ~(flex_size-1);
186 if (S_ISREG(inode->i_mode))
187 block_group++;
188 }
189 bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
174 le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block); 190 le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
175 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 191 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
176 192
193 /*
194 * If we are doing delayed allocation, we don't need take
195 * colour into account.
196 */
197 if (test_opt(inode->i_sb, DELALLOC))
198 return bg_start;
199
177 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) 200 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
178 colour = (current->pid % 16) * 201 colour = (current->pid % 16) *
179 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); 202 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -301,7 +324,64 @@ ext4_ext_max_entries(struct inode *inode, int depth)
301 return max; 324 return max;
302} 325}
303 326
304static int __ext4_ext_check_header(const char *function, struct inode *inode, 327static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
328{
329 ext4_fsblk_t block = ext_pblock(ext);
330 int len = ext4_ext_get_actual_len(ext);
331 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
332 if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
333 ((block + len) > ext4_blocks_count(es))))
334 return 0;
335 else
336 return 1;
337}
338
339static int ext4_valid_extent_idx(struct inode *inode,
340 struct ext4_extent_idx *ext_idx)
341{
342 ext4_fsblk_t block = idx_pblock(ext_idx);
343 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
344 if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
345 (block > ext4_blocks_count(es))))
346 return 0;
347 else
348 return 1;
349}
350
351static int ext4_valid_extent_entries(struct inode *inode,
352 struct ext4_extent_header *eh,
353 int depth)
354{
355 struct ext4_extent *ext;
356 struct ext4_extent_idx *ext_idx;
357 unsigned short entries;
358 if (eh->eh_entries == 0)
359 return 1;
360
361 entries = le16_to_cpu(eh->eh_entries);
362
363 if (depth == 0) {
364 /* leaf entries */
365 ext = EXT_FIRST_EXTENT(eh);
366 while (entries) {
367 if (!ext4_valid_extent(inode, ext))
368 return 0;
369 ext++;
370 entries--;
371 }
372 } else {
373 ext_idx = EXT_FIRST_INDEX(eh);
374 while (entries) {
375 if (!ext4_valid_extent_idx(inode, ext_idx))
376 return 0;
377 ext_idx++;
378 entries--;
379 }
380 }
381 return 1;
382}
383
384static int __ext4_ext_check(const char *function, struct inode *inode,
305 struct ext4_extent_header *eh, 385 struct ext4_extent_header *eh,
306 int depth) 386 int depth)
307{ 387{
@@ -329,11 +409,15 @@ static int __ext4_ext_check_header(const char *function, struct inode *inode,
329 error_msg = "invalid eh_entries"; 409 error_msg = "invalid eh_entries";
330 goto corrupted; 410 goto corrupted;
331 } 411 }
412 if (!ext4_valid_extent_entries(inode, eh, depth)) {
413 error_msg = "invalid extent entries";
414 goto corrupted;
415 }
332 return 0; 416 return 0;
333 417
334corrupted: 418corrupted:
335 ext4_error(inode->i_sb, function, 419 ext4_error(inode->i_sb, function,
336 "bad header in inode #%lu: %s - magic %x, " 420 "bad header/extent in inode #%lu: %s - magic %x, "
337 "entries %u, max %u(%u), depth %u(%u)", 421 "entries %u, max %u(%u), depth %u(%u)",
338 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), 422 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
339 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), 423 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
@@ -342,8 +426,13 @@ corrupted:
342 return -EIO; 426 return -EIO;
343} 427}
344 428
345#define ext4_ext_check_header(inode, eh, depth) \ 429#define ext4_ext_check(inode, eh, depth) \
346 __ext4_ext_check_header(__func__, inode, eh, depth) 430 __ext4_ext_check(__func__, inode, eh, depth)
431
432int ext4_ext_check_inode(struct inode *inode)
433{
434 return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
435}
347 436
348#ifdef EXT_DEBUG 437#ifdef EXT_DEBUG
349static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) 438static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
@@ -547,9 +636,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
547 636
548 eh = ext_inode_hdr(inode); 637 eh = ext_inode_hdr(inode);
549 depth = ext_depth(inode); 638 depth = ext_depth(inode);
550 if (ext4_ext_check_header(inode, eh, depth))
551 return ERR_PTR(-EIO);
552
553 639
554 /* account possible depth increase */ 640 /* account possible depth increase */
555 if (!path) { 641 if (!path) {
@@ -565,6 +651,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
565 i = depth; 651 i = depth;
566 /* walk through the tree */ 652 /* walk through the tree */
567 while (i) { 653 while (i) {
654 int need_to_validate = 0;
655
568 ext_debug("depth %d: num %d, max %d\n", 656 ext_debug("depth %d: num %d, max %d\n",
569 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 657 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
570 658
@@ -573,10 +661,17 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
573 path[ppos].p_depth = i; 661 path[ppos].p_depth = i;
574 path[ppos].p_ext = NULL; 662 path[ppos].p_ext = NULL;
575 663
576 bh = sb_bread(inode->i_sb, path[ppos].p_block); 664 bh = sb_getblk(inode->i_sb, path[ppos].p_block);
577 if (!bh) 665 if (unlikely(!bh))
578 goto err; 666 goto err;
579 667 if (!bh_uptodate_or_lock(bh)) {
668 if (bh_submit_read(bh) < 0) {
669 put_bh(bh);
670 goto err;
671 }
672 /* validate the extent entries */
673 need_to_validate = 1;
674 }
580 eh = ext_block_hdr(bh); 675 eh = ext_block_hdr(bh);
581 ppos++; 676 ppos++;
582 BUG_ON(ppos > depth); 677 BUG_ON(ppos > depth);
@@ -584,7 +679,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
584 path[ppos].p_hdr = eh; 679 path[ppos].p_hdr = eh;
585 i--; 680 i--;
586 681
587 if (ext4_ext_check_header(inode, eh, i)) 682 if (need_to_validate && ext4_ext_check(inode, eh, i))
588 goto err; 683 goto err;
589 } 684 }
590 685
@@ -1122,7 +1217,8 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1122 struct ext4_extent_idx *ix; 1217 struct ext4_extent_idx *ix;
1123 struct ext4_extent *ex; 1218 struct ext4_extent *ex;
1124 ext4_fsblk_t block; 1219 ext4_fsblk_t block;
1125 int depth, ee_len; 1220 int depth; /* Note, NOT eh_depth; depth from top of tree */
1221 int ee_len;
1126 1222
1127 BUG_ON(path == NULL); 1223 BUG_ON(path == NULL);
1128 depth = path->p_depth; 1224 depth = path->p_depth;
@@ -1179,7 +1275,8 @@ got_index:
1179 if (bh == NULL) 1275 if (bh == NULL)
1180 return -EIO; 1276 return -EIO;
1181 eh = ext_block_hdr(bh); 1277 eh = ext_block_hdr(bh);
1182 if (ext4_ext_check_header(inode, eh, depth)) { 1278 /* subtract from p_depth to get proper eh_depth */
1279 if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1183 put_bh(bh); 1280 put_bh(bh);
1184 return -EIO; 1281 return -EIO;
1185 } 1282 }
@@ -1192,7 +1289,7 @@ got_index:
1192 if (bh == NULL) 1289 if (bh == NULL)
1193 return -EIO; 1290 return -EIO;
1194 eh = ext_block_hdr(bh); 1291 eh = ext_block_hdr(bh);
1195 if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) { 1292 if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1196 put_bh(bh); 1293 put_bh(bh);
1197 return -EIO; 1294 return -EIO;
1198 } 1295 }
@@ -2135,7 +2232,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2135 return -ENOMEM; 2232 return -ENOMEM;
2136 } 2233 }
2137 path[0].p_hdr = ext_inode_hdr(inode); 2234 path[0].p_hdr = ext_inode_hdr(inode);
2138 if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) { 2235 if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2139 err = -EIO; 2236 err = -EIO;
2140 goto out; 2237 goto out;
2141 } 2238 }
@@ -2189,7 +2286,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2189 err = -EIO; 2286 err = -EIO;
2190 break; 2287 break;
2191 } 2288 }
2192 if (ext4_ext_check_header(inode, ext_block_hdr(bh), 2289 if (ext4_ext_check(inode, ext_block_hdr(bh),
2193 depth - i - 1)) { 2290 depth - i - 1)) {
2194 err = -EIO; 2291 err = -EIO;
2195 break; 2292 break;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f731cb545a03..588af8c77246 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -33,9 +33,14 @@
33 */ 33 */
34static int ext4_release_file(struct inode *inode, struct file *filp) 34static int ext4_release_file(struct inode *inode, struct file *filp)
35{ 35{
36 if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) {
37 ext4_alloc_da_blocks(inode);
38 EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE;
39 }
36 /* if we are the last writer on the inode, drop the block reservation */ 40 /* if we are the last writer on the inode, drop the block reservation */
37 if ((filp->f_mode & FMODE_WRITE) && 41 if ((filp->f_mode & FMODE_WRITE) &&
38 (atomic_read(&inode->i_writecount) == 1)) 42 (atomic_read(&inode->i_writecount) == 1) &&
43 !EXT4_I(inode)->i_reserved_data_blocks)
39 { 44 {
40 down_write(&EXT4_I(inode)->i_data_sem); 45 down_write(&EXT4_I(inode)->i_data_sem);
41 ext4_discard_preallocations(inode); 46 ext4_discard_preallocations(inode);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f18a919be70b..47b84e8df568 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -188,8 +188,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
188 struct ext4_group_desc *gdp; 188 struct ext4_group_desc *gdp;
189 struct ext4_super_block *es; 189 struct ext4_super_block *es;
190 struct ext4_sb_info *sbi; 190 struct ext4_sb_info *sbi;
191 int fatal = 0, err, count; 191 int fatal = 0, err, count, cleared;
192 ext4_group_t flex_group;
193 192
194 if (atomic_read(&inode->i_count) > 1) { 193 if (atomic_read(&inode->i_count) > 1) {
195 printk(KERN_ERR "ext4_free_inode: inode has count=%d\n", 194 printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
@@ -220,10 +219,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
220 * Note: we must free any quota before locking the superblock, 219 * Note: we must free any quota before locking the superblock,
221 * as writing the quota to disk may need the lock as well. 220 * as writing the quota to disk may need the lock as well.
222 */ 221 */
223 DQUOT_INIT(inode); 222 vfs_dq_init(inode);
224 ext4_xattr_delete_inode(handle, inode); 223 ext4_xattr_delete_inode(handle, inode);
225 DQUOT_FREE_INODE(inode); 224 vfs_dq_free_inode(inode);
226 DQUOT_DROP(inode); 225 vfs_dq_drop(inode);
227 226
228 is_directory = S_ISDIR(inode->i_mode); 227 is_directory = S_ISDIR(inode->i_mode);
229 228
@@ -248,8 +247,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
248 goto error_return; 247 goto error_return;
249 248
250 /* Ok, now we can actually update the inode bitmaps.. */ 249 /* Ok, now we can actually update the inode bitmaps.. */
251 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), 250 spin_lock(sb_bgl_lock(sbi, block_group));
252 bit, bitmap_bh->b_data)) 251 cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
252 spin_unlock(sb_bgl_lock(sbi, block_group));
253 if (!cleared)
253 ext4_error(sb, "ext4_free_inode", 254 ext4_error(sb, "ext4_free_inode",
254 "bit already cleared for inode %lu", ino); 255 "bit already cleared for inode %lu", ino);
255 else { 256 else {
@@ -266,6 +267,13 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
266 if (is_directory) { 267 if (is_directory) {
267 count = ext4_used_dirs_count(sb, gdp) - 1; 268 count = ext4_used_dirs_count(sb, gdp) - 1;
268 ext4_used_dirs_set(sb, gdp, count); 269 ext4_used_dirs_set(sb, gdp, count);
270 if (sbi->s_log_groups_per_flex) {
271 ext4_group_t f;
272
273 f = ext4_flex_group(sbi, block_group);
274 atomic_dec(&sbi->s_flex_groups[f].free_inodes);
275 }
276
269 } 277 }
270 gdp->bg_checksum = ext4_group_desc_csum(sbi, 278 gdp->bg_checksum = ext4_group_desc_csum(sbi,
271 block_group, gdp); 279 block_group, gdp);
@@ -275,10 +283,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
275 percpu_counter_dec(&sbi->s_dirs_counter); 283 percpu_counter_dec(&sbi->s_dirs_counter);
276 284
277 if (sbi->s_log_groups_per_flex) { 285 if (sbi->s_log_groups_per_flex) {
278 flex_group = ext4_flex_group(sbi, block_group); 286 ext4_group_t f;
279 spin_lock(sb_bgl_lock(sbi, flex_group)); 287
280 sbi->s_flex_groups[flex_group].free_inodes++; 288 f = ext4_flex_group(sbi, block_group);
281 spin_unlock(sb_bgl_lock(sbi, flex_group)); 289 atomic_inc(&sbi->s_flex_groups[f].free_inodes);
282 } 290 }
283 } 291 }
284 BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); 292 BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
@@ -358,9 +366,9 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
358 sbi->s_log_groups_per_flex; 366 sbi->s_log_groups_per_flex;
359 367
360find_close_to_parent: 368find_close_to_parent:
361 flexbg_free_blocks = flex_group[best_flex].free_blocks; 369 flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks);
362 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; 370 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
363 if (flex_group[best_flex].free_inodes && 371 if (atomic_read(&flex_group[best_flex].free_inodes) &&
364 flex_freeb_ratio > free_block_ratio) 372 flex_freeb_ratio > free_block_ratio)
365 goto found_flexbg; 373 goto found_flexbg;
366 374
@@ -373,24 +381,24 @@ find_close_to_parent:
373 if (i == parent_fbg_group || i == parent_fbg_group - 1) 381 if (i == parent_fbg_group || i == parent_fbg_group - 1)
374 continue; 382 continue;
375 383
376 flexbg_free_blocks = flex_group[i].free_blocks; 384 flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks);
377 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; 385 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
378 386
379 if (flex_freeb_ratio > free_block_ratio && 387 if (flex_freeb_ratio > free_block_ratio &&
380 flex_group[i].free_inodes) { 388 (atomic_read(&flex_group[i].free_inodes))) {
381 best_flex = i; 389 best_flex = i;
382 goto found_flexbg; 390 goto found_flexbg;
383 } 391 }
384 392
385 if (flex_group[best_flex].free_inodes == 0 || 393 if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) ||
386 (flex_group[i].free_blocks > 394 ((atomic_read(&flex_group[i].free_blocks) >
387 flex_group[best_flex].free_blocks && 395 atomic_read(&flex_group[best_flex].free_blocks)) &&
388 flex_group[i].free_inodes)) 396 atomic_read(&flex_group[i].free_inodes)))
389 best_flex = i; 397 best_flex = i;
390 } 398 }
391 399
392 if (!flex_group[best_flex].free_inodes || 400 if (!atomic_read(&flex_group[best_flex].free_inodes) ||
393 !flex_group[best_flex].free_blocks) 401 !atomic_read(&flex_group[best_flex].free_blocks))
394 return -1; 402 return -1;
395 403
396found_flexbg: 404found_flexbg:
@@ -408,6 +416,42 @@ out:
408 return 0; 416 return 0;
409} 417}
410 418
419struct orlov_stats {
420 __u32 free_inodes;
421 __u32 free_blocks;
422 __u32 used_dirs;
423};
424
425/*
426 * Helper function for Orlov's allocator; returns critical information
427 * for a particular block group or flex_bg. If flex_size is 1, then g
428 * is a block group number; otherwise it is flex_bg number.
429 */
430void get_orlov_stats(struct super_block *sb, ext4_group_t g,
431 int flex_size, struct orlov_stats *stats)
432{
433 struct ext4_group_desc *desc;
434 struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
435
436 if (flex_size > 1) {
437 stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
438 stats->free_blocks = atomic_read(&flex_group[g].free_blocks);
439 stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
440 return;
441 }
442
443 desc = ext4_get_group_desc(sb, g, NULL);
444 if (desc) {
445 stats->free_inodes = ext4_free_inodes_count(sb, desc);
446 stats->free_blocks = ext4_free_blks_count(sb, desc);
447 stats->used_dirs = ext4_used_dirs_count(sb, desc);
448 } else {
449 stats->free_inodes = 0;
450 stats->free_blocks = 0;
451 stats->used_dirs = 0;
452 }
453}
454
411/* 455/*
412 * Orlov's allocator for directories. 456 * Orlov's allocator for directories.
413 * 457 *
@@ -423,35 +467,34 @@ out:
423 * it has too many directories already (max_dirs) or 467 * it has too many directories already (max_dirs) or
424 * it has too few free inodes left (min_inodes) or 468 * it has too few free inodes left (min_inodes) or
425 * it has too few free blocks left (min_blocks) or 469 * it has too few free blocks left (min_blocks) or
426 * it's already running too large debt (max_debt).
427 * Parent's group is preferred, if it doesn't satisfy these 470 * Parent's group is preferred, if it doesn't satisfy these
428 * conditions we search cyclically through the rest. If none 471 * conditions we search cyclically through the rest. If none
429 * of the groups look good we just look for a group with more 472 * of the groups look good we just look for a group with more
430 * free inodes than average (starting at parent's group). 473 * free inodes than average (starting at parent's group).
431 *
432 * Debt is incremented each time we allocate a directory and decremented
433 * when we allocate an inode, within 0--255.
434 */ 474 */
435 475
436#define INODE_COST 64
437#define BLOCK_COST 256
438
439static int find_group_orlov(struct super_block *sb, struct inode *parent, 476static int find_group_orlov(struct super_block *sb, struct inode *parent,
440 ext4_group_t *group) 477 ext4_group_t *group, int mode)
441{ 478{
442 ext4_group_t parent_group = EXT4_I(parent)->i_block_group; 479 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
443 struct ext4_sb_info *sbi = EXT4_SB(sb); 480 struct ext4_sb_info *sbi = EXT4_SB(sb);
444 struct ext4_super_block *es = sbi->s_es;
445 ext4_group_t ngroups = sbi->s_groups_count; 481 ext4_group_t ngroups = sbi->s_groups_count;
446 int inodes_per_group = EXT4_INODES_PER_GROUP(sb); 482 int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
447 unsigned int freei, avefreei; 483 unsigned int freei, avefreei;
448 ext4_fsblk_t freeb, avefreeb; 484 ext4_fsblk_t freeb, avefreeb;
449 ext4_fsblk_t blocks_per_dir;
450 unsigned int ndirs; 485 unsigned int ndirs;
451 int max_debt, max_dirs, min_inodes; 486 int max_dirs, min_inodes;
452 ext4_grpblk_t min_blocks; 487 ext4_grpblk_t min_blocks;
453 ext4_group_t i; 488 ext4_group_t i, grp, g;
454 struct ext4_group_desc *desc; 489 struct ext4_group_desc *desc;
490 struct orlov_stats stats;
491 int flex_size = ext4_flex_bg_size(sbi);
492
493 if (flex_size > 1) {
494 ngroups = (ngroups + flex_size - 1) >>
495 sbi->s_log_groups_per_flex;
496 parent_group >>= sbi->s_log_groups_per_flex;
497 }
455 498
456 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); 499 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
457 avefreei = freei / ngroups; 500 avefreei = freei / ngroups;
@@ -460,71 +503,97 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
460 do_div(avefreeb, ngroups); 503 do_div(avefreeb, ngroups);
461 ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); 504 ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
462 505
463 if ((parent == sb->s_root->d_inode) || 506 if (S_ISDIR(mode) &&
464 (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) { 507 ((parent == sb->s_root->d_inode) ||
508 (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
465 int best_ndir = inodes_per_group; 509 int best_ndir = inodes_per_group;
466 ext4_group_t grp;
467 int ret = -1; 510 int ret = -1;
468 511
469 get_random_bytes(&grp, sizeof(grp)); 512 get_random_bytes(&grp, sizeof(grp));
470 parent_group = (unsigned)grp % ngroups; 513 parent_group = (unsigned)grp % ngroups;
471 for (i = 0; i < ngroups; i++) { 514 for (i = 0; i < ngroups; i++) {
472 grp = (parent_group + i) % ngroups; 515 g = (parent_group + i) % ngroups;
473 desc = ext4_get_group_desc(sb, grp, NULL); 516 get_orlov_stats(sb, g, flex_size, &stats);
474 if (!desc || !ext4_free_inodes_count(sb, desc)) 517 if (!stats.free_inodes)
475 continue; 518 continue;
476 if (ext4_used_dirs_count(sb, desc) >= best_ndir) 519 if (stats.used_dirs >= best_ndir)
477 continue; 520 continue;
478 if (ext4_free_inodes_count(sb, desc) < avefreei) 521 if (stats.free_inodes < avefreei)
479 continue; 522 continue;
480 if (ext4_free_blks_count(sb, desc) < avefreeb) 523 if (stats.free_blocks < avefreeb)
481 continue; 524 continue;
482 *group = grp; 525 grp = g;
483 ret = 0; 526 ret = 0;
484 best_ndir = ext4_used_dirs_count(sb, desc); 527 best_ndir = stats.used_dirs;
528 }
529 if (ret)
530 goto fallback;
531 found_flex_bg:
532 if (flex_size == 1) {
533 *group = grp;
534 return 0;
535 }
536
537 /*
538 * We pack inodes at the beginning of the flexgroup's
539 * inode tables. Block allocation decisions will do
540 * something similar, although regular files will
541 * start at 2nd block group of the flexgroup. See
542 * ext4_ext_find_goal() and ext4_find_near().
543 */
544 grp *= flex_size;
545 for (i = 0; i < flex_size; i++) {
546 if (grp+i >= sbi->s_groups_count)
547 break;
548 desc = ext4_get_group_desc(sb, grp+i, NULL);
549 if (desc && ext4_free_inodes_count(sb, desc)) {
550 *group = grp+i;
551 return 0;
552 }
485 } 553 }
486 if (ret == 0)
487 return ret;
488 goto fallback; 554 goto fallback;
489 } 555 }
490 556
491 blocks_per_dir = ext4_blocks_count(es) - freeb;
492 do_div(blocks_per_dir, ndirs);
493
494 max_dirs = ndirs / ngroups + inodes_per_group / 16; 557 max_dirs = ndirs / ngroups + inodes_per_group / 16;
495 min_inodes = avefreei - inodes_per_group / 4; 558 min_inodes = avefreei - inodes_per_group*flex_size / 4;
496 min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4; 559 if (min_inodes < 1)
497 560 min_inodes = 1;
498 max_debt = EXT4_BLOCKS_PER_GROUP(sb); 561 min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
499 max_debt /= max_t(int, blocks_per_dir, BLOCK_COST); 562
500 if (max_debt * INODE_COST > inodes_per_group) 563 /*
501 max_debt = inodes_per_group / INODE_COST; 564 * Start looking in the flex group where we last allocated an
502 if (max_debt > 255) 565 * inode for this parent directory
503 max_debt = 255; 566 */
504 if (max_debt == 0) 567 if (EXT4_I(parent)->i_last_alloc_group != ~0) {
505 max_debt = 1; 568 parent_group = EXT4_I(parent)->i_last_alloc_group;
569 if (flex_size > 1)
570 parent_group >>= sbi->s_log_groups_per_flex;
571 }
506 572
507 for (i = 0; i < ngroups; i++) { 573 for (i = 0; i < ngroups; i++) {
508 *group = (parent_group + i) % ngroups; 574 grp = (parent_group + i) % ngroups;
509 desc = ext4_get_group_desc(sb, *group, NULL); 575 get_orlov_stats(sb, grp, flex_size, &stats);
510 if (!desc || !ext4_free_inodes_count(sb, desc)) 576 if (stats.used_dirs >= max_dirs)
511 continue;
512 if (ext4_used_dirs_count(sb, desc) >= max_dirs)
513 continue; 577 continue;
514 if (ext4_free_inodes_count(sb, desc) < min_inodes) 578 if (stats.free_inodes < min_inodes)
515 continue; 579 continue;
516 if (ext4_free_blks_count(sb, desc) < min_blocks) 580 if (stats.free_blocks < min_blocks)
517 continue; 581 continue;
518 return 0; 582 goto found_flex_bg;
519 } 583 }
520 584
521fallback: 585fallback:
586 ngroups = sbi->s_groups_count;
587 avefreei = freei / ngroups;
588 parent_group = EXT4_I(parent)->i_block_group;
522 for (i = 0; i < ngroups; i++) { 589 for (i = 0; i < ngroups; i++) {
523 *group = (parent_group + i) % ngroups; 590 grp = (parent_group + i) % ngroups;
524 desc = ext4_get_group_desc(sb, *group, NULL); 591 desc = ext4_get_group_desc(sb, grp, NULL);
525 if (desc && ext4_free_inodes_count(sb, desc) && 592 if (desc && ext4_free_inodes_count(sb, desc) &&
526 ext4_free_inodes_count(sb, desc) >= avefreei) 593 ext4_free_inodes_count(sb, desc) >= avefreei) {
594 *group = grp;
527 return 0; 595 return 0;
596 }
528 } 597 }
529 598
530 if (avefreei) { 599 if (avefreei) {
@@ -540,12 +609,51 @@ fallback:
540} 609}
541 610
542static int find_group_other(struct super_block *sb, struct inode *parent, 611static int find_group_other(struct super_block *sb, struct inode *parent,
543 ext4_group_t *group) 612 ext4_group_t *group, int mode)
544{ 613{
545 ext4_group_t parent_group = EXT4_I(parent)->i_block_group; 614 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
546 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; 615 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
547 struct ext4_group_desc *desc; 616 struct ext4_group_desc *desc;
548 ext4_group_t i; 617 ext4_group_t i, last;
618 int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
619
620 /*
621 * Try to place the inode is the same flex group as its
622 * parent. If we can't find space, use the Orlov algorithm to
623 * find another flex group, and store that information in the
624 * parent directory's inode information so that use that flex
625 * group for future allocations.
626 */
627 if (flex_size > 1) {
628 int retry = 0;
629
630 try_again:
631 parent_group &= ~(flex_size-1);
632 last = parent_group + flex_size;
633 if (last > ngroups)
634 last = ngroups;
635 for (i = parent_group; i < last; i++) {
636 desc = ext4_get_group_desc(sb, i, NULL);
637 if (desc && ext4_free_inodes_count(sb, desc)) {
638 *group = i;
639 return 0;
640 }
641 }
642 if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
643 retry = 1;
644 parent_group = EXT4_I(parent)->i_last_alloc_group;
645 goto try_again;
646 }
647 /*
648 * If this didn't work, use the Orlov search algorithm
649 * to find a new flex group; we pass in the mode to
650 * avoid the topdir algorithms.
651 */
652 *group = parent_group + flex_size;
653 if (*group > ngroups)
654 *group = 0;
655 return find_group_orlov(sb, parent, group, mode);
656 }
549 657
550 /* 658 /*
551 * Try to place the inode in its parent directory 659 * Try to place the inode in its parent directory
@@ -663,6 +771,11 @@ static int ext4_claim_inode(struct super_block *sb,
663 if (S_ISDIR(mode)) { 771 if (S_ISDIR(mode)) {
664 count = ext4_used_dirs_count(sb, gdp) + 1; 772 count = ext4_used_dirs_count(sb, gdp) + 1;
665 ext4_used_dirs_set(sb, gdp, count); 773 ext4_used_dirs_set(sb, gdp, count);
774 if (sbi->s_log_groups_per_flex) {
775 ext4_group_t f = ext4_flex_group(sbi, group);
776
777 atomic_inc(&sbi->s_flex_groups[f].free_inodes);
778 }
666 } 779 }
667 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); 780 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
668err_ret: 781err_ret:
@@ -696,6 +809,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
696 struct inode *ret; 809 struct inode *ret;
697 ext4_group_t i; 810 ext4_group_t i;
698 int free = 0; 811 int free = 0;
812 static int once = 1;
699 ext4_group_t flex_group; 813 ext4_group_t flex_group;
700 814
701 /* Cannot create files in a deleted directory */ 815 /* Cannot create files in a deleted directory */
@@ -713,11 +827,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
713 sbi = EXT4_SB(sb); 827 sbi = EXT4_SB(sb);
714 es = sbi->s_es; 828 es = sbi->s_es;
715 829
716 if (sbi->s_log_groups_per_flex) { 830 if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
717 ret2 = find_group_flex(sb, dir, &group); 831 ret2 = find_group_flex(sb, dir, &group);
718 if (ret2 == -1) { 832 if (ret2 == -1) {
719 ret2 = find_group_other(sb, dir, &group); 833 ret2 = find_group_other(sb, dir, &group, mode);
720 if (ret2 == 0 && printk_ratelimit()) 834 if (ret2 == 0 && once)
835 once = 0;
721 printk(KERN_NOTICE "ext4: find_group_flex " 836 printk(KERN_NOTICE "ext4: find_group_flex "
722 "failed, fallback succeeded dir %lu\n", 837 "failed, fallback succeeded dir %lu\n",
723 dir->i_ino); 838 dir->i_ino);
@@ -729,11 +844,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
729 if (test_opt(sb, OLDALLOC)) 844 if (test_opt(sb, OLDALLOC))
730 ret2 = find_group_dir(sb, dir, &group); 845 ret2 = find_group_dir(sb, dir, &group);
731 else 846 else
732 ret2 = find_group_orlov(sb, dir, &group); 847 ret2 = find_group_orlov(sb, dir, &group, mode);
733 } else 848 } else
734 ret2 = find_group_other(sb, dir, &group); 849 ret2 = find_group_other(sb, dir, &group, mode);
735 850
736got_group: 851got_group:
852 EXT4_I(dir)->i_last_alloc_group = group;
737 err = -ENOSPC; 853 err = -ENOSPC;
738 if (ret2 == -1) 854 if (ret2 == -1)
739 goto out; 855 goto out;
@@ -854,9 +970,7 @@ got:
854 970
855 if (sbi->s_log_groups_per_flex) { 971 if (sbi->s_log_groups_per_flex) {
856 flex_group = ext4_flex_group(sbi, group); 972 flex_group = ext4_flex_group(sbi, group);
857 spin_lock(sb_bgl_lock(sbi, flex_group)); 973 atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
858 sbi->s_flex_groups[flex_group].free_inodes--;
859 spin_unlock(sb_bgl_lock(sbi, flex_group));
860 } 974 }
861 975
862 inode->i_uid = current_fsuid(); 976 inode->i_uid = current_fsuid();
@@ -881,19 +995,16 @@ got:
881 ei->i_disksize = 0; 995 ei->i_disksize = 0;
882 996
883 /* 997 /*
884 * Don't inherit extent flag from directory. We set extent flag on 998 * Don't inherit extent flag from directory, amongst others. We set
885 * newly created directory and file only if -o extent mount option is 999 * extent flag on newly created directory and file only if -o extent
886 * specified 1000 * mount option is specified
887 */ 1001 */
888 ei->i_flags = EXT4_I(dir)->i_flags & ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL); 1002 ei->i_flags =
889 if (S_ISLNK(mode)) 1003 ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
890 ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL);
891 /* dirsync only applies to directories */
892 if (!S_ISDIR(mode))
893 ei->i_flags &= ~EXT4_DIRSYNC_FL;
894 ei->i_file_acl = 0; 1004 ei->i_file_acl = 0;
895 ei->i_dtime = 0; 1005 ei->i_dtime = 0;
896 ei->i_block_group = group; 1006 ei->i_block_group = group;
1007 ei->i_last_alloc_group = ~0;
897 1008
898 ext4_set_inode_flags(inode); 1009 ext4_set_inode_flags(inode);
899 if (IS_DIRSYNC(inode)) 1010 if (IS_DIRSYNC(inode))
@@ -911,7 +1022,7 @@ got:
911 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; 1022 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
912 1023
913 ret = inode; 1024 ret = inode;
914 if (DQUOT_ALLOC_INODE(inode)) { 1025 if (vfs_dq_alloc_inode(inode)) {
915 err = -EDQUOT; 1026 err = -EDQUOT;
916 goto fail_drop; 1027 goto fail_drop;
917 } 1028 }
@@ -952,10 +1063,10 @@ really_out:
952 return ret; 1063 return ret;
953 1064
954fail_free_drop: 1065fail_free_drop:
955 DQUOT_FREE_INODE(inode); 1066 vfs_dq_free_inode(inode);
956 1067
957fail_drop: 1068fail_drop:
958 DQUOT_DROP(inode); 1069 vfs_dq_drop(inode);
959 inode->i_flags |= S_NOQUOTA; 1070 inode->i_flags |= S_NOQUOTA;
960 inode->i_nlink = 0; 1071 inode->i_nlink = 0;
961 unlock_new_inode(inode); 1072 unlock_new_inode(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c7fed5b18745..a2e7952bc5f9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -371,6 +371,34 @@ static int ext4_block_to_path(struct inode *inode,
371 return n; 371 return n;
372} 372}
373 373
374static int __ext4_check_blockref(const char *function, struct inode *inode,
375 unsigned int *p, unsigned int max) {
376
377 unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
378 unsigned int *bref = p;
379 while (bref < p+max) {
380 if (unlikely(*bref >= maxblocks)) {
381 ext4_error(inode->i_sb, function,
382 "block reference %u >= max (%u) "
383 "in inode #%lu, offset=%d",
384 *bref, maxblocks,
385 inode->i_ino, (int)(bref-p));
386 return -EIO;
387 }
388 bref++;
389 }
390 return 0;
391}
392
393
394#define ext4_check_indirect_blockref(inode, bh) \
395 __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data, \
396 EXT4_ADDR_PER_BLOCK((inode)->i_sb))
397
398#define ext4_check_inode_blockref(inode) \
399 __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data, \
400 EXT4_NDIR_BLOCKS)
401
374/** 402/**
375 * ext4_get_branch - read the chain of indirect blocks leading to data 403 * ext4_get_branch - read the chain of indirect blocks leading to data
376 * @inode: inode in question 404 * @inode: inode in question
@@ -415,9 +443,22 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
415 if (!p->key) 443 if (!p->key)
416 goto no_block; 444 goto no_block;
417 while (--depth) { 445 while (--depth) {
418 bh = sb_bread(sb, le32_to_cpu(p->key)); 446 bh = sb_getblk(sb, le32_to_cpu(p->key));
419 if (!bh) 447 if (unlikely(!bh))
420 goto failure; 448 goto failure;
449
450 if (!bh_uptodate_or_lock(bh)) {
451 if (bh_submit_read(bh) < 0) {
452 put_bh(bh);
453 goto failure;
454 }
455 /* validate block references */
456 if (ext4_check_indirect_blockref(inode, bh)) {
457 put_bh(bh);
458 goto failure;
459 }
460 }
461
421 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); 462 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
422 /* Reader: end */ 463 /* Reader: end */
423 if (!p->key) 464 if (!p->key)
@@ -459,6 +500,8 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
459 ext4_fsblk_t bg_start; 500 ext4_fsblk_t bg_start;
460 ext4_fsblk_t last_block; 501 ext4_fsblk_t last_block;
461 ext4_grpblk_t colour; 502 ext4_grpblk_t colour;
503 ext4_group_t block_group;
504 int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
462 505
463 /* Try to find previous block */ 506 /* Try to find previous block */
464 for (p = ind->p - 1; p >= start; p--) { 507 for (p = ind->p - 1; p >= start; p--) {
@@ -474,9 +517,22 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
474 * It is going to be referred to from the inode itself? OK, just put it 517 * It is going to be referred to from the inode itself? OK, just put it
475 * into the same cylinder group then. 518 * into the same cylinder group then.
476 */ 519 */
477 bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group); 520 block_group = ei->i_block_group;
521 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
522 block_group &= ~(flex_size-1);
523 if (S_ISREG(inode->i_mode))
524 block_group++;
525 }
526 bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
478 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 527 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
479 528
529 /*
530 * If we are doing delayed allocation, we don't need take
531 * colour into account.
532 */
533 if (test_opt(inode->i_sb, DELALLOC))
534 return bg_start;
535
480 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) 536 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
481 colour = (current->pid % 16) * 537 colour = (current->pid % 16) *
482 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); 538 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -975,6 +1031,17 @@ out:
975 return err; 1031 return err;
976} 1032}
977 1033
1034qsize_t ext4_get_reserved_space(struct inode *inode)
1035{
1036 unsigned long long total;
1037
1038 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1039 total = EXT4_I(inode)->i_reserved_data_blocks +
1040 EXT4_I(inode)->i_reserved_meta_blocks;
1041 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1042
1043 return total;
1044}
978/* 1045/*
979 * Calculate the number of metadata blocks need to reserve 1046 * Calculate the number of metadata blocks need to reserve
980 * to allocate @blocks for non extent file based file 1047 * to allocate @blocks for non extent file based file
@@ -1036,8 +1103,21 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
1036 /* update per-inode reservations */ 1103 /* update per-inode reservations */
1037 BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); 1104 BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
1038 EXT4_I(inode)->i_reserved_data_blocks -= used; 1105 EXT4_I(inode)->i_reserved_data_blocks -= used;
1039
1040 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1106 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1107
1108 /*
1109 * free those over-booking quota for metadata blocks
1110 */
1111 if (mdb_free)
1112 vfs_dq_release_reservation_block(inode, mdb_free);
1113
1114 /*
1115 * If we have done all the pending block allocations and if
1116 * there aren't any writers on the inode, we can discard the
1117 * inode's preallocations.
1118 */
1119 if (!total && (atomic_read(&inode->i_writecount) == 0))
1120 ext4_discard_preallocations(inode);
1041} 1121}
1042 1122
1043/* 1123/*
@@ -1553,8 +1633,8 @@ static int ext4_journalled_write_end(struct file *file,
1553static int ext4_da_reserve_space(struct inode *inode, int nrblocks) 1633static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
1554{ 1634{
1555 int retries = 0; 1635 int retries = 0;
1556 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1636 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1557 unsigned long md_needed, mdblocks, total = 0; 1637 unsigned long md_needed, mdblocks, total = 0;
1558 1638
1559 /* 1639 /*
1560 * recalculate the amount of metadata blocks to reserve 1640 * recalculate the amount of metadata blocks to reserve
@@ -1570,12 +1650,23 @@ repeat:
1570 md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; 1650 md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
1571 total = md_needed + nrblocks; 1651 total = md_needed + nrblocks;
1572 1652
1653 /*
1654 * Make quota reservation here to prevent quota overflow
1655 * later. Real quota accounting is done at pages writeout
1656 * time.
1657 */
1658 if (vfs_dq_reserve_block(inode, total)) {
1659 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1660 return -EDQUOT;
1661 }
1662
1573 if (ext4_claim_free_blocks(sbi, total)) { 1663 if (ext4_claim_free_blocks(sbi, total)) {
1574 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1664 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1575 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1665 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1576 yield(); 1666 yield();
1577 goto repeat; 1667 goto repeat;
1578 } 1668 }
1669 vfs_dq_release_reservation_block(inode, total);
1579 return -ENOSPC; 1670 return -ENOSPC;
1580 } 1671 }
1581 EXT4_I(inode)->i_reserved_data_blocks += nrblocks; 1672 EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
@@ -1629,6 +1720,8 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1629 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1720 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1630 EXT4_I(inode)->i_reserved_meta_blocks = mdb; 1721 EXT4_I(inode)->i_reserved_meta_blocks = mdb;
1631 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1722 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1723
1724 vfs_dq_release_reservation_block(inode, release);
1632} 1725}
1633 1726
1634static void ext4_da_page_release_reservation(struct page *page, 1727static void ext4_da_page_release_reservation(struct page *page,
@@ -1658,9 +1751,10 @@ static void ext4_da_page_release_reservation(struct page *page,
1658 1751
1659struct mpage_da_data { 1752struct mpage_da_data {
1660 struct inode *inode; 1753 struct inode *inode;
1661 struct buffer_head lbh; /* extent of blocks */ 1754 sector_t b_blocknr; /* start block number of extent */
1755 size_t b_size; /* size of extent */
1756 unsigned long b_state; /* state of the extent */
1662 unsigned long first_page, next_page; /* extent of pages */ 1757 unsigned long first_page, next_page; /* extent of pages */
1663 get_block_t *get_block;
1664 struct writeback_control *wbc; 1758 struct writeback_control *wbc;
1665 int io_done; 1759 int io_done;
1666 int pages_written; 1760 int pages_written;
@@ -1674,7 +1768,6 @@ struct mpage_da_data {
1674 * @mpd->inode: inode 1768 * @mpd->inode: inode
1675 * @mpd->first_page: first page of the extent 1769 * @mpd->first_page: first page of the extent
1676 * @mpd->next_page: page after the last page of the extent 1770 * @mpd->next_page: page after the last page of the extent
1677 * @mpd->get_block: the filesystem's block mapper function
1678 * 1771 *
1679 * By the time mpage_da_submit_io() is called we expect all blocks 1772 * By the time mpage_da_submit_io() is called we expect all blocks
1680 * to be allocated. this may be wrong if allocation failed. 1773 * to be allocated. this may be wrong if allocation failed.
@@ -1694,7 +1787,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
1694 /* 1787 /*
1695 * We need to start from the first_page to the next_page - 1 1788 * We need to start from the first_page to the next_page - 1
1696 * to make sure we also write the mapped dirty buffer_heads. 1789 * to make sure we also write the mapped dirty buffer_heads.
1697 * If we look at mpd->lbh.b_blocknr we would only be looking 1790 * If we look at mpd->b_blocknr we would only be looking
1698 * at the currently mapped buffer_heads. 1791 * at the currently mapped buffer_heads.
1699 */ 1792 */
1700 index = mpd->first_page; 1793 index = mpd->first_page;
@@ -1884,68 +1977,111 @@ static void ext4_print_free_blocks(struct inode *inode)
1884 return; 1977 return;
1885} 1978}
1886 1979
1980#define EXT4_DELALLOC_RSVED 1
1981static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
1982 struct buffer_head *bh_result, int create)
1983{
1984 int ret;
1985 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
1986 loff_t disksize = EXT4_I(inode)->i_disksize;
1987 handle_t *handle = NULL;
1988
1989 handle = ext4_journal_current_handle();
1990 BUG_ON(!handle);
1991 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
1992 bh_result, create, 0, EXT4_DELALLOC_RSVED);
1993 if (ret <= 0)
1994 return ret;
1995
1996 bh_result->b_size = (ret << inode->i_blkbits);
1997
1998 if (ext4_should_order_data(inode)) {
1999 int retval;
2000 retval = ext4_jbd2_file_inode(handle, inode);
2001 if (retval)
2002 /*
2003 * Failed to add inode for ordered mode. Don't
2004 * update file size
2005 */
2006 return retval;
2007 }
2008
2009 /*
2010 * Update on-disk size along with block allocation we don't
2011 * use 'extend_disksize' as size may change within already
2012 * allocated block -bzzz
2013 */
2014 disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
2015 if (disksize > i_size_read(inode))
2016 disksize = i_size_read(inode);
2017 if (disksize > EXT4_I(inode)->i_disksize) {
2018 ext4_update_i_disksize(inode, disksize);
2019 ret = ext4_mark_inode_dirty(handle, inode);
2020 return ret;
2021 }
2022 return 0;
2023}
2024
1887/* 2025/*
1888 * mpage_da_map_blocks - go through given space 2026 * mpage_da_map_blocks - go through given space
1889 * 2027 *
1890 * @mpd->lbh - bh describing space 2028 * @mpd - bh describing space
1891 * @mpd->get_block - the filesystem's block mapper function
1892 * 2029 *
1893 * The function skips space we know is already mapped to disk blocks. 2030 * The function skips space we know is already mapped to disk blocks.
1894 * 2031 *
1895 */ 2032 */
1896static int mpage_da_map_blocks(struct mpage_da_data *mpd) 2033static int mpage_da_map_blocks(struct mpage_da_data *mpd)
1897{ 2034{
1898 int err = 0; 2035 int err = 0;
1899 struct buffer_head new; 2036 struct buffer_head new;
1900 struct buffer_head *lbh = &mpd->lbh;
1901 sector_t next; 2037 sector_t next;
1902 2038
1903 /* 2039 /*
1904 * We consider only non-mapped and non-allocated blocks 2040 * We consider only non-mapped and non-allocated blocks
1905 */ 2041 */
1906 if (buffer_mapped(lbh) && !buffer_delay(lbh)) 2042 if ((mpd->b_state & (1 << BH_Mapped)) &&
2043 !(mpd->b_state & (1 << BH_Delay)))
1907 return 0; 2044 return 0;
1908 new.b_state = lbh->b_state; 2045 new.b_state = mpd->b_state;
1909 new.b_blocknr = 0; 2046 new.b_blocknr = 0;
1910 new.b_size = lbh->b_size; 2047 new.b_size = mpd->b_size;
1911 next = lbh->b_blocknr; 2048 next = mpd->b_blocknr;
1912 /* 2049 /*
1913 * If we didn't accumulate anything 2050 * If we didn't accumulate anything
1914 * to write simply return 2051 * to write simply return
1915 */ 2052 */
1916 if (!new.b_size) 2053 if (!new.b_size)
1917 return 0; 2054 return 0;
1918 err = mpd->get_block(mpd->inode, next, &new, 1);
1919 if (err) {
1920 2055
1921 /* If get block returns with error 2056 err = ext4_da_get_block_write(mpd->inode, next, &new, 1);
1922 * we simply return. Later writepage 2057 if (err) {
1923 * will redirty the page and writepages 2058 /*
1924 * will find the dirty page again 2059 * If get block returns with error we simply
2060 * return. Later writepage will redirty the page and
2061 * writepages will find the dirty page again
1925 */ 2062 */
1926 if (err == -EAGAIN) 2063 if (err == -EAGAIN)
1927 return 0; 2064 return 0;
1928 2065
1929 if (err == -ENOSPC && 2066 if (err == -ENOSPC &&
1930 ext4_count_free_blocks(mpd->inode->i_sb)) { 2067 ext4_count_free_blocks(mpd->inode->i_sb)) {
1931 mpd->retval = err; 2068 mpd->retval = err;
1932 return 0; 2069 return 0;
1933 } 2070 }
1934 2071
1935 /* 2072 /*
1936 * get block failure will cause us 2073 * get block failure will cause us to loop in
1937 * to loop in writepages. Because 2074 * writepages, because a_ops->writepage won't be able
1938 * a_ops->writepage won't be able to 2075 * to make progress. The page will be redirtied by
1939 * make progress. The page will be redirtied 2076 * writepage and writepages will again try to write
1940 * by writepage and writepages will again 2077 * the same.
1941 * try to write the same.
1942 */ 2078 */
1943 printk(KERN_EMERG "%s block allocation failed for inode %lu " 2079 printk(KERN_EMERG "%s block allocation failed for inode %lu "
1944 "at logical offset %llu with max blocks " 2080 "at logical offset %llu with max blocks "
1945 "%zd with error %d\n", 2081 "%zd with error %d\n",
1946 __func__, mpd->inode->i_ino, 2082 __func__, mpd->inode->i_ino,
1947 (unsigned long long)next, 2083 (unsigned long long)next,
1948 lbh->b_size >> mpd->inode->i_blkbits, err); 2084 mpd->b_size >> mpd->inode->i_blkbits, err);
1949 printk(KERN_EMERG "This should not happen.!! " 2085 printk(KERN_EMERG "This should not happen.!! "
1950 "Data will be lost\n"); 2086 "Data will be lost\n");
1951 if (err == -ENOSPC) { 2087 if (err == -ENOSPC) {
@@ -1953,7 +2089,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
1953 } 2089 }
1954 /* invlaidate all the pages */ 2090 /* invlaidate all the pages */
1955 ext4_da_block_invalidatepages(mpd, next, 2091 ext4_da_block_invalidatepages(mpd, next,
1956 lbh->b_size >> mpd->inode->i_blkbits); 2092 mpd->b_size >> mpd->inode->i_blkbits);
1957 return err; 2093 return err;
1958 } 2094 }
1959 BUG_ON(new.b_size == 0); 2095 BUG_ON(new.b_size == 0);
@@ -1965,7 +2101,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
1965 * If blocks are delayed marked, we need to 2101 * If blocks are delayed marked, we need to
1966 * put actual blocknr and drop delayed bit 2102 * put actual blocknr and drop delayed bit
1967 */ 2103 */
1968 if (buffer_delay(lbh) || buffer_unwritten(lbh)) 2104 if ((mpd->b_state & (1 << BH_Delay)) ||
2105 (mpd->b_state & (1 << BH_Unwritten)))
1969 mpage_put_bnr_to_bhs(mpd, next, &new); 2106 mpage_put_bnr_to_bhs(mpd, next, &new);
1970 2107
1971 return 0; 2108 return 0;
@@ -1984,12 +2121,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
1984 * the function is used to collect contig. blocks in same state 2121 * the function is used to collect contig. blocks in same state
1985 */ 2122 */
1986static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, 2123static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
1987 sector_t logical, struct buffer_head *bh) 2124 sector_t logical, size_t b_size,
2125 unsigned long b_state)
1988{ 2126{
1989 sector_t next; 2127 sector_t next;
1990 size_t b_size = bh->b_size; 2128 int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
1991 struct buffer_head *lbh = &mpd->lbh;
1992 int nrblocks = lbh->b_size >> mpd->inode->i_blkbits;
1993 2129
1994 /* check if thereserved journal credits might overflow */ 2130 /* check if thereserved journal credits might overflow */
1995 if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { 2131 if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
@@ -2016,19 +2152,19 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
2016 /* 2152 /*
2017 * First block in the extent 2153 * First block in the extent
2018 */ 2154 */
2019 if (lbh->b_size == 0) { 2155 if (mpd->b_size == 0) {
2020 lbh->b_blocknr = logical; 2156 mpd->b_blocknr = logical;
2021 lbh->b_size = b_size; 2157 mpd->b_size = b_size;
2022 lbh->b_state = bh->b_state & BH_FLAGS; 2158 mpd->b_state = b_state & BH_FLAGS;
2023 return; 2159 return;
2024 } 2160 }
2025 2161
2026 next = lbh->b_blocknr + nrblocks; 2162 next = mpd->b_blocknr + nrblocks;
2027 /* 2163 /*
2028 * Can we merge the block to our big extent? 2164 * Can we merge the block to our big extent?
2029 */ 2165 */
2030 if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { 2166 if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
2031 lbh->b_size += b_size; 2167 mpd->b_size += b_size;
2032 return; 2168 return;
2033 } 2169 }
2034 2170
@@ -2057,7 +2193,7 @@ static int __mpage_da_writepage(struct page *page,
2057{ 2193{
2058 struct mpage_da_data *mpd = data; 2194 struct mpage_da_data *mpd = data;
2059 struct inode *inode = mpd->inode; 2195 struct inode *inode = mpd->inode;
2060 struct buffer_head *bh, *head, fake; 2196 struct buffer_head *bh, *head;
2061 sector_t logical; 2197 sector_t logical;
2062 2198
2063 if (mpd->io_done) { 2199 if (mpd->io_done) {
@@ -2099,9 +2235,9 @@ static int __mpage_da_writepage(struct page *page,
2099 /* 2235 /*
2100 * ... and blocks 2236 * ... and blocks
2101 */ 2237 */
2102 mpd->lbh.b_size = 0; 2238 mpd->b_size = 0;
2103 mpd->lbh.b_state = 0; 2239 mpd->b_state = 0;
2104 mpd->lbh.b_blocknr = 0; 2240 mpd->b_blocknr = 0;
2105 } 2241 }
2106 2242
2107 mpd->next_page = page->index + 1; 2243 mpd->next_page = page->index + 1;
@@ -2109,16 +2245,8 @@ static int __mpage_da_writepage(struct page *page,
2109 (PAGE_CACHE_SHIFT - inode->i_blkbits); 2245 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2110 2246
2111 if (!page_has_buffers(page)) { 2247 if (!page_has_buffers(page)) {
2112 /* 2248 mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
2113 * There is no attached buffer heads yet (mmap?) 2249 (1 << BH_Dirty) | (1 << BH_Uptodate));
2114 * we treat the page asfull of dirty blocks
2115 */
2116 bh = &fake;
2117 bh->b_size = PAGE_CACHE_SIZE;
2118 bh->b_state = 0;
2119 set_buffer_dirty(bh);
2120 set_buffer_uptodate(bh);
2121 mpage_add_bh_to_extent(mpd, logical, bh);
2122 if (mpd->io_done) 2250 if (mpd->io_done)
2123 return MPAGE_DA_EXTENT_TAIL; 2251 return MPAGE_DA_EXTENT_TAIL;
2124 } else { 2252 } else {
@@ -2136,8 +2264,10 @@ static int __mpage_da_writepage(struct page *page,
2136 * with the page in ext4_da_writepage 2264 * with the page in ext4_da_writepage
2137 */ 2265 */
2138 if (buffer_dirty(bh) && 2266 if (buffer_dirty(bh) &&
2139 (!buffer_mapped(bh) || buffer_delay(bh))) { 2267 (!buffer_mapped(bh) || buffer_delay(bh))) {
2140 mpage_add_bh_to_extent(mpd, logical, bh); 2268 mpage_add_bh_to_extent(mpd, logical,
2269 bh->b_size,
2270 bh->b_state);
2141 if (mpd->io_done) 2271 if (mpd->io_done)
2142 return MPAGE_DA_EXTENT_TAIL; 2272 return MPAGE_DA_EXTENT_TAIL;
2143 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { 2273 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
@@ -2149,9 +2279,8 @@ static int __mpage_da_writepage(struct page *page,
2149 * unmapped buffer_head later we need to 2279 * unmapped buffer_head later we need to
2150 * use the b_state flag of that buffer_head. 2280 * use the b_state flag of that buffer_head.
2151 */ 2281 */
2152 if (mpd->lbh.b_size == 0) 2282 if (mpd->b_size == 0)
2153 mpd->lbh.b_state = 2283 mpd->b_state = bh->b_state & BH_FLAGS;
2154 bh->b_state & BH_FLAGS;
2155 } 2284 }
2156 logical++; 2285 logical++;
2157 } while ((bh = bh->b_this_page) != head); 2286 } while ((bh = bh->b_this_page) != head);
@@ -2161,51 +2290,6 @@ static int __mpage_da_writepage(struct page *page,
2161} 2290}
2162 2291
2163/* 2292/*
2164 * mpage_da_writepages - walk the list of dirty pages of the given
2165 * address space, allocates non-allocated blocks, maps newly-allocated
2166 * blocks to existing bhs and issue IO them
2167 *
2168 * @mapping: address space structure to write
2169 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2170 * @get_block: the filesystem's block mapper function.
2171 *
2172 * This is a library function, which implements the writepages()
2173 * address_space_operation.
2174 */
2175static int mpage_da_writepages(struct address_space *mapping,
2176 struct writeback_control *wbc,
2177 struct mpage_da_data *mpd)
2178{
2179 int ret;
2180
2181 if (!mpd->get_block)
2182 return generic_writepages(mapping, wbc);
2183
2184 mpd->lbh.b_size = 0;
2185 mpd->lbh.b_state = 0;
2186 mpd->lbh.b_blocknr = 0;
2187 mpd->first_page = 0;
2188 mpd->next_page = 0;
2189 mpd->io_done = 0;
2190 mpd->pages_written = 0;
2191 mpd->retval = 0;
2192
2193 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
2194 /*
2195 * Handle last extent of pages
2196 */
2197 if (!mpd->io_done && mpd->next_page != mpd->first_page) {
2198 if (mpage_da_map_blocks(mpd) == 0)
2199 mpage_da_submit_io(mpd);
2200
2201 mpd->io_done = 1;
2202 ret = MPAGE_DA_EXTENT_TAIL;
2203 }
2204 wbc->nr_to_write -= mpd->pages_written;
2205 return ret;
2206}
2207
2208/*
2209 * this is a special callback for ->write_begin() only 2293 * this is a special callback for ->write_begin() only
2210 * it's intention is to return mapped block or reserve space 2294 * it's intention is to return mapped block or reserve space
2211 */ 2295 */
@@ -2244,51 +2328,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2244 2328
2245 return ret; 2329 return ret;
2246} 2330}
2247#define EXT4_DELALLOC_RSVED 1
2248static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
2249 struct buffer_head *bh_result, int create)
2250{
2251 int ret;
2252 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
2253 loff_t disksize = EXT4_I(inode)->i_disksize;
2254 handle_t *handle = NULL;
2255
2256 handle = ext4_journal_current_handle();
2257 BUG_ON(!handle);
2258 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
2259 bh_result, create, 0, EXT4_DELALLOC_RSVED);
2260 if (ret > 0) {
2261
2262 bh_result->b_size = (ret << inode->i_blkbits);
2263
2264 if (ext4_should_order_data(inode)) {
2265 int retval;
2266 retval = ext4_jbd2_file_inode(handle, inode);
2267 if (retval)
2268 /*
2269 * Failed to add inode for ordered
2270 * mode. Don't update file size
2271 */
2272 return retval;
2273 }
2274
2275 /*
2276 * Update on-disk size along with block allocation
2277 * we don't use 'extend_disksize' as size may change
2278 * within already allocated block -bzzz
2279 */
2280 disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
2281 if (disksize > i_size_read(inode))
2282 disksize = i_size_read(inode);
2283 if (disksize > EXT4_I(inode)->i_disksize) {
2284 ext4_update_i_disksize(inode, disksize);
2285 ret = ext4_mark_inode_dirty(handle, inode);
2286 return ret;
2287 }
2288 ret = 0;
2289 }
2290 return ret;
2291}
2292 2331
2293static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) 2332static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
2294{ 2333{
@@ -2539,8 +2578,38 @@ retry:
2539 dump_stack(); 2578 dump_stack();
2540 goto out_writepages; 2579 goto out_writepages;
2541 } 2580 }
2542 mpd.get_block = ext4_da_get_block_write; 2581
2543 ret = mpage_da_writepages(mapping, wbc, &mpd); 2582 /*
2583 * Now call __mpage_da_writepage to find the next
2584 * contiguous region of logical blocks that need
2585 * blocks to be allocated by ext4. We don't actually
2586 * submit the blocks for I/O here, even though
2587 * write_cache_pages thinks it will, and will set the
2588 * pages as clean for write before calling
2589 * __mpage_da_writepage().
2590 */
2591 mpd.b_size = 0;
2592 mpd.b_state = 0;
2593 mpd.b_blocknr = 0;
2594 mpd.first_page = 0;
2595 mpd.next_page = 0;
2596 mpd.io_done = 0;
2597 mpd.pages_written = 0;
2598 mpd.retval = 0;
2599 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
2600 &mpd);
2601 /*
2602 * If we have a contigous extent of pages and we
2603 * haven't done the I/O yet, map the blocks and submit
2604 * them for I/O.
2605 */
2606 if (!mpd.io_done && mpd.next_page != mpd.first_page) {
2607 if (mpage_da_map_blocks(&mpd) == 0)
2608 mpage_da_submit_io(&mpd);
2609 mpd.io_done = 1;
2610 ret = MPAGE_DA_EXTENT_TAIL;
2611 }
2612 wbc->nr_to_write -= mpd.pages_written;
2544 2613
2545 ext4_journal_stop(handle); 2614 ext4_journal_stop(handle);
2546 2615
@@ -2816,6 +2885,48 @@ out:
2816 return; 2885 return;
2817} 2886}
2818 2887
2888/*
2889 * Force all delayed allocation blocks to be allocated for a given inode.
2890 */
2891int ext4_alloc_da_blocks(struct inode *inode)
2892{
2893 if (!EXT4_I(inode)->i_reserved_data_blocks &&
2894 !EXT4_I(inode)->i_reserved_meta_blocks)
2895 return 0;
2896
2897 /*
2898 * We do something simple for now. The filemap_flush() will
2899 * also start triggering a write of the data blocks, which is
2900 * not strictly speaking necessary (and for users of
2901 * laptop_mode, not even desirable). However, to do otherwise
2902 * would require replicating code paths in:
2903 *
2904 * ext4_da_writepages() ->
2905 * write_cache_pages() ---> (via passed in callback function)
2906 * __mpage_da_writepage() -->
2907 * mpage_add_bh_to_extent()
2908 * mpage_da_map_blocks()
2909 *
2910 * The problem is that write_cache_pages(), located in
2911 * mm/page-writeback.c, marks pages clean in preparation for
2912 * doing I/O, which is not desirable if we're not planning on
2913 * doing I/O at all.
2914 *
2915 * We could call write_cache_pages(), and then redirty all of
2916 * the pages by calling redirty_page_for_writeback() but that
2917 * would be ugly in the extreme. So instead we would need to
2918 * replicate parts of the code in the above functions,
2919 * simplifying them becuase we wouldn't actually intend to
2920 * write out the pages, but rather only collect contiguous
2921 * logical block extents, call the multi-block allocator, and
2922 * then update the buffer heads with the block allocations.
2923 *
2924 * For now, though, we'll cheat by calling filemap_flush(),
2925 * which will map the blocks, and start the I/O, but not
2926 * actually wait for the I/O to complete.
2927 */
2928 return filemap_flush(inode->i_mapping);
2929}
2819 2930
2820/* 2931/*
2821 * bmap() is special. It gets used by applications such as lilo and by 2932 * bmap() is special. It gets used by applications such as lilo and by
@@ -3838,6 +3949,9 @@ void ext4_truncate(struct inode *inode)
3838 if (!ext4_can_truncate(inode)) 3949 if (!ext4_can_truncate(inode))
3839 return; 3950 return;
3840 3951
3952 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
3953 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
3954
3841 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 3955 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
3842 ext4_ext_truncate(inode); 3956 ext4_ext_truncate(inode);
3843 return; 3957 return;
@@ -4080,12 +4194,7 @@ make_io:
4080 unsigned num; 4194 unsigned num;
4081 4195
4082 table = ext4_inode_table(sb, gdp); 4196 table = ext4_inode_table(sb, gdp);
4083 /* Make sure s_inode_readahead_blks is a power of 2 */ 4197 /* s_inode_readahead_blks is always a power of 2 */
4084 while (EXT4_SB(sb)->s_inode_readahead_blks &
4085 (EXT4_SB(sb)->s_inode_readahead_blks-1))
4086 EXT4_SB(sb)->s_inode_readahead_blks =
4087 (EXT4_SB(sb)->s_inode_readahead_blks &
4088 (EXT4_SB(sb)->s_inode_readahead_blks-1));
4089 b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); 4198 b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
4090 if (table > b) 4199 if (table > b)
4091 b = table; 4200 b = table;
@@ -4257,6 +4366,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4257 ei->i_disksize = inode->i_size; 4366 ei->i_disksize = inode->i_size;
4258 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 4367 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
4259 ei->i_block_group = iloc.block_group; 4368 ei->i_block_group = iloc.block_group;
4369 ei->i_last_alloc_group = ~0;
4260 /* 4370 /*
4261 * NOTE! The in-memory inode i_data array is in little-endian order 4371 * NOTE! The in-memory inode i_data array is in little-endian order
4262 * even on big-endian machines: we do NOT byteswap the block numbers! 4372 * even on big-endian machines: we do NOT byteswap the block numbers!
@@ -4299,6 +4409,20 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4299 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; 4409 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
4300 } 4410 }
4301 4411
4412 if (ei->i_flags & EXT4_EXTENTS_FL) {
4413 /* Validate extent which is part of inode */
4414 ret = ext4_ext_check_inode(inode);
4415 } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
4416 (S_ISLNK(inode->i_mode) &&
4417 !ext4_inode_is_fast_symlink(inode))) {
4418 /* Validate block references which are part of inode */
4419 ret = ext4_check_inode_blockref(inode);
4420 }
4421 if (ret) {
4422 brelse(bh);
4423 goto bad_inode;
4424 }
4425
4302 if (S_ISREG(inode->i_mode)) { 4426 if (S_ISREG(inode->i_mode)) {
4303 inode->i_op = &ext4_file_inode_operations; 4427 inode->i_op = &ext4_file_inode_operations;
4304 inode->i_fop = &ext4_file_operations; 4428 inode->i_fop = &ext4_file_operations;
@@ -4315,7 +4439,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4315 inode->i_op = &ext4_symlink_inode_operations; 4439 inode->i_op = &ext4_symlink_inode_operations;
4316 ext4_set_aops(inode); 4440 ext4_set_aops(inode);
4317 } 4441 }
4318 } else { 4442 } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
4443 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
4319 inode->i_op = &ext4_special_inode_operations; 4444 inode->i_op = &ext4_special_inode_operations;
4320 if (raw_inode->i_block[0]) 4445 if (raw_inode->i_block[0])
4321 init_special_inode(inode, inode->i_mode, 4446 init_special_inode(inode, inode->i_mode,
@@ -4323,6 +4448,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4323 else 4448 else
4324 init_special_inode(inode, inode->i_mode, 4449 init_special_inode(inode, inode->i_mode,
4325 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 4450 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
4451 } else {
4452 brelse(bh);
4453 ret = -EIO;
4454 ext4_error(inode->i_sb, __func__,
4455 "bogus i_mode (%o) for inode=%lu",
4456 inode->i_mode, inode->i_ino);
4457 goto bad_inode;
4326 } 4458 }
4327 brelse(iloc.bh); 4459 brelse(iloc.bh);
4328 ext4_set_inode_flags(inode); 4460 ext4_set_inode_flags(inode);
@@ -4612,7 +4744,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4612 error = PTR_ERR(handle); 4744 error = PTR_ERR(handle);
4613 goto err_out; 4745 goto err_out;
4614 } 4746 }
4615 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; 4747 error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
4616 if (error) { 4748 if (error) {
4617 ext4_journal_stop(handle); 4749 ext4_journal_stop(handle);
4618 return error; 4750 return error;
@@ -4991,7 +5123,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
4991 * i_size has been changed by generic_commit_write() and we thus need 5123 * i_size has been changed by generic_commit_write() and we thus need
4992 * to include the updated inode in the current transaction. 5124 * to include the updated inode in the current transaction.
4993 * 5125 *
4994 * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks 5126 * Also, vfs_dq_alloc_block() will always dirty the inode when blocks
4995 * are allocated to the file. 5127 * are allocated to the file.
4996 * 5128 *
4997 * If the inode is marked synchronous, we don't honour that here - doing 5129 * If the inode is marked synchronous, we don't honour that here - doing
@@ -5116,8 +5248,9 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
5116 return !buffer_mapped(bh); 5248 return !buffer_mapped(bh);
5117} 5249}
5118 5250
5119int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page) 5251int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5120{ 5252{
5253 struct page *page = vmf->page;
5121 loff_t size; 5254 loff_t size;
5122 unsigned long len; 5255 unsigned long len;
5123 int ret = -EINVAL; 5256 int ret = -EINVAL;
@@ -5169,6 +5302,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
5169 goto out_unlock; 5302 goto out_unlock;
5170 ret = 0; 5303 ret = 0;
5171out_unlock: 5304out_unlock:
5305 if (ret)
5306 ret = VM_FAULT_SIGBUS;
5172 up_read(&inode->i_alloc_sem); 5307 up_read(&inode->i_alloc_sem);
5173 return ret; 5308 return ret;
5174} 5309}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 42dc83fb247a..91e75f7a9e73 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -48,8 +48,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
48 if (err) 48 if (err)
49 return err; 49 return err;
50 50
51 if (!S_ISDIR(inode->i_mode)) 51 flags = ext4_mask_flags(inode->i_mode, flags);
52 flags &= ~EXT4_DIRSYNC_FL;
53 52
54 err = -EPERM; 53 err = -EPERM;
55 mutex_lock(&inode->i_mutex); 54 mutex_lock(&inode->i_mutex);
@@ -263,6 +262,20 @@ setversion_out:
263 return err; 262 return err;
264 } 263 }
265 264
265 case EXT4_IOC_ALLOC_DA_BLKS:
266 {
267 int err;
268 if (!is_owner_or_cap(inode))
269 return -EACCES;
270
271 err = mnt_want_write(filp->f_path.mnt);
272 if (err)
273 return err;
274 err = ext4_alloc_da_blocks(inode);
275 mnt_drop_write(filp->f_path.mnt);
276 return err;
277 }
278
266 default: 279 default:
267 return -ENOTTY; 280 return -ENOTTY;
268 } 281 }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4415beeb0b62..f871677a7984 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -46,22 +46,23 @@
46 * The allocation request involve request for multiple number of blocks 46 * The allocation request involve request for multiple number of blocks
47 * near to the goal(block) value specified. 47 * near to the goal(block) value specified.
48 * 48 *
49 * During initialization phase of the allocator we decide to use the group 49 * During initialization phase of the allocator we decide to use the
50 * preallocation or inode preallocation depending on the size file. The 50 * group preallocation or inode preallocation depending on the size of
51 * size of the file could be the resulting file size we would have after 51 * the file. The size of the file could be the resulting file size we
52 * allocation or the current file size which ever is larger. If the size is 52 * would have after allocation, or the current file size, which ever
53 * less that sbi->s_mb_stream_request we select the group 53 * is larger. If the size is less than sbi->s_mb_stream_request we
54 * preallocation. The default value of s_mb_stream_request is 16 54 * select to use the group preallocation. The default value of
55 * blocks. This can also be tuned via 55 * s_mb_stream_request is 16 blocks. This can also be tuned via
56 * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms 56 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
57 * of number of blocks. 57 * terms of number of blocks.
58 * 58 *
59 * The main motivation for having small file use group preallocation is to 59 * The main motivation for having small file use group preallocation is to
60 * ensure that we have small file closer in the disk. 60 * ensure that we have small files closer together on the disk.
61 * 61 *
62 * First stage the allocator looks at the inode prealloc list 62 * First stage the allocator looks at the inode prealloc list,
63 * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for 63 * ext4_inode_info->i_prealloc_list, which contains list of prealloc
64 * this particular inode. The inode prealloc space is represented as: 64 * spaces for this particular inode. The inode prealloc space is
65 * represented as:
65 * 66 *
66 * pa_lstart -> the logical start block for this prealloc space 67 * pa_lstart -> the logical start block for this prealloc space
67 * pa_pstart -> the physical start block for this prealloc space 68 * pa_pstart -> the physical start block for this prealloc space
@@ -121,29 +122,29 @@
121 * list. In case of inode preallocation we follow a list of heuristics 122 * list. In case of inode preallocation we follow a list of heuristics
122 * based on file size. This can be found in ext4_mb_normalize_request. If 123 * based on file size. This can be found in ext4_mb_normalize_request. If
123 * we are doing a group prealloc we try to normalize the request to 124 * we are doing a group prealloc we try to normalize the request to
124 * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to 125 * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
125 * 512 blocks. This can be tuned via 126 * 512 blocks. This can be tuned via
126 * /proc/fs/ext4/<partition/group_prealloc. The value is represented in 127 * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in
127 * terms of number of blocks. If we have mounted the file system with -O 128 * terms of number of blocks. If we have mounted the file system with -O
128 * stripe=<value> option the group prealloc request is normalized to the 129 * stripe=<value> option the group prealloc request is normalized to the
129 * stripe value (sbi->s_stripe) 130 * stripe value (sbi->s_stripe)
130 * 131 *
131 * The regular allocator(using the buddy cache) support few tunables. 132 * The regular allocator(using the buddy cache) supports few tunables.
132 * 133 *
133 * /proc/fs/ext4/<partition>/min_to_scan 134 * /sys/fs/ext4/<partition>/mb_min_to_scan
134 * /proc/fs/ext4/<partition>/max_to_scan 135 * /sys/fs/ext4/<partition>/mb_max_to_scan
135 * /proc/fs/ext4/<partition>/order2_req 136 * /sys/fs/ext4/<partition>/mb_order2_req
136 * 137 *
137 * The regular allocator use buddy scan only if the request len is power of 138 * The regular allocator uses buddy scan only if the request len is power of
138 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The 139 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
139 * value of s_mb_order2_reqs can be tuned via 140 * value of s_mb_order2_reqs can be tuned via
140 * /proc/fs/ext4/<partition>/order2_req. If the request len is equal to 141 * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to
141 * stripe size (sbi->s_stripe), we try to search for contigous block in 142 * stripe size (sbi->s_stripe), we try to search for contigous block in
142 * stripe size. This should result in better allocation on RAID setup. If 143 * stripe size. This should result in better allocation on RAID setups. If
143 * not we search in the specific group using bitmap for best extents. The 144 * not, we search in the specific group using bitmap for best extents. The
144 * tunable min_to_scan and max_to_scan controll the behaviour here. 145 * tunable min_to_scan and max_to_scan control the behaviour here.
145 * min_to_scan indicate how long the mballoc __must__ look for a best 146 * min_to_scan indicate how long the mballoc __must__ look for a best
146 * extent and max_to_scanindicate how long the mballoc __can__ look for a 147 * extent and max_to_scan indicates how long the mballoc __can__ look for a
147 * best extent in the found extents. Searching for the blocks starts with 148 * best extent in the found extents. Searching for the blocks starts with
148 * the group specified as the goal value in allocation context via 149 * the group specified as the goal value in allocation context via
149 * ac_g_ex. Each group is first checked based on the criteria whether it 150 * ac_g_ex. Each group is first checked based on the criteria whether it
@@ -337,8 +338,6 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
337 ext4_group_t group); 338 ext4_group_t group);
338static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 339static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
339 ext4_group_t group); 340 ext4_group_t group);
340static int ext4_mb_init_per_dev_proc(struct super_block *sb);
341static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
342static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); 341static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
343 342
344 343
@@ -1447,7 +1446,7 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
1447 struct ext4_free_extent *gex = &ac->ac_g_ex; 1446 struct ext4_free_extent *gex = &ac->ac_g_ex;
1448 1447
1449 BUG_ON(ex->fe_len <= 0); 1448 BUG_ON(ex->fe_len <= 0);
1450 BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 1449 BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
1451 BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 1450 BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
1452 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); 1451 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
1453 1452
@@ -1726,6 +1725,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1726{ 1725{
1727 unsigned free, fragments; 1726 unsigned free, fragments;
1728 unsigned i, bits; 1727 unsigned i, bits;
1728 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
1729 struct ext4_group_desc *desc; 1729 struct ext4_group_desc *desc;
1730 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 1730 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1731 1731
@@ -1747,6 +1747,12 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1747 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) 1747 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
1748 return 0; 1748 return 0;
1749 1749
1750 /* Avoid using the first bg of a flexgroup for data files */
1751 if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
1752 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
1753 ((group % flex_size) == 0))
1754 return 0;
1755
1750 bits = ac->ac_sb->s_blocksize_bits + 1; 1756 bits = ac->ac_sb->s_blocksize_bits + 1;
1751 for (i = ac->ac_2order; i <= bits; i++) 1757 for (i = ac->ac_2order; i <= bits; i++)
1752 if (grp->bb_counters[i] > 0) 1758 if (grp->bb_counters[i] > 0)
@@ -1971,7 +1977,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1971 /* 1977 /*
1972 * We search using buddy data only if the order of the request 1978 * We search using buddy data only if the order of the request
1973 * is greater than equal to the sbi_s_mb_order2_reqs 1979 * is greater than equal to the sbi_s_mb_order2_reqs
1974 * You can tune it via /proc/fs/ext4/<partition>/order2_req 1980 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
1975 */ 1981 */
1976 if (i >= sbi->s_mb_order2_reqs) { 1982 if (i >= sbi->s_mb_order2_reqs) {
1977 /* 1983 /*
@@ -2693,7 +2699,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2693 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int); 2699 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
2694 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2700 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2695 if (sbi->s_mb_maxs == NULL) { 2701 if (sbi->s_mb_maxs == NULL) {
2696 kfree(sbi->s_mb_maxs); 2702 kfree(sbi->s_mb_offsets);
2697 return -ENOMEM; 2703 return -ENOMEM;
2698 } 2704 }
2699 2705
@@ -2746,7 +2752,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2746 spin_lock_init(&lg->lg_prealloc_lock); 2752 spin_lock_init(&lg->lg_prealloc_lock);
2747 } 2753 }
2748 2754
2749 ext4_mb_init_per_dev_proc(sb);
2750 ext4_mb_history_init(sb); 2755 ext4_mb_history_init(sb);
2751 2756
2752 if (sbi->s_journal) 2757 if (sbi->s_journal)
@@ -2829,7 +2834,6 @@ int ext4_mb_release(struct super_block *sb)
2829 2834
2830 free_percpu(sbi->s_locality_groups); 2835 free_percpu(sbi->s_locality_groups);
2831 ext4_mb_history_release(sb); 2836 ext4_mb_history_release(sb);
2832 ext4_mb_destroy_per_dev_proc(sb);
2833 2837
2834 return 0; 2838 return 0;
2835} 2839}
@@ -2890,62 +2894,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2890 mb_debug("freed %u blocks in %u structures\n", count, count2); 2894 mb_debug("freed %u blocks in %u structures\n", count, count2);
2891} 2895}
2892 2896
2893#define EXT4_MB_STATS_NAME "stats"
2894#define EXT4_MB_MAX_TO_SCAN_NAME "max_to_scan"
2895#define EXT4_MB_MIN_TO_SCAN_NAME "min_to_scan"
2896#define EXT4_MB_ORDER2_REQ "order2_req"
2897#define EXT4_MB_STREAM_REQ "stream_req"
2898#define EXT4_MB_GROUP_PREALLOC "group_prealloc"
2899
2900static int ext4_mb_init_per_dev_proc(struct super_block *sb)
2901{
2902#ifdef CONFIG_PROC_FS
2903 mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
2904 struct ext4_sb_info *sbi = EXT4_SB(sb);
2905 struct proc_dir_entry *proc;
2906
2907 if (sbi->s_proc == NULL)
2908 return -EINVAL;
2909
2910 EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats);
2911 EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan);
2912 EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan);
2913 EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs);
2914 EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request);
2915 EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc);
2916 return 0;
2917
2918err_out:
2919 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
2920 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
2921 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
2922 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
2923 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
2924 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
2925 return -ENOMEM;
2926#else
2927 return 0;
2928#endif
2929}
2930
2931static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
2932{
2933#ifdef CONFIG_PROC_FS
2934 struct ext4_sb_info *sbi = EXT4_SB(sb);
2935
2936 if (sbi->s_proc == NULL)
2937 return -EINVAL;
2938
2939 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
2940 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
2941 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
2942 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
2943 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
2944 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
2945#endif
2946 return 0;
2947}
2948
2949int __init init_ext4_mballoc(void) 2897int __init init_ext4_mballoc(void)
2950{ 2898{
2951 ext4_pspace_cachep = 2899 ext4_pspace_cachep =
@@ -3086,16 +3034,18 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
3086 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) 3034 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
3087 /* release all the reserved blocks if non delalloc */ 3035 /* release all the reserved blocks if non delalloc */
3088 percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks); 3036 percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
3089 else 3037 else {
3090 percpu_counter_sub(&sbi->s_dirtyblocks_counter, 3038 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
3091 ac->ac_b_ex.fe_len); 3039 ac->ac_b_ex.fe_len);
3040 /* convert reserved quota blocks to real quota blocks */
3041 vfs_dq_claim_block(ac->ac_inode, ac->ac_b_ex.fe_len);
3042 }
3092 3043
3093 if (sbi->s_log_groups_per_flex) { 3044 if (sbi->s_log_groups_per_flex) {
3094 ext4_group_t flex_group = ext4_flex_group(sbi, 3045 ext4_group_t flex_group = ext4_flex_group(sbi,
3095 ac->ac_b_ex.fe_group); 3046 ac->ac_b_ex.fe_group);
3096 spin_lock(sb_bgl_lock(sbi, flex_group)); 3047 atomic_sub(ac->ac_b_ex.fe_len,
3097 sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len; 3048 &sbi->s_flex_groups[flex_group].free_blocks);
3098 spin_unlock(sb_bgl_lock(sbi, flex_group));
3099 } 3049 }
3100 3050
3101 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 3051 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -3113,7 +3063,7 @@ out_err:
3113 * here we normalize request for locality group 3063 * here we normalize request for locality group
3114 * Group request are normalized to s_strip size if we set the same via mount 3064 * Group request are normalized to s_strip size if we set the same via mount
3115 * option. If not we set it to s_mb_group_prealloc which can be configured via 3065 * option. If not we set it to s_mb_group_prealloc which can be configured via
3116 * /proc/fs/ext4/<partition>/group_prealloc 3066 * /sys/fs/ext4/<partition>/mb_group_prealloc
3117 * 3067 *
3118 * XXX: should we try to preallocate more than the group has now? 3068 * XXX: should we try to preallocate more than the group has now?
3119 */ 3069 */
@@ -3292,7 +3242,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3292 } 3242 }
3293 BUG_ON(start + size <= ac->ac_o_ex.fe_logical && 3243 BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
3294 start > ac->ac_o_ex.fe_logical); 3244 start > ac->ac_o_ex.fe_logical);
3295 BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 3245 BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
3296 3246
3297 /* now prepare goal request */ 3247 /* now prepare goal request */
3298 3248
@@ -3589,6 +3539,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3589 struct super_block *sb, struct ext4_prealloc_space *pa) 3539 struct super_block *sb, struct ext4_prealloc_space *pa)
3590{ 3540{
3591 ext4_group_t grp; 3541 ext4_group_t grp;
3542 ext4_fsblk_t grp_blk;
3592 3543
3593 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) 3544 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
3594 return; 3545 return;
@@ -3603,8 +3554,15 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3603 pa->pa_deleted = 1; 3554 pa->pa_deleted = 1;
3604 spin_unlock(&pa->pa_lock); 3555 spin_unlock(&pa->pa_lock);
3605 3556
3606 /* -1 is to protect from crossing allocation group */ 3557 grp_blk = pa->pa_pstart;
3607 ext4_get_group_no_and_offset(sb, pa->pa_pstart - 1, &grp, NULL); 3558 /*
3559 * If doing group-based preallocation, pa_pstart may be in the
3560 * next group when pa is used up
3561 */
3562 if (pa->pa_type == MB_GROUP_PA)
3563 grp_blk--;
3564
3565 ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
3608 3566
3609 /* 3567 /*
3610 * possible race: 3568 * possible race:
@@ -3696,7 +3654,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3696 INIT_LIST_HEAD(&pa->pa_inode_list); 3654 INIT_LIST_HEAD(&pa->pa_inode_list);
3697 INIT_LIST_HEAD(&pa->pa_group_list); 3655 INIT_LIST_HEAD(&pa->pa_group_list);
3698 pa->pa_deleted = 0; 3656 pa->pa_deleted = 0;
3699 pa->pa_linear = 0; 3657 pa->pa_type = MB_INODE_PA;
3700 3658
3701 mb_debug("new inode pa %p: %llu/%u for %u\n", pa, 3659 mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
3702 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3660 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
@@ -3759,7 +3717,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3759 INIT_LIST_HEAD(&pa->pa_inode_list); 3717 INIT_LIST_HEAD(&pa->pa_inode_list);
3760 INIT_LIST_HEAD(&pa->pa_group_list); 3718 INIT_LIST_HEAD(&pa->pa_group_list);
3761 pa->pa_deleted = 0; 3719 pa->pa_deleted = 0;
3762 pa->pa_linear = 1; 3720 pa->pa_type = MB_GROUP_PA;
3763 3721
3764 mb_debug("new group pa %p: %llu/%u for %u\n", pa, 3722 mb_debug("new group pa %p: %llu/%u for %u\n", pa,
3765 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3723 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
@@ -4013,7 +3971,7 @@ repeat:
4013 list_del_rcu(&pa->pa_inode_list); 3971 list_del_rcu(&pa->pa_inode_list);
4014 spin_unlock(pa->pa_obj_lock); 3972 spin_unlock(pa->pa_obj_lock);
4015 3973
4016 if (pa->pa_linear) 3974 if (pa->pa_type == MB_GROUP_PA)
4017 ext4_mb_release_group_pa(&e4b, pa, ac); 3975 ext4_mb_release_group_pa(&e4b, pa, ac);
4018 else 3976 else
4019 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); 3977 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
@@ -4113,7 +4071,7 @@ repeat:
4113 spin_unlock(&ei->i_prealloc_lock); 4071 spin_unlock(&ei->i_prealloc_lock);
4114 4072
4115 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 4073 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
4116 BUG_ON(pa->pa_linear != 0); 4074 BUG_ON(pa->pa_type != MB_INODE_PA);
4117 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 4075 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
4118 4076
4119 err = ext4_mb_load_buddy(sb, group, &e4b); 4077 err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -4224,7 +4182,7 @@ static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4224 * file is determined by the current size or the resulting size after 4182 * file is determined by the current size or the resulting size after
4225 * allocation which ever is larger 4183 * allocation which ever is larger
4226 * 4184 *
4227 * One can tune this size via /proc/fs/ext4/<partition>/stream_req 4185 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
4228 */ 4186 */
4229static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) 4187static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4230{ 4188{
@@ -4365,7 +4323,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4365 continue; 4323 continue;
4366 } 4324 }
4367 /* only lg prealloc space */ 4325 /* only lg prealloc space */
4368 BUG_ON(!pa->pa_linear); 4326 BUG_ON(pa->pa_type != MB_GROUP_PA);
4369 4327
4370 /* seems this one can be freed ... */ 4328 /* seems this one can be freed ... */
4371 pa->pa_deleted = 1; 4329 pa->pa_deleted = 1;
@@ -4434,7 +4392,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
4434 pa_inode_list) { 4392 pa_inode_list) {
4435 spin_lock(&tmp_pa->pa_lock); 4393 spin_lock(&tmp_pa->pa_lock);
4436 if (tmp_pa->pa_deleted) { 4394 if (tmp_pa->pa_deleted) {
4437 spin_unlock(&pa->pa_lock); 4395 spin_unlock(&tmp_pa->pa_lock);
4438 continue; 4396 continue;
4439 } 4397 }
4440 if (!added && pa->pa_free < tmp_pa->pa_free) { 4398 if (!added && pa->pa_free < tmp_pa->pa_free) {
@@ -4471,7 +4429,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4471{ 4429{
4472 struct ext4_prealloc_space *pa = ac->ac_pa; 4430 struct ext4_prealloc_space *pa = ac->ac_pa;
4473 if (pa) { 4431 if (pa) {
4474 if (pa->pa_linear) { 4432 if (pa->pa_type == MB_GROUP_PA) {
4475 /* see comment in ext4_mb_use_group_pa() */ 4433 /* see comment in ext4_mb_use_group_pa() */
4476 spin_lock(&pa->pa_lock); 4434 spin_lock(&pa->pa_lock);
4477 pa->pa_pstart += ac->ac_b_ex.fe_len; 4435 pa->pa_pstart += ac->ac_b_ex.fe_len;
@@ -4491,7 +4449,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4491 * doesn't grow big. We need to release 4449 * doesn't grow big. We need to release
4492 * alloc_semp before calling ext4_mb_add_n_trim() 4450 * alloc_semp before calling ext4_mb_add_n_trim()
4493 */ 4451 */
4494 if (pa->pa_linear && likely(pa->pa_free)) { 4452 if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
4495 spin_lock(pa->pa_obj_lock); 4453 spin_lock(pa->pa_obj_lock);
4496 list_del_rcu(&pa->pa_inode_list); 4454 list_del_rcu(&pa->pa_inode_list);
4497 spin_unlock(pa->pa_obj_lock); 4455 spin_unlock(pa->pa_obj_lock);
@@ -4539,7 +4497,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4539 struct ext4_sb_info *sbi; 4497 struct ext4_sb_info *sbi;
4540 struct super_block *sb; 4498 struct super_block *sb;
4541 ext4_fsblk_t block = 0; 4499 ext4_fsblk_t block = 0;
4542 unsigned int inquota; 4500 unsigned int inquota = 0;
4543 unsigned int reserv_blks = 0; 4501 unsigned int reserv_blks = 0;
4544 4502
4545 sb = ar->inode->i_sb; 4503 sb = ar->inode->i_sb;
@@ -4557,9 +4515,17 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4557 (unsigned long long) ar->pleft, 4515 (unsigned long long) ar->pleft,
4558 (unsigned long long) ar->pright); 4516 (unsigned long long) ar->pright);
4559 4517
4560 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) { 4518 /*
4561 /* 4519 * For delayed allocation, we could skip the ENOSPC and
4562 * With delalloc we already reserved the blocks 4520 * EDQUOT check, as blocks and quotas have been already
4521 * reserved when data being copied into pagecache.
4522 */
4523 if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
4524 ar->flags |= EXT4_MB_DELALLOC_RESERVED;
4525 else {
4526 /* Without delayed allocation we need to verify
4527 * there is enough free blocks to do block allocation
4528 * and verify allocation doesn't exceed the quota limits.
4563 */ 4529 */
4564 while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) { 4530 while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
4565 /* let others to free the space */ 4531 /* let others to free the space */
@@ -4571,19 +4537,16 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4571 return 0; 4537 return 0;
4572 } 4538 }
4573 reserv_blks = ar->len; 4539 reserv_blks = ar->len;
4540 while (ar->len && vfs_dq_alloc_block(ar->inode, ar->len)) {
4541 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4542 ar->len--;
4543 }
4544 inquota = ar->len;
4545 if (ar->len == 0) {
4546 *errp = -EDQUOT;
4547 goto out3;
4548 }
4574 } 4549 }
4575 while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
4576 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4577 ar->len--;
4578 }
4579 if (ar->len == 0) {
4580 *errp = -EDQUOT;
4581 goto out3;
4582 }
4583 inquota = ar->len;
4584
4585 if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
4586 ar->flags |= EXT4_MB_DELALLOC_RESERVED;
4587 4550
4588 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4551 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4589 if (!ac) { 4552 if (!ac) {
@@ -4649,8 +4612,8 @@ repeat:
4649out2: 4612out2:
4650 kmem_cache_free(ext4_ac_cachep, ac); 4613 kmem_cache_free(ext4_ac_cachep, ac);
4651out1: 4614out1:
4652 if (ar->len < inquota) 4615 if (inquota && ar->len < inquota)
4653 DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len); 4616 vfs_dq_free_block(ar->inode, inquota - ar->len);
4654out3: 4617out3:
4655 if (!ar->len) { 4618 if (!ar->len) {
4656 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) 4619 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
@@ -4923,9 +4886,7 @@ do_more:
4923 4886
4924 if (sbi->s_log_groups_per_flex) { 4887 if (sbi->s_log_groups_per_flex) {
4925 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 4888 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4926 spin_lock(sb_bgl_lock(sbi, flex_group)); 4889 atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
4927 sbi->s_flex_groups[flex_group].free_blocks += count;
4928 spin_unlock(sb_bgl_lock(sbi, flex_group));
4929 } 4890 }
4930 4891
4931 ext4_mb_release_desc(&e4b); 4892 ext4_mb_release_desc(&e4b);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 10a2921baf14..dd9e6cd5f6cf 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -132,12 +132,15 @@ struct ext4_prealloc_space {
132 ext4_lblk_t pa_lstart; /* log. block */ 132 ext4_lblk_t pa_lstart; /* log. block */
133 unsigned short pa_len; /* len of preallocated chunk */ 133 unsigned short pa_len; /* len of preallocated chunk */
134 unsigned short pa_free; /* how many blocks are free */ 134 unsigned short pa_free; /* how many blocks are free */
135 unsigned short pa_linear; /* consumed in one direction 135 unsigned short pa_type; /* pa type. inode or group */
136 * strictly, for grp prealloc */
137 spinlock_t *pa_obj_lock; 136 spinlock_t *pa_obj_lock;
138 struct inode *pa_inode; /* hack, for history only */ 137 struct inode *pa_inode; /* hack, for history only */
139}; 138};
140 139
140enum {
141 MB_INODE_PA = 0,
142 MB_GROUP_PA = 1
143};
141 144
142struct ext4_free_extent { 145struct ext4_free_extent {
143 ext4_lblk_t fe_logical; 146 ext4_lblk_t fe_logical;
@@ -247,7 +250,6 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
247 250
248#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 251#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
249 252
250struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
251static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, 253static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
252 struct ext4_free_extent *fex) 254 struct ext4_free_extent *fex)
253{ 255{
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index ba702bd7910d..22098e1cd085 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -161,12 +161,12 @@ static struct dx_frame *dx_probe(const struct qstr *d_name,
161 struct dx_frame *frame, 161 struct dx_frame *frame,
162 int *err); 162 int *err);
163static void dx_release(struct dx_frame *frames); 163static void dx_release(struct dx_frame *frames);
164static int dx_make_map(struct ext4_dir_entry_2 *de, int size, 164static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
165 struct dx_hash_info *hinfo, struct dx_map_entry map[]); 165 struct dx_hash_info *hinfo, struct dx_map_entry map[]);
166static void dx_sort_map(struct dx_map_entry *map, unsigned count); 166static void dx_sort_map(struct dx_map_entry *map, unsigned count);
167static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to, 167static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
168 struct dx_map_entry *offsets, int count); 168 struct dx_map_entry *offsets, int count, unsigned blocksize);
169static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size); 169static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
170static void dx_insert_block(struct dx_frame *frame, 170static void dx_insert_block(struct dx_frame *frame,
171 u32 hash, ext4_lblk_t block); 171 u32 hash, ext4_lblk_t block);
172static int ext4_htree_next_block(struct inode *dir, __u32 hash, 172static int ext4_htree_next_block(struct inode *dir, __u32 hash,
@@ -180,14 +180,38 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
180static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, 180static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
181 struct inode *inode); 181 struct inode *inode);
182 182
183unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
184{
185 unsigned len = le16_to_cpu(dlen);
186
187 if (len == EXT4_MAX_REC_LEN || len == 0)
188 return blocksize;
189 return (len & 65532) | ((len & 3) << 16);
190}
191
192__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
193{
194 if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
195 BUG();
196 if (len < 65536)
197 return cpu_to_le16(len);
198 if (len == blocksize) {
199 if (blocksize == 65536)
200 return cpu_to_le16(EXT4_MAX_REC_LEN);
201 else
202 return cpu_to_le16(0);
203 }
204 return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
205}
206
183/* 207/*
184 * p is at least 6 bytes before the end of page 208 * p is at least 6 bytes before the end of page
185 */ 209 */
186static inline struct ext4_dir_entry_2 * 210static inline struct ext4_dir_entry_2 *
187ext4_next_entry(struct ext4_dir_entry_2 *p) 211ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
188{ 212{
189 return (struct ext4_dir_entry_2 *)((char *)p + 213 return (struct ext4_dir_entry_2 *)((char *)p +
190 ext4_rec_len_from_disk(p->rec_len)); 214 ext4_rec_len_from_disk(p->rec_len, blocksize));
191} 215}
192 216
193/* 217/*
@@ -294,7 +318,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
294 space += EXT4_DIR_REC_LEN(de->name_len); 318 space += EXT4_DIR_REC_LEN(de->name_len);
295 names++; 319 names++;
296 } 320 }
297 de = ext4_next_entry(de); 321 de = ext4_next_entry(de, size);
298 } 322 }
299 printk("(%i)\n", names); 323 printk("(%i)\n", names);
300 return (struct stats) { names, space, 1 }; 324 return (struct stats) { names, space, 1 };
@@ -585,7 +609,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
585 top = (struct ext4_dir_entry_2 *) ((char *) de + 609 top = (struct ext4_dir_entry_2 *) ((char *) de +
586 dir->i_sb->s_blocksize - 610 dir->i_sb->s_blocksize -
587 EXT4_DIR_REC_LEN(0)); 611 EXT4_DIR_REC_LEN(0));
588 for (; de < top; de = ext4_next_entry(de)) { 612 for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
589 if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, 613 if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
590 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) 614 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
591 +((char *)de - bh->b_data))) { 615 +((char *)de - bh->b_data))) {
@@ -663,7 +687,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
663 } 687 }
664 if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { 688 if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
665 de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; 689 de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
666 de = ext4_next_entry(de); 690 de = ext4_next_entry(de, dir->i_sb->s_blocksize);
667 if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0) 691 if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
668 goto errout; 692 goto errout;
669 count++; 693 count++;
@@ -713,15 +737,15 @@ errout:
713 * Create map of hash values, offsets, and sizes, stored at end of block. 737 * Create map of hash values, offsets, and sizes, stored at end of block.
714 * Returns number of entries mapped. 738 * Returns number of entries mapped.
715 */ 739 */
716static int dx_make_map (struct ext4_dir_entry_2 *de, int size, 740static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
717 struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) 741 struct dx_hash_info *hinfo,
742 struct dx_map_entry *map_tail)
718{ 743{
719 int count = 0; 744 int count = 0;
720 char *base = (char *) de; 745 char *base = (char *) de;
721 struct dx_hash_info h = *hinfo; 746 struct dx_hash_info h = *hinfo;
722 747
723 while ((char *) de < base + size) 748 while ((char *) de < base + blocksize) {
724 {
725 if (de->name_len && de->inode) { 749 if (de->name_len && de->inode) {
726 ext4fs_dirhash(de->name, de->name_len, &h); 750 ext4fs_dirhash(de->name, de->name_len, &h);
727 map_tail--; 751 map_tail--;
@@ -732,7 +756,7 @@ static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
732 cond_resched(); 756 cond_resched();
733 } 757 }
734 /* XXX: do we need to check rec_len == 0 case? -Chris */ 758 /* XXX: do we need to check rec_len == 0 case? -Chris */
735 de = ext4_next_entry(de); 759 de = ext4_next_entry(de, blocksize);
736 } 760 }
737 return count; 761 return count;
738} 762}
@@ -832,7 +856,8 @@ static inline int search_dirblock(struct buffer_head *bh,
832 return 1; 856 return 1;
833 } 857 }
834 /* prevent looping on a bad block */ 858 /* prevent looping on a bad block */
835 de_len = ext4_rec_len_from_disk(de->rec_len); 859 de_len = ext4_rec_len_from_disk(de->rec_len,
860 dir->i_sb->s_blocksize);
836 if (de_len <= 0) 861 if (de_len <= 0)
837 return -1; 862 return -1;
838 offset += de_len; 863 offset += de_len;
@@ -996,7 +1021,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
996 de = (struct ext4_dir_entry_2 *) bh->b_data; 1021 de = (struct ext4_dir_entry_2 *) bh->b_data;
997 top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize - 1022 top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
998 EXT4_DIR_REC_LEN(0)); 1023 EXT4_DIR_REC_LEN(0));
999 for (; de < top; de = ext4_next_entry(de)) { 1024 for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
1000 int off = (block << EXT4_BLOCK_SIZE_BITS(sb)) 1025 int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
1001 + ((char *) de - bh->b_data); 1026 + ((char *) de - bh->b_data);
1002 1027
@@ -1052,8 +1077,16 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1052 return ERR_PTR(-EIO); 1077 return ERR_PTR(-EIO);
1053 } 1078 }
1054 inode = ext4_iget(dir->i_sb, ino); 1079 inode = ext4_iget(dir->i_sb, ino);
1055 if (IS_ERR(inode)) 1080 if (unlikely(IS_ERR(inode))) {
1056 return ERR_CAST(inode); 1081 if (PTR_ERR(inode) == -ESTALE) {
1082 ext4_error(dir->i_sb, __func__,
1083 "deleted inode referenced: %u",
1084 ino);
1085 return ERR_PTR(-EIO);
1086 } else {
1087 return ERR_CAST(inode);
1088 }
1089 }
1057 } 1090 }
1058 return d_splice_alias(inode, dentry); 1091 return d_splice_alias(inode, dentry);
1059} 1092}
@@ -1109,7 +1142,8 @@ static inline void ext4_set_de_type(struct super_block *sb,
1109 * Returns pointer to last entry moved. 1142 * Returns pointer to last entry moved.
1110 */ 1143 */
1111static struct ext4_dir_entry_2 * 1144static struct ext4_dir_entry_2 *
1112dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) 1145dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
1146 unsigned blocksize)
1113{ 1147{
1114 unsigned rec_len = 0; 1148 unsigned rec_len = 0;
1115 1149
@@ -1118,7 +1152,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
1118 rec_len = EXT4_DIR_REC_LEN(de->name_len); 1152 rec_len = EXT4_DIR_REC_LEN(de->name_len);
1119 memcpy (to, de, rec_len); 1153 memcpy (to, de, rec_len);
1120 ((struct ext4_dir_entry_2 *) to)->rec_len = 1154 ((struct ext4_dir_entry_2 *) to)->rec_len =
1121 ext4_rec_len_to_disk(rec_len); 1155 ext4_rec_len_to_disk(rec_len, blocksize);
1122 de->inode = 0; 1156 de->inode = 0;
1123 map++; 1157 map++;
1124 to += rec_len; 1158 to += rec_len;
@@ -1130,19 +1164,19 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
1130 * Compact each dir entry in the range to the minimal rec_len. 1164 * Compact each dir entry in the range to the minimal rec_len.
1131 * Returns pointer to last entry in range. 1165 * Returns pointer to last entry in range.
1132 */ 1166 */
1133static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size) 1167static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
1134{ 1168{
1135 struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base; 1169 struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
1136 unsigned rec_len = 0; 1170 unsigned rec_len = 0;
1137 1171
1138 prev = to = de; 1172 prev = to = de;
1139 while ((char*)de < base + size) { 1173 while ((char*)de < base + blocksize) {
1140 next = ext4_next_entry(de); 1174 next = ext4_next_entry(de, blocksize);
1141 if (de->inode && de->name_len) { 1175 if (de->inode && de->name_len) {
1142 rec_len = EXT4_DIR_REC_LEN(de->name_len); 1176 rec_len = EXT4_DIR_REC_LEN(de->name_len);
1143 if (de > to) 1177 if (de > to)
1144 memmove(to, de, rec_len); 1178 memmove(to, de, rec_len);
1145 to->rec_len = ext4_rec_len_to_disk(rec_len); 1179 to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
1146 prev = to; 1180 prev = to;
1147 to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len); 1181 to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
1148 } 1182 }
@@ -1215,10 +1249,12 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1215 hash2, split, count-split)); 1249 hash2, split, count-split));
1216 1250
1217 /* Fancy dance to stay within two buffers */ 1251 /* Fancy dance to stay within two buffers */
1218 de2 = dx_move_dirents(data1, data2, map + split, count - split); 1252 de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
1219 de = dx_pack_dirents(data1, blocksize); 1253 de = dx_pack_dirents(data1, blocksize);
1220 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de); 1254 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
1221 de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2); 1255 blocksize);
1256 de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2,
1257 blocksize);
1222 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); 1258 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
1223 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); 1259 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
1224 1260
@@ -1268,6 +1304,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1268 const char *name = dentry->d_name.name; 1304 const char *name = dentry->d_name.name;
1269 int namelen = dentry->d_name.len; 1305 int namelen = dentry->d_name.len;
1270 unsigned int offset = 0; 1306 unsigned int offset = 0;
1307 unsigned int blocksize = dir->i_sb->s_blocksize;
1271 unsigned short reclen; 1308 unsigned short reclen;
1272 int nlen, rlen, err; 1309 int nlen, rlen, err;
1273 char *top; 1310 char *top;
@@ -1275,7 +1312,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1275 reclen = EXT4_DIR_REC_LEN(namelen); 1312 reclen = EXT4_DIR_REC_LEN(namelen);
1276 if (!de) { 1313 if (!de) {
1277 de = (struct ext4_dir_entry_2 *)bh->b_data; 1314 de = (struct ext4_dir_entry_2 *)bh->b_data;
1278 top = bh->b_data + dir->i_sb->s_blocksize - reclen; 1315 top = bh->b_data + blocksize - reclen;
1279 while ((char *) de <= top) { 1316 while ((char *) de <= top) {
1280 if (!ext4_check_dir_entry("ext4_add_entry", dir, de, 1317 if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
1281 bh, offset)) { 1318 bh, offset)) {
@@ -1287,7 +1324,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1287 return -EEXIST; 1324 return -EEXIST;
1288 } 1325 }
1289 nlen = EXT4_DIR_REC_LEN(de->name_len); 1326 nlen = EXT4_DIR_REC_LEN(de->name_len);
1290 rlen = ext4_rec_len_from_disk(de->rec_len); 1327 rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
1291 if ((de->inode? rlen - nlen: rlen) >= reclen) 1328 if ((de->inode? rlen - nlen: rlen) >= reclen)
1292 break; 1329 break;
1293 de = (struct ext4_dir_entry_2 *)((char *)de + rlen); 1330 de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
@@ -1306,11 +1343,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1306 1343
1307 /* By now the buffer is marked for journaling */ 1344 /* By now the buffer is marked for journaling */
1308 nlen = EXT4_DIR_REC_LEN(de->name_len); 1345 nlen = EXT4_DIR_REC_LEN(de->name_len);
1309 rlen = ext4_rec_len_from_disk(de->rec_len); 1346 rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
1310 if (de->inode) { 1347 if (de->inode) {
1311 struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); 1348 struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
1312 de1->rec_len = ext4_rec_len_to_disk(rlen - nlen); 1349 de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize);
1313 de->rec_len = ext4_rec_len_to_disk(nlen); 1350 de->rec_len = ext4_rec_len_to_disk(nlen, blocksize);
1314 de = de1; 1351 de = de1;
1315 } 1352 }
1316 de->file_type = EXT4_FT_UNKNOWN; 1353 de->file_type = EXT4_FT_UNKNOWN;
@@ -1380,7 +1417,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1380 /* The 0th block becomes the root, move the dirents out */ 1417 /* The 0th block becomes the root, move the dirents out */
1381 fde = &root->dotdot; 1418 fde = &root->dotdot;
1382 de = (struct ext4_dir_entry_2 *)((char *)fde + 1419 de = (struct ext4_dir_entry_2 *)((char *)fde +
1383 ext4_rec_len_from_disk(fde->rec_len)); 1420 ext4_rec_len_from_disk(fde->rec_len, blocksize));
1384 if ((char *) de >= (((char *) root) + blocksize)) { 1421 if ((char *) de >= (((char *) root) + blocksize)) {
1385 ext4_error(dir->i_sb, __func__, 1422 ext4_error(dir->i_sb, __func__,
1386 "invalid rec_len for '..' in inode %lu", 1423 "invalid rec_len for '..' in inode %lu",
@@ -1402,12 +1439,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1402 memcpy (data1, de, len); 1439 memcpy (data1, de, len);
1403 de = (struct ext4_dir_entry_2 *) data1; 1440 de = (struct ext4_dir_entry_2 *) data1;
1404 top = data1 + len; 1441 top = data1 + len;
1405 while ((char *)(de2 = ext4_next_entry(de)) < top) 1442 while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
1406 de = de2; 1443 de = de2;
1407 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de); 1444 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
1445 blocksize);
1408 /* Initialize the root; the dot dirents already exist */ 1446 /* Initialize the root; the dot dirents already exist */
1409 de = (struct ext4_dir_entry_2 *) (&root->dotdot); 1447 de = (struct ext4_dir_entry_2 *) (&root->dotdot);
1410 de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2)); 1448 de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
1449 blocksize);
1411 memset (&root->info, 0, sizeof(root->info)); 1450 memset (&root->info, 0, sizeof(root->info));
1412 root->info.info_length = sizeof(root->info); 1451 root->info.info_length = sizeof(root->info);
1413 root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 1452 root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
@@ -1488,7 +1527,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1488 return retval; 1527 return retval;
1489 de = (struct ext4_dir_entry_2 *) bh->b_data; 1528 de = (struct ext4_dir_entry_2 *) bh->b_data;
1490 de->inode = 0; 1529 de->inode = 0;
1491 de->rec_len = ext4_rec_len_to_disk(blocksize); 1530 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
1492 return add_dirent_to_buf(handle, dentry, inode, de, bh); 1531 return add_dirent_to_buf(handle, dentry, inode, de, bh);
1493} 1532}
1494 1533
@@ -1551,7 +1590,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1551 goto cleanup; 1590 goto cleanup;
1552 node2 = (struct dx_node *)(bh2->b_data); 1591 node2 = (struct dx_node *)(bh2->b_data);
1553 entries2 = node2->entries; 1592 entries2 = node2->entries;
1554 node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize); 1593 node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
1594 sb->s_blocksize);
1555 node2->fake.inode = 0; 1595 node2->fake.inode = 0;
1556 BUFFER_TRACE(frame->bh, "get_write_access"); 1596 BUFFER_TRACE(frame->bh, "get_write_access");
1557 err = ext4_journal_get_write_access(handle, frame->bh); 1597 err = ext4_journal_get_write_access(handle, frame->bh);
@@ -1639,6 +1679,7 @@ static int ext4_delete_entry(handle_t *handle,
1639 struct buffer_head *bh) 1679 struct buffer_head *bh)
1640{ 1680{
1641 struct ext4_dir_entry_2 *de, *pde; 1681 struct ext4_dir_entry_2 *de, *pde;
1682 unsigned int blocksize = dir->i_sb->s_blocksize;
1642 int i; 1683 int i;
1643 1684
1644 i = 0; 1685 i = 0;
@@ -1652,8 +1693,11 @@ static int ext4_delete_entry(handle_t *handle,
1652 ext4_journal_get_write_access(handle, bh); 1693 ext4_journal_get_write_access(handle, bh);
1653 if (pde) 1694 if (pde)
1654 pde->rec_len = ext4_rec_len_to_disk( 1695 pde->rec_len = ext4_rec_len_to_disk(
1655 ext4_rec_len_from_disk(pde->rec_len) + 1696 ext4_rec_len_from_disk(pde->rec_len,
1656 ext4_rec_len_from_disk(de->rec_len)); 1697 blocksize) +
1698 ext4_rec_len_from_disk(de->rec_len,
1699 blocksize),
1700 blocksize);
1657 else 1701 else
1658 de->inode = 0; 1702 de->inode = 0;
1659 dir->i_version++; 1703 dir->i_version++;
@@ -1661,9 +1705,9 @@ static int ext4_delete_entry(handle_t *handle,
1661 ext4_handle_dirty_metadata(handle, dir, bh); 1705 ext4_handle_dirty_metadata(handle, dir, bh);
1662 return 0; 1706 return 0;
1663 } 1707 }
1664 i += ext4_rec_len_from_disk(de->rec_len); 1708 i += ext4_rec_len_from_disk(de->rec_len, blocksize);
1665 pde = de; 1709 pde = de;
1666 de = ext4_next_entry(de); 1710 de = ext4_next_entry(de, blocksize);
1667 } 1711 }
1668 return -ENOENT; 1712 return -ENOENT;
1669} 1713}
@@ -1793,6 +1837,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1793 struct inode *inode; 1837 struct inode *inode;
1794 struct buffer_head *dir_block; 1838 struct buffer_head *dir_block;
1795 struct ext4_dir_entry_2 *de; 1839 struct ext4_dir_entry_2 *de;
1840 unsigned int blocksize = dir->i_sb->s_blocksize;
1796 int err, retries = 0; 1841 int err, retries = 0;
1797 1842
1798 if (EXT4_DIR_LINK_MAX(dir)) 1843 if (EXT4_DIR_LINK_MAX(dir))
@@ -1824,13 +1869,14 @@ retry:
1824 de = (struct ext4_dir_entry_2 *) dir_block->b_data; 1869 de = (struct ext4_dir_entry_2 *) dir_block->b_data;
1825 de->inode = cpu_to_le32(inode->i_ino); 1870 de->inode = cpu_to_le32(inode->i_ino);
1826 de->name_len = 1; 1871 de->name_len = 1;
1827 de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len)); 1872 de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
1873 blocksize);
1828 strcpy(de->name, "."); 1874 strcpy(de->name, ".");
1829 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1875 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1830 de = ext4_next_entry(de); 1876 de = ext4_next_entry(de, blocksize);
1831 de->inode = cpu_to_le32(dir->i_ino); 1877 de->inode = cpu_to_le32(dir->i_ino);
1832 de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize - 1878 de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1),
1833 EXT4_DIR_REC_LEN(1)); 1879 blocksize);
1834 de->name_len = 2; 1880 de->name_len = 2;
1835 strcpy(de->name, ".."); 1881 strcpy(de->name, "..");
1836 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1882 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
@@ -1885,7 +1931,7 @@ static int empty_dir(struct inode *inode)
1885 return 1; 1931 return 1;
1886 } 1932 }
1887 de = (struct ext4_dir_entry_2 *) bh->b_data; 1933 de = (struct ext4_dir_entry_2 *) bh->b_data;
1888 de1 = ext4_next_entry(de); 1934 de1 = ext4_next_entry(de, sb->s_blocksize);
1889 if (le32_to_cpu(de->inode) != inode->i_ino || 1935 if (le32_to_cpu(de->inode) != inode->i_ino ||
1890 !le32_to_cpu(de1->inode) || 1936 !le32_to_cpu(de1->inode) ||
1891 strcmp(".", de->name) || 1937 strcmp(".", de->name) ||
@@ -1896,9 +1942,9 @@ static int empty_dir(struct inode *inode)
1896 brelse(bh); 1942 brelse(bh);
1897 return 1; 1943 return 1;
1898 } 1944 }
1899 offset = ext4_rec_len_from_disk(de->rec_len) + 1945 offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) +
1900 ext4_rec_len_from_disk(de1->rec_len); 1946 ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
1901 de = ext4_next_entry(de1); 1947 de = ext4_next_entry(de1, sb->s_blocksize);
1902 while (offset < inode->i_size) { 1948 while (offset < inode->i_size) {
1903 if (!bh || 1949 if (!bh ||
1904 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { 1950 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
@@ -1927,8 +1973,8 @@ static int empty_dir(struct inode *inode)
1927 brelse(bh); 1973 brelse(bh);
1928 return 0; 1974 return 0;
1929 } 1975 }
1930 offset += ext4_rec_len_from_disk(de->rec_len); 1976 offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
1931 de = ext4_next_entry(de); 1977 de = ext4_next_entry(de, sb->s_blocksize);
1932 } 1978 }
1933 brelse(bh); 1979 brelse(bh);
1934 return 1; 1980 return 1;
@@ -2092,7 +2138,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
2092 2138
2093 /* Initialize quotas before so that eventual writes go in 2139 /* Initialize quotas before so that eventual writes go in
2094 * separate transaction */ 2140 * separate transaction */
2095 DQUOT_INIT(dentry->d_inode); 2141 vfs_dq_init(dentry->d_inode);
2096 handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); 2142 handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
2097 if (IS_ERR(handle)) 2143 if (IS_ERR(handle))
2098 return PTR_ERR(handle); 2144 return PTR_ERR(handle);
@@ -2151,7 +2197,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2151 2197
2152 /* Initialize quotas before so that eventual writes go 2198 /* Initialize quotas before so that eventual writes go
2153 * in separate transaction */ 2199 * in separate transaction */
2154 DQUOT_INIT(dentry->d_inode); 2200 vfs_dq_init(dentry->d_inode);
2155 handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); 2201 handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
2156 if (IS_ERR(handle)) 2202 if (IS_ERR(handle))
2157 return PTR_ERR(handle); 2203 return PTR_ERR(handle);
@@ -2297,8 +2343,8 @@ retry:
2297 return err; 2343 return err;
2298} 2344}
2299 2345
2300#define PARENT_INO(buffer) \ 2346#define PARENT_INO(buffer, size) \
2301 (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer))->inode) 2347 (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode)
2302 2348
2303/* 2349/*
2304 * Anybody can rename anything with this: the permission checks are left to the 2350 * Anybody can rename anything with this: the permission checks are left to the
@@ -2311,14 +2357,14 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2311 struct inode *old_inode, *new_inode; 2357 struct inode *old_inode, *new_inode;
2312 struct buffer_head *old_bh, *new_bh, *dir_bh; 2358 struct buffer_head *old_bh, *new_bh, *dir_bh;
2313 struct ext4_dir_entry_2 *old_de, *new_de; 2359 struct ext4_dir_entry_2 *old_de, *new_de;
2314 int retval; 2360 int retval, force_da_alloc = 0;
2315 2361
2316 old_bh = new_bh = dir_bh = NULL; 2362 old_bh = new_bh = dir_bh = NULL;
2317 2363
2318 /* Initialize quotas before so that eventual writes go 2364 /* Initialize quotas before so that eventual writes go
2319 * in separate transaction */ 2365 * in separate transaction */
2320 if (new_dentry->d_inode) 2366 if (new_dentry->d_inode)
2321 DQUOT_INIT(new_dentry->d_inode); 2367 vfs_dq_init(new_dentry->d_inode);
2322 handle = ext4_journal_start(old_dir, 2 * 2368 handle = ext4_journal_start(old_dir, 2 *
2323 EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + 2369 EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
2324 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); 2370 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
@@ -2358,7 +2404,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2358 dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval); 2404 dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
2359 if (!dir_bh) 2405 if (!dir_bh)
2360 goto end_rename; 2406 goto end_rename;
2361 if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) 2407 if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
2408 old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
2362 goto end_rename; 2409 goto end_rename;
2363 retval = -EMLINK; 2410 retval = -EMLINK;
2364 if (!new_inode && new_dir != old_dir && 2411 if (!new_inode && new_dir != old_dir &&
@@ -2430,7 +2477,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2430 if (dir_bh) { 2477 if (dir_bh) {
2431 BUFFER_TRACE(dir_bh, "get_write_access"); 2478 BUFFER_TRACE(dir_bh, "get_write_access");
2432 ext4_journal_get_write_access(handle, dir_bh); 2479 ext4_journal_get_write_access(handle, dir_bh);
2433 PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); 2480 PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
2481 cpu_to_le32(new_dir->i_ino);
2434 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); 2482 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
2435 ext4_handle_dirty_metadata(handle, old_dir, dir_bh); 2483 ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
2436 ext4_dec_count(handle, old_dir); 2484 ext4_dec_count(handle, old_dir);
@@ -2449,6 +2497,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2449 ext4_mark_inode_dirty(handle, new_inode); 2497 ext4_mark_inode_dirty(handle, new_inode);
2450 if (!new_inode->i_nlink) 2498 if (!new_inode->i_nlink)
2451 ext4_orphan_add(handle, new_inode); 2499 ext4_orphan_add(handle, new_inode);
2500 if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
2501 force_da_alloc = 1;
2452 } 2502 }
2453 retval = 0; 2503 retval = 0;
2454 2504
@@ -2457,6 +2507,8 @@ end_rename:
2457 brelse(old_bh); 2507 brelse(old_bh);
2458 brelse(new_bh); 2508 brelse(new_bh);
2459 ext4_journal_stop(handle); 2509 ext4_journal_stop(handle);
2510 if (retval == 0 && force_da_alloc)
2511 ext4_alloc_da_blocks(old_inode);
2460 return retval; 2512 return retval;
2461} 2513}
2462 2514
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c06886abd658..546c7dd869e1 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -938,10 +938,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
938 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { 938 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
939 ext4_group_t flex_group; 939 ext4_group_t flex_group;
940 flex_group = ext4_flex_group(sbi, input->group); 940 flex_group = ext4_flex_group(sbi, input->group);
941 sbi->s_flex_groups[flex_group].free_blocks += 941 atomic_add(input->free_blocks_count,
942 input->free_blocks_count; 942 &sbi->s_flex_groups[flex_group].free_blocks);
943 sbi->s_flex_groups[flex_group].free_inodes += 943 atomic_add(EXT4_INODES_PER_GROUP(sb),
944 EXT4_INODES_PER_GROUP(sb); 944 &sbi->s_flex_groups[flex_group].free_inodes);
945 } 945 }
946 946
947 ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); 947 ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 39d1993cfa13..9987bba99db3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -35,6 +35,7 @@
35#include <linux/quotaops.h> 35#include <linux/quotaops.h>
36#include <linux/seq_file.h> 36#include <linux/seq_file.h>
37#include <linux/proc_fs.h> 37#include <linux/proc_fs.h>
38#include <linux/ctype.h>
38#include <linux/marker.h> 39#include <linux/marker.h>
39#include <linux/log2.h> 40#include <linux/log2.h>
40#include <linux/crc16.h> 41#include <linux/crc16.h>
@@ -48,6 +49,7 @@
48#include "group.h" 49#include "group.h"
49 50
50struct proc_dir_entry *ext4_proc_root; 51struct proc_dir_entry *ext4_proc_root;
52static struct kset *ext4_kset;
51 53
52static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 54static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
53 unsigned long journal_devnum); 55 unsigned long journal_devnum);
@@ -577,9 +579,9 @@ static void ext4_put_super(struct super_block *sb)
577 ext4_commit_super(sb, es, 1); 579 ext4_commit_super(sb, es, 1);
578 } 580 }
579 if (sbi->s_proc) { 581 if (sbi->s_proc) {
580 remove_proc_entry("inode_readahead_blks", sbi->s_proc);
581 remove_proc_entry(sb->s_id, ext4_proc_root); 582 remove_proc_entry(sb->s_id, ext4_proc_root);
582 } 583 }
584 kobject_del(&sbi->s_kobj);
583 585
584 for (i = 0; i < sbi->s_gdb_count; i++) 586 for (i = 0; i < sbi->s_gdb_count; i++)
585 brelse(sbi->s_group_desc[i]); 587 brelse(sbi->s_group_desc[i]);
@@ -615,6 +617,17 @@ static void ext4_put_super(struct super_block *sb)
615 ext4_blkdev_remove(sbi); 617 ext4_blkdev_remove(sbi);
616 } 618 }
617 sb->s_fs_info = NULL; 619 sb->s_fs_info = NULL;
620 /*
621 * Now that we are completely done shutting down the
622 * superblock, we need to actually destroy the kobject.
623 */
624 unlock_kernel();
625 unlock_super(sb);
626 kobject_put(&sbi->s_kobj);
627 wait_for_completion(&sbi->s_kobj_unregister);
628 lock_super(sb);
629 lock_kernel();
630 kfree(sbi->s_blockgroup_lock);
618 kfree(sbi); 631 kfree(sbi);
619 return; 632 return;
620} 633}
@@ -803,8 +816,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
803 if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL)) 816 if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
804 seq_puts(seq, ",noacl"); 817 seq_puts(seq, ",noacl");
805#endif 818#endif
806 if (!test_opt(sb, RESERVATION))
807 seq_puts(seq, ",noreservation");
808 if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { 819 if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
809 seq_printf(seq, ",commit=%u", 820 seq_printf(seq, ",commit=%u",
810 (unsigned) (sbi->s_commit_interval / HZ)); 821 (unsigned) (sbi->s_commit_interval / HZ));
@@ -855,6 +866,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
855 if (test_opt(sb, DATA_ERR_ABORT)) 866 if (test_opt(sb, DATA_ERR_ABORT))
856 seq_puts(seq, ",data_err=abort"); 867 seq_puts(seq, ",data_err=abort");
857 868
869 if (test_opt(sb, NO_AUTO_DA_ALLOC))
870 seq_puts(seq, ",noauto_da_alloc");
871
858 ext4_show_quota_options(seq, sb); 872 ext4_show_quota_options(seq, sb);
859 return 0; 873 return 0;
860} 874}
@@ -926,8 +940,6 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_
926#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group") 940#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
927#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) 941#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
928 942
929static int ext4_dquot_initialize(struct inode *inode, int type);
930static int ext4_dquot_drop(struct inode *inode);
931static int ext4_write_dquot(struct dquot *dquot); 943static int ext4_write_dquot(struct dquot *dquot);
932static int ext4_acquire_dquot(struct dquot *dquot); 944static int ext4_acquire_dquot(struct dquot *dquot);
933static int ext4_release_dquot(struct dquot *dquot); 945static int ext4_release_dquot(struct dquot *dquot);
@@ -942,9 +954,13 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
942 const char *data, size_t len, loff_t off); 954 const char *data, size_t len, loff_t off);
943 955
944static struct dquot_operations ext4_quota_operations = { 956static struct dquot_operations ext4_quota_operations = {
945 .initialize = ext4_dquot_initialize, 957 .initialize = dquot_initialize,
946 .drop = ext4_dquot_drop, 958 .drop = dquot_drop,
947 .alloc_space = dquot_alloc_space, 959 .alloc_space = dquot_alloc_space,
960 .reserve_space = dquot_reserve_space,
961 .claim_space = dquot_claim_space,
962 .release_rsv = dquot_release_reserved_space,
963 .get_reserved_space = ext4_get_reserved_space,
948 .alloc_inode = dquot_alloc_inode, 964 .alloc_inode = dquot_alloc_inode,
949 .free_space = dquot_free_space, 965 .free_space = dquot_free_space,
950 .free_inode = dquot_free_inode, 966 .free_inode = dquot_free_inode,
@@ -1002,7 +1018,7 @@ enum {
1002 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, 1018 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
1003 Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, 1019 Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
1004 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, 1020 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
1005 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, 1021 Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh,
1006 Opt_commit, Opt_min_batch_time, Opt_max_batch_time, 1022 Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
1007 Opt_journal_update, Opt_journal_dev, 1023 Opt_journal_update, Opt_journal_dev,
1008 Opt_journal_checksum, Opt_journal_async_commit, 1024 Opt_journal_checksum, Opt_journal_async_commit,
@@ -1010,8 +1026,8 @@ enum {
1010 Opt_data_err_abort, Opt_data_err_ignore, 1026 Opt_data_err_abort, Opt_data_err_ignore,
1011 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 1027 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1012 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 1028 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
1013 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, 1029 Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
1014 Opt_grpquota, Opt_i_version, 1030 Opt_usrquota, Opt_grpquota, Opt_i_version,
1015 Opt_stripe, Opt_delalloc, Opt_nodelalloc, 1031 Opt_stripe, Opt_delalloc, Opt_nodelalloc,
1016 Opt_inode_readahead_blks, Opt_journal_ioprio 1032 Opt_inode_readahead_blks, Opt_journal_ioprio
1017}; 1033};
@@ -1037,8 +1053,6 @@ static const match_table_t tokens = {
1037 {Opt_nouser_xattr, "nouser_xattr"}, 1053 {Opt_nouser_xattr, "nouser_xattr"},
1038 {Opt_acl, "acl"}, 1054 {Opt_acl, "acl"},
1039 {Opt_noacl, "noacl"}, 1055 {Opt_noacl, "noacl"},
1040 {Opt_reservation, "reservation"},
1041 {Opt_noreservation, "noreservation"},
1042 {Opt_noload, "noload"}, 1056 {Opt_noload, "noload"},
1043 {Opt_nobh, "nobh"}, 1057 {Opt_nobh, "nobh"},
1044 {Opt_bh, "bh"}, 1058 {Opt_bh, "bh"},
@@ -1066,6 +1080,8 @@ static const match_table_t tokens = {
1066 {Opt_quota, "quota"}, 1080 {Opt_quota, "quota"},
1067 {Opt_usrquota, "usrquota"}, 1081 {Opt_usrquota, "usrquota"},
1068 {Opt_barrier, "barrier=%u"}, 1082 {Opt_barrier, "barrier=%u"},
1083 {Opt_barrier, "barrier"},
1084 {Opt_nobarrier, "nobarrier"},
1069 {Opt_i_version, "i_version"}, 1085 {Opt_i_version, "i_version"},
1070 {Opt_stripe, "stripe=%u"}, 1086 {Opt_stripe, "stripe=%u"},
1071 {Opt_resize, "resize"}, 1087 {Opt_resize, "resize"},
@@ -1073,6 +1089,9 @@ static const match_table_t tokens = {
1073 {Opt_nodelalloc, "nodelalloc"}, 1089 {Opt_nodelalloc, "nodelalloc"},
1074 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, 1090 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
1075 {Opt_journal_ioprio, "journal_ioprio=%u"}, 1091 {Opt_journal_ioprio, "journal_ioprio=%u"},
1092 {Opt_auto_da_alloc, "auto_da_alloc=%u"},
1093 {Opt_auto_da_alloc, "auto_da_alloc"},
1094 {Opt_noauto_da_alloc, "noauto_da_alloc"},
1076 {Opt_err, NULL}, 1095 {Opt_err, NULL},
1077}; 1096};
1078 1097
@@ -1205,12 +1224,6 @@ static int parse_options(char *options, struct super_block *sb,
1205 "not supported\n"); 1224 "not supported\n");
1206 break; 1225 break;
1207#endif 1226#endif
1208 case Opt_reservation:
1209 set_opt(sbi->s_mount_opt, RESERVATION);
1210 break;
1211 case Opt_noreservation:
1212 clear_opt(sbi->s_mount_opt, RESERVATION);
1213 break;
1214 case Opt_journal_update: 1227 case Opt_journal_update:
1215 /* @@@ FIXME */ 1228 /* @@@ FIXME */
1216 /* Eventually we will want to be able to create 1229 /* Eventually we will want to be able to create
@@ -1413,9 +1426,14 @@ set_qf_format:
1413 case Opt_abort: 1426 case Opt_abort:
1414 set_opt(sbi->s_mount_opt, ABORT); 1427 set_opt(sbi->s_mount_opt, ABORT);
1415 break; 1428 break;
1429 case Opt_nobarrier:
1430 clear_opt(sbi->s_mount_opt, BARRIER);
1431 break;
1416 case Opt_barrier: 1432 case Opt_barrier:
1417 if (match_int(&args[0], &option)) 1433 if (match_int(&args[0], &option)) {
1418 return 0; 1434 set_opt(sbi->s_mount_opt, BARRIER);
1435 break;
1436 }
1419 if (option) 1437 if (option)
1420 set_opt(sbi->s_mount_opt, BARRIER); 1438 set_opt(sbi->s_mount_opt, BARRIER);
1421 else 1439 else
@@ -1461,6 +1479,11 @@ set_qf_format:
1461 return 0; 1479 return 0;
1462 if (option < 0 || option > (1 << 30)) 1480 if (option < 0 || option > (1 << 30))
1463 return 0; 1481 return 0;
1482 if (option & (option - 1)) {
1483 printk(KERN_ERR "EXT4-fs: inode_readahead_blks"
1484 " must be a power of 2\n");
1485 return 0;
1486 }
1464 sbi->s_inode_readahead_blks = option; 1487 sbi->s_inode_readahead_blks = option;
1465 break; 1488 break;
1466 case Opt_journal_ioprio: 1489 case Opt_journal_ioprio:
@@ -1471,6 +1494,19 @@ set_qf_format:
1471 *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 1494 *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
1472 option); 1495 option);
1473 break; 1496 break;
1497 case Opt_noauto_da_alloc:
1498 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
1499 break;
1500 case Opt_auto_da_alloc:
1501 if (match_int(&args[0], &option)) {
1502 clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
1503 break;
1504 }
1505 if (option)
1506 clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
1507 else
1508 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
1509 break;
1474 default: 1510 default:
1475 printk(KERN_ERR 1511 printk(KERN_ERR
1476 "EXT4-fs: Unrecognized mount option \"%s\" " 1512 "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1610,10 +1646,12 @@ static int ext4_fill_flex_info(struct super_block *sb)
1610 gdp = ext4_get_group_desc(sb, i, &bh); 1646 gdp = ext4_get_group_desc(sb, i, &bh);
1611 1647
1612 flex_group = ext4_flex_group(sbi, i); 1648 flex_group = ext4_flex_group(sbi, i);
1613 sbi->s_flex_groups[flex_group].free_inodes += 1649 atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
1614 ext4_free_inodes_count(sb, gdp); 1650 ext4_free_inodes_count(sb, gdp));
1615 sbi->s_flex_groups[flex_group].free_blocks += 1651 atomic_set(&sbi->s_flex_groups[flex_group].free_blocks,
1616 ext4_free_blks_count(sb, gdp); 1652 ext4_free_blks_count(sb, gdp));
1653 atomic_set(&sbi->s_flex_groups[flex_group].used_dirs,
1654 ext4_used_dirs_count(sb, gdp));
1617 } 1655 }
1618 1656
1619 return 1; 1657 return 1;
@@ -1802,7 +1840,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
1802 } 1840 }
1803 1841
1804 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); 1842 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
1805 DQUOT_INIT(inode); 1843 vfs_dq_init(inode);
1806 if (inode->i_nlink) { 1844 if (inode->i_nlink) {
1807 printk(KERN_DEBUG 1845 printk(KERN_DEBUG
1808 "%s: truncating inode %lu to %lld bytes\n", 1846 "%s: truncating inode %lu to %lld bytes\n",
@@ -1989,6 +2027,181 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
1989 return 0; 2027 return 0;
1990} 2028}
1991 2029
2030/* sysfs supprt */
2031
2032struct ext4_attr {
2033 struct attribute attr;
2034 ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
2035 ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
2036 const char *, size_t);
2037 int offset;
2038};
2039
2040static int parse_strtoul(const char *buf,
2041 unsigned long max, unsigned long *value)
2042{
2043 char *endp;
2044
2045 while (*buf && isspace(*buf))
2046 buf++;
2047 *value = simple_strtoul(buf, &endp, 0);
2048 while (*endp && isspace(*endp))
2049 endp++;
2050 if (*endp || *value > max)
2051 return -EINVAL;
2052
2053 return 0;
2054}
2055
2056static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
2057 struct ext4_sb_info *sbi,
2058 char *buf)
2059{
2060 return snprintf(buf, PAGE_SIZE, "%llu\n",
2061 (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
2062}
2063
2064static ssize_t session_write_kbytes_show(struct ext4_attr *a,
2065 struct ext4_sb_info *sbi, char *buf)
2066{
2067 struct super_block *sb = sbi->s_buddy_cache->i_sb;
2068
2069 return snprintf(buf, PAGE_SIZE, "%lu\n",
2070 (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
2071 sbi->s_sectors_written_start) >> 1);
2072}
2073
2074static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
2075 struct ext4_sb_info *sbi, char *buf)
2076{
2077 struct super_block *sb = sbi->s_buddy_cache->i_sb;
2078
2079 return snprintf(buf, PAGE_SIZE, "%llu\n",
2080 sbi->s_kbytes_written +
2081 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
2082 EXT4_SB(sb)->s_sectors_written_start) >> 1));
2083}
2084
2085static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
2086 struct ext4_sb_info *sbi,
2087 const char *buf, size_t count)
2088{
2089 unsigned long t;
2090
2091 if (parse_strtoul(buf, 0x40000000, &t))
2092 return -EINVAL;
2093
2094 /* inode_readahead_blks must be a power of 2 */
2095 if (t & (t-1))
2096 return -EINVAL;
2097
2098 sbi->s_inode_readahead_blks = t;
2099 return count;
2100}
2101
2102static ssize_t sbi_ui_show(struct ext4_attr *a,
2103 struct ext4_sb_info *sbi, char *buf)
2104{
2105 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
2106
2107 return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
2108}
2109
2110static ssize_t sbi_ui_store(struct ext4_attr *a,
2111 struct ext4_sb_info *sbi,
2112 const char *buf, size_t count)
2113{
2114 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
2115 unsigned long t;
2116
2117 if (parse_strtoul(buf, 0xffffffff, &t))
2118 return -EINVAL;
2119 *ui = t;
2120 return count;
2121}
2122
2123#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
2124static struct ext4_attr ext4_attr_##_name = { \
2125 .attr = {.name = __stringify(_name), .mode = _mode }, \
2126 .show = _show, \
2127 .store = _store, \
2128 .offset = offsetof(struct ext4_sb_info, _elname), \
2129}
2130#define EXT4_ATTR(name, mode, show, store) \
2131static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
2132
2133#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
2134#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
2135#define EXT4_RW_ATTR_SBI_UI(name, elname) \
2136 EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
2137#define ATTR_LIST(name) &ext4_attr_##name.attr
2138
2139EXT4_RO_ATTR(delayed_allocation_blocks);
2140EXT4_RO_ATTR(session_write_kbytes);
2141EXT4_RO_ATTR(lifetime_write_kbytes);
2142EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
2143 inode_readahead_blks_store, s_inode_readahead_blks);
2144EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
2145EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
2146EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
2147EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
2148EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
2149EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
2150
2151static struct attribute *ext4_attrs[] = {
2152 ATTR_LIST(delayed_allocation_blocks),
2153 ATTR_LIST(session_write_kbytes),
2154 ATTR_LIST(lifetime_write_kbytes),
2155 ATTR_LIST(inode_readahead_blks),
2156 ATTR_LIST(mb_stats),
2157 ATTR_LIST(mb_max_to_scan),
2158 ATTR_LIST(mb_min_to_scan),
2159 ATTR_LIST(mb_order2_req),
2160 ATTR_LIST(mb_stream_req),
2161 ATTR_LIST(mb_group_prealloc),
2162 NULL,
2163};
2164
2165static ssize_t ext4_attr_show(struct kobject *kobj,
2166 struct attribute *attr, char *buf)
2167{
2168 struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
2169 s_kobj);
2170 struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
2171
2172 return a->show ? a->show(a, sbi, buf) : 0;
2173}
2174
2175static ssize_t ext4_attr_store(struct kobject *kobj,
2176 struct attribute *attr,
2177 const char *buf, size_t len)
2178{
2179 struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
2180 s_kobj);
2181 struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
2182
2183 return a->store ? a->store(a, sbi, buf, len) : 0;
2184}
2185
2186static void ext4_sb_release(struct kobject *kobj)
2187{
2188 struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
2189 s_kobj);
2190 complete(&sbi->s_kobj_unregister);
2191}
2192
2193
2194static struct sysfs_ops ext4_attr_ops = {
2195 .show = ext4_attr_show,
2196 .store = ext4_attr_store,
2197};
2198
2199static struct kobj_type ext4_ktype = {
2200 .default_attrs = ext4_attrs,
2201 .sysfs_ops = &ext4_attr_ops,
2202 .release = ext4_sb_release,
2203};
2204
1992static int ext4_fill_super(struct super_block *sb, void *data, int silent) 2205static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1993 __releases(kernel_lock) 2206 __releases(kernel_lock)
1994 __acquires(kernel_lock) 2207 __acquires(kernel_lock)
@@ -2019,12 +2232,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2019 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 2232 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
2020 if (!sbi) 2233 if (!sbi)
2021 return -ENOMEM; 2234 return -ENOMEM;
2235
2236 sbi->s_blockgroup_lock =
2237 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
2238 if (!sbi->s_blockgroup_lock) {
2239 kfree(sbi);
2240 return -ENOMEM;
2241 }
2022 sb->s_fs_info = sbi; 2242 sb->s_fs_info = sbi;
2023 sbi->s_mount_opt = 0; 2243 sbi->s_mount_opt = 0;
2024 sbi->s_resuid = EXT4_DEF_RESUID; 2244 sbi->s_resuid = EXT4_DEF_RESUID;
2025 sbi->s_resgid = EXT4_DEF_RESGID; 2245 sbi->s_resgid = EXT4_DEF_RESGID;
2026 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; 2246 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
2027 sbi->s_sb_block = sb_block; 2247 sbi->s_sb_block = sb_block;
2248 sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part,
2249 sectors[1]);
2028 2250
2029 unlock_kernel(); 2251 unlock_kernel();
2030 2252
@@ -2062,6 +2284,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2062 sb->s_magic = le16_to_cpu(es->s_magic); 2284 sb->s_magic = le16_to_cpu(es->s_magic);
2063 if (sb->s_magic != EXT4_SUPER_MAGIC) 2285 if (sb->s_magic != EXT4_SUPER_MAGIC)
2064 goto cantfind_ext4; 2286 goto cantfind_ext4;
2287 sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
2065 2288
2066 /* Set defaults before we parse the mount options */ 2289 /* Set defaults before we parse the mount options */
2067 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 2290 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -2099,7 +2322,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2099 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; 2322 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
2100 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; 2323 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
2101 2324
2102 set_opt(sbi->s_mount_opt, RESERVATION);
2103 set_opt(sbi->s_mount_opt, BARRIER); 2325 set_opt(sbi->s_mount_opt, BARRIER);
2104 2326
2105 /* 2327 /*
@@ -2323,14 +2545,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2323#ifdef CONFIG_PROC_FS 2545#ifdef CONFIG_PROC_FS
2324 if (ext4_proc_root) 2546 if (ext4_proc_root)
2325 sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); 2547 sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
2326
2327 if (sbi->s_proc)
2328 proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
2329 &ext4_ui_proc_fops,
2330 &sbi->s_inode_readahead_blks);
2331#endif 2548#endif
2332 2549
2333 bgl_lock_init(&sbi->s_blockgroup_lock); 2550 bgl_lock_init(sbi->s_blockgroup_lock);
2334 2551
2335 for (i = 0; i < db_count; i++) { 2552 for (i = 0; i < db_count; i++) {
2336 block = descriptor_loc(sb, logical_sb_block, i); 2553 block = descriptor_loc(sb, logical_sb_block, i);
@@ -2562,6 +2779,16 @@ no_journal:
2562 goto failed_mount4; 2779 goto failed_mount4;
2563 } 2780 }
2564 2781
2782 sbi->s_kobj.kset = ext4_kset;
2783 init_completion(&sbi->s_kobj_unregister);
2784 err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
2785 "%s", sb->s_id);
2786 if (err) {
2787 ext4_mb_release(sb);
2788 ext4_ext_release(sb);
2789 goto failed_mount4;
2790 };
2791
2565 /* 2792 /*
2566 * akpm: core read_super() calls in here with the superblock locked. 2793 * akpm: core read_super() calls in here with the superblock locked.
2567 * That deadlocks, because orphan cleanup needs to lock the superblock 2794 * That deadlocks, because orphan cleanup needs to lock the superblock
@@ -2616,7 +2843,6 @@ failed_mount2:
2616 kfree(sbi->s_group_desc); 2843 kfree(sbi->s_group_desc);
2617failed_mount: 2844failed_mount:
2618 if (sbi->s_proc) { 2845 if (sbi->s_proc) {
2619 remove_proc_entry("inode_readahead_blks", sbi->s_proc);
2620 remove_proc_entry(sb->s_id, ext4_proc_root); 2846 remove_proc_entry(sb->s_id, ext4_proc_root);
2621 } 2847 }
2622#ifdef CONFIG_QUOTA 2848#ifdef CONFIG_QUOTA
@@ -2911,6 +3137,10 @@ static int ext4_commit_super(struct super_block *sb,
2911 set_buffer_uptodate(sbh); 3137 set_buffer_uptodate(sbh);
2912 } 3138 }
2913 es->s_wtime = cpu_to_le32(get_seconds()); 3139 es->s_wtime = cpu_to_le32(get_seconds());
3140 es->s_kbytes_written =
3141 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
3142 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
3143 EXT4_SB(sb)->s_sectors_written_start) >> 1));
2914 ext4_free_blocks_count_set(es, percpu_counter_sum_positive( 3144 ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
2915 &EXT4_SB(sb)->s_freeblocks_counter)); 3145 &EXT4_SB(sb)->s_freeblocks_counter));
2916 es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( 3146 es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
@@ -3367,8 +3597,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
3367 * is locked for write. Otherwise the are possible deadlocks: 3597 * is locked for write. Otherwise the are possible deadlocks:
3368 * Process 1 Process 2 3598 * Process 1 Process 2
3369 * ext4_create() quota_sync() 3599 * ext4_create() quota_sync()
3370 * jbd2_journal_start() write_dquot() 3600 * jbd2_journal_start() write_dquot()
3371 * DQUOT_INIT() down(dqio_mutex) 3601 * vfs_dq_init() down(dqio_mutex)
3372 * down(dqio_mutex) jbd2_journal_start() 3602 * down(dqio_mutex) jbd2_journal_start()
3373 * 3603 *
3374 */ 3604 */
@@ -3380,44 +3610,6 @@ static inline struct inode *dquot_to_inode(struct dquot *dquot)
3380 return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; 3610 return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
3381} 3611}
3382 3612
3383static int ext4_dquot_initialize(struct inode *inode, int type)
3384{
3385 handle_t *handle;
3386 int ret, err;
3387
3388 /* We may create quota structure so we need to reserve enough blocks */
3389 handle = ext4_journal_start(inode, 2*EXT4_QUOTA_INIT_BLOCKS(inode->i_sb));
3390 if (IS_ERR(handle))
3391 return PTR_ERR(handle);
3392 ret = dquot_initialize(inode, type);
3393 err = ext4_journal_stop(handle);
3394 if (!ret)
3395 ret = err;
3396 return ret;
3397}
3398
3399static int ext4_dquot_drop(struct inode *inode)
3400{
3401 handle_t *handle;
3402 int ret, err;
3403
3404 /* We may delete quota structure so we need to reserve enough blocks */
3405 handle = ext4_journal_start(inode, 2*EXT4_QUOTA_DEL_BLOCKS(inode->i_sb));
3406 if (IS_ERR(handle)) {
3407 /*
3408 * We call dquot_drop() anyway to at least release references
3409 * to quota structures so that umount does not hang.
3410 */
3411 dquot_drop(inode);
3412 return PTR_ERR(handle);
3413 }
3414 ret = dquot_drop(inode);
3415 err = ext4_journal_stop(handle);
3416 if (!ret)
3417 ret = err;
3418 return ret;
3419}
3420
3421static int ext4_write_dquot(struct dquot *dquot) 3613static int ext4_write_dquot(struct dquot *dquot)
3422{ 3614{
3423 int ret, err; 3615 int ret, err;
@@ -3683,45 +3875,6 @@ static int ext4_get_sb(struct file_system_type *fs_type,
3683 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt); 3875 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
3684} 3876}
3685 3877
3686#ifdef CONFIG_PROC_FS
3687static int ext4_ui_proc_show(struct seq_file *m, void *v)
3688{
3689 unsigned int *p = m->private;
3690
3691 seq_printf(m, "%u\n", *p);
3692 return 0;
3693}
3694
3695static int ext4_ui_proc_open(struct inode *inode, struct file *file)
3696{
3697 return single_open(file, ext4_ui_proc_show, PDE(inode)->data);
3698}
3699
3700static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
3701 size_t cnt, loff_t *ppos)
3702{
3703 unsigned long *p = PDE(file->f_path.dentry->d_inode)->data;
3704 char str[32];
3705
3706 if (cnt >= sizeof(str))
3707 return -EINVAL;
3708 if (copy_from_user(str, buf, cnt))
3709 return -EFAULT;
3710
3711 *p = simple_strtoul(str, NULL, 0);
3712 return cnt;
3713}
3714
3715const struct file_operations ext4_ui_proc_fops = {
3716 .owner = THIS_MODULE,
3717 .open = ext4_ui_proc_open,
3718 .read = seq_read,
3719 .llseek = seq_lseek,
3720 .release = single_release,
3721 .write = ext4_ui_proc_write,
3722};
3723#endif
3724
3725static struct file_system_type ext4_fs_type = { 3878static struct file_system_type ext4_fs_type = {
3726 .owner = THIS_MODULE, 3879 .owner = THIS_MODULE,
3727 .name = "ext4", 3880 .name = "ext4",
@@ -3755,6 +3908,9 @@ static int __init init_ext4_fs(void)
3755{ 3908{
3756 int err; 3909 int err;
3757 3910
3911 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
3912 if (!ext4_kset)
3913 return -ENOMEM;
3758 ext4_proc_root = proc_mkdir("fs/ext4", NULL); 3914 ext4_proc_root = proc_mkdir("fs/ext4", NULL);
3759 err = init_ext4_mballoc(); 3915 err = init_ext4_mballoc();
3760 if (err) 3916 if (err)
@@ -3796,6 +3952,7 @@ static void __exit exit_ext4_fs(void)
3796 exit_ext4_xattr(); 3952 exit_ext4_xattr();
3797 exit_ext4_mballoc(); 3953 exit_ext4_mballoc();
3798 remove_proc_entry("fs/ext4", NULL); 3954 remove_proc_entry("fs/ext4", NULL);
3955 kset_unregister(ext4_kset);
3799} 3956}
3800 3957
3801MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); 3958MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 157ce6589c54..62b31c246994 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -490,7 +490,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
490 error = ext4_handle_dirty_metadata(handle, inode, bh); 490 error = ext4_handle_dirty_metadata(handle, inode, bh);
491 if (IS_SYNC(inode)) 491 if (IS_SYNC(inode))
492 ext4_handle_sync(handle); 492 ext4_handle_sync(handle);
493 DQUOT_FREE_BLOCK(inode, 1); 493 vfs_dq_free_block(inode, 1);
494 ea_bdebug(bh, "refcount now=%d; releasing", 494 ea_bdebug(bh, "refcount now=%d; releasing",
495 le32_to_cpu(BHDR(bh)->h_refcount)); 495 le32_to_cpu(BHDR(bh)->h_refcount));
496 if (ce) 496 if (ce)
@@ -784,7 +784,7 @@ inserted:
784 /* The old block is released after updating 784 /* The old block is released after updating
785 the inode. */ 785 the inode. */
786 error = -EDQUOT; 786 error = -EDQUOT;
787 if (DQUOT_ALLOC_BLOCK(inode, 1)) 787 if (vfs_dq_alloc_block(inode, 1))
788 goto cleanup; 788 goto cleanup;
789 error = ext4_journal_get_write_access(handle, 789 error = ext4_journal_get_write_access(handle,
790 new_bh); 790 new_bh);
@@ -860,7 +860,7 @@ cleanup:
860 return error; 860 return error;
861 861
862cleanup_dquot: 862cleanup_dquot:
863 DQUOT_FREE_BLOCK(inode, 1); 863 vfs_dq_free_block(inode, 1);
864 goto cleanup; 864 goto cleanup;
865 865
866bad_block: 866bad_block:
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 6b74d09adbe5..296785a0dec8 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -202,9 +202,9 @@ static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
202 sector_t blocknr; 202 sector_t blocknr;
203 203
204 /* fat_get_cluster() assumes the requested blocknr isn't truncated. */ 204 /* fat_get_cluster() assumes the requested blocknr isn't truncated. */
205 mutex_lock(&mapping->host->i_mutex); 205 down_read(&mapping->host->i_alloc_sem);
206 blocknr = generic_block_bmap(mapping, block, fat_get_block); 206 blocknr = generic_block_bmap(mapping, block, fat_get_block);
207 mutex_unlock(&mapping->host->i_mutex); 207 up_read(&mapping->host->i_alloc_sem);
208 208
209 return blocknr; 209 return blocknr;
210} 210}
@@ -523,7 +523,9 @@ static int fat_remount(struct super_block *sb, int *flags, char *data)
523 523
524static int fat_statfs(struct dentry *dentry, struct kstatfs *buf) 524static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
525{ 525{
526 struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); 526 struct super_block *sb = dentry->d_sb;
527 struct msdos_sb_info *sbi = MSDOS_SB(sb);
528 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
527 529
528 /* If the count of free cluster is still unknown, counts it here. */ 530 /* If the count of free cluster is still unknown, counts it here. */
529 if (sbi->free_clusters == -1 || !sbi->free_clus_valid) { 531 if (sbi->free_clusters == -1 || !sbi->free_clus_valid) {
@@ -537,6 +539,8 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
537 buf->f_blocks = sbi->max_cluster - FAT_START_ENT; 539 buf->f_blocks = sbi->max_cluster - FAT_START_ENT;
538 buf->f_bfree = sbi->free_clusters; 540 buf->f_bfree = sbi->free_clusters;
539 buf->f_bavail = sbi->free_clusters; 541 buf->f_bavail = sbi->free_clusters;
542 buf->f_fsid.val[0] = (u32)id;
543 buf->f_fsid.val[1] = (u32)(id >> 32);
540 buf->f_namelen = sbi->options.isvfat ? 260 : 12; 544 buf->f_namelen = sbi->options.isvfat ? 260 : 12;
541 545
542 return 0; 546 return 0;
@@ -930,7 +934,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
930 934
931 opts->fs_uid = current_uid(); 935 opts->fs_uid = current_uid();
932 opts->fs_gid = current_gid(); 936 opts->fs_gid = current_gid();
933 opts->fs_fmask = opts->fs_dmask = current->fs->umask; 937 opts->fs_fmask = current_umask();
934 opts->allow_utime = -1; 938 opts->allow_utime = -1;
935 opts->codepage = fat_default_codepage; 939 opts->codepage = fat_default_codepage;
936 opts->iocharset = fat_default_iocharset; 940 opts->iocharset = fat_default_iocharset;
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 7ba03a4acbe0..da3f361a37dd 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -188,7 +188,7 @@ old_compare:
188 goto out; 188 goto out;
189} 189}
190 190
191static struct dentry_operations msdos_dentry_operations = { 191static const struct dentry_operations msdos_dentry_operations = {
192 .d_hash = msdos_hash, 192 .d_hash = msdos_hash,
193 .d_compare = msdos_cmp, 193 .d_compare = msdos_cmp,
194}; 194};
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 8ae32e37673c..a0e00e3a46e9 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -166,13 +166,13 @@ static int vfat_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
166 return 1; 166 return 1;
167} 167}
168 168
169static struct dentry_operations vfat_ci_dentry_ops = { 169static const struct dentry_operations vfat_ci_dentry_ops = {
170 .d_revalidate = vfat_revalidate_ci, 170 .d_revalidate = vfat_revalidate_ci,
171 .d_hash = vfat_hashi, 171 .d_hash = vfat_hashi,
172 .d_compare = vfat_cmpi, 172 .d_compare = vfat_cmpi,
173}; 173};
174 174
175static struct dentry_operations vfat_dentry_ops = { 175static const struct dentry_operations vfat_dentry_ops = {
176 .d_revalidate = vfat_revalidate, 176 .d_revalidate = vfat_revalidate,
177 .d_hash = vfat_hash, 177 .d_hash = vfat_hash,
178 .d_compare = vfat_cmp, 178 .d_compare = vfat_cmp,
diff --git a/fs/fcntl.c b/fs/fcntl.c
index bd215cc791da..cc8e4de2fee5 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -141,7 +141,7 @@ SYSCALL_DEFINE1(dup, unsigned int, fildes)
141 return ret; 141 return ret;
142} 142}
143 143
144#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | FASYNC | O_DIRECT | O_NOATIME) 144#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
145 145
146static int setfl(int fd, struct file * filp, unsigned long arg) 146static int setfl(int fd, struct file * filp, unsigned long arg)
147{ 147{
@@ -177,21 +177,21 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
177 return error; 177 return error;
178 178
179 /* 179 /*
180 * We still need a lock here for now to keep multiple FASYNC calls 180 * ->fasync() is responsible for setting the FASYNC bit.
181 * from racing with each other.
182 */ 181 */
183 lock_kernel(); 182 if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op &&
184 if ((arg ^ filp->f_flags) & FASYNC) { 183 filp->f_op->fasync) {
185 if (filp->f_op && filp->f_op->fasync) { 184 error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
186 error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0); 185 if (error < 0)
187 if (error < 0) 186 goto out;
188 goto out; 187 if (error > 0)
189 } 188 error = 0;
190 } 189 }
191 190 spin_lock(&filp->f_lock);
192 filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK); 191 filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
192 spin_unlock(&filp->f_lock);
193
193 out: 194 out:
194 unlock_kernel();
195 return error; 195 return error;
196} 196}
197 197
@@ -516,7 +516,7 @@ static DEFINE_RWLOCK(fasync_lock);
516static struct kmem_cache *fasync_cache __read_mostly; 516static struct kmem_cache *fasync_cache __read_mostly;
517 517
518/* 518/*
519 * fasync_helper() is used by some character device drivers (mainly mice) 519 * fasync_helper() is used by almost all character device drivers
520 * to set up the fasync queue. It returns negative on error, 0 if it did 520 * to set up the fasync queue. It returns negative on error, 0 if it did
521 * no changes and positive if it added/deleted the entry. 521 * no changes and positive if it added/deleted the entry.
522 */ 522 */
@@ -531,6 +531,12 @@ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fap
531 if (!new) 531 if (!new)
532 return -ENOMEM; 532 return -ENOMEM;
533 } 533 }
534
535 /*
536 * We need to take f_lock first since it's not an IRQ-safe
537 * lock.
538 */
539 spin_lock(&filp->f_lock);
534 write_lock_irq(&fasync_lock); 540 write_lock_irq(&fasync_lock);
535 for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { 541 for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
536 if (fa->fa_file == filp) { 542 if (fa->fa_file == filp) {
@@ -555,7 +561,12 @@ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fap
555 result = 1; 561 result = 1;
556 } 562 }
557out: 563out:
564 if (on)
565 filp->f_flags |= FASYNC;
566 else
567 filp->f_flags &= ~FASYNC;
558 write_unlock_irq(&fasync_lock); 568 write_unlock_irq(&fasync_lock);
569 spin_unlock(&filp->f_lock);
559 return result; 570 return result;
560} 571}
561 572
diff --git a/fs/file_table.c b/fs/file_table.c
index bbeeac6efa1a..54018fe48840 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -13,6 +13,7 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/security.h> 15#include <linux/security.h>
16#include <linux/ima.h>
16#include <linux/eventpoll.h> 17#include <linux/eventpoll.h>
17#include <linux/rcupdate.h> 18#include <linux/rcupdate.h>
18#include <linux/mount.h> 19#include <linux/mount.h>
@@ -127,6 +128,7 @@ struct file *get_empty_filp(void)
127 atomic_long_set(&f->f_count, 1); 128 atomic_long_set(&f->f_count, 1);
128 rwlock_init(&f->f_owner.lock); 129 rwlock_init(&f->f_owner.lock);
129 f->f_cred = get_cred(cred); 130 f->f_cred = get_cred(cred);
131 spin_lock_init(&f->f_lock);
130 eventpoll_init_file(f); 132 eventpoll_init_file(f);
131 /* f->f_version: 0 */ 133 /* f->f_version: 0 */
132 return f; 134 return f;
@@ -167,7 +169,6 @@ struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry,
167 fmode_t mode, const struct file_operations *fop) 169 fmode_t mode, const struct file_operations *fop)
168{ 170{
169 struct file *file; 171 struct file *file;
170 struct path;
171 172
172 file = get_empty_filp(); 173 file = get_empty_filp();
173 if (!file) 174 if (!file)
@@ -279,6 +280,7 @@ void __fput(struct file *file)
279 if (file->f_op && file->f_op->release) 280 if (file->f_op && file->f_op->release)
280 file->f_op->release(inode, file); 281 file->f_op->release(inode, file);
281 security_file_free(file); 282 security_file_free(file);
283 ima_file_free(file);
282 if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL)) 284 if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
283 cdev_put(inode->i_cdev); 285 cdev_put(inode->i_cdev);
284 fops_put(file->f_op); 286 fops_put(file->f_op);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e5eaa62fd17f..91013ff7dd53 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -196,7 +196,7 @@ static void redirty_tail(struct inode *inode)
196 struct inode *tail_inode; 196 struct inode *tail_inode;
197 197
198 tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list); 198 tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list);
199 if (!time_after_eq(inode->dirtied_when, 199 if (time_before(inode->dirtied_when,
200 tail_inode->dirtied_when)) 200 tail_inode->dirtied_when))
201 inode->dirtied_when = jiffies; 201 inode->dirtied_when = jiffies;
202 } 202 }
@@ -220,6 +220,21 @@ static void inode_sync_complete(struct inode *inode)
220 wake_up_bit(&inode->i_state, __I_SYNC); 220 wake_up_bit(&inode->i_state, __I_SYNC);
221} 221}
222 222
223static bool inode_dirtied_after(struct inode *inode, unsigned long t)
224{
225 bool ret = time_after(inode->dirtied_when, t);
226#ifndef CONFIG_64BIT
227 /*
228 * For inodes being constantly redirtied, dirtied_when can get stuck.
229 * It _appears_ to be in the future, but is actually in distant past.
230 * This test is necessary to prevent such wrapped-around relative times
231 * from permanently stopping the whole pdflush writeback.
232 */
233 ret = ret && time_before_eq(inode->dirtied_when, jiffies);
234#endif
235 return ret;
236}
237
223/* 238/*
224 * Move expired dirty inodes from @delaying_queue to @dispatch_queue. 239 * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
225 */ 240 */
@@ -231,7 +246,7 @@ static void move_expired_inodes(struct list_head *delaying_queue,
231 struct inode *inode = list_entry(delaying_queue->prev, 246 struct inode *inode = list_entry(delaying_queue->prev,
232 struct inode, i_list); 247 struct inode, i_list);
233 if (older_than_this && 248 if (older_than_this &&
234 time_after(inode->dirtied_when, *older_than_this)) 249 inode_dirtied_after(inode, *older_than_this))
235 break; 250 break;
236 list_move(&inode->i_list, dispatch_queue); 251 list_move(&inode->i_list, dispatch_queue);
237 } 252 }
@@ -274,6 +289,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
274 int ret; 289 int ret;
275 290
276 BUG_ON(inode->i_state & I_SYNC); 291 BUG_ON(inode->i_state & I_SYNC);
292 WARN_ON(inode->i_state & I_NEW);
277 293
278 /* Set I_SYNC, reset I_DIRTY */ 294 /* Set I_SYNC, reset I_DIRTY */
279 dirty = inode->i_state & I_DIRTY; 295 dirty = inode->i_state & I_DIRTY;
@@ -298,6 +314,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
298 } 314 }
299 315
300 spin_lock(&inode_lock); 316 spin_lock(&inode_lock);
317 WARN_ON(inode->i_state & I_NEW);
301 inode->i_state &= ~I_SYNC; 318 inode->i_state &= ~I_SYNC;
302 if (!(inode->i_state & I_FREEING)) { 319 if (!(inode->i_state & I_FREEING)) {
303 if (!(inode->i_state & I_DIRTY) && 320 if (!(inode->i_state & I_DIRTY) &&
@@ -418,7 +435,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
418 * If older_than_this is non-NULL, then only write out inodes which 435 * If older_than_this is non-NULL, then only write out inodes which
419 * had their first dirtying at a time earlier than *older_than_this. 436 * had their first dirtying at a time earlier than *older_than_this.
420 * 437 *
421 * If we're a pdlfush thread, then implement pdflush collision avoidance 438 * If we're a pdflush thread, then implement pdflush collision avoidance
422 * against the entire list. 439 * against the entire list.
423 * 440 *
424 * If `bdi' is non-zero then we're being asked to writeback a specific queue. 441 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
@@ -470,6 +487,11 @@ void generic_sync_sb_inodes(struct super_block *sb,
470 break; 487 break;
471 } 488 }
472 489
490 if (inode->i_state & I_NEW) {
491 requeue_io(inode);
492 continue;
493 }
494
473 if (wbc->nonblocking && bdi_write_congested(bdi)) { 495 if (wbc->nonblocking && bdi_write_congested(bdi)) {
474 wbc->encountered_congestion = 1; 496 wbc->encountered_congestion = 1;
475 if (!sb_is_blkdev_sb(sb)) 497 if (!sb_is_blkdev_sb(sb))
@@ -485,8 +507,11 @@ void generic_sync_sb_inodes(struct super_block *sb,
485 continue; /* blockdev has wrong queue */ 507 continue; /* blockdev has wrong queue */
486 } 508 }
487 509
488 /* Was this inode dirtied after sync_sb_inodes was called? */ 510 /*
489 if (time_after(inode->dirtied_when, start)) 511 * Was this inode dirtied after sync_sb_inodes was called?
512 * This keeps sync from extra jobs and livelock.
513 */
514 if (inode_dirtied_after(inode, start))
490 break; 515 break;
491 516
492 /* Is another pdflush already flushing this queue? */ 517 /* Is another pdflush already flushing this queue? */
@@ -531,7 +556,8 @@ void generic_sync_sb_inodes(struct super_block *sb,
531 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 556 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
532 struct address_space *mapping; 557 struct address_space *mapping;
533 558
534 if (inode->i_state & (I_FREEING|I_WILL_FREE)) 559 if (inode->i_state &
560 (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
535 continue; 561 continue;
536 mapping = inode->i_mapping; 562 mapping = inode->i_mapping;
537 if (mapping->nrpages == 0) 563 if (mapping->nrpages == 0)
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
new file mode 100644
index 000000000000..eee059052db5
--- /dev/null
+++ b/fs/fs_struct.c
@@ -0,0 +1,177 @@
1#include <linux/module.h>
2#include <linux/sched.h>
3#include <linux/fs.h>
4#include <linux/path.h>
5#include <linux/slab.h>
6#include <linux/fs_struct.h>
7
8/*
9 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
10 * It can block.
11 */
12void set_fs_root(struct fs_struct *fs, struct path *path)
13{
14 struct path old_root;
15
16 write_lock(&fs->lock);
17 old_root = fs->root;
18 fs->root = *path;
19 path_get(path);
20 write_unlock(&fs->lock);
21 if (old_root.dentry)
22 path_put(&old_root);
23}
24
25/*
26 * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
27 * It can block.
28 */
29void set_fs_pwd(struct fs_struct *fs, struct path *path)
30{
31 struct path old_pwd;
32
33 write_lock(&fs->lock);
34 old_pwd = fs->pwd;
35 fs->pwd = *path;
36 path_get(path);
37 write_unlock(&fs->lock);
38
39 if (old_pwd.dentry)
40 path_put(&old_pwd);
41}
42
43void chroot_fs_refs(struct path *old_root, struct path *new_root)
44{
45 struct task_struct *g, *p;
46 struct fs_struct *fs;
47 int count = 0;
48
49 read_lock(&tasklist_lock);
50 do_each_thread(g, p) {
51 task_lock(p);
52 fs = p->fs;
53 if (fs) {
54 write_lock(&fs->lock);
55 if (fs->root.dentry == old_root->dentry
56 && fs->root.mnt == old_root->mnt) {
57 path_get(new_root);
58 fs->root = *new_root;
59 count++;
60 }
61 if (fs->pwd.dentry == old_root->dentry
62 && fs->pwd.mnt == old_root->mnt) {
63 path_get(new_root);
64 fs->pwd = *new_root;
65 count++;
66 }
67 write_unlock(&fs->lock);
68 }
69 task_unlock(p);
70 } while_each_thread(g, p);
71 read_unlock(&tasklist_lock);
72 while (count--)
73 path_put(old_root);
74}
75
76void free_fs_struct(struct fs_struct *fs)
77{
78 path_put(&fs->root);
79 path_put(&fs->pwd);
80 kmem_cache_free(fs_cachep, fs);
81}
82
83void exit_fs(struct task_struct *tsk)
84{
85 struct fs_struct *fs = tsk->fs;
86
87 if (fs) {
88 int kill;
89 task_lock(tsk);
90 write_lock(&fs->lock);
91 tsk->fs = NULL;
92 kill = !--fs->users;
93 write_unlock(&fs->lock);
94 task_unlock(tsk);
95 if (kill)
96 free_fs_struct(fs);
97 }
98}
99
100struct fs_struct *copy_fs_struct(struct fs_struct *old)
101{
102 struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
103 /* We don't need to lock fs - think why ;-) */
104 if (fs) {
105 fs->users = 1;
106 fs->in_exec = 0;
107 rwlock_init(&fs->lock);
108 fs->umask = old->umask;
109 read_lock(&old->lock);
110 fs->root = old->root;
111 path_get(&old->root);
112 fs->pwd = old->pwd;
113 path_get(&old->pwd);
114 read_unlock(&old->lock);
115 }
116 return fs;
117}
118
119int unshare_fs_struct(void)
120{
121 struct fs_struct *fs = current->fs;
122 struct fs_struct *new_fs = copy_fs_struct(fs);
123 int kill;
124
125 if (!new_fs)
126 return -ENOMEM;
127
128 task_lock(current);
129 write_lock(&fs->lock);
130 kill = !--fs->users;
131 current->fs = new_fs;
132 write_unlock(&fs->lock);
133 task_unlock(current);
134
135 if (kill)
136 free_fs_struct(fs);
137
138 return 0;
139}
140EXPORT_SYMBOL_GPL(unshare_fs_struct);
141
142int current_umask(void)
143{
144 return current->fs->umask;
145}
146EXPORT_SYMBOL(current_umask);
147
148/* to be mentioned only in INIT_TASK */
149struct fs_struct init_fs = {
150 .users = 1,
151 .lock = __RW_LOCK_UNLOCKED(init_fs.lock),
152 .umask = 0022,
153};
154
155void daemonize_fs_struct(void)
156{
157 struct fs_struct *fs = current->fs;
158
159 if (fs) {
160 int kill;
161
162 task_lock(current);
163
164 write_lock(&init_fs.lock);
165 init_fs.users++;
166 write_unlock(&init_fs.lock);
167
168 write_lock(&fs->lock);
169 current->fs = &init_fs;
170 kill = !--fs->users;
171 write_unlock(&fs->lock);
172
173 task_unlock(current);
174 if (kill)
175 free_fs_struct(fs);
176 }
177}
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
new file mode 100644
index 000000000000..9bbb8ce7bea0
--- /dev/null
+++ b/fs/fscache/Kconfig
@@ -0,0 +1,56 @@
1
2config FSCACHE
3 tristate "General filesystem local caching manager"
4 depends on EXPERIMENTAL
5 select SLOW_WORK
6 help
7 This option enables a generic filesystem caching manager that can be
8 used by various network and other filesystems to cache data locally.
9 Different sorts of caches can be plugged in, depending on the
10 resources available.
11
12 See Documentation/filesystems/caching/fscache.txt for more information.
13
14config FSCACHE_STATS
15 bool "Gather statistical information on local caching"
16 depends on FSCACHE && PROC_FS
17 help
18 This option causes statistical information to be gathered on local
19 caching and exported through file:
20
21 /proc/fs/fscache/stats
22
23 The gathering of statistics adds a certain amount of overhead to
24 execution as there are a quite a few stats gathered, and on a
25 multi-CPU system these may be on cachelines that keep bouncing
26 between CPUs. On the other hand, the stats are very useful for
27 debugging purposes. Saying 'Y' here is recommended.
28
29 See Documentation/filesystems/caching/fscache.txt for more information.
30
31config FSCACHE_HISTOGRAM
32 bool "Gather latency information on local caching"
33 depends on FSCACHE && PROC_FS
34 help
35 This option causes latency information to be gathered on local
36 caching and exported through file:
37
38 /proc/fs/fscache/histogram
39
40 The generation of this histogram adds a certain amount of overhead to
41 execution as there are a number of points at which data is gathered,
42 and on a multi-CPU system these may be on cachelines that keep
43 bouncing between CPUs. On the other hand, the histogram may be
44 useful for debugging purposes. Saying 'N' here is recommended.
45
46 See Documentation/filesystems/caching/fscache.txt for more information.
47
48config FSCACHE_DEBUG
49 bool "Debug FS-Cache"
50 depends on FSCACHE
51 help
52 This permits debugging to be dynamically enabled in the local caching
53 management module. If this is set, the debugging output may be
54 enabled by setting bits in /sys/modules/fscache/parameter/debug.
55
56 See Documentation/filesystems/caching/fscache.txt for more information.
diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
new file mode 100644
index 000000000000..91571b95aacc
--- /dev/null
+++ b/fs/fscache/Makefile
@@ -0,0 +1,19 @@
1#
2# Makefile for general filesystem caching code
3#
4
5fscache-y := \
6 cache.o \
7 cookie.o \
8 fsdef.o \
9 main.o \
10 netfs.o \
11 object.o \
12 operation.o \
13 page.o
14
15fscache-$(CONFIG_PROC_FS) += proc.o
16fscache-$(CONFIG_FSCACHE_STATS) += stats.o
17fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o
18
19obj-$(CONFIG_FSCACHE) := fscache.o
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
new file mode 100644
index 000000000000..e21985bbb1fb
--- /dev/null
+++ b/fs/fscache/cache.c
@@ -0,0 +1,415 @@
1/* FS-Cache cache handling
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL CACHE
13#include <linux/module.h>
14#include <linux/slab.h>
15#include "internal.h"
16
17LIST_HEAD(fscache_cache_list);
18DECLARE_RWSEM(fscache_addremove_sem);
19DECLARE_WAIT_QUEUE_HEAD(fscache_cache_cleared_wq);
20EXPORT_SYMBOL(fscache_cache_cleared_wq);
21
22static LIST_HEAD(fscache_cache_tag_list);
23
24/*
25 * look up a cache tag
26 */
27struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *name)
28{
29 struct fscache_cache_tag *tag, *xtag;
30
31 /* firstly check for the existence of the tag under read lock */
32 down_read(&fscache_addremove_sem);
33
34 list_for_each_entry(tag, &fscache_cache_tag_list, link) {
35 if (strcmp(tag->name, name) == 0) {
36 atomic_inc(&tag->usage);
37 up_read(&fscache_addremove_sem);
38 return tag;
39 }
40 }
41
42 up_read(&fscache_addremove_sem);
43
44 /* the tag does not exist - create a candidate */
45 xtag = kzalloc(sizeof(*xtag) + strlen(name) + 1, GFP_KERNEL);
46 if (!xtag)
47 /* return a dummy tag if out of memory */
48 return ERR_PTR(-ENOMEM);
49
50 atomic_set(&xtag->usage, 1);
51 strcpy(xtag->name, name);
52
53 /* write lock, search again and add if still not present */
54 down_write(&fscache_addremove_sem);
55
56 list_for_each_entry(tag, &fscache_cache_tag_list, link) {
57 if (strcmp(tag->name, name) == 0) {
58 atomic_inc(&tag->usage);
59 up_write(&fscache_addremove_sem);
60 kfree(xtag);
61 return tag;
62 }
63 }
64
65 list_add_tail(&xtag->link, &fscache_cache_tag_list);
66 up_write(&fscache_addremove_sem);
67 return xtag;
68}
69
70/*
71 * release a reference to a cache tag
72 */
73void __fscache_release_cache_tag(struct fscache_cache_tag *tag)
74{
75 if (tag != ERR_PTR(-ENOMEM)) {
76 down_write(&fscache_addremove_sem);
77
78 if (atomic_dec_and_test(&tag->usage))
79 list_del_init(&tag->link);
80 else
81 tag = NULL;
82
83 up_write(&fscache_addremove_sem);
84
85 kfree(tag);
86 }
87}
88
89/*
90 * select a cache in which to store an object
91 * - the cache addremove semaphore must be at least read-locked by the caller
92 * - the object will never be an index
93 */
94struct fscache_cache *fscache_select_cache_for_object(
95 struct fscache_cookie *cookie)
96{
97 struct fscache_cache_tag *tag;
98 struct fscache_object *object;
99 struct fscache_cache *cache;
100
101 _enter("");
102
103 if (list_empty(&fscache_cache_list)) {
104 _leave(" = NULL [no cache]");
105 return NULL;
106 }
107
108 /* we check the parent to determine the cache to use */
109 spin_lock(&cookie->lock);
110
111 /* the first in the parent's backing list should be the preferred
112 * cache */
113 if (!hlist_empty(&cookie->backing_objects)) {
114 object = hlist_entry(cookie->backing_objects.first,
115 struct fscache_object, cookie_link);
116
117 cache = object->cache;
118 if (object->state >= FSCACHE_OBJECT_DYING ||
119 test_bit(FSCACHE_IOERROR, &cache->flags))
120 cache = NULL;
121
122 spin_unlock(&cookie->lock);
123 _leave(" = %p [parent]", cache);
124 return cache;
125 }
126
127 /* the parent is unbacked */
128 if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
129 /* cookie not an index and is unbacked */
130 spin_unlock(&cookie->lock);
131 _leave(" = NULL [cookie ub,ni]");
132 return NULL;
133 }
134
135 spin_unlock(&cookie->lock);
136
137 if (!cookie->def->select_cache)
138 goto no_preference;
139
140 /* ask the netfs for its preference */
141 tag = cookie->def->select_cache(cookie->parent->netfs_data,
142 cookie->netfs_data);
143 if (!tag)
144 goto no_preference;
145
146 if (tag == ERR_PTR(-ENOMEM)) {
147 _leave(" = NULL [nomem tag]");
148 return NULL;
149 }
150
151 if (!tag->cache) {
152 _leave(" = NULL [unbacked tag]");
153 return NULL;
154 }
155
156 if (test_bit(FSCACHE_IOERROR, &tag->cache->flags))
157 return NULL;
158
159 _leave(" = %p [specific]", tag->cache);
160 return tag->cache;
161
162no_preference:
163 /* netfs has no preference - just select first cache */
164 cache = list_entry(fscache_cache_list.next,
165 struct fscache_cache, link);
166 _leave(" = %p [first]", cache);
167 return cache;
168}
169
170/**
171 * fscache_init_cache - Initialise a cache record
172 * @cache: The cache record to be initialised
173 * @ops: The cache operations to be installed in that record
174 * @idfmt: Format string to define identifier
175 * @...: sprintf-style arguments
176 *
177 * Initialise a record of a cache and fill in the name.
178 *
179 * See Documentation/filesystems/caching/backend-api.txt for a complete
180 * description.
181 */
182void fscache_init_cache(struct fscache_cache *cache,
183 const struct fscache_cache_ops *ops,
184 const char *idfmt,
185 ...)
186{
187 va_list va;
188
189 memset(cache, 0, sizeof(*cache));
190
191 cache->ops = ops;
192
193 va_start(va, idfmt);
194 vsnprintf(cache->identifier, sizeof(cache->identifier), idfmt, va);
195 va_end(va);
196
197 INIT_WORK(&cache->op_gc, fscache_operation_gc);
198 INIT_LIST_HEAD(&cache->link);
199 INIT_LIST_HEAD(&cache->object_list);
200 INIT_LIST_HEAD(&cache->op_gc_list);
201 spin_lock_init(&cache->object_list_lock);
202 spin_lock_init(&cache->op_gc_list_lock);
203}
204EXPORT_SYMBOL(fscache_init_cache);
205
206/**
207 * fscache_add_cache - Declare a cache as being open for business
208 * @cache: The record describing the cache
209 * @ifsdef: The record of the cache object describing the top-level index
210 * @tagname: The tag describing this cache
211 *
212 * Add a cache to the system, making it available for netfs's to use.
213 *
214 * See Documentation/filesystems/caching/backend-api.txt for a complete
215 * description.
216 */
217int fscache_add_cache(struct fscache_cache *cache,
218 struct fscache_object *ifsdef,
219 const char *tagname)
220{
221 struct fscache_cache_tag *tag;
222
223 BUG_ON(!cache->ops);
224 BUG_ON(!ifsdef);
225
226 cache->flags = 0;
227 ifsdef->event_mask = ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
228 ifsdef->state = FSCACHE_OBJECT_ACTIVE;
229
230 if (!tagname)
231 tagname = cache->identifier;
232
233 BUG_ON(!tagname[0]);
234
235 _enter("{%s.%s},,%s", cache->ops->name, cache->identifier, tagname);
236
237 /* we use the cache tag to uniquely identify caches */
238 tag = __fscache_lookup_cache_tag(tagname);
239 if (IS_ERR(tag))
240 goto nomem;
241
242 if (test_and_set_bit(FSCACHE_TAG_RESERVED, &tag->flags))
243 goto tag_in_use;
244
245 cache->kobj = kobject_create_and_add(tagname, fscache_root);
246 if (!cache->kobj)
247 goto error;
248
249 ifsdef->cookie = &fscache_fsdef_index;
250 ifsdef->cache = cache;
251 cache->fsdef = ifsdef;
252
253 down_write(&fscache_addremove_sem);
254
255 tag->cache = cache;
256 cache->tag = tag;
257
258 /* add the cache to the list */
259 list_add(&cache->link, &fscache_cache_list);
260
261 /* add the cache's netfs definition index object to the cache's
262 * list */
263 spin_lock(&cache->object_list_lock);
264 list_add_tail(&ifsdef->cache_link, &cache->object_list);
265 spin_unlock(&cache->object_list_lock);
266
267 /* add the cache's netfs definition index object to the top level index
268 * cookie as a known backing object */
269 spin_lock(&fscache_fsdef_index.lock);
270
271 hlist_add_head(&ifsdef->cookie_link,
272 &fscache_fsdef_index.backing_objects);
273
274 atomic_inc(&fscache_fsdef_index.usage);
275
276 /* done */
277 spin_unlock(&fscache_fsdef_index.lock);
278 up_write(&fscache_addremove_sem);
279
280 printk(KERN_NOTICE "FS-Cache: Cache \"%s\" added (type %s)\n",
281 cache->tag->name, cache->ops->name);
282 kobject_uevent(cache->kobj, KOBJ_ADD);
283
284 _leave(" = 0 [%s]", cache->identifier);
285 return 0;
286
287tag_in_use:
288 printk(KERN_ERR "FS-Cache: Cache tag '%s' already in use\n", tagname);
289 __fscache_release_cache_tag(tag);
290 _leave(" = -EXIST");
291 return -EEXIST;
292
293error:
294 __fscache_release_cache_tag(tag);
295 _leave(" = -EINVAL");
296 return -EINVAL;
297
298nomem:
299 _leave(" = -ENOMEM");
300 return -ENOMEM;
301}
302EXPORT_SYMBOL(fscache_add_cache);
303
304/**
305 * fscache_io_error - Note a cache I/O error
306 * @cache: The record describing the cache
307 *
308 * Note that an I/O error occurred in a cache and that it should no longer be
309 * used for anything. This also reports the error into the kernel log.
310 *
311 * See Documentation/filesystems/caching/backend-api.txt for a complete
312 * description.
313 */
314void fscache_io_error(struct fscache_cache *cache)
315{
316 set_bit(FSCACHE_IOERROR, &cache->flags);
317
318 printk(KERN_ERR "FS-Cache: Cache %s stopped due to I/O error\n",
319 cache->ops->name);
320}
321EXPORT_SYMBOL(fscache_io_error);
322
323/*
324 * request withdrawal of all the objects in a cache
325 * - all the objects being withdrawn are moved onto the supplied list
326 */
327static void fscache_withdraw_all_objects(struct fscache_cache *cache,
328 struct list_head *dying_objects)
329{
330 struct fscache_object *object;
331
332 spin_lock(&cache->object_list_lock);
333
334 while (!list_empty(&cache->object_list)) {
335 object = list_entry(cache->object_list.next,
336 struct fscache_object, cache_link);
337 list_move_tail(&object->cache_link, dying_objects);
338
339 _debug("withdraw %p", object->cookie);
340
341 spin_lock(&object->lock);
342 spin_unlock(&cache->object_list_lock);
343 fscache_raise_event(object, FSCACHE_OBJECT_EV_WITHDRAW);
344 spin_unlock(&object->lock);
345
346 cond_resched();
347 spin_lock(&cache->object_list_lock);
348 }
349
350 spin_unlock(&cache->object_list_lock);
351}
352
353/**
354 * fscache_withdraw_cache - Withdraw a cache from the active service
355 * @cache: The record describing the cache
356 *
357 * Withdraw a cache from service, unbinding all its cache objects from the
358 * netfs cookies they're currently representing.
359 *
360 * See Documentation/filesystems/caching/backend-api.txt for a complete
361 * description.
362 */
363void fscache_withdraw_cache(struct fscache_cache *cache)
364{
365 LIST_HEAD(dying_objects);
366
367 _enter("");
368
369 printk(KERN_NOTICE "FS-Cache: Withdrawing cache \"%s\"\n",
370 cache->tag->name);
371
372 /* make the cache unavailable for cookie acquisition */
373 if (test_and_set_bit(FSCACHE_CACHE_WITHDRAWN, &cache->flags))
374 BUG();
375
376 down_write(&fscache_addremove_sem);
377 list_del_init(&cache->link);
378 cache->tag->cache = NULL;
379 up_write(&fscache_addremove_sem);
380
381 /* make sure all pages pinned by operations on behalf of the netfs are
382 * written to disk */
383 cache->ops->sync_cache(cache);
384
385 /* dissociate all the netfs pages backed by this cache from the block
386 * mappings in the cache */
387 cache->ops->dissociate_pages(cache);
388
389 /* we now have to destroy all the active objects pertaining to this
390 * cache - which we do by passing them off to thread pool to be
391 * disposed of */
392 _debug("destroy");
393
394 fscache_withdraw_all_objects(cache, &dying_objects);
395
396 /* wait for all extant objects to finish their outstanding operations
397 * and go away */
398 _debug("wait for finish");
399 wait_event(fscache_cache_cleared_wq,
400 atomic_read(&cache->object_count) == 0);
401 _debug("wait for clearance");
402 wait_event(fscache_cache_cleared_wq,
403 list_empty(&cache->object_list));
404 _debug("cleared");
405 ASSERT(list_empty(&dying_objects));
406
407 kobject_put(cache->kobj);
408
409 clear_bit(FSCACHE_TAG_RESERVED, &cache->tag->flags);
410 fscache_release_cache_tag(cache->tag);
411 cache->tag = NULL;
412
413 _leave("");
414}
415EXPORT_SYMBOL(fscache_withdraw_cache);
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
new file mode 100644
index 000000000000..72fd18f6c71f
--- /dev/null
+++ b/fs/fscache/cookie.c
@@ -0,0 +1,500 @@
1/* netfs cookie management
2 *
3 * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * See Documentation/filesystems/caching/netfs-api.txt for more information on
12 * the netfs API.
13 */
14
15#define FSCACHE_DEBUG_LEVEL COOKIE
16#include <linux/module.h>
17#include <linux/slab.h>
18#include "internal.h"
19
20struct kmem_cache *fscache_cookie_jar;
21
22static atomic_t fscache_object_debug_id = ATOMIC_INIT(0);
23
24static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie);
25static int fscache_alloc_object(struct fscache_cache *cache,
26 struct fscache_cookie *cookie);
27static int fscache_attach_object(struct fscache_cookie *cookie,
28 struct fscache_object *object);
29
30/*
31 * initialise an cookie jar slab element prior to any use
32 */
33void fscache_cookie_init_once(void *_cookie)
34{
35 struct fscache_cookie *cookie = _cookie;
36
37 memset(cookie, 0, sizeof(*cookie));
38 spin_lock_init(&cookie->lock);
39 INIT_HLIST_HEAD(&cookie->backing_objects);
40}
41
42/*
43 * request a cookie to represent an object (index, datafile, xattr, etc)
44 * - parent specifies the parent object
45 * - the top level index cookie for each netfs is stored in the fscache_netfs
46 * struct upon registration
47 * - def points to the definition
48 * - the netfs_data will be passed to the functions pointed to in *def
49 * - all attached caches will be searched to see if they contain this object
50 * - index objects aren't stored on disk until there's a dependent file that
51 * needs storing
52 * - other objects are stored in a selected cache immediately, and all the
53 * indices forming the path to it are instantiated if necessary
54 * - we never let on to the netfs about errors
55 * - we may set a negative cookie pointer, but that's okay
56 */
57struct fscache_cookie *__fscache_acquire_cookie(
58 struct fscache_cookie *parent,
59 const struct fscache_cookie_def *def,
60 void *netfs_data)
61{
62 struct fscache_cookie *cookie;
63
64 BUG_ON(!def);
65
66 _enter("{%s},{%s},%p",
67 parent ? (char *) parent->def->name : "<no-parent>",
68 def->name, netfs_data);
69
70 fscache_stat(&fscache_n_acquires);
71
72 /* if there's no parent cookie, then we don't create one here either */
73 if (!parent) {
74 fscache_stat(&fscache_n_acquires_null);
75 _leave(" [no parent]");
76 return NULL;
77 }
78
79 /* validate the definition */
80 BUG_ON(!def->get_key);
81 BUG_ON(!def->name[0]);
82
83 BUG_ON(def->type == FSCACHE_COOKIE_TYPE_INDEX &&
84 parent->def->type != FSCACHE_COOKIE_TYPE_INDEX);
85
86 /* allocate and initialise a cookie */
87 cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL);
88 if (!cookie) {
89 fscache_stat(&fscache_n_acquires_oom);
90 _leave(" [ENOMEM]");
91 return NULL;
92 }
93
94 atomic_set(&cookie->usage, 1);
95 atomic_set(&cookie->n_children, 0);
96
97 atomic_inc(&parent->usage);
98 atomic_inc(&parent->n_children);
99
100 cookie->def = def;
101 cookie->parent = parent;
102 cookie->netfs_data = netfs_data;
103 cookie->flags = 0;
104
105 INIT_RADIX_TREE(&cookie->stores, GFP_NOFS);
106
107 switch (cookie->def->type) {
108 case FSCACHE_COOKIE_TYPE_INDEX:
109 fscache_stat(&fscache_n_cookie_index);
110 break;
111 case FSCACHE_COOKIE_TYPE_DATAFILE:
112 fscache_stat(&fscache_n_cookie_data);
113 break;
114 default:
115 fscache_stat(&fscache_n_cookie_special);
116 break;
117 }
118
119 /* if the object is an index then we need do nothing more here - we
120 * create indices on disk when we need them as an index may exist in
121 * multiple caches */
122 if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
123 if (fscache_acquire_non_index_cookie(cookie) < 0) {
124 atomic_dec(&parent->n_children);
125 __fscache_cookie_put(cookie);
126 fscache_stat(&fscache_n_acquires_nobufs);
127 _leave(" = NULL");
128 return NULL;
129 }
130 }
131
132 fscache_stat(&fscache_n_acquires_ok);
133 _leave(" = %p", cookie);
134 return cookie;
135}
136EXPORT_SYMBOL(__fscache_acquire_cookie);
137
138/*
139 * acquire a non-index cookie
140 * - this must make sure the index chain is instantiated and instantiate the
141 * object representation too
142 */
143static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
144{
145 struct fscache_object *object;
146 struct fscache_cache *cache;
147 uint64_t i_size;
148 int ret;
149
150 _enter("");
151
152 cookie->flags = 1 << FSCACHE_COOKIE_UNAVAILABLE;
153
154 /* now we need to see whether the backing objects for this cookie yet
155 * exist, if not there'll be nothing to search */
156 down_read(&fscache_addremove_sem);
157
158 if (list_empty(&fscache_cache_list)) {
159 up_read(&fscache_addremove_sem);
160 _leave(" = 0 [no caches]");
161 return 0;
162 }
163
164 /* select a cache in which to store the object */
165 cache = fscache_select_cache_for_object(cookie->parent);
166 if (!cache) {
167 up_read(&fscache_addremove_sem);
168 fscache_stat(&fscache_n_acquires_no_cache);
169 _leave(" = -ENOMEDIUM [no cache]");
170 return -ENOMEDIUM;
171 }
172
173 _debug("cache %s", cache->tag->name);
174
175 cookie->flags =
176 (1 << FSCACHE_COOKIE_LOOKING_UP) |
177 (1 << FSCACHE_COOKIE_CREATING) |
178 (1 << FSCACHE_COOKIE_NO_DATA_YET);
179
180 /* ask the cache to allocate objects for this cookie and its parent
181 * chain */
182 ret = fscache_alloc_object(cache, cookie);
183 if (ret < 0) {
184 up_read(&fscache_addremove_sem);
185 _leave(" = %d", ret);
186 return ret;
187 }
188
189 /* pass on how big the object we're caching is supposed to be */
190 cookie->def->get_attr(cookie->netfs_data, &i_size);
191
192 spin_lock(&cookie->lock);
193 if (hlist_empty(&cookie->backing_objects)) {
194 spin_unlock(&cookie->lock);
195 goto unavailable;
196 }
197
198 object = hlist_entry(cookie->backing_objects.first,
199 struct fscache_object, cookie_link);
200
201 fscache_set_store_limit(object, i_size);
202
203 /* initiate the process of looking up all the objects in the chain
204 * (done by fscache_initialise_object()) */
205 fscache_enqueue_object(object);
206
207 spin_unlock(&cookie->lock);
208
209 /* we may be required to wait for lookup to complete at this point */
210 if (!fscache_defer_lookup) {
211 _debug("non-deferred lookup %p", &cookie->flags);
212 wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
213 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
214 _debug("complete");
215 if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags))
216 goto unavailable;
217 }
218
219 up_read(&fscache_addremove_sem);
220 _leave(" = 0 [deferred]");
221 return 0;
222
223unavailable:
224 up_read(&fscache_addremove_sem);
225 _leave(" = -ENOBUFS");
226 return -ENOBUFS;
227}
228
229/*
230 * recursively allocate cache object records for a cookie/cache combination
231 * - caller must be holding the addremove sem
232 */
233static int fscache_alloc_object(struct fscache_cache *cache,
234 struct fscache_cookie *cookie)
235{
236 struct fscache_object *object;
237 struct hlist_node *_n;
238 int ret;
239
240 _enter("%p,%p{%s}", cache, cookie, cookie->def->name);
241
242 spin_lock(&cookie->lock);
243 hlist_for_each_entry(object, _n, &cookie->backing_objects,
244 cookie_link) {
245 if (object->cache == cache)
246 goto object_already_extant;
247 }
248 spin_unlock(&cookie->lock);
249
250 /* ask the cache to allocate an object (we may end up with duplicate
251 * objects at this stage, but we sort that out later) */
252 object = cache->ops->alloc_object(cache, cookie);
253 if (IS_ERR(object)) {
254 fscache_stat(&fscache_n_object_no_alloc);
255 ret = PTR_ERR(object);
256 goto error;
257 }
258
259 fscache_stat(&fscache_n_object_alloc);
260
261 object->debug_id = atomic_inc_return(&fscache_object_debug_id);
262
263 _debug("ALLOC OBJ%x: %s {%lx}",
264 object->debug_id, cookie->def->name, object->events);
265
266 ret = fscache_alloc_object(cache, cookie->parent);
267 if (ret < 0)
268 goto error_put;
269
270 /* only attach if we managed to allocate all we needed, otherwise
271 * discard the object we just allocated and instead use the one
272 * attached to the cookie */
273 if (fscache_attach_object(cookie, object) < 0)
274 cache->ops->put_object(object);
275
276 _leave(" = 0");
277 return 0;
278
279object_already_extant:
280 ret = -ENOBUFS;
281 if (object->state >= FSCACHE_OBJECT_DYING) {
282 spin_unlock(&cookie->lock);
283 goto error;
284 }
285 spin_unlock(&cookie->lock);
286 _leave(" = 0 [found]");
287 return 0;
288
289error_put:
290 cache->ops->put_object(object);
291error:
292 _leave(" = %d", ret);
293 return ret;
294}
295
296/*
297 * attach a cache object to a cookie
298 */
299static int fscache_attach_object(struct fscache_cookie *cookie,
300 struct fscache_object *object)
301{
302 struct fscache_object *p;
303 struct fscache_cache *cache = object->cache;
304 struct hlist_node *_n;
305 int ret;
306
307 _enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id);
308
309 spin_lock(&cookie->lock);
310
311 /* there may be multiple initial creations of this object, but we only
312 * want one */
313 ret = -EEXIST;
314 hlist_for_each_entry(p, _n, &cookie->backing_objects, cookie_link) {
315 if (p->cache == object->cache) {
316 if (p->state >= FSCACHE_OBJECT_DYING)
317 ret = -ENOBUFS;
318 goto cant_attach_object;
319 }
320 }
321
322 /* pin the parent object */
323 spin_lock_nested(&cookie->parent->lock, 1);
324 hlist_for_each_entry(p, _n, &cookie->parent->backing_objects,
325 cookie_link) {
326 if (p->cache == object->cache) {
327 if (p->state >= FSCACHE_OBJECT_DYING) {
328 ret = -ENOBUFS;
329 spin_unlock(&cookie->parent->lock);
330 goto cant_attach_object;
331 }
332 object->parent = p;
333 spin_lock(&p->lock);
334 p->n_children++;
335 spin_unlock(&p->lock);
336 break;
337 }
338 }
339 spin_unlock(&cookie->parent->lock);
340
341 /* attach to the cache's object list */
342 if (list_empty(&object->cache_link)) {
343 spin_lock(&cache->object_list_lock);
344 list_add(&object->cache_link, &cache->object_list);
345 spin_unlock(&cache->object_list_lock);
346 }
347
348 /* attach to the cookie */
349 object->cookie = cookie;
350 atomic_inc(&cookie->usage);
351 hlist_add_head(&object->cookie_link, &cookie->backing_objects);
352 ret = 0;
353
354cant_attach_object:
355 spin_unlock(&cookie->lock);
356 _leave(" = %d", ret);
357 return ret;
358}
359
360/*
361 * update the index entries backing a cookie
362 */
363void __fscache_update_cookie(struct fscache_cookie *cookie)
364{
365 struct fscache_object *object;
366 struct hlist_node *_p;
367
368 fscache_stat(&fscache_n_updates);
369
370 if (!cookie) {
371 fscache_stat(&fscache_n_updates_null);
372 _leave(" [no cookie]");
373 return;
374 }
375
376 _enter("{%s}", cookie->def->name);
377
378 BUG_ON(!cookie->def->get_aux);
379
380 spin_lock(&cookie->lock);
381
382 /* update the index entry on disk in each cache backing this cookie */
383 hlist_for_each_entry(object, _p,
384 &cookie->backing_objects, cookie_link) {
385 fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
386 }
387
388 spin_unlock(&cookie->lock);
389 _leave("");
390}
391EXPORT_SYMBOL(__fscache_update_cookie);
392
393/*
394 * release a cookie back to the cache
395 * - the object will be marked as recyclable on disk if retire is true
396 * - all dependents of this cookie must have already been unregistered
397 * (indices/files/pages)
398 */
399void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
400{
401 struct fscache_cache *cache;
402 struct fscache_object *object;
403 unsigned long event;
404
405 fscache_stat(&fscache_n_relinquishes);
406
407 if (!cookie) {
408 fscache_stat(&fscache_n_relinquishes_null);
409 _leave(" [no cookie]");
410 return;
411 }
412
413 _enter("%p{%s,%p},%d",
414 cookie, cookie->def->name, cookie->netfs_data, retire);
415
416 if (atomic_read(&cookie->n_children) != 0) {
417 printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n",
418 cookie->def->name);
419 BUG();
420 }
421
422 /* wait for the cookie to finish being instantiated (or to fail) */
423 if (test_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) {
424 fscache_stat(&fscache_n_relinquishes_waitcrt);
425 wait_on_bit(&cookie->flags, FSCACHE_COOKIE_CREATING,
426 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
427 }
428
429 event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
430
431 /* detach pointers back to the netfs */
432 spin_lock(&cookie->lock);
433
434 cookie->netfs_data = NULL;
435 cookie->def = NULL;
436
437 /* break links with all the active objects */
438 while (!hlist_empty(&cookie->backing_objects)) {
439 object = hlist_entry(cookie->backing_objects.first,
440 struct fscache_object,
441 cookie_link);
442
443 _debug("RELEASE OBJ%x", object->debug_id);
444
445 /* detach each cache object from the object cookie */
446 spin_lock(&object->lock);
447 hlist_del_init(&object->cookie_link);
448
449 cache = object->cache;
450 object->cookie = NULL;
451 fscache_raise_event(object, event);
452 spin_unlock(&object->lock);
453
454 if (atomic_dec_and_test(&cookie->usage))
455 /* the cookie refcount shouldn't be reduced to 0 yet */
456 BUG();
457 }
458
459 spin_unlock(&cookie->lock);
460
461 if (cookie->parent) {
462 ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0);
463 ASSERTCMP(atomic_read(&cookie->parent->n_children), >, 0);
464 atomic_dec(&cookie->parent->n_children);
465 }
466
467 /* finally dispose of the cookie */
468 ASSERTCMP(atomic_read(&cookie->usage), >, 0);
469 fscache_cookie_put(cookie);
470
471 _leave("");
472}
473EXPORT_SYMBOL(__fscache_relinquish_cookie);
474
475/*
476 * destroy a cookie
477 */
478void __fscache_cookie_put(struct fscache_cookie *cookie)
479{
480 struct fscache_cookie *parent;
481
482 _enter("%p", cookie);
483
484 for (;;) {
485 _debug("FREE COOKIE %p", cookie);
486 parent = cookie->parent;
487 BUG_ON(!hlist_empty(&cookie->backing_objects));
488 kmem_cache_free(fscache_cookie_jar, cookie);
489
490 if (!parent)
491 break;
492
493 cookie = parent;
494 BUG_ON(atomic_read(&cookie->usage) <= 0);
495 if (!atomic_dec_and_test(&cookie->usage))
496 break;
497 }
498
499 _leave("");
500}
diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
new file mode 100644
index 000000000000..f5b4baee7352
--- /dev/null
+++ b/fs/fscache/fsdef.c
@@ -0,0 +1,144 @@
1/* Filesystem index definition
2 *
3 * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL CACHE
13#include <linux/module.h>
14#include "internal.h"
15
16static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data,
17 void *buffer, uint16_t bufmax);
18
19static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data,
20 void *buffer, uint16_t bufmax);
21
22static
23enum fscache_checkaux fscache_fsdef_netfs_check_aux(void *cookie_netfs_data,
24 const void *data,
25 uint16_t datalen);
26
27/*
28 * The root index is owned by FS-Cache itself.
29 *
30 * When a netfs requests caching facilities, FS-Cache will, if one doesn't
31 * already exist, create an entry in the root index with the key being the name
32 * of the netfs ("AFS" for example), and the auxiliary data holding the index
33 * structure version supplied by the netfs:
34 *
35 * FSDEF
36 * |
37 * +-----------+
38 * | |
39 * NFS AFS
40 * [v=1] [v=1]
41 *
42 * If an entry with the appropriate name does already exist, the version is
43 * compared. If the version is different, the entire subtree from that entry
44 * will be discarded and a new entry created.
45 *
46 * The new entry will be an index, and a cookie referring to it will be passed
47 * to the netfs. This is then the root handle by which the netfs accesses the
48 * cache. It can create whatever objects it likes in that index, including
49 * further indices.
50 */
51static struct fscache_cookie_def fscache_fsdef_index_def = {
52 .name = ".FS-Cache",
53 .type = FSCACHE_COOKIE_TYPE_INDEX,
54};
55
56struct fscache_cookie fscache_fsdef_index = {
57 .usage = ATOMIC_INIT(1),
58 .lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock),
59 .backing_objects = HLIST_HEAD_INIT,
60 .def = &fscache_fsdef_index_def,
61};
62EXPORT_SYMBOL(fscache_fsdef_index);
63
64/*
65 * Definition of an entry in the root index. Each entry is an index, keyed to
66 * a specific netfs and only applicable to a particular version of the index
67 * structure used by that netfs.
68 */
69struct fscache_cookie_def fscache_fsdef_netfs_def = {
70 .name = "FSDEF.netfs",
71 .type = FSCACHE_COOKIE_TYPE_INDEX,
72 .get_key = fscache_fsdef_netfs_get_key,
73 .get_aux = fscache_fsdef_netfs_get_aux,
74 .check_aux = fscache_fsdef_netfs_check_aux,
75};
76
77/*
78 * get the key data for an FSDEF index record - this is the name of the netfs
79 * for which this entry is created
80 */
81static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data,
82 void *buffer, uint16_t bufmax)
83{
84 const struct fscache_netfs *netfs = cookie_netfs_data;
85 unsigned klen;
86
87 _enter("{%s.%u},", netfs->name, netfs->version);
88
89 klen = strlen(netfs->name);
90 if (klen > bufmax)
91 return 0;
92
93 memcpy(buffer, netfs->name, klen);
94 return klen;
95}
96
97/*
98 * get the auxiliary data for an FSDEF index record - this is the index
99 * structure version number of the netfs for which this version is created
100 */
101static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data,
102 void *buffer, uint16_t bufmax)
103{
104 const struct fscache_netfs *netfs = cookie_netfs_data;
105 unsigned dlen;
106
107 _enter("{%s.%u},", netfs->name, netfs->version);
108
109 dlen = sizeof(uint32_t);
110 if (dlen > bufmax)
111 return 0;
112
113 memcpy(buffer, &netfs->version, dlen);
114 return dlen;
115}
116
117/*
118 * check that the index structure version number stored in the auxiliary data
119 * matches the one the netfs gave us
120 */
121static enum fscache_checkaux fscache_fsdef_netfs_check_aux(
122 void *cookie_netfs_data,
123 const void *data,
124 uint16_t datalen)
125{
126 struct fscache_netfs *netfs = cookie_netfs_data;
127 uint32_t version;
128
129 _enter("{%s},,%hu", netfs->name, datalen);
130
131 if (datalen != sizeof(version)) {
132 _leave(" = OBSOLETE [dl=%d v=%zu]", datalen, sizeof(version));
133 return FSCACHE_CHECKAUX_OBSOLETE;
134 }
135
136 memcpy(&version, data, sizeof(version));
137 if (version != netfs->version) {
138 _leave(" = OBSOLETE [ver=%x net=%x]", version, netfs->version);
139 return FSCACHE_CHECKAUX_OBSOLETE;
140 }
141
142 _leave(" = OKAY");
143 return FSCACHE_CHECKAUX_OKAY;
144}
diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c
new file mode 100644
index 000000000000..bad496748a59
--- /dev/null
+++ b/fs/fscache/histogram.c
@@ -0,0 +1,109 @@
1/* FS-Cache latency histogram
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL THREAD
13#include <linux/module.h>
14#include <linux/proc_fs.h>
15#include <linux/seq_file.h>
16#include "internal.h"
17
18atomic_t fscache_obj_instantiate_histogram[HZ];
19atomic_t fscache_objs_histogram[HZ];
20atomic_t fscache_ops_histogram[HZ];
21atomic_t fscache_retrieval_delay_histogram[HZ];
22atomic_t fscache_retrieval_histogram[HZ];
23
24/*
25 * display the time-taken histogram
26 */
27static int fscache_histogram_show(struct seq_file *m, void *v)
28{
29 unsigned long index;
30 unsigned n[5], t;
31
32 switch ((unsigned long) v) {
33 case 1:
34 seq_puts(m, "JIFS SECS OBJ INST OP RUNS OBJ RUNS "
35 " RETRV DLY RETRIEVLS\n");
36 return 0;
37 case 2:
38 seq_puts(m, "===== ===== ========= ========= ========="
39 " ========= =========\n");
40 return 0;
41 default:
42 index = (unsigned long) v - 3;
43 n[0] = atomic_read(&fscache_obj_instantiate_histogram[index]);
44 n[1] = atomic_read(&fscache_ops_histogram[index]);
45 n[2] = atomic_read(&fscache_objs_histogram[index]);
46 n[3] = atomic_read(&fscache_retrieval_delay_histogram[index]);
47 n[4] = atomic_read(&fscache_retrieval_histogram[index]);
48 if (!(n[0] | n[1] | n[2] | n[3] | n[4]))
49 return 0;
50
51 t = (index * 1000) / HZ;
52
53 seq_printf(m, "%4lu 0.%03u %9u %9u %9u %9u %9u\n",
54 index, t, n[0], n[1], n[2], n[3], n[4]);
55 return 0;
56 }
57}
58
59/*
60 * set up the iterator to start reading from the first line
61 */
62static void *fscache_histogram_start(struct seq_file *m, loff_t *_pos)
63{
64 if ((unsigned long long)*_pos >= HZ + 2)
65 return NULL;
66 if (*_pos == 0)
67 *_pos = 1;
68 return (void *)(unsigned long) *_pos;
69}
70
71/*
72 * move to the next line
73 */
74static void *fscache_histogram_next(struct seq_file *m, void *v, loff_t *pos)
75{
76 (*pos)++;
77 return (unsigned long long)*pos > HZ + 2 ?
78 NULL : (void *)(unsigned long) *pos;
79}
80
81/*
82 * clean up after reading
83 */
84static void fscache_histogram_stop(struct seq_file *m, void *v)
85{
86}
87
88static const struct seq_operations fscache_histogram_ops = {
89 .start = fscache_histogram_start,
90 .stop = fscache_histogram_stop,
91 .next = fscache_histogram_next,
92 .show = fscache_histogram_show,
93};
94
95/*
96 * open "/proc/fs/fscache/histogram" to provide latency data
97 */
98static int fscache_histogram_open(struct inode *inode, struct file *file)
99{
100 return seq_open(file, &fscache_histogram_ops);
101}
102
103const struct file_operations fscache_histogram_fops = {
104 .owner = THIS_MODULE,
105 .open = fscache_histogram_open,
106 .read = seq_read,
107 .llseek = seq_lseek,
108 .release = seq_release,
109};
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
new file mode 100644
index 000000000000..e0cbd16f6dc9
--- /dev/null
+++ b/fs/fscache/internal.h
@@ -0,0 +1,380 @@
1/* Internal definitions for FS-Cache
2 *
3 * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12/*
13 * Lock order, in the order in which multiple locks should be obtained:
14 * - fscache_addremove_sem
15 * - cookie->lock
16 * - cookie->parent->lock
17 * - cache->object_list_lock
18 * - object->lock
19 * - object->parent->lock
20 * - fscache_thread_lock
21 *
22 */
23
24#include <linux/fscache-cache.h>
25#include <linux/sched.h>
26
27#define FSCACHE_MIN_THREADS 4
28#define FSCACHE_MAX_THREADS 32
29
30/*
31 * fsc-cache.c
32 */
33extern struct list_head fscache_cache_list;
34extern struct rw_semaphore fscache_addremove_sem;
35
36extern struct fscache_cache *fscache_select_cache_for_object(
37 struct fscache_cookie *);
38
39/*
40 * fsc-cookie.c
41 */
42extern struct kmem_cache *fscache_cookie_jar;
43
44extern void fscache_cookie_init_once(void *);
45extern void __fscache_cookie_put(struct fscache_cookie *);
46
47/*
48 * fsc-fsdef.c
49 */
50extern struct fscache_cookie fscache_fsdef_index;
51extern struct fscache_cookie_def fscache_fsdef_netfs_def;
52
53/*
54 * fsc-histogram.c
55 */
56#ifdef CONFIG_FSCACHE_HISTOGRAM
57extern atomic_t fscache_obj_instantiate_histogram[HZ];
58extern atomic_t fscache_objs_histogram[HZ];
59extern atomic_t fscache_ops_histogram[HZ];
60extern atomic_t fscache_retrieval_delay_histogram[HZ];
61extern atomic_t fscache_retrieval_histogram[HZ];
62
63static inline void fscache_hist(atomic_t histogram[], unsigned long start_jif)
64{
65 unsigned long jif = jiffies - start_jif;
66 if (jif >= HZ)
67 jif = HZ - 1;
68 atomic_inc(&histogram[jif]);
69}
70
71extern const struct file_operations fscache_histogram_fops;
72
73#else
74#define fscache_hist(hist, start_jif) do {} while (0)
75#endif
76
77/*
78 * fsc-main.c
79 */
80extern unsigned fscache_defer_lookup;
81extern unsigned fscache_defer_create;
82extern unsigned fscache_debug;
83extern struct kobject *fscache_root;
84
85extern int fscache_wait_bit(void *);
86extern int fscache_wait_bit_interruptible(void *);
87
88/*
89 * fsc-object.c
90 */
91extern void fscache_withdrawing_object(struct fscache_cache *,
92 struct fscache_object *);
93extern void fscache_enqueue_object(struct fscache_object *);
94
95/*
96 * fsc-operation.c
97 */
98extern int fscache_submit_exclusive_op(struct fscache_object *,
99 struct fscache_operation *);
100extern int fscache_submit_op(struct fscache_object *,
101 struct fscache_operation *);
102extern void fscache_abort_object(struct fscache_object *);
103extern void fscache_start_operations(struct fscache_object *);
104extern void fscache_operation_gc(struct work_struct *);
105
106/*
107 * fsc-proc.c
108 */
109#ifdef CONFIG_PROC_FS
110extern int __init fscache_proc_init(void);
111extern void fscache_proc_cleanup(void);
112#else
113#define fscache_proc_init() (0)
114#define fscache_proc_cleanup() do {} while (0)
115#endif
116
117/*
118 * fsc-stats.c
119 */
120#ifdef CONFIG_FSCACHE_STATS
121extern atomic_t fscache_n_ops_processed[FSCACHE_MAX_THREADS];
122extern atomic_t fscache_n_objs_processed[FSCACHE_MAX_THREADS];
123
124extern atomic_t fscache_n_op_pend;
125extern atomic_t fscache_n_op_run;
126extern atomic_t fscache_n_op_enqueue;
127extern atomic_t fscache_n_op_deferred_release;
128extern atomic_t fscache_n_op_release;
129extern atomic_t fscache_n_op_gc;
130
131extern atomic_t fscache_n_attr_changed;
132extern atomic_t fscache_n_attr_changed_ok;
133extern atomic_t fscache_n_attr_changed_nobufs;
134extern atomic_t fscache_n_attr_changed_nomem;
135extern atomic_t fscache_n_attr_changed_calls;
136
137extern atomic_t fscache_n_allocs;
138extern atomic_t fscache_n_allocs_ok;
139extern atomic_t fscache_n_allocs_wait;
140extern atomic_t fscache_n_allocs_nobufs;
141extern atomic_t fscache_n_alloc_ops;
142extern atomic_t fscache_n_alloc_op_waits;
143
144extern atomic_t fscache_n_retrievals;
145extern atomic_t fscache_n_retrievals_ok;
146extern atomic_t fscache_n_retrievals_wait;
147extern atomic_t fscache_n_retrievals_nodata;
148extern atomic_t fscache_n_retrievals_nobufs;
149extern atomic_t fscache_n_retrievals_intr;
150extern atomic_t fscache_n_retrievals_nomem;
151extern atomic_t fscache_n_retrieval_ops;
152extern atomic_t fscache_n_retrieval_op_waits;
153
154extern atomic_t fscache_n_stores;
155extern atomic_t fscache_n_stores_ok;
156extern atomic_t fscache_n_stores_again;
157extern atomic_t fscache_n_stores_nobufs;
158extern atomic_t fscache_n_stores_oom;
159extern atomic_t fscache_n_store_ops;
160extern atomic_t fscache_n_store_calls;
161
162extern atomic_t fscache_n_marks;
163extern atomic_t fscache_n_uncaches;
164
165extern atomic_t fscache_n_acquires;
166extern atomic_t fscache_n_acquires_null;
167extern atomic_t fscache_n_acquires_no_cache;
168extern atomic_t fscache_n_acquires_ok;
169extern atomic_t fscache_n_acquires_nobufs;
170extern atomic_t fscache_n_acquires_oom;
171
172extern atomic_t fscache_n_updates;
173extern atomic_t fscache_n_updates_null;
174extern atomic_t fscache_n_updates_run;
175
176extern atomic_t fscache_n_relinquishes;
177extern atomic_t fscache_n_relinquishes_null;
178extern atomic_t fscache_n_relinquishes_waitcrt;
179
180extern atomic_t fscache_n_cookie_index;
181extern atomic_t fscache_n_cookie_data;
182extern atomic_t fscache_n_cookie_special;
183
184extern atomic_t fscache_n_object_alloc;
185extern atomic_t fscache_n_object_no_alloc;
186extern atomic_t fscache_n_object_lookups;
187extern atomic_t fscache_n_object_lookups_negative;
188extern atomic_t fscache_n_object_lookups_positive;
189extern atomic_t fscache_n_object_created;
190extern atomic_t fscache_n_object_avail;
191extern atomic_t fscache_n_object_dead;
192
193extern atomic_t fscache_n_checkaux_none;
194extern atomic_t fscache_n_checkaux_okay;
195extern atomic_t fscache_n_checkaux_update;
196extern atomic_t fscache_n_checkaux_obsolete;
197
198static inline void fscache_stat(atomic_t *stat)
199{
200 atomic_inc(stat);
201}
202
203extern const struct file_operations fscache_stats_fops;
204#else
205
206#define fscache_stat(stat) do {} while (0)
207#endif
208
209/*
210 * raise an event on an object
211 * - if the event is not masked for that object, then the object is
212 * queued for attention by the thread pool.
213 */
214static inline void fscache_raise_event(struct fscache_object *object,
215 unsigned event)
216{
217 if (!test_and_set_bit(event, &object->events) &&
218 test_bit(event, &object->event_mask))
219 fscache_enqueue_object(object);
220}
221
222/*
223 * drop a reference to a cookie
224 */
225static inline void fscache_cookie_put(struct fscache_cookie *cookie)
226{
227 BUG_ON(atomic_read(&cookie->usage) <= 0);
228 if (atomic_dec_and_test(&cookie->usage))
229 __fscache_cookie_put(cookie);
230}
231
232/*
233 * get an extra reference to a netfs retrieval context
234 */
235static inline
236void *fscache_get_context(struct fscache_cookie *cookie, void *context)
237{
238 if (cookie->def->get_context)
239 cookie->def->get_context(cookie->netfs_data, context);
240 return context;
241}
242
243/*
244 * release a reference to a netfs retrieval context
245 */
246static inline
247void fscache_put_context(struct fscache_cookie *cookie, void *context)
248{
249 if (cookie->def->put_context)
250 cookie->def->put_context(cookie->netfs_data, context);
251}
252
253/*****************************************************************************/
254/*
255 * debug tracing
256 */
257#define dbgprintk(FMT, ...) \
258 printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
259
260/* make sure we maintain the format strings, even when debugging is disabled */
261static inline __attribute__((format(printf, 1, 2)))
262void _dbprintk(const char *fmt, ...)
263{
264}
265
266#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
267#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
268#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
269
270#define kjournal(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
271
272#ifdef __KDEBUG
273#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
274#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
275#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
276
277#elif defined(CONFIG_FSCACHE_DEBUG)
278#define _enter(FMT, ...) \
279do { \
280 if (__do_kdebug(ENTER)) \
281 kenter(FMT, ##__VA_ARGS__); \
282} while (0)
283
284#define _leave(FMT, ...) \
285do { \
286 if (__do_kdebug(LEAVE)) \
287 kleave(FMT, ##__VA_ARGS__); \
288} while (0)
289
290#define _debug(FMT, ...) \
291do { \
292 if (__do_kdebug(DEBUG)) \
293 kdebug(FMT, ##__VA_ARGS__); \
294} while (0)
295
296#else
297#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
298#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
299#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
300#endif
301
302/*
303 * determine whether a particular optional debugging point should be logged
304 * - we need to go through three steps to persuade cpp to correctly join the
305 * shorthand in FSCACHE_DEBUG_LEVEL with its prefix
306 */
307#define ____do_kdebug(LEVEL, POINT) \
308 unlikely((fscache_debug & \
309 (FSCACHE_POINT_##POINT << (FSCACHE_DEBUG_ ## LEVEL * 3))))
310#define ___do_kdebug(LEVEL, POINT) \
311 ____do_kdebug(LEVEL, POINT)
312#define __do_kdebug(POINT) \
313 ___do_kdebug(FSCACHE_DEBUG_LEVEL, POINT)
314
315#define FSCACHE_DEBUG_CACHE 0
316#define FSCACHE_DEBUG_COOKIE 1
317#define FSCACHE_DEBUG_PAGE 2
318#define FSCACHE_DEBUG_OPERATION 3
319
320#define FSCACHE_POINT_ENTER 1
321#define FSCACHE_POINT_LEAVE 2
322#define FSCACHE_POINT_DEBUG 4
323
324#ifndef FSCACHE_DEBUG_LEVEL
325#define FSCACHE_DEBUG_LEVEL CACHE
326#endif
327
328/*
329 * assertions
330 */
331#if 1 /* defined(__KDEBUGALL) */
332
333#define ASSERT(X) \
334do { \
335 if (unlikely(!(X))) { \
336 printk(KERN_ERR "\n"); \
337 printk(KERN_ERR "FS-Cache: Assertion failed\n"); \
338 BUG(); \
339 } \
340} while (0)
341
342#define ASSERTCMP(X, OP, Y) \
343do { \
344 if (unlikely(!((X) OP (Y)))) { \
345 printk(KERN_ERR "\n"); \
346 printk(KERN_ERR "FS-Cache: Assertion failed\n"); \
347 printk(KERN_ERR "%lx " #OP " %lx is false\n", \
348 (unsigned long)(X), (unsigned long)(Y)); \
349 BUG(); \
350 } \
351} while (0)
352
353#define ASSERTIF(C, X) \
354do { \
355 if (unlikely((C) && !(X))) { \
356 printk(KERN_ERR "\n"); \
357 printk(KERN_ERR "FS-Cache: Assertion failed\n"); \
358 BUG(); \
359 } \
360} while (0)
361
362#define ASSERTIFCMP(C, X, OP, Y) \
363do { \
364 if (unlikely((C) && !((X) OP (Y)))) { \
365 printk(KERN_ERR "\n"); \
366 printk(KERN_ERR "FS-Cache: Assertion failed\n"); \
367 printk(KERN_ERR "%lx " #OP " %lx is false\n", \
368 (unsigned long)(X), (unsigned long)(Y)); \
369 BUG(); \
370 } \
371} while (0)
372
373#else
374
375#define ASSERT(X) do {} while (0)
376#define ASSERTCMP(X, OP, Y) do {} while (0)
377#define ASSERTIF(C, X) do {} while (0)
378#define ASSERTIFCMP(C, X, OP, Y) do {} while (0)
379
380#endif /* assert or not */
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
new file mode 100644
index 000000000000..4de41b597499
--- /dev/null
+++ b/fs/fscache/main.c
@@ -0,0 +1,124 @@
1/* General filesystem local caching manager
2 *
3 * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL CACHE
13#include <linux/module.h>
14#include <linux/init.h>
15#include <linux/sched.h>
16#include <linux/completion.h>
17#include <linux/slab.h>
18#include "internal.h"
19
20MODULE_DESCRIPTION("FS Cache Manager");
21MODULE_AUTHOR("Red Hat, Inc.");
22MODULE_LICENSE("GPL");
23
24unsigned fscache_defer_lookup = 1;
25module_param_named(defer_lookup, fscache_defer_lookup, uint,
26 S_IWUSR | S_IRUGO);
27MODULE_PARM_DESC(fscache_defer_lookup,
28 "Defer cookie lookup to background thread");
29
30unsigned fscache_defer_create = 1;
31module_param_named(defer_create, fscache_defer_create, uint,
32 S_IWUSR | S_IRUGO);
33MODULE_PARM_DESC(fscache_defer_create,
34 "Defer cookie creation to background thread");
35
36unsigned fscache_debug;
37module_param_named(debug, fscache_debug, uint,
38 S_IWUSR | S_IRUGO);
39MODULE_PARM_DESC(fscache_debug,
40 "FS-Cache debugging mask");
41
42struct kobject *fscache_root;
43
44/*
45 * initialise the fs caching module
46 */
47static int __init fscache_init(void)
48{
49 int ret;
50
51 ret = slow_work_register_user();
52 if (ret < 0)
53 goto error_slow_work;
54
55 ret = fscache_proc_init();
56 if (ret < 0)
57 goto error_proc;
58
59 fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar",
60 sizeof(struct fscache_cookie),
61 0,
62 0,
63 fscache_cookie_init_once);
64 if (!fscache_cookie_jar) {
65 printk(KERN_NOTICE
66 "FS-Cache: Failed to allocate a cookie jar\n");
67 ret = -ENOMEM;
68 goto error_cookie_jar;
69 }
70
71 fscache_root = kobject_create_and_add("fscache", kernel_kobj);
72 if (!fscache_root)
73 goto error_kobj;
74
75 printk(KERN_NOTICE "FS-Cache: Loaded\n");
76 return 0;
77
78error_kobj:
79 kmem_cache_destroy(fscache_cookie_jar);
80error_cookie_jar:
81 fscache_proc_cleanup();
82error_proc:
83 slow_work_unregister_user();
84error_slow_work:
85 return ret;
86}
87
88fs_initcall(fscache_init);
89
90/*
91 * clean up on module removal
92 */
93static void __exit fscache_exit(void)
94{
95 _enter("");
96
97 kobject_put(fscache_root);
98 kmem_cache_destroy(fscache_cookie_jar);
99 fscache_proc_cleanup();
100 slow_work_unregister_user();
101 printk(KERN_NOTICE "FS-Cache: Unloaded\n");
102}
103
104module_exit(fscache_exit);
105
106/*
107 * wait_on_bit() sleep function for uninterruptible waiting
108 */
109int fscache_wait_bit(void *flags)
110{
111 schedule();
112 return 0;
113}
114EXPORT_SYMBOL(fscache_wait_bit);
115
116/*
117 * wait_on_bit() sleep function for interruptible waiting
118 */
119int fscache_wait_bit_interruptible(void *flags)
120{
121 schedule();
122 return signal_pending(current);
123}
124EXPORT_SYMBOL(fscache_wait_bit_interruptible);
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
new file mode 100644
index 000000000000..e028b8eb1c40
--- /dev/null
+++ b/fs/fscache/netfs.c
@@ -0,0 +1,103 @@
1/* FS-Cache netfs (client) registration
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL COOKIE
13#include <linux/module.h>
14#include <linux/slab.h>
15#include "internal.h"
16
17static LIST_HEAD(fscache_netfs_list);
18
19/*
20 * register a network filesystem for caching
21 */
22int __fscache_register_netfs(struct fscache_netfs *netfs)
23{
24 struct fscache_netfs *ptr;
25 int ret;
26
27 _enter("{%s}", netfs->name);
28
29 INIT_LIST_HEAD(&netfs->link);
30
31 /* allocate a cookie for the primary index */
32 netfs->primary_index =
33 kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL);
34
35 if (!netfs->primary_index) {
36 _leave(" = -ENOMEM");
37 return -ENOMEM;
38 }
39
40 /* initialise the primary index cookie */
41 atomic_set(&netfs->primary_index->usage, 1);
42 atomic_set(&netfs->primary_index->n_children, 0);
43
44 netfs->primary_index->def = &fscache_fsdef_netfs_def;
45 netfs->primary_index->parent = &fscache_fsdef_index;
46 netfs->primary_index->netfs_data = netfs;
47
48 atomic_inc(&netfs->primary_index->parent->usage);
49 atomic_inc(&netfs->primary_index->parent->n_children);
50
51 spin_lock_init(&netfs->primary_index->lock);
52 INIT_HLIST_HEAD(&netfs->primary_index->backing_objects);
53
54 /* check the netfs type is not already present */
55 down_write(&fscache_addremove_sem);
56
57 ret = -EEXIST;
58 list_for_each_entry(ptr, &fscache_netfs_list, link) {
59 if (strcmp(ptr->name, netfs->name) == 0)
60 goto already_registered;
61 }
62
63 list_add(&netfs->link, &fscache_netfs_list);
64 ret = 0;
65
66 printk(KERN_NOTICE "FS-Cache: Netfs '%s' registered for caching\n",
67 netfs->name);
68
69already_registered:
70 up_write(&fscache_addremove_sem);
71
72 if (ret < 0) {
73 netfs->primary_index->parent = NULL;
74 __fscache_cookie_put(netfs->primary_index);
75 netfs->primary_index = NULL;
76 }
77
78 _leave(" = %d", ret);
79 return ret;
80}
81EXPORT_SYMBOL(__fscache_register_netfs);
82
83/*
84 * unregister a network filesystem from the cache
85 * - all cookies must have been released first
86 */
87void __fscache_unregister_netfs(struct fscache_netfs *netfs)
88{
89 _enter("{%s.%u}", netfs->name, netfs->version);
90
91 down_write(&fscache_addremove_sem);
92
93 list_del(&netfs->link);
94 fscache_relinquish_cookie(netfs->primary_index, 0);
95
96 up_write(&fscache_addremove_sem);
97
98 printk(KERN_NOTICE "FS-Cache: Netfs '%s' unregistered from caching\n",
99 netfs->name);
100
101 _leave("");
102}
103EXPORT_SYMBOL(__fscache_unregister_netfs);
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
new file mode 100644
index 000000000000..392a41b1b79d
--- /dev/null
+++ b/fs/fscache/object.c
@@ -0,0 +1,810 @@
1/* FS-Cache object state machine handler
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * See Documentation/filesystems/caching/object.txt for a description of the
12 * object state machine and the in-kernel representations.
13 */
14
15#define FSCACHE_DEBUG_LEVEL COOKIE
16#include <linux/module.h>
17#include "internal.h"
18
19const char *fscache_object_states[] = {
20 [FSCACHE_OBJECT_INIT] = "OBJECT_INIT",
21 [FSCACHE_OBJECT_LOOKING_UP] = "OBJECT_LOOKING_UP",
22 [FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING",
23 [FSCACHE_OBJECT_AVAILABLE] = "OBJECT_AVAILABLE",
24 [FSCACHE_OBJECT_ACTIVE] = "OBJECT_ACTIVE",
25 [FSCACHE_OBJECT_UPDATING] = "OBJECT_UPDATING",
26 [FSCACHE_OBJECT_DYING] = "OBJECT_DYING",
27 [FSCACHE_OBJECT_LC_DYING] = "OBJECT_LC_DYING",
28 [FSCACHE_OBJECT_ABORT_INIT] = "OBJECT_ABORT_INIT",
29 [FSCACHE_OBJECT_RELEASING] = "OBJECT_RELEASING",
30 [FSCACHE_OBJECT_RECYCLING] = "OBJECT_RECYCLING",
31 [FSCACHE_OBJECT_WITHDRAWING] = "OBJECT_WITHDRAWING",
32 [FSCACHE_OBJECT_DEAD] = "OBJECT_DEAD",
33};
34EXPORT_SYMBOL(fscache_object_states);
35
36static void fscache_object_slow_work_put_ref(struct slow_work *);
37static int fscache_object_slow_work_get_ref(struct slow_work *);
38static void fscache_object_slow_work_execute(struct slow_work *);
39static void fscache_initialise_object(struct fscache_object *);
40static void fscache_lookup_object(struct fscache_object *);
41static void fscache_object_available(struct fscache_object *);
42static void fscache_release_object(struct fscache_object *);
43static void fscache_withdraw_object(struct fscache_object *);
44static void fscache_enqueue_dependents(struct fscache_object *);
45static void fscache_dequeue_object(struct fscache_object *);
46
47const struct slow_work_ops fscache_object_slow_work_ops = {
48 .get_ref = fscache_object_slow_work_get_ref,
49 .put_ref = fscache_object_slow_work_put_ref,
50 .execute = fscache_object_slow_work_execute,
51};
52EXPORT_SYMBOL(fscache_object_slow_work_ops);
53
54/*
55 * we need to notify the parent when an op completes that we had outstanding
56 * upon it
57 */
58static inline void fscache_done_parent_op(struct fscache_object *object)
59{
60 struct fscache_object *parent = object->parent;
61
62 _enter("OBJ%x {OBJ%x,%x}",
63 object->debug_id, parent->debug_id, parent->n_ops);
64
65 spin_lock_nested(&parent->lock, 1);
66 parent->n_ops--;
67 parent->n_obj_ops--;
68 if (parent->n_ops == 0)
69 fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
70 spin_unlock(&parent->lock);
71}
72
73/*
74 * process events that have been sent to an object's state machine
75 * - initiates parent lookup
76 * - does object lookup
77 * - does object creation
78 * - does object recycling and retirement
79 * - does object withdrawal
80 */
81static void fscache_object_state_machine(struct fscache_object *object)
82{
83 enum fscache_object_state new_state;
84
85 ASSERT(object != NULL);
86
87 _enter("{OBJ%x,%s,%lx}",
88 object->debug_id, fscache_object_states[object->state],
89 object->events);
90
91 switch (object->state) {
92 /* wait for the parent object to become ready */
93 case FSCACHE_OBJECT_INIT:
94 object->event_mask =
95 ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
96 fscache_initialise_object(object);
97 goto done;
98
99 /* look up the object metadata on disk */
100 case FSCACHE_OBJECT_LOOKING_UP:
101 fscache_lookup_object(object);
102 goto lookup_transit;
103
104 /* create the object metadata on disk */
105 case FSCACHE_OBJECT_CREATING:
106 fscache_lookup_object(object);
107 goto lookup_transit;
108
109 /* handle an object becoming available; start pending
110 * operations and queue dependent operations for processing */
111 case FSCACHE_OBJECT_AVAILABLE:
112 fscache_object_available(object);
113 goto active_transit;
114
115 /* normal running state */
116 case FSCACHE_OBJECT_ACTIVE:
117 goto active_transit;
118
119 /* update the object metadata on disk */
120 case FSCACHE_OBJECT_UPDATING:
121 clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
122 fscache_stat(&fscache_n_updates_run);
123 object->cache->ops->update_object(object);
124 goto active_transit;
125
126 /* handle an object dying during lookup or creation */
127 case FSCACHE_OBJECT_LC_DYING:
128 object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
129 object->cache->ops->lookup_complete(object);
130
131 spin_lock(&object->lock);
132 object->state = FSCACHE_OBJECT_DYING;
133 if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
134 &object->cookie->flags))
135 wake_up_bit(&object->cookie->flags,
136 FSCACHE_COOKIE_CREATING);
137 spin_unlock(&object->lock);
138
139 fscache_done_parent_op(object);
140
141 /* wait for completion of all active operations on this object
142 * and the death of all child objects of this object */
143 case FSCACHE_OBJECT_DYING:
144 dying:
145 clear_bit(FSCACHE_OBJECT_EV_CLEARED, &object->events);
146 spin_lock(&object->lock);
147 _debug("dying OBJ%x {%d,%d}",
148 object->debug_id, object->n_ops, object->n_children);
149 if (object->n_ops == 0 && object->n_children == 0) {
150 object->event_mask &=
151 ~(1 << FSCACHE_OBJECT_EV_CLEARED);
152 object->event_mask |=
153 (1 << FSCACHE_OBJECT_EV_WITHDRAW) |
154 (1 << FSCACHE_OBJECT_EV_RETIRE) |
155 (1 << FSCACHE_OBJECT_EV_RELEASE) |
156 (1 << FSCACHE_OBJECT_EV_ERROR);
157 } else {
158 object->event_mask &=
159 ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
160 (1 << FSCACHE_OBJECT_EV_RETIRE) |
161 (1 << FSCACHE_OBJECT_EV_RELEASE) |
162 (1 << FSCACHE_OBJECT_EV_ERROR));
163 object->event_mask |=
164 1 << FSCACHE_OBJECT_EV_CLEARED;
165 }
166 spin_unlock(&object->lock);
167 fscache_enqueue_dependents(object);
168 goto terminal_transit;
169
170 /* handle an abort during initialisation */
171 case FSCACHE_OBJECT_ABORT_INIT:
172 _debug("handle abort init %lx", object->events);
173 object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
174
175 spin_lock(&object->lock);
176 fscache_dequeue_object(object);
177
178 object->state = FSCACHE_OBJECT_DYING;
179 if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
180 &object->cookie->flags))
181 wake_up_bit(&object->cookie->flags,
182 FSCACHE_COOKIE_CREATING);
183 spin_unlock(&object->lock);
184 goto dying;
185
186 /* handle the netfs releasing an object and possibly marking it
187 * obsolete too */
188 case FSCACHE_OBJECT_RELEASING:
189 case FSCACHE_OBJECT_RECYCLING:
190 object->event_mask &=
191 ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
192 (1 << FSCACHE_OBJECT_EV_RETIRE) |
193 (1 << FSCACHE_OBJECT_EV_RELEASE) |
194 (1 << FSCACHE_OBJECT_EV_ERROR));
195 fscache_release_object(object);
196 spin_lock(&object->lock);
197 object->state = FSCACHE_OBJECT_DEAD;
198 spin_unlock(&object->lock);
199 fscache_stat(&fscache_n_object_dead);
200 goto terminal_transit;
201
202 /* handle the parent cache of this object being withdrawn from
203 * active service */
204 case FSCACHE_OBJECT_WITHDRAWING:
205 object->event_mask &=
206 ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
207 (1 << FSCACHE_OBJECT_EV_RETIRE) |
208 (1 << FSCACHE_OBJECT_EV_RELEASE) |
209 (1 << FSCACHE_OBJECT_EV_ERROR));
210 fscache_withdraw_object(object);
211 spin_lock(&object->lock);
212 object->state = FSCACHE_OBJECT_DEAD;
213 spin_unlock(&object->lock);
214 fscache_stat(&fscache_n_object_dead);
215 goto terminal_transit;
216
217 /* complain about the object being woken up once it is
218 * deceased */
219 case FSCACHE_OBJECT_DEAD:
220 printk(KERN_ERR "FS-Cache:"
221 " Unexpected event in dead state %lx\n",
222 object->events & object->event_mask);
223 BUG();
224
225 default:
226 printk(KERN_ERR "FS-Cache: Unknown object state %u\n",
227 object->state);
228 BUG();
229 }
230
231 /* determine the transition from a lookup state */
232lookup_transit:
233 switch (fls(object->events & object->event_mask) - 1) {
234 case FSCACHE_OBJECT_EV_WITHDRAW:
235 case FSCACHE_OBJECT_EV_RETIRE:
236 case FSCACHE_OBJECT_EV_RELEASE:
237 case FSCACHE_OBJECT_EV_ERROR:
238 new_state = FSCACHE_OBJECT_LC_DYING;
239 goto change_state;
240 case FSCACHE_OBJECT_EV_REQUEUE:
241 goto done;
242 case -1:
243 goto done; /* sleep until event */
244 default:
245 goto unsupported_event;
246 }
247
248 /* determine the transition from an active state */
249active_transit:
250 switch (fls(object->events & object->event_mask) - 1) {
251 case FSCACHE_OBJECT_EV_WITHDRAW:
252 case FSCACHE_OBJECT_EV_RETIRE:
253 case FSCACHE_OBJECT_EV_RELEASE:
254 case FSCACHE_OBJECT_EV_ERROR:
255 new_state = FSCACHE_OBJECT_DYING;
256 goto change_state;
257 case FSCACHE_OBJECT_EV_UPDATE:
258 new_state = FSCACHE_OBJECT_UPDATING;
259 goto change_state;
260 case -1:
261 new_state = FSCACHE_OBJECT_ACTIVE;
262 goto change_state; /* sleep until event */
263 default:
264 goto unsupported_event;
265 }
266
267 /* determine the transition from a terminal state */
268terminal_transit:
269 switch (fls(object->events & object->event_mask) - 1) {
270 case FSCACHE_OBJECT_EV_WITHDRAW:
271 new_state = FSCACHE_OBJECT_WITHDRAWING;
272 goto change_state;
273 case FSCACHE_OBJECT_EV_RETIRE:
274 new_state = FSCACHE_OBJECT_RECYCLING;
275 goto change_state;
276 case FSCACHE_OBJECT_EV_RELEASE:
277 new_state = FSCACHE_OBJECT_RELEASING;
278 goto change_state;
279 case FSCACHE_OBJECT_EV_ERROR:
280 new_state = FSCACHE_OBJECT_WITHDRAWING;
281 goto change_state;
282 case FSCACHE_OBJECT_EV_CLEARED:
283 new_state = FSCACHE_OBJECT_DYING;
284 goto change_state;
285 case -1:
286 goto done; /* sleep until event */
287 default:
288 goto unsupported_event;
289 }
290
291change_state:
292 spin_lock(&object->lock);
293 object->state = new_state;
294 spin_unlock(&object->lock);
295
296done:
297 _leave(" [->%s]", fscache_object_states[object->state]);
298 return;
299
300unsupported_event:
301 printk(KERN_ERR "FS-Cache:"
302 " Unsupported event %lx [mask %lx] in state %s\n",
303 object->events, object->event_mask,
304 fscache_object_states[object->state]);
305 BUG();
306}
307
308/*
309 * execute an object
310 */
311static void fscache_object_slow_work_execute(struct slow_work *work)
312{
313 struct fscache_object *object =
314 container_of(work, struct fscache_object, work);
315 unsigned long start;
316
317 _enter("{OBJ%x}", object->debug_id);
318
319 clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
320
321 start = jiffies;
322 fscache_object_state_machine(object);
323 fscache_hist(fscache_objs_histogram, start);
324 if (object->events & object->event_mask)
325 fscache_enqueue_object(object);
326}
327
328/*
329 * initialise an object
330 * - check the specified object's parent to see if we can make use of it
331 * immediately to do a creation
332 * - we may need to start the process of creating a parent and we need to wait
333 * for the parent's lookup and creation to complete if it's not there yet
334 * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
335 * leaf-most cookies of the object and all its children
336 */
337static void fscache_initialise_object(struct fscache_object *object)
338{
339 struct fscache_object *parent;
340
341 _enter("");
342 ASSERT(object->cookie != NULL);
343 ASSERT(object->cookie->parent != NULL);
344 ASSERT(list_empty(&object->work.link));
345
346 if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
347 (1 << FSCACHE_OBJECT_EV_RELEASE) |
348 (1 << FSCACHE_OBJECT_EV_RETIRE) |
349 (1 << FSCACHE_OBJECT_EV_WITHDRAW))) {
350 _debug("abort init %lx", object->events);
351 spin_lock(&object->lock);
352 object->state = FSCACHE_OBJECT_ABORT_INIT;
353 spin_unlock(&object->lock);
354 return;
355 }
356
357 spin_lock(&object->cookie->lock);
358 spin_lock_nested(&object->cookie->parent->lock, 1);
359
360 parent = object->parent;
361 if (!parent) {
362 _debug("no parent");
363 set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
364 } else {
365 spin_lock(&object->lock);
366 spin_lock_nested(&parent->lock, 1);
367 _debug("parent %s", fscache_object_states[parent->state]);
368
369 if (parent->state >= FSCACHE_OBJECT_DYING) {
370 _debug("bad parent");
371 set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
372 } else if (parent->state < FSCACHE_OBJECT_AVAILABLE) {
373 _debug("wait");
374
375 /* we may get woken up in this state by child objects
376 * binding on to us, so we need to make sure we don't
377 * add ourself to the list multiple times */
378 if (list_empty(&object->dep_link)) {
379 object->cache->ops->grab_object(object);
380 list_add(&object->dep_link,
381 &parent->dependents);
382
383 /* fscache_acquire_non_index_cookie() uses this
384 * to wake the chain up */
385 if (parent->state == FSCACHE_OBJECT_INIT)
386 fscache_enqueue_object(parent);
387 }
388 } else {
389 _debug("go");
390 parent->n_ops++;
391 parent->n_obj_ops++;
392 object->lookup_jif = jiffies;
393 object->state = FSCACHE_OBJECT_LOOKING_UP;
394 set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
395 }
396
397 spin_unlock(&parent->lock);
398 spin_unlock(&object->lock);
399 }
400
401 spin_unlock(&object->cookie->parent->lock);
402 spin_unlock(&object->cookie->lock);
403 _leave("");
404}
405
406/*
407 * look an object up in the cache from which it was allocated
408 * - we hold an "access lock" on the parent object, so the parent object cannot
409 * be withdrawn by either party till we've finished
410 * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
411 * leaf-most cookies of the object and all its children
412 */
413static void fscache_lookup_object(struct fscache_object *object)
414{
415 struct fscache_cookie *cookie = object->cookie;
416 struct fscache_object *parent;
417
418 _enter("");
419
420 parent = object->parent;
421 ASSERT(parent != NULL);
422 ASSERTCMP(parent->n_ops, >, 0);
423 ASSERTCMP(parent->n_obj_ops, >, 0);
424
425 /* make sure the parent is still available */
426 ASSERTCMP(parent->state, >=, FSCACHE_OBJECT_AVAILABLE);
427
428 if (parent->state >= FSCACHE_OBJECT_DYING ||
429 test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
430 _debug("unavailable");
431 set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
432 _leave("");
433 return;
434 }
435
436 _debug("LOOKUP \"%s/%s\" in \"%s\"",
437 parent->cookie->def->name, cookie->def->name,
438 object->cache->tag->name);
439
440 fscache_stat(&fscache_n_object_lookups);
441 object->cache->ops->lookup_object(object);
442
443 if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events))
444 set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
445
446 _leave("");
447}
448
449/**
450 * fscache_object_lookup_negative - Note negative cookie lookup
451 * @object: Object pointing to cookie to mark
452 *
453 * Note negative lookup, permitting those waiting to read data from an already
454 * existing backing object to continue as there's no data for them to read.
455 */
456void fscache_object_lookup_negative(struct fscache_object *object)
457{
458 struct fscache_cookie *cookie = object->cookie;
459
460 _enter("{OBJ%x,%s}",
461 object->debug_id, fscache_object_states[object->state]);
462
463 spin_lock(&object->lock);
464 if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
465 fscache_stat(&fscache_n_object_lookups_negative);
466
467 /* transit here to allow write requests to begin stacking up
468 * and read requests to begin returning ENODATA */
469 object->state = FSCACHE_OBJECT_CREATING;
470 spin_unlock(&object->lock);
471
472 set_bit(FSCACHE_COOKIE_PENDING_FILL, &cookie->flags);
473 set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
474
475 _debug("wake up lookup %p", &cookie->flags);
476 smp_mb__before_clear_bit();
477 clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
478 smp_mb__after_clear_bit();
479 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
480 set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
481 } else {
482 ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
483 spin_unlock(&object->lock);
484 }
485
486 _leave("");
487}
488EXPORT_SYMBOL(fscache_object_lookup_negative);
489
490/**
491 * fscache_obtained_object - Note successful object lookup or creation
492 * @object: Object pointing to cookie to mark
493 *
494 * Note successful lookup and/or creation, permitting those waiting to write
495 * data to a backing object to continue.
496 *
497 * Note that after calling this, an object's cookie may be relinquished by the
498 * netfs, and so must be accessed with object lock held.
499 */
500void fscache_obtained_object(struct fscache_object *object)
501{
502 struct fscache_cookie *cookie = object->cookie;
503
504 _enter("{OBJ%x,%s}",
505 object->debug_id, fscache_object_states[object->state]);
506
507 /* if we were still looking up, then we must have a positive lookup
508 * result, in which case there may be data available */
509 spin_lock(&object->lock);
510 if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
511 fscache_stat(&fscache_n_object_lookups_positive);
512
513 clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
514
515 object->state = FSCACHE_OBJECT_AVAILABLE;
516 spin_unlock(&object->lock);
517
518 smp_mb__before_clear_bit();
519 clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
520 smp_mb__after_clear_bit();
521 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
522 set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
523 } else {
524 ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
525 fscache_stat(&fscache_n_object_created);
526
527 object->state = FSCACHE_OBJECT_AVAILABLE;
528 spin_unlock(&object->lock);
529 set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
530 smp_wmb();
531 }
532
533 if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &cookie->flags))
534 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_CREATING);
535
536 _leave("");
537}
538EXPORT_SYMBOL(fscache_obtained_object);
539
540/*
541 * handle an object that has just become available
542 */
543static void fscache_object_available(struct fscache_object *object)
544{
545 _enter("{OBJ%x}", object->debug_id);
546
547 spin_lock(&object->lock);
548
549 if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags))
550 wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING);
551
552 fscache_done_parent_op(object);
553 if (object->n_in_progress == 0) {
554 if (object->n_ops > 0) {
555 ASSERTCMP(object->n_ops, >=, object->n_obj_ops);
556 ASSERTIF(object->n_ops > object->n_obj_ops,
557 !list_empty(&object->pending_ops));
558 fscache_start_operations(object);
559 } else {
560 ASSERT(list_empty(&object->pending_ops));
561 }
562 }
563 spin_unlock(&object->lock);
564
565 object->cache->ops->lookup_complete(object);
566 fscache_enqueue_dependents(object);
567
568 fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif);
569 fscache_stat(&fscache_n_object_avail);
570
571 _leave("");
572}
573
574/*
575 * drop an object's attachments
576 */
577static void fscache_drop_object(struct fscache_object *object)
578{
579 struct fscache_object *parent = object->parent;
580 struct fscache_cache *cache = object->cache;
581
582 _enter("{OBJ%x,%d}", object->debug_id, object->n_children);
583
584 spin_lock(&cache->object_list_lock);
585 list_del_init(&object->cache_link);
586 spin_unlock(&cache->object_list_lock);
587
588 cache->ops->drop_object(object);
589
590 if (parent) {
591 _debug("release parent OBJ%x {%d}",
592 parent->debug_id, parent->n_children);
593
594 spin_lock(&parent->lock);
595 parent->n_children--;
596 if (parent->n_children == 0)
597 fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
598 spin_unlock(&parent->lock);
599 object->parent = NULL;
600 }
601
602 /* this just shifts the object release to the slow work processor */
603 object->cache->ops->put_object(object);
604
605 _leave("");
606}
607
608/*
609 * release or recycle an object that the netfs has discarded
610 */
611static void fscache_release_object(struct fscache_object *object)
612{
613 _enter("");
614
615 fscache_drop_object(object);
616}
617
618/*
619 * withdraw an object from active service
620 */
621static void fscache_withdraw_object(struct fscache_object *object)
622{
623 struct fscache_cookie *cookie;
624 bool detached;
625
626 _enter("");
627
628 spin_lock(&object->lock);
629 cookie = object->cookie;
630 if (cookie) {
631 /* need to get the cookie lock before the object lock, starting
632 * from the object pointer */
633 atomic_inc(&cookie->usage);
634 spin_unlock(&object->lock);
635
636 detached = false;
637 spin_lock(&cookie->lock);
638 spin_lock(&object->lock);
639
640 if (object->cookie == cookie) {
641 hlist_del_init(&object->cookie_link);
642 object->cookie = NULL;
643 detached = true;
644 }
645 spin_unlock(&cookie->lock);
646 fscache_cookie_put(cookie);
647 if (detached)
648 fscache_cookie_put(cookie);
649 }
650
651 spin_unlock(&object->lock);
652
653 fscache_drop_object(object);
654}
655
656/*
657 * withdraw an object from active service at the behest of the cache
658 * - need break the links to a cached object cookie
659 * - called under two situations:
660 * (1) recycler decides to reclaim an in-use object
661 * (2) a cache is unmounted
662 * - have to take care as the cookie can be being relinquished by the netfs
663 * simultaneously
664 * - the object is pinned by the caller holding a refcount on it
665 */
666void fscache_withdrawing_object(struct fscache_cache *cache,
667 struct fscache_object *object)
668{
669 bool enqueue = false;
670
671 _enter(",OBJ%x", object->debug_id);
672
673 spin_lock(&object->lock);
674 if (object->state < FSCACHE_OBJECT_WITHDRAWING) {
675 object->state = FSCACHE_OBJECT_WITHDRAWING;
676 enqueue = true;
677 }
678 spin_unlock(&object->lock);
679
680 if (enqueue)
681 fscache_enqueue_object(object);
682
683 _leave("");
684}
685
686/*
687 * allow the slow work item processor to get a ref on an object
688 */
689static int fscache_object_slow_work_get_ref(struct slow_work *work)
690{
691 struct fscache_object *object =
692 container_of(work, struct fscache_object, work);
693
694 return object->cache->ops->grab_object(object) ? 0 : -EAGAIN;
695}
696
697/*
698 * allow the slow work item processor to discard a ref on a work item
699 */
700static void fscache_object_slow_work_put_ref(struct slow_work *work)
701{
702 struct fscache_object *object =
703 container_of(work, struct fscache_object, work);
704
705 return object->cache->ops->put_object(object);
706}
707
708/*
709 * enqueue an object for metadata-type processing
710 */
711void fscache_enqueue_object(struct fscache_object *object)
712{
713 _enter("{OBJ%x}", object->debug_id);
714
715 slow_work_enqueue(&object->work);
716}
717
718/*
719 * enqueue the dependents of an object for metadata-type processing
720 * - the caller must hold the object's lock
721 * - this may cause an already locked object to wind up being processed again
722 */
723static void fscache_enqueue_dependents(struct fscache_object *object)
724{
725 struct fscache_object *dep;
726
727 _enter("{OBJ%x}", object->debug_id);
728
729 if (list_empty(&object->dependents))
730 return;
731
732 spin_lock(&object->lock);
733
734 while (!list_empty(&object->dependents)) {
735 dep = list_entry(object->dependents.next,
736 struct fscache_object, dep_link);
737 list_del_init(&dep->dep_link);
738
739
740 /* sort onto appropriate lists */
741 fscache_enqueue_object(dep);
742 dep->cache->ops->put_object(dep);
743
744 if (!list_empty(&object->dependents))
745 cond_resched_lock(&object->lock);
746 }
747
748 spin_unlock(&object->lock);
749}
750
751/*
752 * remove an object from whatever queue it's waiting on
753 * - the caller must hold object->lock
754 */
755void fscache_dequeue_object(struct fscache_object *object)
756{
757 _enter("{OBJ%x}", object->debug_id);
758
759 if (!list_empty(&object->dep_link)) {
760 spin_lock(&object->parent->lock);
761 list_del_init(&object->dep_link);
762 spin_unlock(&object->parent->lock);
763 }
764
765 _leave("");
766}
767
768/**
769 * fscache_check_aux - Ask the netfs whether an object on disk is still valid
770 * @object: The object to ask about
771 * @data: The auxiliary data for the object
772 * @datalen: The size of the auxiliary data
773 *
774 * This function consults the netfs about the coherency state of an object
775 */
776enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
777 const void *data, uint16_t datalen)
778{
779 enum fscache_checkaux result;
780
781 if (!object->cookie->def->check_aux) {
782 fscache_stat(&fscache_n_checkaux_none);
783 return FSCACHE_CHECKAUX_OKAY;
784 }
785
786 result = object->cookie->def->check_aux(object->cookie->netfs_data,
787 data, datalen);
788 switch (result) {
789 /* entry okay as is */
790 case FSCACHE_CHECKAUX_OKAY:
791 fscache_stat(&fscache_n_checkaux_okay);
792 break;
793
794 /* entry requires update */
795 case FSCACHE_CHECKAUX_NEEDS_UPDATE:
796 fscache_stat(&fscache_n_checkaux_update);
797 break;
798
799 /* entry requires deletion */
800 case FSCACHE_CHECKAUX_OBSOLETE:
801 fscache_stat(&fscache_n_checkaux_obsolete);
802 break;
803
804 default:
805 BUG();
806 }
807
808 return result;
809}
810EXPORT_SYMBOL(fscache_check_aux);
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
new file mode 100644
index 000000000000..e7f8d53b8b6b
--- /dev/null
+++ b/fs/fscache/operation.c
@@ -0,0 +1,459 @@
1/* FS-Cache worker operation management routines
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * See Documentation/filesystems/caching/operations.txt
12 */
13
14#define FSCACHE_DEBUG_LEVEL OPERATION
15#include <linux/module.h>
16#include "internal.h"
17
18atomic_t fscache_op_debug_id;
19EXPORT_SYMBOL(fscache_op_debug_id);
20
21/**
22 * fscache_enqueue_operation - Enqueue an operation for processing
23 * @op: The operation to enqueue
24 *
25 * Enqueue an operation for processing by the FS-Cache thread pool.
26 *
27 * This will get its own ref on the object.
28 */
29void fscache_enqueue_operation(struct fscache_operation *op)
30{
31 _enter("{OBJ%x OP%x,%u}",
32 op->object->debug_id, op->debug_id, atomic_read(&op->usage));
33
34 ASSERT(op->processor != NULL);
35 ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
36 ASSERTCMP(atomic_read(&op->usage), >, 0);
37
38 if (list_empty(&op->pend_link)) {
39 switch (op->flags & FSCACHE_OP_TYPE) {
40 case FSCACHE_OP_FAST:
41 _debug("queue fast");
42 atomic_inc(&op->usage);
43 if (!schedule_work(&op->fast_work))
44 fscache_put_operation(op);
45 break;
46 case FSCACHE_OP_SLOW:
47 _debug("queue slow");
48 slow_work_enqueue(&op->slow_work);
49 break;
50 case FSCACHE_OP_MYTHREAD:
51 _debug("queue for caller's attention");
52 break;
53 default:
54 printk(KERN_ERR "FS-Cache: Unexpected op type %lx",
55 op->flags);
56 BUG();
57 break;
58 }
59 fscache_stat(&fscache_n_op_enqueue);
60 }
61}
62EXPORT_SYMBOL(fscache_enqueue_operation);
63
64/*
65 * start an op running
66 */
67static void fscache_run_op(struct fscache_object *object,
68 struct fscache_operation *op)
69{
70 object->n_in_progress++;
71 if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
72 wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
73 if (op->processor)
74 fscache_enqueue_operation(op);
75 fscache_stat(&fscache_n_op_run);
76}
77
78/*
79 * submit an exclusive operation for an object
80 * - other ops are excluded from running simultaneously with this one
81 * - this gets any extra refs it needs on an op
82 */
83int fscache_submit_exclusive_op(struct fscache_object *object,
84 struct fscache_operation *op)
85{
86 int ret;
87
88 _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
89
90 spin_lock(&object->lock);
91 ASSERTCMP(object->n_ops, >=, object->n_in_progress);
92 ASSERTCMP(object->n_ops, >=, object->n_exclusive);
93
94 ret = -ENOBUFS;
95 if (fscache_object_is_active(object)) {
96 op->object = object;
97 object->n_ops++;
98 object->n_exclusive++; /* reads and writes must wait */
99
100 if (object->n_ops > 0) {
101 atomic_inc(&op->usage);
102 list_add_tail(&op->pend_link, &object->pending_ops);
103 fscache_stat(&fscache_n_op_pend);
104 } else if (!list_empty(&object->pending_ops)) {
105 atomic_inc(&op->usage);
106 list_add_tail(&op->pend_link, &object->pending_ops);
107 fscache_stat(&fscache_n_op_pend);
108 fscache_start_operations(object);
109 } else {
110 ASSERTCMP(object->n_in_progress, ==, 0);
111 fscache_run_op(object, op);
112 }
113
114 /* need to issue a new write op after this */
115 clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
116 ret = 0;
117 } else if (object->state == FSCACHE_OBJECT_CREATING) {
118 op->object = object;
119 object->n_ops++;
120 object->n_exclusive++; /* reads and writes must wait */
121 atomic_inc(&op->usage);
122 list_add_tail(&op->pend_link, &object->pending_ops);
123 fscache_stat(&fscache_n_op_pend);
124 ret = 0;
125 } else {
126 /* not allowed to submit ops in any other state */
127 BUG();
128 }
129
130 spin_unlock(&object->lock);
131 return ret;
132}
133
134/*
135 * report an unexpected submission
136 */
137static void fscache_report_unexpected_submission(struct fscache_object *object,
138 struct fscache_operation *op,
139 unsigned long ostate)
140{
141 static bool once_only;
142 struct fscache_operation *p;
143 unsigned n;
144
145 if (once_only)
146 return;
147 once_only = true;
148
149 kdebug("unexpected submission OP%x [OBJ%x %s]",
150 op->debug_id, object->debug_id,
151 fscache_object_states[object->state]);
152 kdebug("objstate=%s [%s]",
153 fscache_object_states[object->state],
154 fscache_object_states[ostate]);
155 kdebug("objflags=%lx", object->flags);
156 kdebug("objevent=%lx [%lx]", object->events, object->event_mask);
157 kdebug("ops=%u inp=%u exc=%u",
158 object->n_ops, object->n_in_progress, object->n_exclusive);
159
160 if (!list_empty(&object->pending_ops)) {
161 n = 0;
162 list_for_each_entry(p, &object->pending_ops, pend_link) {
163 ASSERTCMP(p->object, ==, object);
164 kdebug("%p %p", op->processor, op->release);
165 n++;
166 }
167
168 kdebug("n=%u", n);
169 }
170
171 dump_stack();
172}
173
174/*
175 * submit an operation for an object
176 * - objects may be submitted only in the following states:
177 * - during object creation (write ops may be submitted)
178 * - whilst the object is active
179 * - after an I/O error incurred in one of the two above states (op rejected)
180 * - this gets any extra refs it needs on an op
181 */
182int fscache_submit_op(struct fscache_object *object,
183 struct fscache_operation *op)
184{
185 unsigned long ostate;
186 int ret;
187
188 _enter("{OBJ%x OP%x},{%u}",
189 object->debug_id, op->debug_id, atomic_read(&op->usage));
190
191 ASSERTCMP(atomic_read(&op->usage), >, 0);
192
193 spin_lock(&object->lock);
194 ASSERTCMP(object->n_ops, >=, object->n_in_progress);
195 ASSERTCMP(object->n_ops, >=, object->n_exclusive);
196
197 ostate = object->state;
198 smp_rmb();
199
200 if (fscache_object_is_active(object)) {
201 op->object = object;
202 object->n_ops++;
203
204 if (object->n_exclusive > 0) {
205 atomic_inc(&op->usage);
206 list_add_tail(&op->pend_link, &object->pending_ops);
207 fscache_stat(&fscache_n_op_pend);
208 } else if (!list_empty(&object->pending_ops)) {
209 atomic_inc(&op->usage);
210 list_add_tail(&op->pend_link, &object->pending_ops);
211 fscache_stat(&fscache_n_op_pend);
212 fscache_start_operations(object);
213 } else {
214 ASSERTCMP(object->n_exclusive, ==, 0);
215 fscache_run_op(object, op);
216 }
217 ret = 0;
218 } else if (object->state == FSCACHE_OBJECT_CREATING) {
219 op->object = object;
220 object->n_ops++;
221 atomic_inc(&op->usage);
222 list_add_tail(&op->pend_link, &object->pending_ops);
223 fscache_stat(&fscache_n_op_pend);
224 ret = 0;
225 } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
226 fscache_report_unexpected_submission(object, op, ostate);
227 ASSERT(!fscache_object_is_active(object));
228 ret = -ENOBUFS;
229 } else {
230 ret = -ENOBUFS;
231 }
232
233 spin_unlock(&object->lock);
234 return ret;
235}
236
237/*
238 * queue an object for withdrawal on error, aborting all following asynchronous
239 * operations
240 */
241void fscache_abort_object(struct fscache_object *object)
242{
243 _enter("{OBJ%x}", object->debug_id);
244
245 fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
246}
247
248/*
249 * jump start the operation processing on an object
250 * - caller must hold object->lock
251 */
252void fscache_start_operations(struct fscache_object *object)
253{
254 struct fscache_operation *op;
255 bool stop = false;
256
257 while (!list_empty(&object->pending_ops) && !stop) {
258 op = list_entry(object->pending_ops.next,
259 struct fscache_operation, pend_link);
260
261 if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
262 if (object->n_in_progress > 0)
263 break;
264 stop = true;
265 }
266 list_del_init(&op->pend_link);
267 object->n_in_progress++;
268
269 if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
270 wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
271 if (op->processor)
272 fscache_enqueue_operation(op);
273
274 /* the pending queue was holding a ref on the object */
275 fscache_put_operation(op);
276 }
277
278 ASSERTCMP(object->n_in_progress, <=, object->n_ops);
279
280 _debug("woke %d ops on OBJ%x",
281 object->n_in_progress, object->debug_id);
282}
283
284/*
285 * release an operation
286 * - queues pending ops if this is the last in-progress op
287 */
288void fscache_put_operation(struct fscache_operation *op)
289{
290 struct fscache_object *object;
291 struct fscache_cache *cache;
292
293 _enter("{OBJ%x OP%x,%d}",
294 op->object->debug_id, op->debug_id, atomic_read(&op->usage));
295
296 ASSERTCMP(atomic_read(&op->usage), >, 0);
297
298 if (!atomic_dec_and_test(&op->usage))
299 return;
300
301 _debug("PUT OP");
302 if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags))
303 BUG();
304
305 fscache_stat(&fscache_n_op_release);
306
307 if (op->release) {
308 op->release(op);
309 op->release = NULL;
310 }
311
312 object = op->object;
313
314 /* now... we may get called with the object spinlock held, so we
315 * complete the cleanup here only if we can immediately acquire the
316 * lock, and defer it otherwise */
317 if (!spin_trylock(&object->lock)) {
318 _debug("defer put");
319 fscache_stat(&fscache_n_op_deferred_release);
320
321 cache = object->cache;
322 spin_lock(&cache->op_gc_list_lock);
323 list_add_tail(&op->pend_link, &cache->op_gc_list);
324 spin_unlock(&cache->op_gc_list_lock);
325 schedule_work(&cache->op_gc);
326 _leave(" [defer]");
327 return;
328 }
329
330 if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
331 ASSERTCMP(object->n_exclusive, >, 0);
332 object->n_exclusive--;
333 }
334
335 ASSERTCMP(object->n_in_progress, >, 0);
336 object->n_in_progress--;
337 if (object->n_in_progress == 0)
338 fscache_start_operations(object);
339
340 ASSERTCMP(object->n_ops, >, 0);
341 object->n_ops--;
342 if (object->n_ops == 0)
343 fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
344
345 spin_unlock(&object->lock);
346
347 kfree(op);
348 _leave(" [done]");
349}
350EXPORT_SYMBOL(fscache_put_operation);
351
352/*
353 * garbage collect operations that have had their release deferred
354 */
355void fscache_operation_gc(struct work_struct *work)
356{
357 struct fscache_operation *op;
358 struct fscache_object *object;
359 struct fscache_cache *cache =
360 container_of(work, struct fscache_cache, op_gc);
361 int count = 0;
362
363 _enter("");
364
365 do {
366 spin_lock(&cache->op_gc_list_lock);
367 if (list_empty(&cache->op_gc_list)) {
368 spin_unlock(&cache->op_gc_list_lock);
369 break;
370 }
371
372 op = list_entry(cache->op_gc_list.next,
373 struct fscache_operation, pend_link);
374 list_del(&op->pend_link);
375 spin_unlock(&cache->op_gc_list_lock);
376
377 object = op->object;
378
379 _debug("GC DEFERRED REL OBJ%x OP%x",
380 object->debug_id, op->debug_id);
381 fscache_stat(&fscache_n_op_gc);
382
383 ASSERTCMP(atomic_read(&op->usage), ==, 0);
384
385 spin_lock(&object->lock);
386 if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
387 ASSERTCMP(object->n_exclusive, >, 0);
388 object->n_exclusive--;
389 }
390
391 ASSERTCMP(object->n_in_progress, >, 0);
392 object->n_in_progress--;
393 if (object->n_in_progress == 0)
394 fscache_start_operations(object);
395
396 ASSERTCMP(object->n_ops, >, 0);
397 object->n_ops--;
398 if (object->n_ops == 0)
399 fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
400
401 spin_unlock(&object->lock);
402
403 } while (count++ < 20);
404
405 if (!list_empty(&cache->op_gc_list))
406 schedule_work(&cache->op_gc);
407
408 _leave("");
409}
410
411/*
412 * allow the slow work item processor to get a ref on an operation
413 */
414static int fscache_op_get_ref(struct slow_work *work)
415{
416 struct fscache_operation *op =
417 container_of(work, struct fscache_operation, slow_work);
418
419 atomic_inc(&op->usage);
420 return 0;
421}
422
423/*
424 * allow the slow work item processor to discard a ref on an operation
425 */
426static void fscache_op_put_ref(struct slow_work *work)
427{
428 struct fscache_operation *op =
429 container_of(work, struct fscache_operation, slow_work);
430
431 fscache_put_operation(op);
432}
433
434/*
435 * execute an operation using the slow thread pool to provide processing context
436 * - the caller holds a ref to this object, so we don't need to hold one
437 */
438static void fscache_op_execute(struct slow_work *work)
439{
440 struct fscache_operation *op =
441 container_of(work, struct fscache_operation, slow_work);
442 unsigned long start;
443
444 _enter("{OBJ%x OP%x,%d}",
445 op->object->debug_id, op->debug_id, atomic_read(&op->usage));
446
447 ASSERT(op->processor != NULL);
448 start = jiffies;
449 op->processor(op);
450 fscache_hist(fscache_ops_histogram, start);
451
452 _leave("");
453}
454
455const struct slow_work_ops fscache_op_slow_work_ops = {
456 .get_ref = fscache_op_get_ref,
457 .put_ref = fscache_op_put_ref,
458 .execute = fscache_op_execute,
459};
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
new file mode 100644
index 000000000000..2568e0eb644f
--- /dev/null
+++ b/fs/fscache/page.c
@@ -0,0 +1,816 @@
1/* Cache page management and data I/O routines
2 *
3 * Copyright (C) 2004-2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL PAGE
13#include <linux/module.h>
14#include <linux/fscache-cache.h>
15#include <linux/buffer_head.h>
16#include <linux/pagevec.h>
17#include "internal.h"
18
19/*
20 * check to see if a page is being written to the cache
21 */
22bool __fscache_check_page_write(struct fscache_cookie *cookie, struct page *page)
23{
24 void *val;
25
26 rcu_read_lock();
27 val = radix_tree_lookup(&cookie->stores, page->index);
28 rcu_read_unlock();
29
30 return val != NULL;
31}
32EXPORT_SYMBOL(__fscache_check_page_write);
33
34/*
35 * wait for a page to finish being written to the cache
36 */
37void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *page)
38{
39 wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0);
40
41 wait_event(*wq, !__fscache_check_page_write(cookie, page));
42}
43EXPORT_SYMBOL(__fscache_wait_on_page_write);
44
45/*
46 * note that a page has finished being written to the cache
47 */
48static void fscache_end_page_write(struct fscache_cookie *cookie, struct page *page)
49{
50 struct page *xpage;
51
52 spin_lock(&cookie->lock);
53 xpage = radix_tree_delete(&cookie->stores, page->index);
54 spin_unlock(&cookie->lock);
55 ASSERT(xpage != NULL);
56
57 wake_up_bit(&cookie->flags, 0);
58}
59
60/*
61 * actually apply the changed attributes to a cache object
62 */
63static void fscache_attr_changed_op(struct fscache_operation *op)
64{
65 struct fscache_object *object = op->object;
66
67 _enter("{OBJ%x OP%x}", object->debug_id, op->debug_id);
68
69 fscache_stat(&fscache_n_attr_changed_calls);
70
71 if (fscache_object_is_active(object) &&
72 object->cache->ops->attr_changed(object) < 0)
73 fscache_abort_object(object);
74
75 _leave("");
76}
77
78/*
79 * notification that the attributes on an object have changed
80 */
81int __fscache_attr_changed(struct fscache_cookie *cookie)
82{
83 struct fscache_operation *op;
84 struct fscache_object *object;
85
86 _enter("%p", cookie);
87
88 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
89
90 fscache_stat(&fscache_n_attr_changed);
91
92 op = kzalloc(sizeof(*op), GFP_KERNEL);
93 if (!op) {
94 fscache_stat(&fscache_n_attr_changed_nomem);
95 _leave(" = -ENOMEM");
96 return -ENOMEM;
97 }
98
99 fscache_operation_init(op, NULL);
100 fscache_operation_init_slow(op, fscache_attr_changed_op);
101 op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE);
102
103 spin_lock(&cookie->lock);
104
105 if (hlist_empty(&cookie->backing_objects))
106 goto nobufs;
107 object = hlist_entry(cookie->backing_objects.first,
108 struct fscache_object, cookie_link);
109
110 if (fscache_submit_exclusive_op(object, op) < 0)
111 goto nobufs;
112 spin_unlock(&cookie->lock);
113 fscache_stat(&fscache_n_attr_changed_ok);
114 fscache_put_operation(op);
115 _leave(" = 0");
116 return 0;
117
118nobufs:
119 spin_unlock(&cookie->lock);
120 kfree(op);
121 fscache_stat(&fscache_n_attr_changed_nobufs);
122 _leave(" = %d", -ENOBUFS);
123 return -ENOBUFS;
124}
125EXPORT_SYMBOL(__fscache_attr_changed);
126
127/*
128 * handle secondary execution given to a retrieval op on behalf of the
129 * cache
130 */
131static void fscache_retrieval_work(struct work_struct *work)
132{
133 struct fscache_retrieval *op =
134 container_of(work, struct fscache_retrieval, op.fast_work);
135 unsigned long start;
136
137 _enter("{OP%x}", op->op.debug_id);
138
139 start = jiffies;
140 op->op.processor(&op->op);
141 fscache_hist(fscache_ops_histogram, start);
142 fscache_put_operation(&op->op);
143}
144
145/*
146 * release a retrieval op reference
147 */
148static void fscache_release_retrieval_op(struct fscache_operation *_op)
149{
150 struct fscache_retrieval *op =
151 container_of(_op, struct fscache_retrieval, op);
152
153 _enter("{OP%x}", op->op.debug_id);
154
155 fscache_hist(fscache_retrieval_histogram, op->start_time);
156 if (op->context)
157 fscache_put_context(op->op.object->cookie, op->context);
158
159 _leave("");
160}
161
162/*
163 * allocate a retrieval op
164 */
165static struct fscache_retrieval *fscache_alloc_retrieval(
166 struct address_space *mapping,
167 fscache_rw_complete_t end_io_func,
168 void *context)
169{
170 struct fscache_retrieval *op;
171
172 /* allocate a retrieval operation and attempt to submit it */
173 op = kzalloc(sizeof(*op), GFP_NOIO);
174 if (!op) {
175 fscache_stat(&fscache_n_retrievals_nomem);
176 return NULL;
177 }
178
179 fscache_operation_init(&op->op, fscache_release_retrieval_op);
180 op->op.flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
181 op->mapping = mapping;
182 op->end_io_func = end_io_func;
183 op->context = context;
184 op->start_time = jiffies;
185 INIT_WORK(&op->op.fast_work, fscache_retrieval_work);
186 INIT_LIST_HEAD(&op->to_do);
187 return op;
188}
189
190/*
191 * wait for a deferred lookup to complete
192 */
193static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
194{
195 unsigned long jif;
196
197 _enter("");
198
199 if (!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) {
200 _leave(" = 0 [imm]");
201 return 0;
202 }
203
204 fscache_stat(&fscache_n_retrievals_wait);
205
206 jif = jiffies;
207 if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
208 fscache_wait_bit_interruptible,
209 TASK_INTERRUPTIBLE) != 0) {
210 fscache_stat(&fscache_n_retrievals_intr);
211 _leave(" = -ERESTARTSYS");
212 return -ERESTARTSYS;
213 }
214
215 ASSERT(!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags));
216
217 smp_rmb();
218 fscache_hist(fscache_retrieval_delay_histogram, jif);
219 _leave(" = 0 [dly]");
220 return 0;
221}
222
223/*
224 * read a page from the cache or allocate a block in which to store it
225 * - we return:
226 * -ENOMEM - out of memory, nothing done
227 * -ERESTARTSYS - interrupted
228 * -ENOBUFS - no backing object available in which to cache the block
229 * -ENODATA - no data available in the backing object for this block
230 * 0 - dispatched a read - it'll call end_io_func() when finished
231 */
232int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
233 struct page *page,
234 fscache_rw_complete_t end_io_func,
235 void *context,
236 gfp_t gfp)
237{
238 struct fscache_retrieval *op;
239 struct fscache_object *object;
240 int ret;
241
242 _enter("%p,%p,,,", cookie, page);
243
244 fscache_stat(&fscache_n_retrievals);
245
246 if (hlist_empty(&cookie->backing_objects))
247 goto nobufs;
248
249 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
250 ASSERTCMP(page, !=, NULL);
251
252 if (fscache_wait_for_deferred_lookup(cookie) < 0)
253 return -ERESTARTSYS;
254
255 op = fscache_alloc_retrieval(page->mapping, end_io_func, context);
256 if (!op) {
257 _leave(" = -ENOMEM");
258 return -ENOMEM;
259 }
260
261 spin_lock(&cookie->lock);
262
263 if (hlist_empty(&cookie->backing_objects))
264 goto nobufs_unlock;
265 object = hlist_entry(cookie->backing_objects.first,
266 struct fscache_object, cookie_link);
267
268 ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
269
270 if (fscache_submit_op(object, &op->op) < 0)
271 goto nobufs_unlock;
272 spin_unlock(&cookie->lock);
273
274 fscache_stat(&fscache_n_retrieval_ops);
275
276 /* pin the netfs read context in case we need to do the actual netfs
277 * read because we've encountered a cache read failure */
278 fscache_get_context(object->cookie, op->context);
279
280 /* we wait for the operation to become active, and then process it
281 * *here*, in this thread, and not in the thread pool */
282 if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
283 _debug(">>> WT");
284 fscache_stat(&fscache_n_retrieval_op_waits);
285 wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
286 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
287 _debug("<<< GO");
288 }
289
290 /* ask the cache to honour the operation */
291 if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
292 ret = object->cache->ops->allocate_page(op, page, gfp);
293 if (ret == 0)
294 ret = -ENODATA;
295 } else {
296 ret = object->cache->ops->read_or_alloc_page(op, page, gfp);
297 }
298
299 if (ret == -ENOMEM)
300 fscache_stat(&fscache_n_retrievals_nomem);
301 else if (ret == -ERESTARTSYS)
302 fscache_stat(&fscache_n_retrievals_intr);
303 else if (ret == -ENODATA)
304 fscache_stat(&fscache_n_retrievals_nodata);
305 else if (ret < 0)
306 fscache_stat(&fscache_n_retrievals_nobufs);
307 else
308 fscache_stat(&fscache_n_retrievals_ok);
309
310 fscache_put_retrieval(op);
311 _leave(" = %d", ret);
312 return ret;
313
314nobufs_unlock:
315 spin_unlock(&cookie->lock);
316 kfree(op);
317nobufs:
318 fscache_stat(&fscache_n_retrievals_nobufs);
319 _leave(" = -ENOBUFS");
320 return -ENOBUFS;
321}
322EXPORT_SYMBOL(__fscache_read_or_alloc_page);
323
324/*
325 * read a list of page from the cache or allocate a block in which to store
326 * them
327 * - we return:
328 * -ENOMEM - out of memory, some pages may be being read
329 * -ERESTARTSYS - interrupted, some pages may be being read
330 * -ENOBUFS - no backing object or space available in which to cache any
331 * pages not being read
332 * -ENODATA - no data available in the backing object for some or all of
333 * the pages
334 * 0 - dispatched a read on all pages
335 *
336 * end_io_func() will be called for each page read from the cache as it is
337 * finishes being read
338 *
339 * any pages for which a read is dispatched will be removed from pages and
340 * nr_pages
341 */
342int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
343 struct address_space *mapping,
344 struct list_head *pages,
345 unsigned *nr_pages,
346 fscache_rw_complete_t end_io_func,
347 void *context,
348 gfp_t gfp)
349{
350 fscache_pages_retrieval_func_t func;
351 struct fscache_retrieval *op;
352 struct fscache_object *object;
353 int ret;
354
355 _enter("%p,,%d,,,", cookie, *nr_pages);
356
357 fscache_stat(&fscache_n_retrievals);
358
359 if (hlist_empty(&cookie->backing_objects))
360 goto nobufs;
361
362 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
363 ASSERTCMP(*nr_pages, >, 0);
364 ASSERT(!list_empty(pages));
365
366 if (fscache_wait_for_deferred_lookup(cookie) < 0)
367 return -ERESTARTSYS;
368
369 op = fscache_alloc_retrieval(mapping, end_io_func, context);
370 if (!op)
371 return -ENOMEM;
372
373 spin_lock(&cookie->lock);
374
375 if (hlist_empty(&cookie->backing_objects))
376 goto nobufs_unlock;
377 object = hlist_entry(cookie->backing_objects.first,
378 struct fscache_object, cookie_link);
379
380 if (fscache_submit_op(object, &op->op) < 0)
381 goto nobufs_unlock;
382 spin_unlock(&cookie->lock);
383
384 fscache_stat(&fscache_n_retrieval_ops);
385
386 /* pin the netfs read context in case we need to do the actual netfs
387 * read because we've encountered a cache read failure */
388 fscache_get_context(object->cookie, op->context);
389
390 /* we wait for the operation to become active, and then process it
391 * *here*, in this thread, and not in the thread pool */
392 if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
393 _debug(">>> WT");
394 fscache_stat(&fscache_n_retrieval_op_waits);
395 wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
396 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
397 _debug("<<< GO");
398 }
399
400 /* ask the cache to honour the operation */
401 if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags))
402 func = object->cache->ops->allocate_pages;
403 else
404 func = object->cache->ops->read_or_alloc_pages;
405 ret = func(op, pages, nr_pages, gfp);
406
407 if (ret == -ENOMEM)
408 fscache_stat(&fscache_n_retrievals_nomem);
409 else if (ret == -ERESTARTSYS)
410 fscache_stat(&fscache_n_retrievals_intr);
411 else if (ret == -ENODATA)
412 fscache_stat(&fscache_n_retrievals_nodata);
413 else if (ret < 0)
414 fscache_stat(&fscache_n_retrievals_nobufs);
415 else
416 fscache_stat(&fscache_n_retrievals_ok);
417
418 fscache_put_retrieval(op);
419 _leave(" = %d", ret);
420 return ret;
421
422nobufs_unlock:
423 spin_unlock(&cookie->lock);
424 kfree(op);
425nobufs:
426 fscache_stat(&fscache_n_retrievals_nobufs);
427 _leave(" = -ENOBUFS");
428 return -ENOBUFS;
429}
430EXPORT_SYMBOL(__fscache_read_or_alloc_pages);
431
432/*
433 * allocate a block in the cache on which to store a page
434 * - we return:
435 * -ENOMEM - out of memory, nothing done
436 * -ERESTARTSYS - interrupted
437 * -ENOBUFS - no backing object available in which to cache the block
438 * 0 - block allocated
439 */
440int __fscache_alloc_page(struct fscache_cookie *cookie,
441 struct page *page,
442 gfp_t gfp)
443{
444 struct fscache_retrieval *op;
445 struct fscache_object *object;
446 int ret;
447
448 _enter("%p,%p,,,", cookie, page);
449
450 fscache_stat(&fscache_n_allocs);
451
452 if (hlist_empty(&cookie->backing_objects))
453 goto nobufs;
454
455 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
456 ASSERTCMP(page, !=, NULL);
457
458 if (fscache_wait_for_deferred_lookup(cookie) < 0)
459 return -ERESTARTSYS;
460
461 op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
462 if (!op)
463 return -ENOMEM;
464
465 spin_lock(&cookie->lock);
466
467 if (hlist_empty(&cookie->backing_objects))
468 goto nobufs_unlock;
469 object = hlist_entry(cookie->backing_objects.first,
470 struct fscache_object, cookie_link);
471
472 if (fscache_submit_op(object, &op->op) < 0)
473 goto nobufs_unlock;
474 spin_unlock(&cookie->lock);
475
476 fscache_stat(&fscache_n_alloc_ops);
477
478 if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
479 _debug(">>> WT");
480 fscache_stat(&fscache_n_alloc_op_waits);
481 wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
482 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
483 _debug("<<< GO");
484 }
485
486 /* ask the cache to honour the operation */
487 ret = object->cache->ops->allocate_page(op, page, gfp);
488
489 if (ret < 0)
490 fscache_stat(&fscache_n_allocs_nobufs);
491 else
492 fscache_stat(&fscache_n_allocs_ok);
493
494 fscache_put_retrieval(op);
495 _leave(" = %d", ret);
496 return ret;
497
498nobufs_unlock:
499 spin_unlock(&cookie->lock);
500 kfree(op);
501nobufs:
502 fscache_stat(&fscache_n_allocs_nobufs);
503 _leave(" = -ENOBUFS");
504 return -ENOBUFS;
505}
506EXPORT_SYMBOL(__fscache_alloc_page);
507
508/*
509 * release a write op reference
510 */
511static void fscache_release_write_op(struct fscache_operation *_op)
512{
513 _enter("{OP%x}", _op->debug_id);
514}
515
516/*
517 * perform the background storage of a page into the cache
518 */
519static void fscache_write_op(struct fscache_operation *_op)
520{
521 struct fscache_storage *op =
522 container_of(_op, struct fscache_storage, op);
523 struct fscache_object *object = op->op.object;
524 struct fscache_cookie *cookie = object->cookie;
525 struct page *page;
526 unsigned n;
527 void *results[1];
528 int ret;
529
530 _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
531
532 spin_lock(&cookie->lock);
533 spin_lock(&object->lock);
534
535 if (!fscache_object_is_active(object)) {
536 spin_unlock(&object->lock);
537 spin_unlock(&cookie->lock);
538 _leave("");
539 return;
540 }
541
542 fscache_stat(&fscache_n_store_calls);
543
544 /* find a page to store */
545 page = NULL;
546 n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, 1,
547 FSCACHE_COOKIE_PENDING_TAG);
548 if (n != 1)
549 goto superseded;
550 page = results[0];
551 _debug("gang %d [%lx]", n, page->index);
552 if (page->index > op->store_limit)
553 goto superseded;
554
555 radix_tree_tag_clear(&cookie->stores, page->index,
556 FSCACHE_COOKIE_PENDING_TAG);
557
558 spin_unlock(&object->lock);
559 spin_unlock(&cookie->lock);
560
561 if (page) {
562 ret = object->cache->ops->write_page(op, page);
563 fscache_end_page_write(cookie, page);
564 page_cache_release(page);
565 if (ret < 0)
566 fscache_abort_object(object);
567 else
568 fscache_enqueue_operation(&op->op);
569 }
570
571 _leave("");
572 return;
573
574superseded:
575 /* this writer is going away and there aren't any more things to
576 * write */
577 _debug("cease");
578 clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
579 spin_unlock(&object->lock);
580 spin_unlock(&cookie->lock);
581 _leave("");
582}
583
584/*
585 * request a page be stored in the cache
586 * - returns:
587 * -ENOMEM - out of memory, nothing done
588 * -ENOBUFS - no backing object available in which to cache the page
589 * 0 - dispatched a write - it'll call end_io_func() when finished
590 *
591 * if the cookie still has a backing object at this point, that object can be
592 * in one of a few states with respect to storage processing:
593 *
594 * (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is
595 * set)
596 *
597 * (a) no writes yet (set FSCACHE_COOKIE_PENDING_FILL and queue deferred
598 * fill op)
599 *
600 * (b) writes deferred till post-creation (mark page for writing and
601 * return immediately)
602 *
603 * (2) negative lookup, object created, initial fill being made from netfs
604 * (FSCACHE_COOKIE_INITIAL_FILL is set)
605 *
606 * (a) fill point not yet reached this page (mark page for writing and
607 * return)
608 *
609 * (b) fill point passed this page (queue op to store this page)
610 *
611 * (3) object extant (queue op to store this page)
612 *
613 * any other state is invalid
614 */
615int __fscache_write_page(struct fscache_cookie *cookie,
616 struct page *page,
617 gfp_t gfp)
618{
619 struct fscache_storage *op;
620 struct fscache_object *object;
621 int ret;
622
623 _enter("%p,%x,", cookie, (u32) page->flags);
624
625 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
626 ASSERT(PageFsCache(page));
627
628 fscache_stat(&fscache_n_stores);
629
630 op = kzalloc(sizeof(*op), GFP_NOIO);
631 if (!op)
632 goto nomem;
633
634 fscache_operation_init(&op->op, fscache_release_write_op);
635 fscache_operation_init_slow(&op->op, fscache_write_op);
636 op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING);
637
638 ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
639 if (ret < 0)
640 goto nomem_free;
641
642 ret = -ENOBUFS;
643 spin_lock(&cookie->lock);
644
645 if (hlist_empty(&cookie->backing_objects))
646 goto nobufs;
647 object = hlist_entry(cookie->backing_objects.first,
648 struct fscache_object, cookie_link);
649 if (test_bit(FSCACHE_IOERROR, &object->cache->flags))
650 goto nobufs;
651
652 /* add the page to the pending-storage radix tree on the backing
653 * object */
654 spin_lock(&object->lock);
655
656 _debug("store limit %llx", (unsigned long long) object->store_limit);
657
658 ret = radix_tree_insert(&cookie->stores, page->index, page);
659 if (ret < 0) {
660 if (ret == -EEXIST)
661 goto already_queued;
662 _debug("insert failed %d", ret);
663 goto nobufs_unlock_obj;
664 }
665
666 radix_tree_tag_set(&cookie->stores, page->index,
667 FSCACHE_COOKIE_PENDING_TAG);
668 page_cache_get(page);
669
670 /* we only want one writer at a time, but we do need to queue new
671 * writers after exclusive ops */
672 if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags))
673 goto already_pending;
674
675 spin_unlock(&object->lock);
676
677 op->op.debug_id = atomic_inc_return(&fscache_op_debug_id);
678 op->store_limit = object->store_limit;
679
680 if (fscache_submit_op(object, &op->op) < 0)
681 goto submit_failed;
682
683 spin_unlock(&cookie->lock);
684 radix_tree_preload_end();
685 fscache_stat(&fscache_n_store_ops);
686 fscache_stat(&fscache_n_stores_ok);
687
688 /* the slow work queue now carries its own ref on the object */
689 fscache_put_operation(&op->op);
690 _leave(" = 0");
691 return 0;
692
693already_queued:
694 fscache_stat(&fscache_n_stores_again);
695already_pending:
696 spin_unlock(&object->lock);
697 spin_unlock(&cookie->lock);
698 radix_tree_preload_end();
699 kfree(op);
700 fscache_stat(&fscache_n_stores_ok);
701 _leave(" = 0");
702 return 0;
703
704submit_failed:
705 radix_tree_delete(&cookie->stores, page->index);
706 page_cache_release(page);
707 ret = -ENOBUFS;
708 goto nobufs;
709
710nobufs_unlock_obj:
711 spin_unlock(&object->lock);
712nobufs:
713 spin_unlock(&cookie->lock);
714 radix_tree_preload_end();
715 kfree(op);
716 fscache_stat(&fscache_n_stores_nobufs);
717 _leave(" = -ENOBUFS");
718 return -ENOBUFS;
719
720nomem_free:
721 kfree(op);
722nomem:
723 fscache_stat(&fscache_n_stores_oom);
724 _leave(" = -ENOMEM");
725 return -ENOMEM;
726}
727EXPORT_SYMBOL(__fscache_write_page);
728
729/*
730 * remove a page from the cache
731 */
732void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page)
733{
734 struct fscache_object *object;
735
736 _enter(",%p", page);
737
738 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
739 ASSERTCMP(page, !=, NULL);
740
741 fscache_stat(&fscache_n_uncaches);
742
743 /* cache withdrawal may beat us to it */
744 if (!PageFsCache(page))
745 goto done;
746
747 /* get the object */
748 spin_lock(&cookie->lock);
749
750 if (hlist_empty(&cookie->backing_objects)) {
751 ClearPageFsCache(page);
752 goto done_unlock;
753 }
754
755 object = hlist_entry(cookie->backing_objects.first,
756 struct fscache_object, cookie_link);
757
758 /* there might now be stuff on disk we could read */
759 clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
760
761 /* only invoke the cache backend if we managed to mark the page
762 * uncached here; this deals with synchronisation vs withdrawal */
763 if (TestClearPageFsCache(page) &&
764 object->cache->ops->uncache_page) {
765 /* the cache backend releases the cookie lock */
766 object->cache->ops->uncache_page(object, page);
767 goto done;
768 }
769
770done_unlock:
771 spin_unlock(&cookie->lock);
772done:
773 _leave("");
774}
775EXPORT_SYMBOL(__fscache_uncache_page);
776
777/**
778 * fscache_mark_pages_cached - Mark pages as being cached
779 * @op: The retrieval op pages are being marked for
780 * @pagevec: The pages to be marked
781 *
782 * Mark a bunch of netfs pages as being cached. After this is called,
783 * the netfs must call fscache_uncache_page() to remove the mark.
784 */
785void fscache_mark_pages_cached(struct fscache_retrieval *op,
786 struct pagevec *pagevec)
787{
788 struct fscache_cookie *cookie = op->op.object->cookie;
789 unsigned long loop;
790
791#ifdef CONFIG_FSCACHE_STATS
792 atomic_add(pagevec->nr, &fscache_n_marks);
793#endif
794
795 for (loop = 0; loop < pagevec->nr; loop++) {
796 struct page *page = pagevec->pages[loop];
797
798 _debug("- mark %p{%lx}", page, page->index);
799 if (TestSetPageFsCache(page)) {
800 static bool once_only;
801 if (!once_only) {
802 once_only = true;
803 printk(KERN_WARNING "FS-Cache:"
804 " Cookie type %s marked page %lx"
805 " multiple times\n",
806 cookie->def->name, page->index);
807 }
808 }
809 }
810
811 if (cookie->def->mark_pages_cached)
812 cookie->def->mark_pages_cached(cookie->netfs_data,
813 op->mapping, pagevec);
814 pagevec_reinit(pagevec);
815}
816EXPORT_SYMBOL(fscache_mark_pages_cached);
diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c
new file mode 100644
index 000000000000..beeab44bc31a
--- /dev/null
+++ b/fs/fscache/proc.c
@@ -0,0 +1,68 @@
1/* FS-Cache statistics viewing interface
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL OPERATION
13#include <linux/module.h>
14#include <linux/proc_fs.h>
15#include <linux/seq_file.h>
16#include "internal.h"
17
18/*
19 * initialise the /proc/fs/fscache/ directory
20 */
21int __init fscache_proc_init(void)
22{
23 _enter("");
24
25 if (!proc_mkdir("fs/fscache", NULL))
26 goto error_dir;
27
28#ifdef CONFIG_FSCACHE_STATS
29 if (!proc_create("fs/fscache/stats", S_IFREG | 0444, NULL,
30 &fscache_stats_fops))
31 goto error_stats;
32#endif
33
34#ifdef CONFIG_FSCACHE_HISTOGRAM
35 if (!proc_create("fs/fscache/histogram", S_IFREG | 0444, NULL,
36 &fscache_histogram_fops))
37 goto error_histogram;
38#endif
39
40 _leave(" = 0");
41 return 0;
42
43#ifdef CONFIG_FSCACHE_HISTOGRAM
44error_histogram:
45#endif
46#ifdef CONFIG_FSCACHE_STATS
47 remove_proc_entry("fs/fscache/stats", NULL);
48error_stats:
49#endif
50 remove_proc_entry("fs/fscache", NULL);
51error_dir:
52 _leave(" = -ENOMEM");
53 return -ENOMEM;
54}
55
56/*
57 * clean up the /proc/fs/fscache/ directory
58 */
59void fscache_proc_cleanup(void)
60{
61#ifdef CONFIG_FSCACHE_HISTOGRAM
62 remove_proc_entry("fs/fscache/histogram", NULL);
63#endif
64#ifdef CONFIG_FSCACHE_STATS
65 remove_proc_entry("fs/fscache/stats", NULL);
66#endif
67 remove_proc_entry("fs/fscache", NULL);
68}
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
new file mode 100644
index 000000000000..65deb99e756b
--- /dev/null
+++ b/fs/fscache/stats.c
@@ -0,0 +1,212 @@
1/* FS-Cache statistics
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL THREAD
13#include <linux/module.h>
14#include <linux/proc_fs.h>
15#include <linux/seq_file.h>
16#include "internal.h"
17
18/*
19 * operation counters
20 */
21atomic_t fscache_n_op_pend;
22atomic_t fscache_n_op_run;
23atomic_t fscache_n_op_enqueue;
24atomic_t fscache_n_op_requeue;
25atomic_t fscache_n_op_deferred_release;
26atomic_t fscache_n_op_release;
27atomic_t fscache_n_op_gc;
28
29atomic_t fscache_n_attr_changed;
30atomic_t fscache_n_attr_changed_ok;
31atomic_t fscache_n_attr_changed_nobufs;
32atomic_t fscache_n_attr_changed_nomem;
33atomic_t fscache_n_attr_changed_calls;
34
35atomic_t fscache_n_allocs;
36atomic_t fscache_n_allocs_ok;
37atomic_t fscache_n_allocs_wait;
38atomic_t fscache_n_allocs_nobufs;
39atomic_t fscache_n_alloc_ops;
40atomic_t fscache_n_alloc_op_waits;
41
42atomic_t fscache_n_retrievals;
43atomic_t fscache_n_retrievals_ok;
44atomic_t fscache_n_retrievals_wait;
45atomic_t fscache_n_retrievals_nodata;
46atomic_t fscache_n_retrievals_nobufs;
47atomic_t fscache_n_retrievals_intr;
48atomic_t fscache_n_retrievals_nomem;
49atomic_t fscache_n_retrieval_ops;
50atomic_t fscache_n_retrieval_op_waits;
51
52atomic_t fscache_n_stores;
53atomic_t fscache_n_stores_ok;
54atomic_t fscache_n_stores_again;
55atomic_t fscache_n_stores_nobufs;
56atomic_t fscache_n_stores_oom;
57atomic_t fscache_n_store_ops;
58atomic_t fscache_n_store_calls;
59
60atomic_t fscache_n_marks;
61atomic_t fscache_n_uncaches;
62
63atomic_t fscache_n_acquires;
64atomic_t fscache_n_acquires_null;
65atomic_t fscache_n_acquires_no_cache;
66atomic_t fscache_n_acquires_ok;
67atomic_t fscache_n_acquires_nobufs;
68atomic_t fscache_n_acquires_oom;
69
70atomic_t fscache_n_updates;
71atomic_t fscache_n_updates_null;
72atomic_t fscache_n_updates_run;
73
74atomic_t fscache_n_relinquishes;
75atomic_t fscache_n_relinquishes_null;
76atomic_t fscache_n_relinquishes_waitcrt;
77
78atomic_t fscache_n_cookie_index;
79atomic_t fscache_n_cookie_data;
80atomic_t fscache_n_cookie_special;
81
82atomic_t fscache_n_object_alloc;
83atomic_t fscache_n_object_no_alloc;
84atomic_t fscache_n_object_lookups;
85atomic_t fscache_n_object_lookups_negative;
86atomic_t fscache_n_object_lookups_positive;
87atomic_t fscache_n_object_created;
88atomic_t fscache_n_object_avail;
89atomic_t fscache_n_object_dead;
90
91atomic_t fscache_n_checkaux_none;
92atomic_t fscache_n_checkaux_okay;
93atomic_t fscache_n_checkaux_update;
94atomic_t fscache_n_checkaux_obsolete;
95
96/*
97 * display the general statistics
98 */
99static int fscache_stats_show(struct seq_file *m, void *v)
100{
101 seq_puts(m, "FS-Cache statistics\n");
102
103 seq_printf(m, "Cookies: idx=%u dat=%u spc=%u\n",
104 atomic_read(&fscache_n_cookie_index),
105 atomic_read(&fscache_n_cookie_data),
106 atomic_read(&fscache_n_cookie_special));
107
108 seq_printf(m, "Objects: alc=%u nal=%u avl=%u ded=%u\n",
109 atomic_read(&fscache_n_object_alloc),
110 atomic_read(&fscache_n_object_no_alloc),
111 atomic_read(&fscache_n_object_avail),
112 atomic_read(&fscache_n_object_dead));
113 seq_printf(m, "ChkAux : non=%u ok=%u upd=%u obs=%u\n",
114 atomic_read(&fscache_n_checkaux_none),
115 atomic_read(&fscache_n_checkaux_okay),
116 atomic_read(&fscache_n_checkaux_update),
117 atomic_read(&fscache_n_checkaux_obsolete));
118
119 seq_printf(m, "Pages : mrk=%u unc=%u\n",
120 atomic_read(&fscache_n_marks),
121 atomic_read(&fscache_n_uncaches));
122
123 seq_printf(m, "Acquire: n=%u nul=%u noc=%u ok=%u nbf=%u"
124 " oom=%u\n",
125 atomic_read(&fscache_n_acquires),
126 atomic_read(&fscache_n_acquires_null),
127 atomic_read(&fscache_n_acquires_no_cache),
128 atomic_read(&fscache_n_acquires_ok),
129 atomic_read(&fscache_n_acquires_nobufs),
130 atomic_read(&fscache_n_acquires_oom));
131
132 seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u\n",
133 atomic_read(&fscache_n_object_lookups),
134 atomic_read(&fscache_n_object_lookups_negative),
135 atomic_read(&fscache_n_object_lookups_positive),
136 atomic_read(&fscache_n_object_created));
137
138 seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
139 atomic_read(&fscache_n_updates),
140 atomic_read(&fscache_n_updates_null),
141 atomic_read(&fscache_n_updates_run));
142
143 seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u\n",
144 atomic_read(&fscache_n_relinquishes),
145 atomic_read(&fscache_n_relinquishes_null),
146 atomic_read(&fscache_n_relinquishes_waitcrt));
147
148 seq_printf(m, "AttrChg: n=%u ok=%u nbf=%u oom=%u run=%u\n",
149 atomic_read(&fscache_n_attr_changed),
150 atomic_read(&fscache_n_attr_changed_ok),
151 atomic_read(&fscache_n_attr_changed_nobufs),
152 atomic_read(&fscache_n_attr_changed_nomem),
153 atomic_read(&fscache_n_attr_changed_calls));
154
155 seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u\n",
156 atomic_read(&fscache_n_allocs),
157 atomic_read(&fscache_n_allocs_ok),
158 atomic_read(&fscache_n_allocs_wait),
159 atomic_read(&fscache_n_allocs_nobufs));
160 seq_printf(m, "Allocs : ops=%u owt=%u\n",
161 atomic_read(&fscache_n_alloc_ops),
162 atomic_read(&fscache_n_alloc_op_waits));
163
164 seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u"
165 " int=%u oom=%u\n",
166 atomic_read(&fscache_n_retrievals),
167 atomic_read(&fscache_n_retrievals_ok),
168 atomic_read(&fscache_n_retrievals_wait),
169 atomic_read(&fscache_n_retrievals_nodata),
170 atomic_read(&fscache_n_retrievals_nobufs),
171 atomic_read(&fscache_n_retrievals_intr),
172 atomic_read(&fscache_n_retrievals_nomem));
173 seq_printf(m, "Retrvls: ops=%u owt=%u\n",
174 atomic_read(&fscache_n_retrieval_ops),
175 atomic_read(&fscache_n_retrieval_op_waits));
176
177 seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n",
178 atomic_read(&fscache_n_stores),
179 atomic_read(&fscache_n_stores_ok),
180 atomic_read(&fscache_n_stores_again),
181 atomic_read(&fscache_n_stores_nobufs),
182 atomic_read(&fscache_n_stores_oom));
183 seq_printf(m, "Stores : ops=%u run=%u\n",
184 atomic_read(&fscache_n_store_ops),
185 atomic_read(&fscache_n_store_calls));
186
187 seq_printf(m, "Ops : pend=%u run=%u enq=%u\n",
188 atomic_read(&fscache_n_op_pend),
189 atomic_read(&fscache_n_op_run),
190 atomic_read(&fscache_n_op_enqueue));
191 seq_printf(m, "Ops : dfr=%u rel=%u gc=%u\n",
192 atomic_read(&fscache_n_op_deferred_release),
193 atomic_read(&fscache_n_op_release),
194 atomic_read(&fscache_n_op_gc));
195 return 0;
196}
197
198/*
199 * open "/proc/fs/fscache/stats" allowing provision of a statistical summary
200 */
201static int fscache_stats_open(struct inode *inode, struct file *file)
202{
203 return single_open(file, fscache_stats_show, NULL);
204}
205
206const struct file_operations fscache_stats_fops = {
207 .owner = THIS_MODULE,
208 .open = fscache_stats_open,
209 .read = seq_read,
210 .llseek = seq_lseek,
211 .release = seq_release,
212};
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index fdff346e96fd..8b8eebc5614b 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -224,7 +224,7 @@ static int invalid_nodeid(u64 nodeid)
224 return !nodeid || nodeid == FUSE_ROOT_ID; 224 return !nodeid || nodeid == FUSE_ROOT_ID;
225} 225}
226 226
227struct dentry_operations fuse_dentry_operations = { 227const struct dentry_operations fuse_dentry_operations = {
228 .d_revalidate = fuse_dentry_revalidate, 228 .d_revalidate = fuse_dentry_revalidate,
229}; 229};
230 230
@@ -1032,6 +1032,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
1032 fuse_put_request(fc, req); 1032 fuse_put_request(fc, req);
1033 return -ENOMEM; 1033 return -ENOMEM;
1034 } 1034 }
1035 req->out.argpages = 1;
1035 req->num_pages = 1; 1036 req->num_pages = 1;
1036 req->pages[0] = page; 1037 req->pages[0] = page;
1037 fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR); 1038 fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index d9fdb7cec538..2b25133524a3 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -386,7 +386,6 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
386 req->in.numargs = 1; 386 req->in.numargs = 1;
387 req->in.args[0].size = sizeof(struct fuse_read_in); 387 req->in.args[0].size = sizeof(struct fuse_read_in);
388 req->in.args[0].value = inarg; 388 req->in.args[0].value = inarg;
389 req->out.argpages = 1;
390 req->out.argvar = 1; 389 req->out.argvar = 1;
391 req->out.numargs = 1; 390 req->out.numargs = 1;
392 req->out.args[0].size = count; 391 req->out.args[0].size = count;
@@ -453,6 +452,7 @@ static int fuse_readpage(struct file *file, struct page *page)
453 attr_ver = fuse_get_attr_version(fc); 452 attr_ver = fuse_get_attr_version(fc);
454 453
455 req->out.page_zeroing = 1; 454 req->out.page_zeroing = 1;
455 req->out.argpages = 1;
456 req->num_pages = 1; 456 req->num_pages = 1;
457 req->pages[0] = page; 457 req->pages[0] = page;
458 num_read = fuse_send_read(req, file, inode, pos, count, NULL); 458 num_read = fuse_send_read(req, file, inode, pos, count, NULL);
@@ -510,6 +510,8 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file,
510 struct fuse_conn *fc = get_fuse_conn(inode); 510 struct fuse_conn *fc = get_fuse_conn(inode);
511 loff_t pos = page_offset(req->pages[0]); 511 loff_t pos = page_offset(req->pages[0]);
512 size_t count = req->num_pages << PAGE_CACHE_SHIFT; 512 size_t count = req->num_pages << PAGE_CACHE_SHIFT;
513
514 req->out.argpages = 1;
513 req->out.page_zeroing = 1; 515 req->out.page_zeroing = 1;
514 fuse_read_fill(req, file, inode, pos, count, FUSE_READ); 516 fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
515 req->misc.read.attr_ver = fuse_get_attr_version(fc); 517 req->misc.read.attr_ver = fuse_get_attr_version(fc);
@@ -621,7 +623,6 @@ static void fuse_write_fill(struct fuse_req *req, struct file *file,
621 inarg->flags = file ? file->f_flags : 0; 623 inarg->flags = file ? file->f_flags : 0;
622 req->in.h.opcode = FUSE_WRITE; 624 req->in.h.opcode = FUSE_WRITE;
623 req->in.h.nodeid = get_node_id(inode); 625 req->in.h.nodeid = get_node_id(inode);
624 req->in.argpages = 1;
625 req->in.numargs = 2; 626 req->in.numargs = 2;
626 if (fc->minor < 9) 627 if (fc->minor < 9)
627 req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; 628 req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
@@ -695,6 +696,7 @@ static int fuse_buffered_write(struct file *file, struct inode *inode,
695 if (IS_ERR(req)) 696 if (IS_ERR(req))
696 return PTR_ERR(req); 697 return PTR_ERR(req);
697 698
699 req->in.argpages = 1;
698 req->num_pages = 1; 700 req->num_pages = 1;
699 req->pages[0] = page; 701 req->pages[0] = page;
700 req->page_offset = offset; 702 req->page_offset = offset;
@@ -771,6 +773,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
771 size_t count = 0; 773 size_t count = 0;
772 int err; 774 int err;
773 775
776 req->in.argpages = 1;
774 req->page_offset = offset; 777 req->page_offset = offset;
775 778
776 do { 779 do {
@@ -935,21 +938,28 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
935} 938}
936 939
937static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, 940static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
938 unsigned nbytes, int write) 941 unsigned *nbytesp, int write)
939{ 942{
943 unsigned nbytes = *nbytesp;
940 unsigned long user_addr = (unsigned long) buf; 944 unsigned long user_addr = (unsigned long) buf;
941 unsigned offset = user_addr & ~PAGE_MASK; 945 unsigned offset = user_addr & ~PAGE_MASK;
942 int npages; 946 int npages;
943 947
944 /* This doesn't work with nfsd */ 948 /* Special case for kernel I/O: can copy directly into the buffer */
945 if (!current->mm) 949 if (segment_eq(get_fs(), KERNEL_DS)) {
946 return -EPERM; 950 if (write)
951 req->in.args[1].value = (void *) user_addr;
952 else
953 req->out.args[0].value = (void *) user_addr;
954
955 return 0;
956 }
947 957
948 nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); 958 nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
949 npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; 959 npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
950 npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ); 960 npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
951 down_read(&current->mm->mmap_sem); 961 down_read(&current->mm->mmap_sem);
952 npages = get_user_pages(current, current->mm, user_addr, npages, write, 962 npages = get_user_pages(current, current->mm, user_addr, npages, !write,
953 0, req->pages, NULL); 963 0, req->pages, NULL);
954 up_read(&current->mm->mmap_sem); 964 up_read(&current->mm->mmap_sem);
955 if (npages < 0) 965 if (npages < 0)
@@ -957,6 +967,15 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
957 967
958 req->num_pages = npages; 968 req->num_pages = npages;
959 req->page_offset = offset; 969 req->page_offset = offset;
970
971 if (write)
972 req->in.argpages = 1;
973 else
974 req->out.argpages = 1;
975
976 nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;
977 *nbytesp = min(*nbytesp, nbytes);
978
960 return 0; 979 return 0;
961} 980}
962 981
@@ -979,15 +998,13 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
979 998
980 while (count) { 999 while (count) {
981 size_t nres; 1000 size_t nres;
982 size_t nbytes_limit = min(count, nmax); 1001 size_t nbytes = min(count, nmax);
983 size_t nbytes; 1002 int err = fuse_get_user_pages(req, buf, &nbytes, write);
984 int err = fuse_get_user_pages(req, buf, nbytes_limit, !write);
985 if (err) { 1003 if (err) {
986 res = err; 1004 res = err;
987 break; 1005 break;
988 } 1006 }
989 nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset; 1007
990 nbytes = min(nbytes_limit, nbytes);
991 if (write) 1008 if (write)
992 nres = fuse_send_write(req, file, inode, pos, nbytes, 1009 nres = fuse_send_write(req, file, inode, pos, nbytes,
993 current->files); 1010 current->files);
@@ -1163,6 +1180,7 @@ static int fuse_writepage_locked(struct page *page)
1163 fuse_write_fill(req, NULL, ff, inode, page_offset(page), 0, 1); 1180 fuse_write_fill(req, NULL, ff, inode, page_offset(page), 0, 1);
1164 1181
1165 copy_highpage(tmp_page, page); 1182 copy_highpage(tmp_page, page);
1183 req->in.argpages = 1;
1166 req->num_pages = 1; 1184 req->num_pages = 1;
1167 req->pages[0] = tmp_page; 1185 req->pages[0] = tmp_page;
1168 req->page_offset = 0; 1186 req->page_offset = 0;
@@ -1234,8 +1252,9 @@ static void fuse_vma_close(struct vm_area_struct *vma)
1234 * - sync(2) 1252 * - sync(2)
1235 * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER 1253 * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
1236 */ 1254 */
1237static int fuse_page_mkwrite(struct vm_area_struct *vma, struct page *page) 1255static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1238{ 1256{
1257 struct page *page = vmf->page;
1239 /* 1258 /*
1240 * Don't use page->mapping as it may become NULL from a 1259 * Don't use page->mapping as it may become NULL from a
1241 * concurrent truncate. 1260 * concurrent truncate.
@@ -1273,6 +1292,15 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
1273 return 0; 1292 return 0;
1274} 1293}
1275 1294
1295static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
1296{
1297 /* Can't provide the coherency needed for MAP_SHARED */
1298 if (vma->vm_flags & VM_MAYSHARE)
1299 return -ENODEV;
1300
1301 return generic_file_mmap(file, vma);
1302}
1303
1276static int convert_fuse_file_lock(const struct fuse_file_lock *ffl, 1304static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
1277 struct file_lock *fl) 1305 struct file_lock *fl)
1278{ 1306{
@@ -1465,7 +1493,7 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
1465 case SEEK_END: 1493 case SEEK_END:
1466 retval = fuse_update_attributes(inode, NULL, file, NULL); 1494 retval = fuse_update_attributes(inode, NULL, file, NULL);
1467 if (retval) 1495 if (retval)
1468 return retval; 1496 goto exit;
1469 offset += i_size_read(inode); 1497 offset += i_size_read(inode);
1470 break; 1498 break;
1471 case SEEK_CUR: 1499 case SEEK_CUR:
@@ -1479,6 +1507,7 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
1479 } 1507 }
1480 retval = offset; 1508 retval = offset;
1481 } 1509 }
1510exit:
1482 mutex_unlock(&inode->i_mutex); 1511 mutex_unlock(&inode->i_mutex);
1483 return retval; 1512 return retval;
1484} 1513}
@@ -1906,6 +1935,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
1906 .llseek = fuse_file_llseek, 1935 .llseek = fuse_file_llseek,
1907 .read = fuse_direct_read, 1936 .read = fuse_direct_read,
1908 .write = fuse_direct_write, 1937 .write = fuse_direct_write,
1938 .mmap = fuse_direct_mmap,
1909 .open = fuse_open, 1939 .open = fuse_open,
1910 .flush = fuse_flush, 1940 .flush = fuse_flush,
1911 .release = fuse_release, 1941 .release = fuse_release,
@@ -1915,7 +1945,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
1915 .unlocked_ioctl = fuse_file_ioctl, 1945 .unlocked_ioctl = fuse_file_ioctl,
1916 .compat_ioctl = fuse_file_compat_ioctl, 1946 .compat_ioctl = fuse_file_compat_ioctl,
1917 .poll = fuse_file_poll, 1947 .poll = fuse_file_poll,
1918 /* no mmap and splice_read */ 1948 /* no splice_read */
1919}; 1949};
1920 1950
1921static const struct address_space_operations fuse_file_aops = { 1951static const struct address_space_operations fuse_file_aops = {
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 5e64b815a5a1..6fc5aedaa0d5 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -493,7 +493,7 @@ static inline u64 get_node_id(struct inode *inode)
493/** Device operations */ 493/** Device operations */
494extern const struct file_operations fuse_dev_operations; 494extern const struct file_operations fuse_dev_operations;
495 495
496extern struct dentry_operations fuse_dentry_operations; 496extern const struct dentry_operations fuse_dentry_operations;
497 497
498/** 498/**
499 * Get a filled in inode 499 * Get a filled in inode
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 995d63b2e747..e0b53aa7bbec 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -134,7 +134,7 @@ generic_acl_init(struct inode *inode, struct inode *dir,
134 mode_t mode = inode->i_mode; 134 mode_t mode = inode->i_mode;
135 int error; 135 int error;
136 136
137 inode->i_mode = mode & ~current->fs->umask; 137 inode->i_mode = mode & ~current_umask();
138 if (!S_ISLNK(inode->i_mode)) 138 if (!S_ISLNK(inode->i_mode))
139 acl = ops->getacl(dir, ACL_TYPE_DEFAULT); 139 acl = ops->getacl(dir, ACL_TYPE_DEFAULT);
140 if (acl) { 140 if (acl) {
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index e563a6449811..3a981b7f64ca 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,10 @@
1config GFS2_FS 1config GFS2_FS
2 tristate "GFS2 file system support" 2 tristate "GFS2 file system support"
3 depends on EXPERIMENTAL && (64BIT || LBD) 3 depends on EXPERIMENTAL && (64BIT || LBD)
4 select DLM if GFS2_FS_LOCKING_DLM
5 select CONFIGFS_FS if GFS2_FS_LOCKING_DLM
6 select SYSFS if GFS2_FS_LOCKING_DLM
7 select IP_SCTP if DLM_SCTP
4 select FS_POSIX_ACL 8 select FS_POSIX_ACL
5 select CRC32 9 select CRC32
6 help 10 help
@@ -18,17 +22,16 @@ config GFS2_FS
18 the locking module below. Documentation and utilities for GFS2 can 22 the locking module below. Documentation and utilities for GFS2 can
19 be found here: http://sources.redhat.com/cluster 23 be found here: http://sources.redhat.com/cluster
20 24
21 The "nolock" lock module is now built in to GFS2 by default. 25 The "nolock" lock module is now built in to GFS2 by default. If
26 you want to use the DLM, be sure to enable HOTPLUG and IPv4/6
27 networking.
22 28
23config GFS2_FS_LOCKING_DLM 29config GFS2_FS_LOCKING_DLM
24 tristate "GFS2 DLM locking module" 30 bool "GFS2 DLM locking"
25 depends on GFS2_FS && SYSFS && NET && INET && (IPV6 || IPV6=n) 31 depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && HOTPLUG
26 select IP_SCTP if DLM_SCTP
27 select CONFIGFS_FS
28 select DLM
29 help 32 help
30 Multiple node locking module for GFS2 33 Multiple node locking module for GFS2
31 34
32 Most users of GFS2 will require this module. It provides the locking 35 Most users of GFS2 will require this. It provides the locking
33 interface between GFS2 and the DLM, which is required to use GFS2 36 interface between GFS2 and the DLM, which is required to use GFS2
34 in a cluster environment. 37 in a cluster environment.
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index c1b4ec6a9650..a851ea4bdf70 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,9 +1,9 @@
1obj-$(CONFIG_GFS2_FS) += gfs2.o 1obj-$(CONFIG_GFS2_FS) += gfs2.o
2gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \ 2gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
3 glops.o inode.o log.o lops.o locking.o main.o meta_io.o \ 3 glops.o inode.o log.o lops.o main.o meta_io.o \
4 mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \ 4 mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
5 ops_fstype.o ops_inode.o ops_super.o quota.o \ 5 ops_fstype.o ops_inode.o ops_super.o quota.o \
6 recovery.o rgrp.o super.o sys.o trans.o util.o 6 recovery.o rgrp.o super.o sys.o trans.o util.o
7 7
8obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/ 8gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
9 9
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index e335dceb6a4f..fa881bdc3d85 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -15,7 +15,6 @@
15#include <linux/posix_acl.h> 15#include <linux/posix_acl.h>
16#include <linux/posix_acl_xattr.h> 16#include <linux/posix_acl_xattr.h>
17#include <linux/gfs2_ondisk.h> 17#include <linux/gfs2_ondisk.h>
18#include <linux/lm_interface.h>
19 18
20#include "gfs2.h" 19#include "gfs2.h"
21#include "incore.h" 20#include "incore.h"
@@ -216,7 +215,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
216 if (error) 215 if (error)
217 return error; 216 return error;
218 if (!acl) { 217 if (!acl) {
219 mode &= ~current->fs->umask; 218 mode &= ~current_umask();
220 if (mode != ip->i_inode.i_mode) 219 if (mode != ip->i_inode.i_mode)
221 error = munge_mode(ip, mode); 220 error = munge_mode(ip, mode);
222 return error; 221 return error;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 11ffc56f1f81..3a5d3f883e10 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -13,7 +13,6 @@
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/gfs2_ondisk.h> 14#include <linux/gfs2_ondisk.h>
15#include <linux/crc32.h> 15#include <linux/crc32.h>
16#include <linux/lm_interface.h>
17 16
18#include "gfs2.h" 17#include "gfs2.h"
19#include "incore.h" 18#include "incore.h"
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index b7c8e5c70791..aef4d0c06748 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -60,7 +60,6 @@
60#include <linux/gfs2_ondisk.h> 60#include <linux/gfs2_ondisk.h>
61#include <linux/crc32.h> 61#include <linux/crc32.h>
62#include <linux/vmalloc.h> 62#include <linux/vmalloc.h>
63#include <linux/lm_interface.h>
64 63
65#include "gfs2.h" 64#include "gfs2.h"
66#include "incore.h" 65#include "incore.h"
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
index f114ba2b3557..dee9b03e5b37 100644
--- a/fs/gfs2/eaops.c
+++ b/fs/gfs2/eaops.c
@@ -14,7 +14,6 @@
14#include <linux/capability.h> 14#include <linux/capability.h>
15#include <linux/xattr.h> 15#include <linux/xattr.h>
16#include <linux/gfs2_ondisk.h> 16#include <linux/gfs2_ondisk.h>
17#include <linux/lm_interface.h>
18#include <asm/uaccess.h> 17#include <asm/uaccess.h>
19 18
20#include "gfs2.h" 19#include "gfs2.h"
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 0d1c76d906ae..899763aed217 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -13,7 +13,6 @@
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/xattr.h> 14#include <linux/xattr.h>
15#include <linux/gfs2_ondisk.h> 15#include <linux/gfs2_ondisk.h>
16#include <linux/lm_interface.h>
17#include <asm/uaccess.h> 16#include <asm/uaccess.h>
18 17
19#include "gfs2.h" 18#include "gfs2.h"
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 6b983aef785d..3984e47d1d33 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -10,7 +10,6 @@
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/slab.h> 11#include <linux/slab.h>
12#include <linux/spinlock.h> 12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
15#include <linux/delay.h> 14#include <linux/delay.h>
16#include <linux/sort.h> 15#include <linux/sort.h>
@@ -18,7 +17,6 @@
18#include <linux/kallsyms.h> 17#include <linux/kallsyms.h>
19#include <linux/gfs2_ondisk.h> 18#include <linux/gfs2_ondisk.h>
20#include <linux/list.h> 19#include <linux/list.h>
21#include <linux/lm_interface.h>
22#include <linux/wait.h> 20#include <linux/wait.h>
23#include <linux/module.h> 21#include <linux/module.h>
24#include <linux/rwsem.h> 22#include <linux/rwsem.h>
@@ -155,13 +153,10 @@ static void glock_free(struct gfs2_glock *gl)
155 struct gfs2_sbd *sdp = gl->gl_sbd; 153 struct gfs2_sbd *sdp = gl->gl_sbd;
156 struct inode *aspace = gl->gl_aspace; 154 struct inode *aspace = gl->gl_aspace;
157 155
158 if (sdp->sd_lockstruct.ls_ops->lm_put_lock)
159 sdp->sd_lockstruct.ls_ops->lm_put_lock(gl->gl_lock);
160
161 if (aspace) 156 if (aspace)
162 gfs2_aspace_put(aspace); 157 gfs2_aspace_put(aspace);
163 158
164 kmem_cache_free(gfs2_glock_cachep, gl); 159 sdp->sd_lockstruct.ls_ops->lm_put_lock(gfs2_glock_cachep, gl);
165} 160}
166 161
167/** 162/**
@@ -172,6 +167,7 @@ static void glock_free(struct gfs2_glock *gl)
172 167
173static void gfs2_glock_hold(struct gfs2_glock *gl) 168static void gfs2_glock_hold(struct gfs2_glock *gl)
174{ 169{
170 GLOCK_BUG_ON(gl, atomic_read(&gl->gl_ref) == 0);
175 atomic_inc(&gl->gl_ref); 171 atomic_inc(&gl->gl_ref);
176} 172}
177 173
@@ -211,17 +207,15 @@ int gfs2_glock_put(struct gfs2_glock *gl)
211 atomic_dec(&lru_count); 207 atomic_dec(&lru_count);
212 } 208 }
213 spin_unlock(&lru_lock); 209 spin_unlock(&lru_lock);
214 GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED);
215 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_lru));
216 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); 210 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
217 glock_free(gl); 211 glock_free(gl);
218 rv = 1; 212 rv = 1;
219 goto out; 213 goto out;
220 } 214 }
221 write_unlock(gl_lock_addr(gl->gl_hash));
222 /* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */ 215 /* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */
223 if (atomic_read(&gl->gl_ref) == 2) 216 if (atomic_read(&gl->gl_ref) == 2)
224 gfs2_glock_schedule_for_reclaim(gl); 217 gfs2_glock_schedule_for_reclaim(gl);
218 write_unlock(gl_lock_addr(gl->gl_hash));
225out: 219out:
226 return rv; 220 return rv;
227} 221}
@@ -256,27 +250,6 @@ static struct gfs2_glock *search_bucket(unsigned int hash,
256} 250}
257 251
258/** 252/**
259 * gfs2_glock_find() - Find glock by lock number
260 * @sdp: The GFS2 superblock
261 * @name: The lock name
262 *
263 * Returns: NULL, or the struct gfs2_glock with the requested number
264 */
265
266static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp,
267 const struct lm_lockname *name)
268{
269 unsigned int hash = gl_hash(sdp, name);
270 struct gfs2_glock *gl;
271
272 read_lock(gl_lock_addr(hash));
273 gl = search_bucket(hash, sdp, name);
274 read_unlock(gl_lock_addr(hash));
275
276 return gl;
277}
278
279/**
280 * may_grant - check if its ok to grant a new lock 253 * may_grant - check if its ok to grant a new lock
281 * @gl: The glock 254 * @gl: The glock
282 * @gh: The lock request which we wish to grant 255 * @gh: The lock request which we wish to grant
@@ -523,7 +496,7 @@ out_locked:
523} 496}
524 497
525static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock, 498static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
526 unsigned int cur_state, unsigned int req_state, 499 unsigned int req_state,
527 unsigned int flags) 500 unsigned int flags)
528{ 501{
529 int ret = LM_OUT_ERROR; 502 int ret = LM_OUT_ERROR;
@@ -532,7 +505,7 @@ static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
532 return req_state == LM_ST_UNLOCKED ? 0 : req_state; 505 return req_state == LM_ST_UNLOCKED ? 0 : req_state;
533 506
534 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 507 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
535 ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state, 508 ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock,
536 req_state, flags); 509 req_state, flags);
537 return ret; 510 return ret;
538} 511}
@@ -575,7 +548,7 @@ __acquires(&gl->gl_spin)
575 gl->gl_state == LM_ST_DEFERRED) && 548 gl->gl_state == LM_ST_DEFERRED) &&
576 !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) 549 !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
577 lck_flags |= LM_FLAG_TRY_1CB; 550 lck_flags |= LM_FLAG_TRY_1CB;
578 ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, target, lck_flags); 551 ret = gfs2_lm_lock(sdp, gl, target, lck_flags);
579 552
580 if (!(ret & LM_OUT_ASYNC)) { 553 if (!(ret & LM_OUT_ASYNC)) {
581 finish_xmote(gl, ret); 554 finish_xmote(gl, ret);
@@ -624,10 +597,11 @@ __acquires(&gl->gl_spin)
624 597
625 GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)); 598 GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags));
626 599
600 down_read(&gfs2_umount_flush_sem);
627 if (test_bit(GLF_DEMOTE, &gl->gl_flags) && 601 if (test_bit(GLF_DEMOTE, &gl->gl_flags) &&
628 gl->gl_demote_state != gl->gl_state) { 602 gl->gl_demote_state != gl->gl_state) {
629 if (find_first_holder(gl)) 603 if (find_first_holder(gl))
630 goto out; 604 goto out_unlock;
631 if (nonblock) 605 if (nonblock)
632 goto out_sched; 606 goto out_sched;
633 set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); 607 set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
@@ -638,23 +612,26 @@ __acquires(&gl->gl_spin)
638 gfs2_demote_wake(gl); 612 gfs2_demote_wake(gl);
639 ret = do_promote(gl); 613 ret = do_promote(gl);
640 if (ret == 0) 614 if (ret == 0)
641 goto out; 615 goto out_unlock;
642 if (ret == 2) 616 if (ret == 2)
643 return; 617 goto out_sem;
644 gh = find_first_waiter(gl); 618 gh = find_first_waiter(gl);
645 gl->gl_target = gh->gh_state; 619 gl->gl_target = gh->gh_state;
646 if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) 620 if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
647 do_error(gl, 0); /* Fail queued try locks */ 621 do_error(gl, 0); /* Fail queued try locks */
648 } 622 }
649 do_xmote(gl, gh, gl->gl_target); 623 do_xmote(gl, gh, gl->gl_target);
624out_sem:
625 up_read(&gfs2_umount_flush_sem);
650 return; 626 return;
651 627
652out_sched: 628out_sched:
653 gfs2_glock_hold(gl); 629 gfs2_glock_hold(gl);
654 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 630 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
655 gfs2_glock_put(gl); 631 gfs2_glock_put(gl);
656out: 632out_unlock:
657 clear_bit(GLF_LOCK, &gl->gl_flags); 633 clear_bit(GLF_LOCK, &gl->gl_flags);
634 goto out_sem;
658} 635}
659 636
660static void glock_work_func(struct work_struct *work) 637static void glock_work_func(struct work_struct *work)
@@ -681,18 +658,6 @@ static void glock_work_func(struct work_struct *work)
681 gfs2_glock_put(gl); 658 gfs2_glock_put(gl);
682} 659}
683 660
684static int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
685 void **lockp)
686{
687 int error = -EIO;
688 if (!sdp->sd_lockstruct.ls_ops->lm_get_lock)
689 return 0;
690 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
691 error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
692 sdp->sd_lockstruct.ls_lockspace, name, lockp);
693 return error;
694}
695
696/** 661/**
697 * gfs2_glock_get() - Get a glock, or create one if one doesn't exist 662 * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
698 * @sdp: The GFS2 superblock 663 * @sdp: The GFS2 superblock
@@ -719,10 +684,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
719 gl = search_bucket(hash, sdp, &name); 684 gl = search_bucket(hash, sdp, &name);
720 read_unlock(gl_lock_addr(hash)); 685 read_unlock(gl_lock_addr(hash));
721 686
722 if (gl || !create) { 687 *glp = gl;
723 *glp = gl; 688 if (gl)
724 return 0; 689 return 0;
725 } 690 if (!create)
691 return -ENOENT;
726 692
727 gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL); 693 gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
728 if (!gl) 694 if (!gl)
@@ -736,7 +702,9 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
736 gl->gl_demote_state = LM_ST_EXCLUSIVE; 702 gl->gl_demote_state = LM_ST_EXCLUSIVE;
737 gl->gl_hash = hash; 703 gl->gl_hash = hash;
738 gl->gl_ops = glops; 704 gl->gl_ops = glops;
739 gl->gl_stamp = jiffies; 705 snprintf(gl->gl_strname, GDLM_STRNAME_BYTES, "%8x%16llx", name.ln_type, (unsigned long long)number);
706 memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb));
707 gl->gl_lksb.sb_lvbptr = gl->gl_lvb;
740 gl->gl_tchange = jiffies; 708 gl->gl_tchange = jiffies;
741 gl->gl_object = NULL; 709 gl->gl_object = NULL;
742 gl->gl_sbd = sdp; 710 gl->gl_sbd = sdp;
@@ -753,10 +721,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
753 } 721 }
754 } 722 }
755 723
756 error = gfs2_lm_get_lock(sdp, &name, &gl->gl_lock);
757 if (error)
758 goto fail_aspace;
759
760 write_lock(gl_lock_addr(hash)); 724 write_lock(gl_lock_addr(hash));
761 tmp = search_bucket(hash, sdp, &name); 725 tmp = search_bucket(hash, sdp, &name);
762 if (tmp) { 726 if (tmp) {
@@ -772,9 +736,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
772 736
773 return 0; 737 return 0;
774 738
775fail_aspace:
776 if (gl->gl_aspace)
777 gfs2_aspace_put(gl->gl_aspace);
778fail: 739fail:
779 kmem_cache_free(gfs2_glock_cachep, gl); 740 kmem_cache_free(gfs2_glock_cachep, gl);
780 return error; 741 return error;
@@ -966,7 +927,7 @@ do_cancel:
966 if (!(gh->gh_flags & LM_FLAG_PRIORITY)) { 927 if (!(gh->gh_flags & LM_FLAG_PRIORITY)) {
967 spin_unlock(&gl->gl_spin); 928 spin_unlock(&gl->gl_spin);
968 if (sdp->sd_lockstruct.ls_ops->lm_cancel) 929 if (sdp->sd_lockstruct.ls_ops->lm_cancel)
969 sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock); 930 sdp->sd_lockstruct.ls_ops->lm_cancel(gl);
970 spin_lock(&gl->gl_spin); 931 spin_lock(&gl->gl_spin);
971 } 932 }
972 return; 933 return;
@@ -1051,7 +1012,6 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
1051 spin_lock(&gl->gl_spin); 1012 spin_lock(&gl->gl_spin);
1052 clear_bit(GLF_LOCK, &gl->gl_flags); 1013 clear_bit(GLF_LOCK, &gl->gl_flags);
1053 } 1014 }
1054 gl->gl_stamp = jiffies;
1055 if (list_empty(&gl->gl_holders) && 1015 if (list_empty(&gl->gl_holders) &&
1056 !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && 1016 !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
1057 !test_bit(GLF_DEMOTE, &gl->gl_flags)) 1017 !test_bit(GLF_DEMOTE, &gl->gl_flags))
@@ -1240,70 +1200,13 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
1240 gfs2_glock_dq_uninit(&ghs[x]); 1200 gfs2_glock_dq_uninit(&ghs[x]);
1241} 1201}
1242 1202
1243static int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp) 1203void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
1244{
1245 int error = -EIO;
1246 if (!sdp->sd_lockstruct.ls_ops->lm_hold_lvb)
1247 return 0;
1248 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
1249 error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
1250 return error;
1251}
1252
1253/**
1254 * gfs2_lvb_hold - attach a LVB from a glock
1255 * @gl: The glock in question
1256 *
1257 */
1258
1259int gfs2_lvb_hold(struct gfs2_glock *gl)
1260{
1261 int error;
1262
1263 if (!atomic_read(&gl->gl_lvb_count)) {
1264 error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
1265 if (error)
1266 return error;
1267 gfs2_glock_hold(gl);
1268 }
1269 atomic_inc(&gl->gl_lvb_count);
1270
1271 return 0;
1272}
1273
1274/**
1275 * gfs2_lvb_unhold - detach a LVB from a glock
1276 * @gl: The glock in question
1277 *
1278 */
1279
1280void gfs2_lvb_unhold(struct gfs2_glock *gl)
1281{
1282 struct gfs2_sbd *sdp = gl->gl_sbd;
1283
1284 gfs2_glock_hold(gl);
1285 gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
1286 if (atomic_dec_and_test(&gl->gl_lvb_count)) {
1287 if (sdp->sd_lockstruct.ls_ops->lm_unhold_lvb)
1288 sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(gl->gl_lock, gl->gl_lvb);
1289 gl->gl_lvb = NULL;
1290 gfs2_glock_put(gl);
1291 }
1292 gfs2_glock_put(gl);
1293}
1294
1295static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
1296 unsigned int state)
1297{ 1204{
1298 struct gfs2_glock *gl;
1299 unsigned long delay = 0; 1205 unsigned long delay = 0;
1300 unsigned long holdtime; 1206 unsigned long holdtime;
1301 unsigned long now = jiffies; 1207 unsigned long now = jiffies;
1302 1208
1303 gl = gfs2_glock_find(sdp, name); 1209 gfs2_glock_hold(gl);
1304 if (!gl)
1305 return;
1306
1307 holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; 1210 holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
1308 if (time_before(now, holdtime)) 1211 if (time_before(now, holdtime))
1309 delay = holdtime - now; 1212 delay = holdtime - now;
@@ -1317,74 +1220,33 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
1317 gfs2_glock_put(gl); 1220 gfs2_glock_put(gl);
1318} 1221}
1319 1222
1320static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
1321{
1322 struct gfs2_jdesc *jd;
1323
1324 spin_lock(&sdp->sd_jindex_spin);
1325 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
1326 if (jd->jd_jid != jid)
1327 continue;
1328 jd->jd_dirty = 1;
1329 break;
1330 }
1331 spin_unlock(&sdp->sd_jindex_spin);
1332}
1333
1334/** 1223/**
1335 * gfs2_glock_cb - Callback used by locking module 1224 * gfs2_glock_complete - Callback used by locking
1336 * @sdp: Pointer to the superblock 1225 * @gl: Pointer to the glock
1337 * @type: Type of callback 1226 * @ret: The return value from the dlm
1338 * @data: Type dependent data pointer
1339 * 1227 *
1340 * Called by the locking module when it wants to tell us something.
1341 * Either we need to drop a lock, one of our ASYNC requests completed, or
1342 * a journal from another client needs to be recovered.
1343 */ 1228 */
1344 1229
1345void gfs2_glock_cb(void *cb_data, unsigned int type, void *data) 1230void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
1346{ 1231{
1347 struct gfs2_sbd *sdp = cb_data; 1232 struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
1348 1233 gl->gl_reply = ret;
1349 switch (type) { 1234 if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
1350 case LM_CB_NEED_E: 1235 struct gfs2_holder *gh;
1351 blocking_cb(sdp, data, LM_ST_UNLOCKED); 1236 spin_lock(&gl->gl_spin);
1352 return; 1237 gh = find_first_waiter(gl);
1353 1238 if ((!(gh && (gh->gh_flags & LM_FLAG_NOEXP)) &&
1354 case LM_CB_NEED_D: 1239 (gl->gl_target != LM_ST_UNLOCKED)) ||
1355 blocking_cb(sdp, data, LM_ST_DEFERRED); 1240 ((ret & ~LM_OUT_ST_MASK) != 0))
1356 return; 1241 set_bit(GLF_FROZEN, &gl->gl_flags);
1357 1242 spin_unlock(&gl->gl_spin);
1358 case LM_CB_NEED_S: 1243 if (test_bit(GLF_FROZEN, &gl->gl_flags))
1359 blocking_cb(sdp, data, LM_ST_SHARED);
1360 return;
1361
1362 case LM_CB_ASYNC: {
1363 struct lm_async_cb *async = data;
1364 struct gfs2_glock *gl;
1365
1366 down_read(&gfs2_umount_flush_sem);
1367 gl = gfs2_glock_find(sdp, &async->lc_name);
1368 if (gfs2_assert_warn(sdp, gl))
1369 return; 1244 return;
1370 gl->gl_reply = async->lc_ret;
1371 set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
1372 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1373 gfs2_glock_put(gl);
1374 up_read(&gfs2_umount_flush_sem);
1375 return;
1376 }
1377
1378 case LM_CB_NEED_RECOVERY:
1379 gfs2_jdesc_make_dirty(sdp, *(unsigned int *)data);
1380 if (sdp->sd_recoverd_process)
1381 wake_up_process(sdp->sd_recoverd_process);
1382 return;
1383
1384 default:
1385 gfs2_assert_warn(sdp, 0);
1386 return;
1387 } 1245 }
1246 set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
1247 gfs2_glock_hold(gl);
1248 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1249 gfs2_glock_put(gl);
1388} 1250}
1389 1251
1390/** 1252/**
@@ -1515,6 +1377,25 @@ out:
1515 return has_entries; 1377 return has_entries;
1516} 1378}
1517 1379
1380
1381/**
1382 * thaw_glock - thaw out a glock which has an unprocessed reply waiting
1383 * @gl: The glock to thaw
1384 *
1385 * N.B. When we freeze a glock, we leave a ref to the glock outstanding,
1386 * so this has to result in the ref count being dropped by one.
1387 */
1388
1389static void thaw_glock(struct gfs2_glock *gl)
1390{
1391 if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
1392 return;
1393 set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
1394 gfs2_glock_hold(gl);
1395 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1396 gfs2_glock_put(gl);
1397}
1398
1518/** 1399/**
1519 * clear_glock - look at a glock and see if we can free it from glock cache 1400 * clear_glock - look at a glock and see if we can free it from glock cache
1520 * @gl: the glock to look at 1401 * @gl: the glock to look at
@@ -1540,6 +1421,20 @@ static void clear_glock(struct gfs2_glock *gl)
1540} 1421}
1541 1422
1542/** 1423/**
1424 * gfs2_glock_thaw - Thaw any frozen glocks
1425 * @sdp: The super block
1426 *
1427 */
1428
1429void gfs2_glock_thaw(struct gfs2_sbd *sdp)
1430{
1431 unsigned x;
1432
1433 for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
1434 examine_bucket(thaw_glock, sdp, x);
1435}
1436
1437/**
1543 * gfs2_gl_hash_clear - Empty out the glock hash table 1438 * gfs2_gl_hash_clear - Empty out the glock hash table
1544 * @sdp: the filesystem 1439 * @sdp: the filesystem
1545 * @wait: wait until it's all gone 1440 * @wait: wait until it's all gone
@@ -1619,7 +1514,7 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
1619 if (flags & LM_FLAG_NOEXP) 1514 if (flags & LM_FLAG_NOEXP)
1620 *p++ = 'e'; 1515 *p++ = 'e';
1621 if (flags & LM_FLAG_ANY) 1516 if (flags & LM_FLAG_ANY)
1622 *p++ = 'a'; 1517 *p++ = 'A';
1623 if (flags & LM_FLAG_PRIORITY) 1518 if (flags & LM_FLAG_PRIORITY)
1624 *p++ = 'p'; 1519 *p++ = 'p';
1625 if (flags & GL_ASYNC) 1520 if (flags & GL_ASYNC)
@@ -1683,6 +1578,10 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
1683 *p++ = 'i'; 1578 *p++ = 'i';
1684 if (test_bit(GLF_REPLY_PENDING, gflags)) 1579 if (test_bit(GLF_REPLY_PENDING, gflags))
1685 *p++ = 'r'; 1580 *p++ = 'r';
1581 if (test_bit(GLF_INITIAL, gflags))
1582 *p++ = 'I';
1583 if (test_bit(GLF_FROZEN, gflags))
1584 *p++ = 'F';
1686 *p = 0; 1585 *p = 0;
1687 return buf; 1586 return buf;
1688} 1587}
@@ -1717,14 +1616,13 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
1717 dtime *= 1000000/HZ; /* demote time in uSec */ 1616 dtime *= 1000000/HZ; /* demote time in uSec */
1718 if (!test_bit(GLF_DEMOTE, &gl->gl_flags)) 1617 if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
1719 dtime = 0; 1618 dtime = 0;
1720 gfs2_print_dbg(seq, "G: s:%s n:%u/%llu f:%s t:%s d:%s/%llu l:%d a:%d r:%d\n", 1619 gfs2_print_dbg(seq, "G: s:%s n:%u/%llu f:%s t:%s d:%s/%llu a:%d r:%d\n",
1721 state2str(gl->gl_state), 1620 state2str(gl->gl_state),
1722 gl->gl_name.ln_type, 1621 gl->gl_name.ln_type,
1723 (unsigned long long)gl->gl_name.ln_number, 1622 (unsigned long long)gl->gl_name.ln_number,
1724 gflags2str(gflags_buf, &gl->gl_flags), 1623 gflags2str(gflags_buf, &gl->gl_flags),
1725 state2str(gl->gl_target), 1624 state2str(gl->gl_target),
1726 state2str(gl->gl_demote_state), dtime, 1625 state2str(gl->gl_demote_state), dtime,
1727 atomic_read(&gl->gl_lvb_count),
1728 atomic_read(&gl->gl_ail_count), 1626 atomic_read(&gl->gl_ail_count),
1729 atomic_read(&gl->gl_ref)); 1627 atomic_read(&gl->gl_ref));
1730 1628
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 543ec7ecfbda..a602a28f6f08 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -11,15 +11,130 @@
11#define __GLOCK_DOT_H__ 11#define __GLOCK_DOT_H__
12 12
13#include <linux/sched.h> 13#include <linux/sched.h>
14#include <linux/parser.h>
14#include "incore.h" 15#include "incore.h"
15 16
16/* Flags for lock requests; used in gfs2_holder gh_flag field. 17/* Options for hostdata parser */
17 From lm_interface.h: 18
19enum {
20 Opt_jid,
21 Opt_id,
22 Opt_first,
23 Opt_nodir,
24 Opt_err,
25};
26
27/*
28 * lm_lockname types
29 */
30
31#define LM_TYPE_RESERVED 0x00
32#define LM_TYPE_NONDISK 0x01
33#define LM_TYPE_INODE 0x02
34#define LM_TYPE_RGRP 0x03
35#define LM_TYPE_META 0x04
36#define LM_TYPE_IOPEN 0x05
37#define LM_TYPE_FLOCK 0x06
38#define LM_TYPE_PLOCK 0x07
39#define LM_TYPE_QUOTA 0x08
40#define LM_TYPE_JOURNAL 0x09
41
42/*
43 * lm_lock() states
44 *
45 * SHARED is compatible with SHARED, not with DEFERRED or EX.
46 * DEFERRED is compatible with DEFERRED, not with SHARED or EX.
47 */
48
49#define LM_ST_UNLOCKED 0
50#define LM_ST_EXCLUSIVE 1
51#define LM_ST_DEFERRED 2
52#define LM_ST_SHARED 3
53
54/*
55 * lm_lock() flags
56 *
57 * LM_FLAG_TRY
58 * Don't wait to acquire the lock if it can't be granted immediately.
59 *
60 * LM_FLAG_TRY_1CB
61 * Send one blocking callback if TRY is set and the lock is not granted.
62 *
63 * LM_FLAG_NOEXP
64 * GFS sets this flag on lock requests it makes while doing journal recovery.
65 * These special requests should not be blocked due to the recovery like
66 * ordinary locks would be.
67 *
68 * LM_FLAG_ANY
69 * A SHARED request may also be granted in DEFERRED, or a DEFERRED request may
70 * also be granted in SHARED. The preferred state is whichever is compatible
71 * with other granted locks, or the specified state if no other locks exist.
72 *
73 * LM_FLAG_PRIORITY
74 * Override fairness considerations. Suppose a lock is held in a shared state
75 * and there is a pending request for the deferred state. A shared lock
76 * request with the priority flag would be allowed to bypass the deferred
77 * request and directly join the other shared lock. A shared lock request
78 * without the priority flag might be forced to wait until the deferred
79 * requested had acquired and released the lock.
80 */
81
18#define LM_FLAG_TRY 0x00000001 82#define LM_FLAG_TRY 0x00000001
19#define LM_FLAG_TRY_1CB 0x00000002 83#define LM_FLAG_TRY_1CB 0x00000002
20#define LM_FLAG_NOEXP 0x00000004 84#define LM_FLAG_NOEXP 0x00000004
21#define LM_FLAG_ANY 0x00000008 85#define LM_FLAG_ANY 0x00000008
22#define LM_FLAG_PRIORITY 0x00000010 */ 86#define LM_FLAG_PRIORITY 0x00000010
87#define GL_ASYNC 0x00000040
88#define GL_EXACT 0x00000080
89#define GL_SKIP 0x00000100
90#define GL_ATIME 0x00000200
91#define GL_NOCACHE 0x00000400
92
93/*
94 * lm_lock() and lm_async_cb return flags
95 *
96 * LM_OUT_ST_MASK
97 * Masks the lower two bits of lock state in the returned value.
98 *
99 * LM_OUT_CANCELED
100 * The lock request was canceled.
101 *
102 * LM_OUT_ASYNC
103 * The result of the request will be returned in an LM_CB_ASYNC callback.
104 *
105 */
106
107#define LM_OUT_ST_MASK 0x00000003
108#define LM_OUT_CANCELED 0x00000008
109#define LM_OUT_ASYNC 0x00000080
110#define LM_OUT_ERROR 0x00000100
111
112/*
113 * lm_recovery_done() messages
114 */
115
116#define LM_RD_GAVEUP 308
117#define LM_RD_SUCCESS 309
118
119#define GLR_TRYFAILED 13
120
121struct lm_lockops {
122 const char *lm_proto_name;
123 int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname);
124 void (*lm_unmount) (struct gfs2_sbd *sdp);
125 void (*lm_withdraw) (struct gfs2_sbd *sdp);
126 void (*lm_put_lock) (struct kmem_cache *cachep, void *gl);
127 unsigned int (*lm_lock) (struct gfs2_glock *gl,
128 unsigned int req_state, unsigned int flags);
129 void (*lm_cancel) (struct gfs2_glock *gl);
130 const match_table_t *lm_tokens;
131};
132
133#define LM_FLAG_TRY 0x00000001
134#define LM_FLAG_TRY_1CB 0x00000002
135#define LM_FLAG_NOEXP 0x00000004
136#define LM_FLAG_ANY 0x00000008
137#define LM_FLAG_PRIORITY 0x00000010
23 138
24#define GL_ASYNC 0x00000040 139#define GL_ASYNC 0x00000040
25#define GL_EXACT 0x00000080 140#define GL_EXACT 0x00000080
@@ -128,10 +243,12 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
128int gfs2_lvb_hold(struct gfs2_glock *gl); 243int gfs2_lvb_hold(struct gfs2_glock *gl);
129void gfs2_lvb_unhold(struct gfs2_glock *gl); 244void gfs2_lvb_unhold(struct gfs2_glock *gl);
130 245
131void gfs2_glock_cb(void *cb_data, unsigned int type, void *data); 246void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
247void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
132void gfs2_reclaim_glock(struct gfs2_sbd *sdp); 248void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
133void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); 249void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
134void gfs2_glock_finish_truncate(struct gfs2_inode *ip); 250void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
251void gfs2_glock_thaw(struct gfs2_sbd *sdp);
135 252
136int __init gfs2_glock_init(void); 253int __init gfs2_glock_init(void);
137void gfs2_glock_exit(void); 254void gfs2_glock_exit(void);
@@ -141,4 +258,6 @@ void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
141int gfs2_register_debugfs(void); 258int gfs2_register_debugfs(void);
142void gfs2_unregister_debugfs(void); 259void gfs2_unregister_debugfs(void);
143 260
261extern const struct lm_lockops gfs2_dlm_ops;
262
144#endif /* __GLOCK_DOT_H__ */ 263#endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 8522d3aa64fc..bf23a62aa925 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -12,7 +12,6 @@
12#include <linux/completion.h> 12#include <linux/completion.h>
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/gfs2_ondisk.h> 14#include <linux/gfs2_ondisk.h>
15#include <linux/lm_interface.h>
16#include <linux/bio.h> 15#include <linux/bio.h>
17 16
18#include "gfs2.h" 17#include "gfs2.h"
@@ -38,20 +37,25 @@
38static void gfs2_ail_empty_gl(struct gfs2_glock *gl) 37static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
39{ 38{
40 struct gfs2_sbd *sdp = gl->gl_sbd; 39 struct gfs2_sbd *sdp = gl->gl_sbd;
41 unsigned int blocks;
42 struct list_head *head = &gl->gl_ail_list; 40 struct list_head *head = &gl->gl_ail_list;
43 struct gfs2_bufdata *bd; 41 struct gfs2_bufdata *bd;
44 struct buffer_head *bh; 42 struct buffer_head *bh;
45 int error; 43 struct gfs2_trans tr;
46 44
47 blocks = atomic_read(&gl->gl_ail_count); 45 memset(&tr, 0, sizeof(tr));
48 if (!blocks) 46 tr.tr_revokes = atomic_read(&gl->gl_ail_count);
49 return;
50 47
51 error = gfs2_trans_begin(sdp, 0, blocks); 48 if (!tr.tr_revokes)
52 if (gfs2_assert_withdraw(sdp, !error))
53 return; 49 return;
54 50
51 /* A shortened, inline version of gfs2_trans_begin() */
52 tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64));
53 tr.tr_ip = (unsigned long)__builtin_return_address(0);
54 INIT_LIST_HEAD(&tr.tr_list_buf);
55 gfs2_log_reserve(sdp, tr.tr_reserved);
56 BUG_ON(current->journal_info);
57 current->journal_info = &tr;
58
55 gfs2_log_lock(sdp); 59 gfs2_log_lock(sdp);
56 while (!list_empty(head)) { 60 while (!list_empty(head)) {
57 bd = list_entry(head->next, struct gfs2_bufdata, 61 bd = list_entry(head->next, struct gfs2_bufdata,
@@ -72,29 +76,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
72} 76}
73 77
74/** 78/**
75 * gfs2_pte_inval - Sync and invalidate all PTEs associated with a glock 79 * rgrp_go_sync - sync out the metadata for this glock
76 * @gl: the glock
77 *
78 */
79
80static void gfs2_pte_inval(struct gfs2_glock *gl)
81{
82 struct gfs2_inode *ip;
83 struct inode *inode;
84
85 ip = gl->gl_object;
86 inode = &ip->i_inode;
87 if (!ip || !S_ISREG(inode->i_mode))
88 return;
89
90 unmap_shared_mapping_range(inode->i_mapping, 0, 0);
91 if (test_bit(GIF_SW_PAGED, &ip->i_flags))
92 set_bit(GLF_DIRTY, &gl->gl_flags);
93
94}
95
96/**
97 * meta_go_sync - sync out the metadata for this glock
98 * @gl: the glock 80 * @gl: the glock
99 * 81 *
100 * Called when demoting or unlocking an EX glock. We must flush 82 * Called when demoting or unlocking an EX glock. We must flush
@@ -102,36 +84,42 @@ static void gfs2_pte_inval(struct gfs2_glock *gl)
102 * not return to caller to demote/unlock the glock until I/O is complete. 84 * not return to caller to demote/unlock the glock until I/O is complete.
103 */ 85 */
104 86
105static void meta_go_sync(struct gfs2_glock *gl) 87static void rgrp_go_sync(struct gfs2_glock *gl)
106{ 88{
107 if (gl->gl_state != LM_ST_EXCLUSIVE) 89 struct address_space *metamapping = gl->gl_aspace->i_mapping;
90 int error;
91
92 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
108 return; 93 return;
94 BUG_ON(gl->gl_state != LM_ST_EXCLUSIVE);
109 95
110 if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) { 96 gfs2_log_flush(gl->gl_sbd, gl);
111 gfs2_log_flush(gl->gl_sbd, gl); 97 filemap_fdatawrite(metamapping);
112 gfs2_meta_sync(gl); 98 error = filemap_fdatawait(metamapping);
113 gfs2_ail_empty_gl(gl); 99 mapping_set_error(metamapping, error);
114 } 100 gfs2_ail_empty_gl(gl);
115} 101}
116 102
117/** 103/**
118 * meta_go_inval - invalidate the metadata for this glock 104 * rgrp_go_inval - invalidate the metadata for this glock
119 * @gl: the glock 105 * @gl: the glock
120 * @flags: 106 * @flags:
121 * 107 *
108 * We never used LM_ST_DEFERRED with resource groups, so that we
109 * should always see the metadata flag set here.
110 *
122 */ 111 */
123 112
124static void meta_go_inval(struct gfs2_glock *gl, int flags) 113static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
125{ 114{
126 if (!(flags & DIO_METADATA)) 115 struct address_space *mapping = gl->gl_aspace->i_mapping;
127 return;
128 116
129 gfs2_meta_inval(gl); 117 BUG_ON(!(flags & DIO_METADATA));
130 if (gl->gl_object == GFS2_I(gl->gl_sbd->sd_rindex)) 118 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
131 gl->gl_sbd->sd_rindex_uptodate = 0; 119 truncate_inode_pages(mapping, 0);
132 else if (gl->gl_ops == &gfs2_rgrp_glops && gl->gl_object) {
133 struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object;
134 120
121 if (gl->gl_object) {
122 struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object;
135 rgd->rd_flags &= ~GFS2_RDF_UPTODATE; 123 rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
136 } 124 }
137} 125}
@@ -148,48 +136,54 @@ static void inode_go_sync(struct gfs2_glock *gl)
148 struct address_space *metamapping = gl->gl_aspace->i_mapping; 136 struct address_space *metamapping = gl->gl_aspace->i_mapping;
149 int error; 137 int error;
150 138
151 if (gl->gl_state != LM_ST_UNLOCKED)
152 gfs2_pte_inval(gl);
153 if (gl->gl_state != LM_ST_EXCLUSIVE)
154 return;
155
156 if (ip && !S_ISREG(ip->i_inode.i_mode)) 139 if (ip && !S_ISREG(ip->i_inode.i_mode))
157 ip = NULL; 140 ip = NULL;
141 if (ip && test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
142 unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0);
143 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
144 return;
158 145
159 if (test_bit(GLF_DIRTY, &gl->gl_flags)) { 146 BUG_ON(gl->gl_state != LM_ST_EXCLUSIVE);
160 gfs2_log_flush(gl->gl_sbd, gl); 147
161 filemap_fdatawrite(metamapping); 148 gfs2_log_flush(gl->gl_sbd, gl);
162 if (ip) { 149 filemap_fdatawrite(metamapping);
163 struct address_space *mapping = ip->i_inode.i_mapping; 150 if (ip) {
164 filemap_fdatawrite(mapping); 151 struct address_space *mapping = ip->i_inode.i_mapping;
165 error = filemap_fdatawait(mapping); 152 filemap_fdatawrite(mapping);
166 mapping_set_error(mapping, error); 153 error = filemap_fdatawait(mapping);
167 } 154 mapping_set_error(mapping, error);
168 error = filemap_fdatawait(metamapping);
169 mapping_set_error(metamapping, error);
170 clear_bit(GLF_DIRTY, &gl->gl_flags);
171 gfs2_ail_empty_gl(gl);
172 } 155 }
156 error = filemap_fdatawait(metamapping);
157 mapping_set_error(metamapping, error);
158 gfs2_ail_empty_gl(gl);
173} 159}
174 160
175/** 161/**
176 * inode_go_inval - prepare a inode glock to be released 162 * inode_go_inval - prepare a inode glock to be released
177 * @gl: the glock 163 * @gl: the glock
178 * @flags: 164 * @flags:
165 *
166 * Normally we invlidate everything, but if we are moving into
167 * LM_ST_DEFERRED from LM_ST_SHARED or LM_ST_EXCLUSIVE then we
168 * can keep hold of the metadata, since it won't have changed.
179 * 169 *
180 */ 170 */
181 171
182static void inode_go_inval(struct gfs2_glock *gl, int flags) 172static void inode_go_inval(struct gfs2_glock *gl, int flags)
183{ 173{
184 struct gfs2_inode *ip = gl->gl_object; 174 struct gfs2_inode *ip = gl->gl_object;
185 int meta = (flags & DIO_METADATA);
186 175
187 if (meta) { 176 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
188 gfs2_meta_inval(gl); 177
178 if (flags & DIO_METADATA) {
179 struct address_space *mapping = gl->gl_aspace->i_mapping;
180 truncate_inode_pages(mapping, 0);
189 if (ip) 181 if (ip)
190 set_bit(GIF_INVALID, &ip->i_flags); 182 set_bit(GIF_INVALID, &ip->i_flags);
191 } 183 }
192 184
185 if (ip == GFS2_I(gl->gl_sbd->sd_rindex))
186 gl->gl_sbd->sd_rindex_uptodate = 0;
193 if (ip && S_ISREG(ip->i_inode.i_mode)) 187 if (ip && S_ISREG(ip->i_inode.i_mode))
194 truncate_inode_pages(ip->i_inode.i_mapping, 0); 188 truncate_inode_pages(ip->i_inode.i_mapping, 0);
195} 189}
@@ -390,20 +384,7 @@ static int trans_go_demote_ok(const struct gfs2_glock *gl)
390 return 0; 384 return 0;
391} 385}
392 386
393/**
394 * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
395 * @gl: the glock
396 *
397 * Returns: 1 if it's ok
398 */
399
400static int quota_go_demote_ok(const struct gfs2_glock *gl)
401{
402 return !atomic_read(&gl->gl_lvb_count);
403}
404
405const struct gfs2_glock_operations gfs2_meta_glops = { 387const struct gfs2_glock_operations gfs2_meta_glops = {
406 .go_xmote_th = meta_go_sync,
407 .go_type = LM_TYPE_META, 388 .go_type = LM_TYPE_META,
408}; 389};
409 390
@@ -418,8 +399,8 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
418}; 399};
419 400
420const struct gfs2_glock_operations gfs2_rgrp_glops = { 401const struct gfs2_glock_operations gfs2_rgrp_glops = {
421 .go_xmote_th = meta_go_sync, 402 .go_xmote_th = rgrp_go_sync,
422 .go_inval = meta_go_inval, 403 .go_inval = rgrp_go_inval,
423 .go_demote_ok = rgrp_go_demote_ok, 404 .go_demote_ok = rgrp_go_demote_ok,
424 .go_lock = rgrp_go_lock, 405 .go_lock = rgrp_go_lock,
425 .go_unlock = rgrp_go_unlock, 406 .go_unlock = rgrp_go_unlock,
@@ -448,7 +429,6 @@ const struct gfs2_glock_operations gfs2_nondisk_glops = {
448}; 429};
449 430
450const struct gfs2_glock_operations gfs2_quota_glops = { 431const struct gfs2_glock_operations gfs2_quota_glops = {
451 .go_demote_ok = quota_go_demote_ok,
452 .go_type = LM_TYPE_QUOTA, 432 .go_type = LM_TYPE_QUOTA,
453}; 433};
454 434
@@ -456,3 +436,15 @@ const struct gfs2_glock_operations gfs2_journal_glops = {
456 .go_type = LM_TYPE_JOURNAL, 436 .go_type = LM_TYPE_JOURNAL,
457}; 437};
458 438
439const struct gfs2_glock_operations *gfs2_glops_list[] = {
440 [LM_TYPE_META] = &gfs2_meta_glops,
441 [LM_TYPE_INODE] = &gfs2_inode_glops,
442 [LM_TYPE_RGRP] = &gfs2_rgrp_glops,
443 [LM_TYPE_NONDISK] = &gfs2_trans_glops,
444 [LM_TYPE_IOPEN] = &gfs2_iopen_glops,
445 [LM_TYPE_FLOCK] = &gfs2_flock_glops,
446 [LM_TYPE_NONDISK] = &gfs2_nondisk_glops,
447 [LM_TYPE_QUOTA] = &gfs2_quota_glops,
448 [LM_TYPE_JOURNAL] = &gfs2_journal_glops,
449};
450
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
index a1d9b5b024e6..b3aa2e3210fd 100644
--- a/fs/gfs2/glops.h
+++ b/fs/gfs2/glops.h
@@ -21,5 +21,6 @@ extern const struct gfs2_glock_operations gfs2_flock_glops;
21extern const struct gfs2_glock_operations gfs2_nondisk_glops; 21extern const struct gfs2_glock_operations gfs2_nondisk_glops;
22extern const struct gfs2_glock_operations gfs2_quota_glops; 22extern const struct gfs2_glock_operations gfs2_quota_glops;
23extern const struct gfs2_glock_operations gfs2_journal_glops; 23extern const struct gfs2_glock_operations gfs2_journal_glops;
24extern const struct gfs2_glock_operations *gfs2_glops_list[];
24 25
25#endif /* __GLOPS_DOT_H__ */ 26#endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 608849d00021..399d1b978049 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -12,6 +12,8 @@
12 12
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/workqueue.h> 14#include <linux/workqueue.h>
15#include <linux/dlm.h>
16#include <linux/buffer_head.h>
15 17
16#define DIO_WAIT 0x00000010 18#define DIO_WAIT 0x00000010
17#define DIO_METADATA 0x00000020 19#define DIO_METADATA 0x00000020
@@ -26,6 +28,7 @@ struct gfs2_trans;
26struct gfs2_ail; 28struct gfs2_ail;
27struct gfs2_jdesc; 29struct gfs2_jdesc;
28struct gfs2_sbd; 30struct gfs2_sbd;
31struct lm_lockops;
29 32
30typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret); 33typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret);
31 34
@@ -121,6 +124,28 @@ struct gfs2_bufdata {
121 struct list_head bd_ail_gl_list; 124 struct list_head bd_ail_gl_list;
122}; 125};
123 126
127/*
128 * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a
129 * prefix of lock_dlm_ gets awkward.
130 */
131
132#define GDLM_STRNAME_BYTES 25
133#define GDLM_LVB_SIZE 32
134
135enum {
136 DFL_BLOCK_LOCKS = 0,
137};
138
139struct lm_lockname {
140 u64 ln_number;
141 unsigned int ln_type;
142};
143
144#define lm_name_equal(name1, name2) \
145 (((name1)->ln_number == (name2)->ln_number) && \
146 ((name1)->ln_type == (name2)->ln_type))
147
148
124struct gfs2_glock_operations { 149struct gfs2_glock_operations {
125 void (*go_xmote_th) (struct gfs2_glock *gl); 150 void (*go_xmote_th) (struct gfs2_glock *gl);
126 int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh); 151 int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
@@ -162,6 +187,8 @@ enum {
162 GLF_LFLUSH = 7, 187 GLF_LFLUSH = 7,
163 GLF_INVALIDATE_IN_PROGRESS = 8, 188 GLF_INVALIDATE_IN_PROGRESS = 8,
164 GLF_REPLY_PENDING = 9, 189 GLF_REPLY_PENDING = 9,
190 GLF_INITIAL = 10,
191 GLF_FROZEN = 11,
165}; 192};
166 193
167struct gfs2_glock { 194struct gfs2_glock {
@@ -176,16 +203,15 @@ struct gfs2_glock {
176 unsigned int gl_target; 203 unsigned int gl_target;
177 unsigned int gl_reply; 204 unsigned int gl_reply;
178 unsigned int gl_hash; 205 unsigned int gl_hash;
206 unsigned int gl_req;
179 unsigned int gl_demote_state; /* state requested by remote node */ 207 unsigned int gl_demote_state; /* state requested by remote node */
180 unsigned long gl_demote_time; /* time of first demote request */ 208 unsigned long gl_demote_time; /* time of first demote request */
181 struct list_head gl_holders; 209 struct list_head gl_holders;
182 210
183 const struct gfs2_glock_operations *gl_ops; 211 const struct gfs2_glock_operations *gl_ops;
184 void *gl_lock; 212 char gl_strname[GDLM_STRNAME_BYTES];
185 char *gl_lvb; 213 struct dlm_lksb gl_lksb;
186 atomic_t gl_lvb_count; 214 char gl_lvb[32];
187
188 unsigned long gl_stamp;
189 unsigned long gl_tchange; 215 unsigned long gl_tchange;
190 void *gl_object; 216 void *gl_object;
191 217
@@ -283,7 +309,9 @@ enum {
283 309
284struct gfs2_quota_data { 310struct gfs2_quota_data {
285 struct list_head qd_list; 311 struct list_head qd_list;
286 unsigned int qd_count; 312 struct list_head qd_reclaim;
313
314 atomic_t qd_count;
287 315
288 u32 qd_id; 316 u32 qd_id;
289 unsigned long qd_flags; /* QDF_... */ 317 unsigned long qd_flags; /* QDF_... */
@@ -303,7 +331,6 @@ struct gfs2_quota_data {
303 331
304 u64 qd_sync_gen; 332 u64 qd_sync_gen;
305 unsigned long qd_last_warn; 333 unsigned long qd_last_warn;
306 unsigned long qd_last_touched;
307}; 334};
308 335
309struct gfs2_trans { 336struct gfs2_trans {
@@ -390,7 +417,7 @@ struct gfs2_args {
390 unsigned int ar_suiddir:1; /* suiddir support */ 417 unsigned int ar_suiddir:1; /* suiddir support */
391 unsigned int ar_data:2; /* ordered/writeback */ 418 unsigned int ar_data:2; /* ordered/writeback */
392 unsigned int ar_meta:1; /* mount metafs */ 419 unsigned int ar_meta:1; /* mount metafs */
393 unsigned int ar_num_glockd; /* Number of glockd threads */ 420 unsigned int ar_discard:1; /* discard requests */
394}; 421};
395 422
396struct gfs2_tune { 423struct gfs2_tune {
@@ -406,7 +433,6 @@ struct gfs2_tune {
406 unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */ 433 unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
407 unsigned int gt_quota_scale_num; /* Numerator */ 434 unsigned int gt_quota_scale_num; /* Numerator */
408 unsigned int gt_quota_scale_den; /* Denominator */ 435 unsigned int gt_quota_scale_den; /* Denominator */
409 unsigned int gt_quota_cache_secs;
410 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */ 436 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
411 unsigned int gt_new_files_jdata; 437 unsigned int gt_new_files_jdata;
412 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ 438 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
@@ -445,6 +471,31 @@ struct gfs2_sb_host {
445 471
446 char sb_lockproto[GFS2_LOCKNAME_LEN]; 472 char sb_lockproto[GFS2_LOCKNAME_LEN];
447 char sb_locktable[GFS2_LOCKNAME_LEN]; 473 char sb_locktable[GFS2_LOCKNAME_LEN];
474 u8 sb_uuid[16];
475};
476
477/*
478 * lm_mount() return values
479 *
480 * ls_jid - the journal ID this node should use
481 * ls_first - this node is the first to mount the file system
482 * ls_lockspace - lock module's context for this file system
483 * ls_ops - lock module's functions
484 */
485
486struct lm_lockstruct {
487 u32 ls_id;
488 unsigned int ls_jid;
489 unsigned int ls_first;
490 unsigned int ls_first_done;
491 unsigned int ls_nodir;
492 const struct lm_lockops *ls_ops;
493 unsigned long ls_flags;
494 dlm_lockspace_t *ls_dlm;
495
496 int ls_recover_jid;
497 int ls_recover_jid_done;
498 int ls_recover_jid_status;
448}; 499};
449 500
450struct gfs2_sbd { 501struct gfs2_sbd {
@@ -520,7 +571,6 @@ struct gfs2_sbd {
520 spinlock_t sd_jindex_spin; 571 spinlock_t sd_jindex_spin;
521 struct mutex sd_jindex_mutex; 572 struct mutex sd_jindex_mutex;
522 unsigned int sd_journals; 573 unsigned int sd_journals;
523 unsigned long sd_jindex_refresh_time;
524 574
525 struct gfs2_jdesc *sd_jdesc; 575 struct gfs2_jdesc *sd_jdesc;
526 struct gfs2_holder sd_journal_gh; 576 struct gfs2_holder sd_journal_gh;
@@ -540,7 +590,6 @@ struct gfs2_sbd {
540 590
541 struct list_head sd_quota_list; 591 struct list_head sd_quota_list;
542 atomic_t sd_quota_count; 592 atomic_t sd_quota_count;
543 spinlock_t sd_quota_spin;
544 struct mutex sd_quota_mutex; 593 struct mutex sd_quota_mutex;
545 wait_queue_head_t sd_quota_wait; 594 wait_queue_head_t sd_quota_wait;
546 struct list_head sd_trunc_list; 595 struct list_head sd_trunc_list;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 3b87c188da41..7b277d449155 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -16,7 +16,6 @@
16#include <linux/sort.h> 16#include <linux/sort.h>
17#include <linux/gfs2_ondisk.h> 17#include <linux/gfs2_ondisk.h>
18#include <linux/crc32.h> 18#include <linux/crc32.h>
19#include <linux/lm_interface.h>
20#include <linux/security.h> 19#include <linux/security.h>
21#include <linux/time.h> 20#include <linux/time.h>
22 21
@@ -137,16 +136,16 @@ void gfs2_set_iop(struct inode *inode)
137 136
138 if (S_ISREG(mode)) { 137 if (S_ISREG(mode)) {
139 inode->i_op = &gfs2_file_iops; 138 inode->i_op = &gfs2_file_iops;
140 if (sdp->sd_args.ar_localflocks) 139 if (gfs2_localflocks(sdp))
141 inode->i_fop = &gfs2_file_fops_nolock; 140 inode->i_fop = gfs2_file_fops_nolock;
142 else 141 else
143 inode->i_fop = &gfs2_file_fops; 142 inode->i_fop = gfs2_file_fops;
144 } else if (S_ISDIR(mode)) { 143 } else if (S_ISDIR(mode)) {
145 inode->i_op = &gfs2_dir_iops; 144 inode->i_op = &gfs2_dir_iops;
146 if (sdp->sd_args.ar_localflocks) 145 if (gfs2_localflocks(sdp))
147 inode->i_fop = &gfs2_dir_fops_nolock; 146 inode->i_fop = gfs2_dir_fops_nolock;
148 else 147 else
149 inode->i_fop = &gfs2_dir_fops; 148 inode->i_fop = gfs2_dir_fops;
150 } else if (S_ISLNK(mode)) { 149 } else if (S_ISLNK(mode)) {
151 inode->i_op = &gfs2_symlink_iops; 150 inode->i_op = &gfs2_symlink_iops;
152 } else { 151 } else {
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index d5329364cdff..dca4fee3078b 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -101,12 +101,26 @@ void gfs2_dinode_print(const struct gfs2_inode *ip);
101extern const struct inode_operations gfs2_file_iops; 101extern const struct inode_operations gfs2_file_iops;
102extern const struct inode_operations gfs2_dir_iops; 102extern const struct inode_operations gfs2_dir_iops;
103extern const struct inode_operations gfs2_symlink_iops; 103extern const struct inode_operations gfs2_symlink_iops;
104extern const struct file_operations gfs2_file_fops; 104extern const struct file_operations *gfs2_file_fops_nolock;
105extern const struct file_operations gfs2_dir_fops; 105extern const struct file_operations *gfs2_dir_fops_nolock;
106extern const struct file_operations gfs2_file_fops_nolock;
107extern const struct file_operations gfs2_dir_fops_nolock;
108 106
109extern void gfs2_set_inode_flags(struct inode *inode); 107extern void gfs2_set_inode_flags(struct inode *inode);
108
109#ifdef CONFIG_GFS2_FS_LOCKING_DLM
110extern const struct file_operations *gfs2_file_fops;
111extern const struct file_operations *gfs2_dir_fops;
112static inline int gfs2_localflocks(const struct gfs2_sbd *sdp)
113{
114 return sdp->sd_args.ar_localflocks;
115}
116#else /* Single node only */
117#define gfs2_file_fops NULL
118#define gfs2_dir_fops NULL
119static inline int gfs2_localflocks(const struct gfs2_sbd *sdp)
120{
121 return 1;
122}
123#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
110 124
111#endif /* __INODE_DOT_H__ */ 125#endif /* __INODE_DOT_H__ */
112 126
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
new file mode 100644
index 000000000000..46df988323bc
--- /dev/null
+++ b/fs/gfs2/lock_dlm.c
@@ -0,0 +1,241 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/fs.h>
11#include <linux/dlm.h>
12#include <linux/types.h>
13#include <linux/gfs2_ondisk.h>
14
15#include "incore.h"
16#include "glock.h"
17#include "util.h"
18
19
20static void gdlm_ast(void *arg)
21{
22 struct gfs2_glock *gl = arg;
23 unsigned ret = gl->gl_state;
24
25 BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED);
26
27 if (gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID)
28 memset(gl->gl_lvb, 0, GDLM_LVB_SIZE);
29
30 switch (gl->gl_lksb.sb_status) {
31 case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
32 kmem_cache_free(gfs2_glock_cachep, gl);
33 return;
34 case -DLM_ECANCEL: /* Cancel while getting lock */
35 ret |= LM_OUT_CANCELED;
36 goto out;
37 case -EAGAIN: /* Try lock fails */
38 goto out;
39 case -EINVAL: /* Invalid */
40 case -ENOMEM: /* Out of memory */
41 ret |= LM_OUT_ERROR;
42 goto out;
43 case 0: /* Success */
44 break;
45 default: /* Something unexpected */
46 BUG();
47 }
48
49 ret = gl->gl_req;
50 if (gl->gl_lksb.sb_flags & DLM_SBF_ALTMODE) {
51 if (gl->gl_req == LM_ST_SHARED)
52 ret = LM_ST_DEFERRED;
53 else if (gl->gl_req == LM_ST_DEFERRED)
54 ret = LM_ST_SHARED;
55 else
56 BUG();
57 }
58
59 set_bit(GLF_INITIAL, &gl->gl_flags);
60 gfs2_glock_complete(gl, ret);
61 return;
62out:
63 if (!test_bit(GLF_INITIAL, &gl->gl_flags))
64 gl->gl_lksb.sb_lkid = 0;
65 gfs2_glock_complete(gl, ret);
66}
67
68static void gdlm_bast(void *arg, int mode)
69{
70 struct gfs2_glock *gl = arg;
71
72 switch (mode) {
73 case DLM_LOCK_EX:
74 gfs2_glock_cb(gl, LM_ST_UNLOCKED);
75 break;
76 case DLM_LOCK_CW:
77 gfs2_glock_cb(gl, LM_ST_DEFERRED);
78 break;
79 case DLM_LOCK_PR:
80 gfs2_glock_cb(gl, LM_ST_SHARED);
81 break;
82 default:
83 printk(KERN_ERR "unknown bast mode %d", mode);
84 BUG();
85 }
86}
87
88/* convert gfs lock-state to dlm lock-mode */
89
90static int make_mode(const unsigned int lmstate)
91{
92 switch (lmstate) {
93 case LM_ST_UNLOCKED:
94 return DLM_LOCK_NL;
95 case LM_ST_EXCLUSIVE:
96 return DLM_LOCK_EX;
97 case LM_ST_DEFERRED:
98 return DLM_LOCK_CW;
99 case LM_ST_SHARED:
100 return DLM_LOCK_PR;
101 }
102 printk(KERN_ERR "unknown LM state %d", lmstate);
103 BUG();
104 return -1;
105}
106
107static u32 make_flags(const u32 lkid, const unsigned int gfs_flags,
108 const int req)
109{
110 u32 lkf = 0;
111
112 if (gfs_flags & LM_FLAG_TRY)
113 lkf |= DLM_LKF_NOQUEUE;
114
115 if (gfs_flags & LM_FLAG_TRY_1CB) {
116 lkf |= DLM_LKF_NOQUEUE;
117 lkf |= DLM_LKF_NOQUEUEBAST;
118 }
119
120 if (gfs_flags & LM_FLAG_PRIORITY) {
121 lkf |= DLM_LKF_NOORDER;
122 lkf |= DLM_LKF_HEADQUE;
123 }
124
125 if (gfs_flags & LM_FLAG_ANY) {
126 if (req == DLM_LOCK_PR)
127 lkf |= DLM_LKF_ALTCW;
128 else if (req == DLM_LOCK_CW)
129 lkf |= DLM_LKF_ALTPR;
130 else
131 BUG();
132 }
133
134 if (lkid != 0)
135 lkf |= DLM_LKF_CONVERT;
136
137 lkf |= DLM_LKF_VALBLK;
138
139 return lkf;
140}
141
142static unsigned int gdlm_lock(struct gfs2_glock *gl,
143 unsigned int req_state, unsigned int flags)
144{
145 struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
146 int error;
147 int req;
148 u32 lkf;
149
150 gl->gl_req = req_state;
151 req = make_mode(req_state);
152 lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req);
153
154 /*
155 * Submit the actual lock request.
156 */
157
158 error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
159 GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
160 if (error == -EAGAIN)
161 return 0;
162 if (error)
163 return LM_OUT_ERROR;
164 return LM_OUT_ASYNC;
165}
166
167static void gdlm_put_lock(struct kmem_cache *cachep, void *ptr)
168{
169 struct gfs2_glock *gl = ptr;
170 struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
171 int error;
172
173 if (gl->gl_lksb.sb_lkid == 0) {
174 kmem_cache_free(cachep, gl);
175 return;
176 }
177
178 error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK,
179 NULL, gl);
180 if (error) {
181 printk(KERN_ERR "gdlm_unlock %x,%llx err=%d\n",
182 gl->gl_name.ln_type,
183 (unsigned long long)gl->gl_name.ln_number, error);
184 return;
185 }
186}
187
188static void gdlm_cancel(struct gfs2_glock *gl)
189{
190 struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
191 dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl);
192}
193
194static int gdlm_mount(struct gfs2_sbd *sdp, const char *fsname)
195{
196 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
197 int error;
198
199 if (fsname == NULL) {
200 fs_info(sdp, "no fsname found\n");
201 return -EINVAL;
202 }
203
204 error = dlm_new_lockspace(fsname, strlen(fsname), &ls->ls_dlm,
205 DLM_LSFL_FS | DLM_LSFL_NEWEXCL |
206 (ls->ls_nodir ? DLM_LSFL_NODIR : 0),
207 GDLM_LVB_SIZE);
208 if (error)
209 printk(KERN_ERR "dlm_new_lockspace error %d", error);
210
211 return error;
212}
213
214static void gdlm_unmount(struct gfs2_sbd *sdp)
215{
216 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
217
218 if (ls->ls_dlm) {
219 dlm_release_lockspace(ls->ls_dlm, 2);
220 ls->ls_dlm = NULL;
221 }
222}
223
224static const match_table_t dlm_tokens = {
225 { Opt_jid, "jid=%d"},
226 { Opt_id, "id=%d"},
227 { Opt_first, "first=%d"},
228 { Opt_nodir, "nodir=%d"},
229 { Opt_err, NULL },
230};
231
232const struct lm_lockops gfs2_dlm_ops = {
233 .lm_proto_name = "lock_dlm",
234 .lm_mount = gdlm_mount,
235 .lm_unmount = gdlm_unmount,
236 .lm_put_lock = gdlm_put_lock,
237 .lm_lock = gdlm_lock,
238 .lm_cancel = gdlm_cancel,
239 .lm_tokens = &dlm_tokens,
240};
241
diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
deleted file mode 100644
index 523243a13a21..000000000000
--- a/fs/gfs2/locking.c
+++ /dev/null
@@ -1,232 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/string.h>
13#include <linux/slab.h>
14#include <linux/wait.h>
15#include <linux/sched.h>
16#include <linux/kmod.h>
17#include <linux/fs.h>
18#include <linux/delay.h>
19#include <linux/lm_interface.h>
20
21struct lmh_wrapper {
22 struct list_head lw_list;
23 const struct lm_lockops *lw_ops;
24};
25
26static int nolock_mount(char *table_name, char *host_data,
27 lm_callback_t cb, void *cb_data,
28 unsigned int min_lvb_size, int flags,
29 struct lm_lockstruct *lockstruct,
30 struct kobject *fskobj);
31
32/* List of registered low-level locking protocols. A file system selects one
33 of them by name at mount time, e.g. lock_nolock, lock_dlm. */
34
35static const struct lm_lockops nolock_ops = {
36 .lm_proto_name = "lock_nolock",
37 .lm_mount = nolock_mount,
38};
39
40static struct lmh_wrapper nolock_proto = {
41 .lw_list = LIST_HEAD_INIT(nolock_proto.lw_list),
42 .lw_ops = &nolock_ops,
43};
44
45static LIST_HEAD(lmh_list);
46static DEFINE_MUTEX(lmh_lock);
47
48static int nolock_mount(char *table_name, char *host_data,
49 lm_callback_t cb, void *cb_data,
50 unsigned int min_lvb_size, int flags,
51 struct lm_lockstruct *lockstruct,
52 struct kobject *fskobj)
53{
54 char *c;
55 unsigned int jid;
56
57 c = strstr(host_data, "jid=");
58 if (!c)
59 jid = 0;
60 else {
61 c += 4;
62 sscanf(c, "%u", &jid);
63 }
64
65 lockstruct->ls_jid = jid;
66 lockstruct->ls_first = 1;
67 lockstruct->ls_lvb_size = min_lvb_size;
68 lockstruct->ls_ops = &nolock_ops;
69 lockstruct->ls_flags = LM_LSFLAG_LOCAL;
70
71 return 0;
72}
73
74/**
75 * gfs2_register_lockproto - Register a low-level locking protocol
76 * @proto: the protocol definition
77 *
78 * Returns: 0 on success, -EXXX on failure
79 */
80
81int gfs2_register_lockproto(const struct lm_lockops *proto)
82{
83 struct lmh_wrapper *lw;
84
85 mutex_lock(&lmh_lock);
86
87 list_for_each_entry(lw, &lmh_list, lw_list) {
88 if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
89 mutex_unlock(&lmh_lock);
90 printk(KERN_INFO "GFS2: protocol %s already exists\n",
91 proto->lm_proto_name);
92 return -EEXIST;
93 }
94 }
95
96 lw = kzalloc(sizeof(struct lmh_wrapper), GFP_KERNEL);
97 if (!lw) {
98 mutex_unlock(&lmh_lock);
99 return -ENOMEM;
100 }
101
102 lw->lw_ops = proto;
103 list_add(&lw->lw_list, &lmh_list);
104
105 mutex_unlock(&lmh_lock);
106
107 return 0;
108}
109
110/**
111 * gfs2_unregister_lockproto - Unregister a low-level locking protocol
112 * @proto: the protocol definition
113 *
114 */
115
116void gfs2_unregister_lockproto(const struct lm_lockops *proto)
117{
118 struct lmh_wrapper *lw;
119
120 mutex_lock(&lmh_lock);
121
122 list_for_each_entry(lw, &lmh_list, lw_list) {
123 if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
124 list_del(&lw->lw_list);
125 mutex_unlock(&lmh_lock);
126 kfree(lw);
127 return;
128 }
129 }
130
131 mutex_unlock(&lmh_lock);
132
133 printk(KERN_WARNING "GFS2: can't unregister lock protocol %s\n",
134 proto->lm_proto_name);
135}
136
137/**
138 * gfs2_mount_lockproto - Mount a lock protocol
139 * @proto_name - the name of the protocol
140 * @table_name - the name of the lock space
141 * @host_data - data specific to this host
142 * @cb - the callback to the code using the lock module
143 * @sdp - The GFS2 superblock
144 * @min_lvb_size - the mininum LVB size that the caller can deal with
145 * @flags - LM_MFLAG_*
146 * @lockstruct - a structure returned describing the mount
147 *
148 * Returns: 0 on success, -EXXX on failure
149 */
150
151int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
152 lm_callback_t cb, void *cb_data,
153 unsigned int min_lvb_size, int flags,
154 struct lm_lockstruct *lockstruct,
155 struct kobject *fskobj)
156{
157 struct lmh_wrapper *lw = NULL;
158 int try = 0;
159 int error, found;
160
161
162retry:
163 mutex_lock(&lmh_lock);
164
165 if (list_empty(&nolock_proto.lw_list))
166 list_add(&nolock_proto.lw_list, &lmh_list);
167
168 found = 0;
169 list_for_each_entry(lw, &lmh_list, lw_list) {
170 if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) {
171 found = 1;
172 break;
173 }
174 }
175
176 if (!found) {
177 if (!try && capable(CAP_SYS_MODULE)) {
178 try = 1;
179 mutex_unlock(&lmh_lock);
180 request_module(proto_name);
181 goto retry;
182 }
183 printk(KERN_INFO "GFS2: can't find protocol %s\n", proto_name);
184 error = -ENOENT;
185 goto out;
186 }
187
188 if (lw->lw_ops->lm_owner &&
189 !try_module_get(lw->lw_ops->lm_owner)) {
190 try = 0;
191 mutex_unlock(&lmh_lock);
192 msleep(1000);
193 goto retry;
194 }
195
196 error = lw->lw_ops->lm_mount(table_name, host_data, cb, cb_data,
197 min_lvb_size, flags, lockstruct, fskobj);
198 if (error)
199 module_put(lw->lw_ops->lm_owner);
200out:
201 mutex_unlock(&lmh_lock);
202 return error;
203}
204
205void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct)
206{
207 mutex_lock(&lmh_lock);
208 if (lockstruct->ls_ops->lm_unmount)
209 lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
210 if (lockstruct->ls_ops->lm_owner)
211 module_put(lockstruct->ls_ops->lm_owner);
212 mutex_unlock(&lmh_lock);
213}
214
215/**
216 * gfs2_withdraw_lockproto - abnormally unmount a lock module
217 * @lockstruct: the lockstruct passed into mount
218 *
219 */
220
221void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct)
222{
223 mutex_lock(&lmh_lock);
224 lockstruct->ls_ops->lm_withdraw(lockstruct->ls_lockspace);
225 if (lockstruct->ls_ops->lm_owner)
226 module_put(lockstruct->ls_ops->lm_owner);
227 mutex_unlock(&lmh_lock);
228}
229
230EXPORT_SYMBOL_GPL(gfs2_register_lockproto);
231EXPORT_SYMBOL_GPL(gfs2_unregister_lockproto);
232
diff --git a/fs/gfs2/locking/dlm/Makefile b/fs/gfs2/locking/dlm/Makefile
deleted file mode 100644
index 2609bb6cd013..000000000000
--- a/fs/gfs2/locking/dlm/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
1obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
2lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o
3
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
deleted file mode 100644
index 2482c9047505..000000000000
--- a/fs/gfs2/locking/dlm/lock.c
+++ /dev/null
@@ -1,708 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include "lock_dlm.h"
11
12static char junk_lvb[GDLM_LVB_SIZE];
13
14
15/* convert dlm lock-mode to gfs lock-state */
16
17static s16 gdlm_make_lmstate(s16 dlmmode)
18{
19 switch (dlmmode) {
20 case DLM_LOCK_IV:
21 case DLM_LOCK_NL:
22 return LM_ST_UNLOCKED;
23 case DLM_LOCK_EX:
24 return LM_ST_EXCLUSIVE;
25 case DLM_LOCK_CW:
26 return LM_ST_DEFERRED;
27 case DLM_LOCK_PR:
28 return LM_ST_SHARED;
29 }
30 gdlm_assert(0, "unknown DLM mode %d", dlmmode);
31 return -1;
32}
33
34/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
35 thread gets to it. */
36
37static void queue_submit(struct gdlm_lock *lp)
38{
39 struct gdlm_ls *ls = lp->ls;
40
41 spin_lock(&ls->async_lock);
42 list_add_tail(&lp->delay_list, &ls->submit);
43 spin_unlock(&ls->async_lock);
44 wake_up(&ls->thread_wait);
45}
46
47static void wake_up_ast(struct gdlm_lock *lp)
48{
49 clear_bit(LFL_AST_WAIT, &lp->flags);
50 smp_mb__after_clear_bit();
51 wake_up_bit(&lp->flags, LFL_AST_WAIT);
52}
53
54static void gdlm_delete_lp(struct gdlm_lock *lp)
55{
56 struct gdlm_ls *ls = lp->ls;
57
58 spin_lock(&ls->async_lock);
59 if (!list_empty(&lp->delay_list))
60 list_del_init(&lp->delay_list);
61 ls->all_locks_count--;
62 spin_unlock(&ls->async_lock);
63
64 kfree(lp);
65}
66
67static void gdlm_queue_delayed(struct gdlm_lock *lp)
68{
69 struct gdlm_ls *ls = lp->ls;
70
71 spin_lock(&ls->async_lock);
72 list_add_tail(&lp->delay_list, &ls->delayed);
73 spin_unlock(&ls->async_lock);
74}
75
76static void process_complete(struct gdlm_lock *lp)
77{
78 struct gdlm_ls *ls = lp->ls;
79 struct lm_async_cb acb;
80
81 memset(&acb, 0, sizeof(acb));
82
83 if (lp->lksb.sb_status == -DLM_ECANCEL) {
84 log_info("complete dlm cancel %x,%llx flags %lx",
85 lp->lockname.ln_type,
86 (unsigned long long)lp->lockname.ln_number,
87 lp->flags);
88
89 lp->req = lp->cur;
90 acb.lc_ret |= LM_OUT_CANCELED;
91 if (lp->cur == DLM_LOCK_IV)
92 lp->lksb.sb_lkid = 0;
93 goto out;
94 }
95
96 if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
97 if (lp->lksb.sb_status != -DLM_EUNLOCK) {
98 log_info("unlock sb_status %d %x,%llx flags %lx",
99 lp->lksb.sb_status, lp->lockname.ln_type,
100 (unsigned long long)lp->lockname.ln_number,
101 lp->flags);
102 return;
103 }
104
105 lp->cur = DLM_LOCK_IV;
106 lp->req = DLM_LOCK_IV;
107 lp->lksb.sb_lkid = 0;
108
109 if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
110 gdlm_delete_lp(lp);
111 return;
112 }
113 goto out;
114 }
115
116 if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
117 memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
118
119 if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
120 if (lp->req == DLM_LOCK_PR)
121 lp->req = DLM_LOCK_CW;
122 else if (lp->req == DLM_LOCK_CW)
123 lp->req = DLM_LOCK_PR;
124 }
125
126 /*
127 * A canceled lock request. The lock was just taken off the delayed
128 * list and was never even submitted to dlm.
129 */
130
131 if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
132 log_info("complete internal cancel %x,%llx",
133 lp->lockname.ln_type,
134 (unsigned long long)lp->lockname.ln_number);
135 lp->req = lp->cur;
136 acb.lc_ret |= LM_OUT_CANCELED;
137 goto out;
138 }
139
140 /*
141 * An error occured.
142 */
143
144 if (lp->lksb.sb_status) {
145 /* a "normal" error */
146 if ((lp->lksb.sb_status == -EAGAIN) &&
147 (lp->lkf & DLM_LKF_NOQUEUE)) {
148 lp->req = lp->cur;
149 if (lp->cur == DLM_LOCK_IV)
150 lp->lksb.sb_lkid = 0;
151 goto out;
152 }
153
154 /* this could only happen with cancels I think */
155 log_info("ast sb_status %d %x,%llx flags %lx",
156 lp->lksb.sb_status, lp->lockname.ln_type,
157 (unsigned long long)lp->lockname.ln_number,
158 lp->flags);
159 return;
160 }
161
162 /*
163 * This is an AST for an EX->EX conversion for sync_lvb from GFS.
164 */
165
166 if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
167 wake_up_ast(lp);
168 return;
169 }
170
171 /*
172 * A lock has been demoted to NL because it initially completed during
173 * BLOCK_LOCKS. Now it must be requested in the originally requested
174 * mode.
175 */
176
177 if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
178 gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
179 lp->lockname.ln_type,
180 (unsigned long long)lp->lockname.ln_number);
181 gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
182 lp->lockname.ln_type,
183 (unsigned long long)lp->lockname.ln_number);
184
185 lp->cur = DLM_LOCK_NL;
186 lp->req = lp->prev_req;
187 lp->prev_req = DLM_LOCK_IV;
188 lp->lkf &= ~DLM_LKF_CONVDEADLK;
189
190 set_bit(LFL_NOCACHE, &lp->flags);
191
192 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
193 !test_bit(LFL_NOBLOCK, &lp->flags))
194 gdlm_queue_delayed(lp);
195 else
196 queue_submit(lp);
197 return;
198 }
199
200 /*
201 * A request is granted during dlm recovery. It may be granted
202 * because the locks of a failed node were cleared. In that case,
203 * there may be inconsistent data beneath this lock and we must wait
204 * for recovery to complete to use it. When gfs recovery is done this
205 * granted lock will be converted to NL and then reacquired in this
206 * granted state.
207 */
208
209 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
210 !test_bit(LFL_NOBLOCK, &lp->flags) &&
211 lp->req != DLM_LOCK_NL) {
212
213 lp->cur = lp->req;
214 lp->prev_req = lp->req;
215 lp->req = DLM_LOCK_NL;
216 lp->lkf |= DLM_LKF_CONVERT;
217 lp->lkf &= ~DLM_LKF_CONVDEADLK;
218
219 log_debug("rereq %x,%llx id %x %d,%d",
220 lp->lockname.ln_type,
221 (unsigned long long)lp->lockname.ln_number,
222 lp->lksb.sb_lkid, lp->cur, lp->req);
223
224 set_bit(LFL_REREQUEST, &lp->flags);
225 queue_submit(lp);
226 return;
227 }
228
229 /*
230 * DLM demoted the lock to NL before it was granted so GFS must be
231 * told it cannot cache data for this lock.
232 */
233
234 if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
235 set_bit(LFL_NOCACHE, &lp->flags);
236
237out:
238 /*
239 * This is an internal lock_dlm lock
240 */
241
242 if (test_bit(LFL_INLOCK, &lp->flags)) {
243 clear_bit(LFL_NOBLOCK, &lp->flags);
244 lp->cur = lp->req;
245 wake_up_ast(lp);
246 return;
247 }
248
249 /*
250 * Normal completion of a lock request. Tell GFS it now has the lock.
251 */
252
253 clear_bit(LFL_NOBLOCK, &lp->flags);
254 lp->cur = lp->req;
255
256 acb.lc_name = lp->lockname;
257 acb.lc_ret |= gdlm_make_lmstate(lp->cur);
258
259 ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
260}
261
262static void gdlm_ast(void *astarg)
263{
264 struct gdlm_lock *lp = astarg;
265 clear_bit(LFL_ACTIVE, &lp->flags);
266 process_complete(lp);
267}
268
269static void process_blocking(struct gdlm_lock *lp, int bast_mode)
270{
271 struct gdlm_ls *ls = lp->ls;
272 unsigned int cb = 0;
273
274 switch (gdlm_make_lmstate(bast_mode)) {
275 case LM_ST_EXCLUSIVE:
276 cb = LM_CB_NEED_E;
277 break;
278 case LM_ST_DEFERRED:
279 cb = LM_CB_NEED_D;
280 break;
281 case LM_ST_SHARED:
282 cb = LM_CB_NEED_S;
283 break;
284 default:
285 gdlm_assert(0, "unknown bast mode %u", bast_mode);
286 }
287
288 ls->fscb(ls->sdp, cb, &lp->lockname);
289}
290
291
292static void gdlm_bast(void *astarg, int mode)
293{
294 struct gdlm_lock *lp = astarg;
295
296 if (!mode) {
297 printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
298 lp->lockname.ln_type,
299 (unsigned long long)lp->lockname.ln_number);
300 return;
301 }
302
303 process_blocking(lp, mode);
304}
305
306/* convert gfs lock-state to dlm lock-mode */
307
308static s16 make_mode(s16 lmstate)
309{
310 switch (lmstate) {
311 case LM_ST_UNLOCKED:
312 return DLM_LOCK_NL;
313 case LM_ST_EXCLUSIVE:
314 return DLM_LOCK_EX;
315 case LM_ST_DEFERRED:
316 return DLM_LOCK_CW;
317 case LM_ST_SHARED:
318 return DLM_LOCK_PR;
319 }
320 gdlm_assert(0, "unknown LM state %d", lmstate);
321 return -1;
322}
323
324
325/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and
326 DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */
327
328static void check_cur_state(struct gdlm_lock *lp, unsigned int cur_state)
329{
330 s16 cur = make_mode(cur_state);
331 if (lp->cur != DLM_LOCK_IV)
332 gdlm_assert(lp->cur == cur, "%d, %d", lp->cur, cur);
333}
334
335static inline unsigned int make_flags(struct gdlm_lock *lp,
336 unsigned int gfs_flags,
337 s16 cur, s16 req)
338{
339 unsigned int lkf = 0;
340
341 if (gfs_flags & LM_FLAG_TRY)
342 lkf |= DLM_LKF_NOQUEUE;
343
344 if (gfs_flags & LM_FLAG_TRY_1CB) {
345 lkf |= DLM_LKF_NOQUEUE;
346 lkf |= DLM_LKF_NOQUEUEBAST;
347 }
348
349 if (gfs_flags & LM_FLAG_PRIORITY) {
350 lkf |= DLM_LKF_NOORDER;
351 lkf |= DLM_LKF_HEADQUE;
352 }
353
354 if (gfs_flags & LM_FLAG_ANY) {
355 if (req == DLM_LOCK_PR)
356 lkf |= DLM_LKF_ALTCW;
357 else if (req == DLM_LOCK_CW)
358 lkf |= DLM_LKF_ALTPR;
359 }
360
361 if (lp->lksb.sb_lkid != 0) {
362 lkf |= DLM_LKF_CONVERT;
363 }
364
365 if (lp->lvb)
366 lkf |= DLM_LKF_VALBLK;
367
368 return lkf;
369}
370
371/* make_strname - convert GFS lock numbers to a string */
372
373static inline void make_strname(const struct lm_lockname *lockname,
374 struct gdlm_strname *str)
375{
376 sprintf(str->name, "%8x%16llx", lockname->ln_type,
377 (unsigned long long)lockname->ln_number);
378 str->namelen = GDLM_STRNAME_BYTES;
379}
380
381static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
382 struct gdlm_lock **lpp)
383{
384 struct gdlm_lock *lp;
385
386 lp = kzalloc(sizeof(struct gdlm_lock), GFP_NOFS);
387 if (!lp)
388 return -ENOMEM;
389
390 lp->lockname = *name;
391 make_strname(name, &lp->strname);
392 lp->ls = ls;
393 lp->cur = DLM_LOCK_IV;
394 INIT_LIST_HEAD(&lp->delay_list);
395
396 spin_lock(&ls->async_lock);
397 ls->all_locks_count++;
398 spin_unlock(&ls->async_lock);
399
400 *lpp = lp;
401 return 0;
402}
403
404int gdlm_get_lock(void *lockspace, struct lm_lockname *name,
405 void **lockp)
406{
407 struct gdlm_lock *lp;
408 int error;
409
410 error = gdlm_create_lp(lockspace, name, &lp);
411
412 *lockp = lp;
413 return error;
414}
415
416void gdlm_put_lock(void *lock)
417{
418 gdlm_delete_lp(lock);
419}
420
421unsigned int gdlm_do_lock(struct gdlm_lock *lp)
422{
423 struct gdlm_ls *ls = lp->ls;
424 int error, bast = 1;
425
426 /*
427 * When recovery is in progress, delay lock requests for submission
428 * once recovery is done. Requests for recovery (NOEXP) and unlocks
429 * can pass.
430 */
431
432 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
433 !test_bit(LFL_NOBLOCK, &lp->flags) && lp->req != DLM_LOCK_NL) {
434 gdlm_queue_delayed(lp);
435 return LM_OUT_ASYNC;
436 }
437
438 /*
439 * Submit the actual lock request.
440 */
441
442 if (test_bit(LFL_NOBAST, &lp->flags))
443 bast = 0;
444
445 set_bit(LFL_ACTIVE, &lp->flags);
446
447 log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type,
448 (unsigned long long)lp->lockname.ln_number, lp->lksb.sb_lkid,
449 lp->cur, lp->req, lp->lkf);
450
451 error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf,
452 lp->strname.name, lp->strname.namelen, 0, gdlm_ast,
453 lp, bast ? gdlm_bast : NULL);
454
455 if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
456 lp->lksb.sb_status = -EAGAIN;
457 gdlm_ast(lp);
458 error = 0;
459 }
460
461 if (error) {
462 log_error("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x "
463 "flags=%lx", ls->fsname, lp->lockname.ln_type,
464 (unsigned long long)lp->lockname.ln_number, error,
465 lp->cur, lp->req, lp->lkf, lp->flags);
466 return LM_OUT_ERROR;
467 }
468 return LM_OUT_ASYNC;
469}
470
471static unsigned int gdlm_do_unlock(struct gdlm_lock *lp)
472{
473 struct gdlm_ls *ls = lp->ls;
474 unsigned int lkf = 0;
475 int error;
476
477 set_bit(LFL_DLM_UNLOCK, &lp->flags);
478 set_bit(LFL_ACTIVE, &lp->flags);
479
480 if (lp->lvb)
481 lkf = DLM_LKF_VALBLK;
482
483 log_debug("un %x,%llx %x %d %x", lp->lockname.ln_type,
484 (unsigned long long)lp->lockname.ln_number,
485 lp->lksb.sb_lkid, lp->cur, lkf);
486
487 error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp);
488
489 if (error) {
490 log_error("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x "
491 "flags=%lx", ls->fsname, lp->lockname.ln_type,
492 (unsigned long long)lp->lockname.ln_number, error,
493 lp->cur, lp->req, lp->lkf, lp->flags);
494 return LM_OUT_ERROR;
495 }
496 return LM_OUT_ASYNC;
497}
498
499unsigned int gdlm_lock(void *lock, unsigned int cur_state,
500 unsigned int req_state, unsigned int flags)
501{
502 struct gdlm_lock *lp = lock;
503
504 if (req_state == LM_ST_UNLOCKED)
505 return gdlm_unlock(lock, cur_state);
506
507 if (req_state == LM_ST_UNLOCKED)
508 return gdlm_unlock(lock, cur_state);
509
510 clear_bit(LFL_DLM_CANCEL, &lp->flags);
511 if (flags & LM_FLAG_NOEXP)
512 set_bit(LFL_NOBLOCK, &lp->flags);
513
514 check_cur_state(lp, cur_state);
515 lp->req = make_mode(req_state);
516 lp->lkf = make_flags(lp, flags, lp->cur, lp->req);
517
518 return gdlm_do_lock(lp);
519}
520
521unsigned int gdlm_unlock(void *lock, unsigned int cur_state)
522{
523 struct gdlm_lock *lp = lock;
524
525 clear_bit(LFL_DLM_CANCEL, &lp->flags);
526 if (lp->cur == DLM_LOCK_IV)
527 return 0;
528 return gdlm_do_unlock(lp);
529}
530
531void gdlm_cancel(void *lock)
532{
533 struct gdlm_lock *lp = lock;
534 struct gdlm_ls *ls = lp->ls;
535 int error, delay_list = 0;
536
537 if (test_bit(LFL_DLM_CANCEL, &lp->flags))
538 return;
539
540 log_info("gdlm_cancel %x,%llx flags %lx", lp->lockname.ln_type,
541 (unsigned long long)lp->lockname.ln_number, lp->flags);
542
543 spin_lock(&ls->async_lock);
544 if (!list_empty(&lp->delay_list)) {
545 list_del_init(&lp->delay_list);
546 delay_list = 1;
547 }
548 spin_unlock(&ls->async_lock);
549
550 if (delay_list) {
551 set_bit(LFL_CANCEL, &lp->flags);
552 set_bit(LFL_ACTIVE, &lp->flags);
553 gdlm_ast(lp);
554 return;
555 }
556
557 if (!test_bit(LFL_ACTIVE, &lp->flags) ||
558 test_bit(LFL_DLM_UNLOCK, &lp->flags)) {
559 log_info("gdlm_cancel skip %x,%llx flags %lx",
560 lp->lockname.ln_type,
561 (unsigned long long)lp->lockname.ln_number, lp->flags);
562 return;
563 }
564
565 /* the lock is blocked in the dlm */
566
567 set_bit(LFL_DLM_CANCEL, &lp->flags);
568 set_bit(LFL_ACTIVE, &lp->flags);
569
570 error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, DLM_LKF_CANCEL,
571 NULL, lp);
572
573 log_info("gdlm_cancel rv %d %x,%llx flags %lx", error,
574 lp->lockname.ln_type,
575 (unsigned long long)lp->lockname.ln_number, lp->flags);
576
577 if (error == -EBUSY)
578 clear_bit(LFL_DLM_CANCEL, &lp->flags);
579}
580
581static int gdlm_add_lvb(struct gdlm_lock *lp)
582{
583 char *lvb;
584
585 lvb = kzalloc(GDLM_LVB_SIZE, GFP_NOFS);
586 if (!lvb)
587 return -ENOMEM;
588
589 lp->lksb.sb_lvbptr = lvb;
590 lp->lvb = lvb;
591 return 0;
592}
593
594static void gdlm_del_lvb(struct gdlm_lock *lp)
595{
596 kfree(lp->lvb);
597 lp->lvb = NULL;
598 lp->lksb.sb_lvbptr = NULL;
599}
600
601static int gdlm_ast_wait(void *word)
602{
603 schedule();
604 return 0;
605}
606
607/* This can do a synchronous dlm request (requiring a lock_dlm thread to get
608 the completion) because gfs won't call hold_lvb() during a callback (from
609 the context of a lock_dlm thread). */
610
611static int hold_null_lock(struct gdlm_lock *lp)
612{
613 struct gdlm_lock *lpn = NULL;
614 int error;
615
616 if (lp->hold_null) {
617 printk(KERN_INFO "lock_dlm: lvb already held\n");
618 return 0;
619 }
620
621 error = gdlm_create_lp(lp->ls, &lp->lockname, &lpn);
622 if (error)
623 goto out;
624
625 lpn->lksb.sb_lvbptr = junk_lvb;
626 lpn->lvb = junk_lvb;
627
628 lpn->req = DLM_LOCK_NL;
629 lpn->lkf = DLM_LKF_VALBLK | DLM_LKF_EXPEDITE;
630 set_bit(LFL_NOBAST, &lpn->flags);
631 set_bit(LFL_INLOCK, &lpn->flags);
632 set_bit(LFL_AST_WAIT, &lpn->flags);
633
634 gdlm_do_lock(lpn);
635 wait_on_bit(&lpn->flags, LFL_AST_WAIT, gdlm_ast_wait, TASK_UNINTERRUPTIBLE);
636 error = lpn->lksb.sb_status;
637 if (error) {
638 printk(KERN_INFO "lock_dlm: hold_null_lock dlm error %d\n",
639 error);
640 gdlm_delete_lp(lpn);
641 lpn = NULL;
642 }
643out:
644 lp->hold_null = lpn;
645 return error;
646}
647
648/* This cannot do a synchronous dlm request (requiring a lock_dlm thread to get
649 the completion) because gfs may call unhold_lvb() during a callback (from
650 the context of a lock_dlm thread) which could cause a deadlock since the
651 other lock_dlm thread could be engaged in recovery. */
652
653static void unhold_null_lock(struct gdlm_lock *lp)
654{
655 struct gdlm_lock *lpn = lp->hold_null;
656
657 gdlm_assert(lpn, "%x,%llx", lp->lockname.ln_type,
658 (unsigned long long)lp->lockname.ln_number);
659 lpn->lksb.sb_lvbptr = NULL;
660 lpn->lvb = NULL;
661 set_bit(LFL_UNLOCK_DELETE, &lpn->flags);
662 gdlm_do_unlock(lpn);
663 lp->hold_null = NULL;
664}
665
666/* Acquire a NL lock because gfs requires the value block to remain
667 intact on the resource while the lvb is "held" even if it's holding no locks
668 on the resource. */
669
670int gdlm_hold_lvb(void *lock, char **lvbp)
671{
672 struct gdlm_lock *lp = lock;
673 int error;
674
675 error = gdlm_add_lvb(lp);
676 if (error)
677 return error;
678
679 *lvbp = lp->lvb;
680
681 error = hold_null_lock(lp);
682 if (error)
683 gdlm_del_lvb(lp);
684
685 return error;
686}
687
688void gdlm_unhold_lvb(void *lock, char *lvb)
689{
690 struct gdlm_lock *lp = lock;
691
692 unhold_null_lock(lp);
693 gdlm_del_lvb(lp);
694}
695
696void gdlm_submit_delayed(struct gdlm_ls *ls)
697{
698 struct gdlm_lock *lp, *safe;
699
700 spin_lock(&ls->async_lock);
701 list_for_each_entry_safe(lp, safe, &ls->delayed, delay_list) {
702 list_del_init(&lp->delay_list);
703 list_add_tail(&lp->delay_list, &ls->submit);
704 }
705 spin_unlock(&ls->async_lock);
706 wake_up(&ls->thread_wait);
707}
708
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
deleted file mode 100644
index 3c98e7c6f93b..000000000000
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ /dev/null
@@ -1,166 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef LOCK_DLM_DOT_H
11#define LOCK_DLM_DOT_H
12
13#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/spinlock.h>
16#include <linux/types.h>
17#include <linux/string.h>
18#include <linux/list.h>
19#include <linux/socket.h>
20#include <linux/delay.h>
21#include <linux/kthread.h>
22#include <linux/kobject.h>
23#include <linux/fcntl.h>
24#include <linux/wait.h>
25#include <net/sock.h>
26
27#include <linux/dlm.h>
28#include <linux/dlm_plock.h>
29#include <linux/lm_interface.h>
30
31/*
32 * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a
33 * prefix of lock_dlm_ gets awkward. Externally, GFS refers to this module
34 * as "lock_dlm".
35 */
36
37#define GDLM_STRNAME_BYTES 24
38#define GDLM_LVB_SIZE 32
39#define GDLM_DROP_COUNT 0
40#define GDLM_DROP_PERIOD 60
41#define GDLM_NAME_LEN 128
42
43/* GFS uses 12 bytes to identify a resource (32 bit type + 64 bit number).
44 We sprintf these numbers into a 24 byte string of hex values to make them
45 human-readable (to make debugging simpler.) */
46
47struct gdlm_strname {
48 unsigned char name[GDLM_STRNAME_BYTES];
49 unsigned short namelen;
50};
51
52enum {
53 DFL_BLOCK_LOCKS = 0,
54 DFL_SPECTATOR = 1,
55 DFL_WITHDRAW = 2,
56};
57
58struct gdlm_ls {
59 u32 id;
60 int jid;
61 int first;
62 int first_done;
63 unsigned long flags;
64 struct kobject kobj;
65 char clustername[GDLM_NAME_LEN];
66 char fsname[GDLM_NAME_LEN];
67 int fsflags;
68 dlm_lockspace_t *dlm_lockspace;
69 lm_callback_t fscb;
70 struct gfs2_sbd *sdp;
71 int recover_jid;
72 int recover_jid_done;
73 int recover_jid_status;
74 spinlock_t async_lock;
75 struct list_head delayed;
76 struct list_head submit;
77 u32 all_locks_count;
78 wait_queue_head_t wait_control;
79 struct task_struct *thread;
80 wait_queue_head_t thread_wait;
81};
82
83enum {
84 LFL_NOBLOCK = 0,
85 LFL_NOCACHE = 1,
86 LFL_DLM_UNLOCK = 2,
87 LFL_DLM_CANCEL = 3,
88 LFL_SYNC_LVB = 4,
89 LFL_FORCE_PROMOTE = 5,
90 LFL_REREQUEST = 6,
91 LFL_ACTIVE = 7,
92 LFL_INLOCK = 8,
93 LFL_CANCEL = 9,
94 LFL_NOBAST = 10,
95 LFL_HEADQUE = 11,
96 LFL_UNLOCK_DELETE = 12,
97 LFL_AST_WAIT = 13,
98};
99
100struct gdlm_lock {
101 struct gdlm_ls *ls;
102 struct lm_lockname lockname;
103 struct gdlm_strname strname;
104 char *lvb;
105 struct dlm_lksb lksb;
106
107 s16 cur;
108 s16 req;
109 s16 prev_req;
110 u32 lkf; /* dlm flags DLM_LKF_ */
111 unsigned long flags; /* lock_dlm flags LFL_ */
112
113 struct list_head delay_list; /* delayed */
114 struct gdlm_lock *hold_null; /* NL lock for hold_lvb */
115};
116
117#define gdlm_assert(assertion, fmt, args...) \
118do { \
119 if (unlikely(!(assertion))) { \
120 printk(KERN_EMERG "lock_dlm: fatal assertion failed \"%s\"\n" \
121 "lock_dlm: " fmt "\n", \
122 #assertion, ##args); \
123 BUG(); \
124 } \
125} while (0)
126
127#define log_print(lev, fmt, arg...) printk(lev "lock_dlm: " fmt "\n" , ## arg)
128#define log_info(fmt, arg...) log_print(KERN_INFO , fmt , ## arg)
129#define log_error(fmt, arg...) log_print(KERN_ERR , fmt , ## arg)
130#ifdef LOCK_DLM_LOG_DEBUG
131#define log_debug(fmt, arg...) log_print(KERN_DEBUG , fmt , ## arg)
132#else
133#define log_debug(fmt, arg...)
134#endif
135
136/* sysfs.c */
137
138int gdlm_sysfs_init(void);
139void gdlm_sysfs_exit(void);
140int gdlm_kobject_setup(struct gdlm_ls *, struct kobject *);
141void gdlm_kobject_release(struct gdlm_ls *);
142
143/* thread.c */
144
145int gdlm_init_threads(struct gdlm_ls *);
146void gdlm_release_threads(struct gdlm_ls *);
147
148/* lock.c */
149
150void gdlm_submit_delayed(struct gdlm_ls *);
151unsigned int gdlm_do_lock(struct gdlm_lock *);
152
153int gdlm_get_lock(void *, struct lm_lockname *, void **);
154void gdlm_put_lock(void *);
155unsigned int gdlm_lock(void *, unsigned int, unsigned int, unsigned int);
156unsigned int gdlm_unlock(void *, unsigned int);
157void gdlm_cancel(void *);
158int gdlm_hold_lvb(void *, char **);
159void gdlm_unhold_lvb(void *, char *);
160
161/* mount.c */
162
163extern const struct lm_lockops gdlm_ops;
164
165#endif
166
diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c
deleted file mode 100644
index b9a03a7ff801..000000000000
--- a/fs/gfs2/locking/dlm/main.c
+++ /dev/null
@@ -1,48 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/init.h>
11
12#include "lock_dlm.h"
13
14static int __init init_lock_dlm(void)
15{
16 int error;
17
18 error = gfs2_register_lockproto(&gdlm_ops);
19 if (error) {
20 printk(KERN_WARNING "lock_dlm: can't register protocol: %d\n",
21 error);
22 return error;
23 }
24
25 error = gdlm_sysfs_init();
26 if (error) {
27 gfs2_unregister_lockproto(&gdlm_ops);
28 return error;
29 }
30
31 printk(KERN_INFO
32 "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__);
33 return 0;
34}
35
36static void __exit exit_lock_dlm(void)
37{
38 gdlm_sysfs_exit();
39 gfs2_unregister_lockproto(&gdlm_ops);
40}
41
42module_init(init_lock_dlm);
43module_exit(exit_lock_dlm);
44
45MODULE_DESCRIPTION("GFS DLM Locking Module");
46MODULE_AUTHOR("Red Hat, Inc.");
47MODULE_LICENSE("GPL");
48
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
deleted file mode 100644
index 1aa7eb6a0226..000000000000
--- a/fs/gfs2/locking/dlm/mount.c
+++ /dev/null
@@ -1,276 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include "lock_dlm.h"
11
12const struct lm_lockops gdlm_ops;
13
14
15static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
16 int flags, char *table_name)
17{
18 struct gdlm_ls *ls;
19 char buf[256], *p;
20
21 ls = kzalloc(sizeof(struct gdlm_ls), GFP_KERNEL);
22 if (!ls)
23 return NULL;
24
25 ls->fscb = cb;
26 ls->sdp = sdp;
27 ls->fsflags = flags;
28 spin_lock_init(&ls->async_lock);
29 INIT_LIST_HEAD(&ls->delayed);
30 INIT_LIST_HEAD(&ls->submit);
31 init_waitqueue_head(&ls->thread_wait);
32 init_waitqueue_head(&ls->wait_control);
33 ls->jid = -1;
34
35 strncpy(buf, table_name, 256);
36 buf[255] = '\0';
37
38 p = strchr(buf, ':');
39 if (!p) {
40 log_info("invalid table_name \"%s\"", table_name);
41 kfree(ls);
42 return NULL;
43 }
44 *p = '\0';
45 p++;
46
47 strncpy(ls->clustername, buf, GDLM_NAME_LEN);
48 strncpy(ls->fsname, p, GDLM_NAME_LEN);
49
50 return ls;
51}
52
53static int make_args(struct gdlm_ls *ls, char *data_arg, int *nodir)
54{
55 char data[256];
56 char *options, *x, *y;
57 int error = 0;
58
59 memset(data, 0, 256);
60 strncpy(data, data_arg, 255);
61
62 if (!strlen(data)) {
63 log_error("no mount options, (u)mount helpers not installed");
64 return -EINVAL;
65 }
66
67 for (options = data; (x = strsep(&options, ":")); ) {
68 if (!*x)
69 continue;
70
71 y = strchr(x, '=');
72 if (y)
73 *y++ = 0;
74
75 if (!strcmp(x, "jid")) {
76 if (!y) {
77 log_error("need argument to jid");
78 error = -EINVAL;
79 break;
80 }
81 sscanf(y, "%u", &ls->jid);
82
83 } else if (!strcmp(x, "first")) {
84 if (!y) {
85 log_error("need argument to first");
86 error = -EINVAL;
87 break;
88 }
89 sscanf(y, "%u", &ls->first);
90
91 } else if (!strcmp(x, "id")) {
92 if (!y) {
93 log_error("need argument to id");
94 error = -EINVAL;
95 break;
96 }
97 sscanf(y, "%u", &ls->id);
98
99 } else if (!strcmp(x, "nodir")) {
100 if (!y) {
101 log_error("need argument to nodir");
102 error = -EINVAL;
103 break;
104 }
105 sscanf(y, "%u", nodir);
106
107 } else {
108 log_error("unkonwn option: %s", x);
109 error = -EINVAL;
110 break;
111 }
112 }
113
114 return error;
115}
116
117static int gdlm_mount(char *table_name, char *host_data,
118 lm_callback_t cb, void *cb_data,
119 unsigned int min_lvb_size, int flags,
120 struct lm_lockstruct *lockstruct,
121 struct kobject *fskobj)
122{
123 struct gdlm_ls *ls;
124 int error = -ENOMEM, nodir = 0;
125
126 if (min_lvb_size > GDLM_LVB_SIZE)
127 goto out;
128
129 ls = init_gdlm(cb, cb_data, flags, table_name);
130 if (!ls)
131 goto out;
132
133 error = make_args(ls, host_data, &nodir);
134 if (error)
135 goto out;
136
137 error = gdlm_init_threads(ls);
138 if (error)
139 goto out_free;
140
141 error = gdlm_kobject_setup(ls, fskobj);
142 if (error)
143 goto out_thread;
144
145 error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname),
146 &ls->dlm_lockspace,
147 DLM_LSFL_FS | DLM_LSFL_NEWEXCL |
148 (nodir ? DLM_LSFL_NODIR : 0),
149 GDLM_LVB_SIZE);
150 if (error) {
151 log_error("dlm_new_lockspace error %d", error);
152 goto out_kobj;
153 }
154
155 lockstruct->ls_jid = ls->jid;
156 lockstruct->ls_first = ls->first;
157 lockstruct->ls_lockspace = ls;
158 lockstruct->ls_ops = &gdlm_ops;
159 lockstruct->ls_flags = 0;
160 lockstruct->ls_lvb_size = GDLM_LVB_SIZE;
161 return 0;
162
163out_kobj:
164 gdlm_kobject_release(ls);
165out_thread:
166 gdlm_release_threads(ls);
167out_free:
168 kfree(ls);
169out:
170 return error;
171}
172
173static void gdlm_unmount(void *lockspace)
174{
175 struct gdlm_ls *ls = lockspace;
176
177 log_debug("unmount flags %lx", ls->flags);
178
179 /* FIXME: serialize unmount and withdraw in case they
180 happen at once. Also, if unmount follows withdraw,
181 wait for withdraw to finish. */
182
183 if (test_bit(DFL_WITHDRAW, &ls->flags))
184 goto out;
185
186 gdlm_kobject_release(ls);
187 dlm_release_lockspace(ls->dlm_lockspace, 2);
188 gdlm_release_threads(ls);
189 BUG_ON(ls->all_locks_count);
190out:
191 kfree(ls);
192}
193
194static void gdlm_recovery_done(void *lockspace, unsigned int jid,
195 unsigned int message)
196{
197 char env_jid[20];
198 char env_status[20];
199 char *envp[] = { env_jid, env_status, NULL };
200 struct gdlm_ls *ls = lockspace;
201 ls->recover_jid_done = jid;
202 ls->recover_jid_status = message;
203 sprintf(env_jid, "JID=%d", jid);
204 sprintf(env_status, "RECOVERY=%s",
205 message == LM_RD_SUCCESS ? "Done" : "Failed");
206 kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
207}
208
209static void gdlm_others_may_mount(void *lockspace)
210{
211 char *message = "FIRSTMOUNT=Done";
212 char *envp[] = { message, NULL };
213 struct gdlm_ls *ls = lockspace;
214 ls->first_done = 1;
215 kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
216}
217
218/* Userspace gets the offline uevent, blocks new gfs locks on
219 other mounters, and lets us know (sets WITHDRAW flag). Then,
220 userspace leaves the mount group while we leave the lockspace. */
221
222static void gdlm_withdraw(void *lockspace)
223{
224 struct gdlm_ls *ls = lockspace;
225
226 kobject_uevent(&ls->kobj, KOBJ_OFFLINE);
227
228 wait_event_interruptible(ls->wait_control,
229 test_bit(DFL_WITHDRAW, &ls->flags));
230
231 dlm_release_lockspace(ls->dlm_lockspace, 2);
232 gdlm_release_threads(ls);
233 gdlm_kobject_release(ls);
234}
235
236static int gdlm_plock(void *lockspace, struct lm_lockname *name,
237 struct file *file, int cmd, struct file_lock *fl)
238{
239 struct gdlm_ls *ls = lockspace;
240 return dlm_posix_lock(ls->dlm_lockspace, name->ln_number, file, cmd, fl);
241}
242
243static int gdlm_punlock(void *lockspace, struct lm_lockname *name,
244 struct file *file, struct file_lock *fl)
245{
246 struct gdlm_ls *ls = lockspace;
247 return dlm_posix_unlock(ls->dlm_lockspace, name->ln_number, file, fl);
248}
249
250static int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
251 struct file *file, struct file_lock *fl)
252{
253 struct gdlm_ls *ls = lockspace;
254 return dlm_posix_get(ls->dlm_lockspace, name->ln_number, file, fl);
255}
256
257const struct lm_lockops gdlm_ops = {
258 .lm_proto_name = "lock_dlm",
259 .lm_mount = gdlm_mount,
260 .lm_others_may_mount = gdlm_others_may_mount,
261 .lm_unmount = gdlm_unmount,
262 .lm_withdraw = gdlm_withdraw,
263 .lm_get_lock = gdlm_get_lock,
264 .lm_put_lock = gdlm_put_lock,
265 .lm_lock = gdlm_lock,
266 .lm_unlock = gdlm_unlock,
267 .lm_plock = gdlm_plock,
268 .lm_punlock = gdlm_punlock,
269 .lm_plock_get = gdlm_plock_get,
270 .lm_cancel = gdlm_cancel,
271 .lm_hold_lvb = gdlm_hold_lvb,
272 .lm_unhold_lvb = gdlm_unhold_lvb,
273 .lm_recovery_done = gdlm_recovery_done,
274 .lm_owner = THIS_MODULE,
275};
276
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
deleted file mode 100644
index 9b7edcf7bd49..000000000000
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ /dev/null
@@ -1,226 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/ctype.h>
11#include <linux/stat.h>
12
13#include "lock_dlm.h"
14
15static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf)
16{
17 return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name);
18}
19
20static ssize_t block_show(struct gdlm_ls *ls, char *buf)
21{
22 ssize_t ret;
23 int val = 0;
24
25 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags))
26 val = 1;
27 ret = sprintf(buf, "%d\n", val);
28 return ret;
29}
30
31static ssize_t block_store(struct gdlm_ls *ls, const char *buf, size_t len)
32{
33 ssize_t ret = len;
34 int val;
35
36 val = simple_strtol(buf, NULL, 0);
37
38 if (val == 1)
39 set_bit(DFL_BLOCK_LOCKS, &ls->flags);
40 else if (val == 0) {
41 clear_bit(DFL_BLOCK_LOCKS, &ls->flags);
42 gdlm_submit_delayed(ls);
43 } else {
44 ret = -EINVAL;
45 }
46 return ret;
47}
48
49static ssize_t withdraw_show(struct gdlm_ls *ls, char *buf)
50{
51 ssize_t ret;
52 int val = 0;
53
54 if (test_bit(DFL_WITHDRAW, &ls->flags))
55 val = 1;
56 ret = sprintf(buf, "%d\n", val);
57 return ret;
58}
59
60static ssize_t withdraw_store(struct gdlm_ls *ls, const char *buf, size_t len)
61{
62 ssize_t ret = len;
63 int val;
64
65 val = simple_strtol(buf, NULL, 0);
66
67 if (val == 1)
68 set_bit(DFL_WITHDRAW, &ls->flags);
69 else
70 ret = -EINVAL;
71 wake_up(&ls->wait_control);
72 return ret;
73}
74
75static ssize_t id_show(struct gdlm_ls *ls, char *buf)
76{
77 return sprintf(buf, "%u\n", ls->id);
78}
79
80static ssize_t jid_show(struct gdlm_ls *ls, char *buf)
81{
82 return sprintf(buf, "%d\n", ls->jid);
83}
84
85static ssize_t first_show(struct gdlm_ls *ls, char *buf)
86{
87 return sprintf(buf, "%d\n", ls->first);
88}
89
90static ssize_t first_done_show(struct gdlm_ls *ls, char *buf)
91{
92 return sprintf(buf, "%d\n", ls->first_done);
93}
94
95static ssize_t recover_show(struct gdlm_ls *ls, char *buf)
96{
97 return sprintf(buf, "%d\n", ls->recover_jid);
98}
99
100static ssize_t recover_store(struct gdlm_ls *ls, const char *buf, size_t len)
101{
102 ls->recover_jid = simple_strtol(buf, NULL, 0);
103 ls->fscb(ls->sdp, LM_CB_NEED_RECOVERY, &ls->recover_jid);
104 return len;
105}
106
107static ssize_t recover_done_show(struct gdlm_ls *ls, char *buf)
108{
109 return sprintf(buf, "%d\n", ls->recover_jid_done);
110}
111
112static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
113{
114 return sprintf(buf, "%d\n", ls->recover_jid_status);
115}
116
117struct gdlm_attr {
118 struct attribute attr;
119 ssize_t (*show)(struct gdlm_ls *, char *);
120 ssize_t (*store)(struct gdlm_ls *, const char *, size_t);
121};
122
123#define GDLM_ATTR(_name,_mode,_show,_store) \
124static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
125
126GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
127GDLM_ATTR(block, 0644, block_show, block_store);
128GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
129GDLM_ATTR(id, 0444, id_show, NULL);
130GDLM_ATTR(jid, 0444, jid_show, NULL);
131GDLM_ATTR(first, 0444, first_show, NULL);
132GDLM_ATTR(first_done, 0444, first_done_show, NULL);
133GDLM_ATTR(recover, 0644, recover_show, recover_store);
134GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
135GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
136
137static struct attribute *gdlm_attrs[] = {
138 &gdlm_attr_proto_name.attr,
139 &gdlm_attr_block.attr,
140 &gdlm_attr_withdraw.attr,
141 &gdlm_attr_id.attr,
142 &gdlm_attr_jid.attr,
143 &gdlm_attr_first.attr,
144 &gdlm_attr_first_done.attr,
145 &gdlm_attr_recover.attr,
146 &gdlm_attr_recover_done.attr,
147 &gdlm_attr_recover_status.attr,
148 NULL,
149};
150
151static ssize_t gdlm_attr_show(struct kobject *kobj, struct attribute *attr,
152 char *buf)
153{
154 struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
155 struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
156 return a->show ? a->show(ls, buf) : 0;
157}
158
159static ssize_t gdlm_attr_store(struct kobject *kobj, struct attribute *attr,
160 const char *buf, size_t len)
161{
162 struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
163 struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
164 return a->store ? a->store(ls, buf, len) : len;
165}
166
167static struct sysfs_ops gdlm_attr_ops = {
168 .show = gdlm_attr_show,
169 .store = gdlm_attr_store,
170};
171
172static struct kobj_type gdlm_ktype = {
173 .default_attrs = gdlm_attrs,
174 .sysfs_ops = &gdlm_attr_ops,
175};
176
177static struct kset *gdlm_kset;
178
179int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj)
180{
181 int error;
182
183 ls->kobj.kset = gdlm_kset;
184 error = kobject_init_and_add(&ls->kobj, &gdlm_ktype, fskobj,
185 "lock_module");
186 if (error)
187 log_error("can't register kobj %d", error);
188 kobject_uevent(&ls->kobj, KOBJ_ADD);
189
190 return error;
191}
192
193void gdlm_kobject_release(struct gdlm_ls *ls)
194{
195 kobject_put(&ls->kobj);
196}
197
198static int gdlm_uevent(struct kset *kset, struct kobject *kobj,
199 struct kobj_uevent_env *env)
200{
201 struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
202 add_uevent_var(env, "LOCKTABLE=%s:%s", ls->clustername, ls->fsname);
203 add_uevent_var(env, "LOCKPROTO=lock_dlm");
204 return 0;
205}
206
207static struct kset_uevent_ops gdlm_uevent_ops = {
208 .uevent = gdlm_uevent,
209};
210
211
212int gdlm_sysfs_init(void)
213{
214 gdlm_kset = kset_create_and_add("lock_dlm", &gdlm_uevent_ops, kernel_kobj);
215 if (!gdlm_kset) {
216 printk(KERN_WARNING "%s: can not create kset\n", __func__);
217 return -ENOMEM;
218 }
219 return 0;
220}
221
222void gdlm_sysfs_exit(void)
223{
224 kset_unregister(gdlm_kset);
225}
226
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
deleted file mode 100644
index 38823efd698c..000000000000
--- a/fs/gfs2/locking/dlm/thread.c
+++ /dev/null
@@ -1,68 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include "lock_dlm.h"
11
12static inline int no_work(struct gdlm_ls *ls)
13{
14 int ret;
15
16 spin_lock(&ls->async_lock);
17 ret = list_empty(&ls->submit);
18 spin_unlock(&ls->async_lock);
19
20 return ret;
21}
22
23static int gdlm_thread(void *data)
24{
25 struct gdlm_ls *ls = (struct gdlm_ls *) data;
26 struct gdlm_lock *lp = NULL;
27
28 while (!kthread_should_stop()) {
29 wait_event_interruptible(ls->thread_wait,
30 !no_work(ls) || kthread_should_stop());
31
32 spin_lock(&ls->async_lock);
33
34 if (!list_empty(&ls->submit)) {
35 lp = list_entry(ls->submit.next, struct gdlm_lock,
36 delay_list);
37 list_del_init(&lp->delay_list);
38 spin_unlock(&ls->async_lock);
39 gdlm_do_lock(lp);
40 spin_lock(&ls->async_lock);
41 }
42 spin_unlock(&ls->async_lock);
43 }
44
45 return 0;
46}
47
48int gdlm_init_threads(struct gdlm_ls *ls)
49{
50 struct task_struct *p;
51 int error;
52
53 p = kthread_run(gdlm_thread, ls, "lock_dlm");
54 error = IS_ERR(p);
55 if (error) {
56 log_error("can't start lock_dlm thread %d", error);
57 return error;
58 }
59 ls->thread = p;
60
61 return 0;
62}
63
64void gdlm_release_threads(struct gdlm_ls *ls)
65{
66 kthread_stop(ls->thread);
67}
68
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index ad305854bdc6..98918a756410 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -14,7 +14,6 @@
14#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h> 15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h> 16#include <linux/crc32.h>
17#include <linux/lm_interface.h>
18#include <linux/delay.h> 17#include <linux/delay.h>
19#include <linux/kthread.h> 18#include <linux/kthread.h>
20#include <linux/freezer.h> 19#include <linux/freezer.h>
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 4390f6f4047d..80e4f5f898bb 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -13,7 +13,6 @@
13#include <linux/completion.h> 13#include <linux/completion.h>
14#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h> 15#include <linux/gfs2_ondisk.h>
16#include <linux/lm_interface.h>
17 16
18#include "gfs2.h" 17#include "gfs2.h"
19#include "incore.h" 18#include "incore.h"
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 7cacfde32194..a6892ed0840a 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -14,7 +14,6 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/gfs2_ondisk.h> 16#include <linux/gfs2_ondisk.h>
17#include <linux/lm_interface.h>
18#include <asm/atomic.h> 17#include <asm/atomic.h>
19 18
20#include "gfs2.h" 19#include "gfs2.h"
@@ -23,6 +22,12 @@
23#include "sys.h" 22#include "sys.h"
24#include "util.h" 23#include "util.h"
25#include "glock.h" 24#include "glock.h"
25#include "quota.h"
26
27static struct shrinker qd_shrinker = {
28 .shrink = gfs2_shrink_qd_memory,
29 .seeks = DEFAULT_SEEKS,
30};
26 31
27static void gfs2_init_inode_once(void *foo) 32static void gfs2_init_inode_once(void *foo)
28{ 33{
@@ -41,8 +46,6 @@ static void gfs2_init_glock_once(void *foo)
41 INIT_HLIST_NODE(&gl->gl_list); 46 INIT_HLIST_NODE(&gl->gl_list);
42 spin_lock_init(&gl->gl_spin); 47 spin_lock_init(&gl->gl_spin);
43 INIT_LIST_HEAD(&gl->gl_holders); 48 INIT_LIST_HEAD(&gl->gl_holders);
44 gl->gl_lvb = NULL;
45 atomic_set(&gl->gl_lvb_count, 0);
46 INIT_LIST_HEAD(&gl->gl_lru); 49 INIT_LIST_HEAD(&gl->gl_lru);
47 INIT_LIST_HEAD(&gl->gl_ail_list); 50 INIT_LIST_HEAD(&gl->gl_ail_list);
48 atomic_set(&gl->gl_ail_count, 0); 51 atomic_set(&gl->gl_ail_count, 0);
@@ -100,6 +103,8 @@ static int __init init_gfs2_fs(void)
100 if (!gfs2_quotad_cachep) 103 if (!gfs2_quotad_cachep)
101 goto fail; 104 goto fail;
102 105
106 register_shrinker(&qd_shrinker);
107
103 error = register_filesystem(&gfs2_fs_type); 108 error = register_filesystem(&gfs2_fs_type);
104 if (error) 109 if (error)
105 goto fail; 110 goto fail;
@@ -117,6 +122,7 @@ static int __init init_gfs2_fs(void)
117fail_unregister: 122fail_unregister:
118 unregister_filesystem(&gfs2_fs_type); 123 unregister_filesystem(&gfs2_fs_type);
119fail: 124fail:
125 unregister_shrinker(&qd_shrinker);
120 gfs2_glock_exit(); 126 gfs2_glock_exit();
121 127
122 if (gfs2_quotad_cachep) 128 if (gfs2_quotad_cachep)
@@ -145,6 +151,7 @@ fail:
145 151
146static void __exit exit_gfs2_fs(void) 152static void __exit exit_gfs2_fs(void)
147{ 153{
154 unregister_shrinker(&qd_shrinker);
148 gfs2_glock_exit(); 155 gfs2_glock_exit();
149 gfs2_unregister_debugfs(); 156 gfs2_unregister_debugfs();
150 unregister_filesystem(&gfs2_fs_type); 157 unregister_filesystem(&gfs2_fs_type);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 09853620c951..8d6f13256b26 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -19,7 +19,6 @@
19#include <linux/delay.h> 19#include <linux/delay.h>
20#include <linux/bio.h> 20#include <linux/bio.h>
21#include <linux/gfs2_ondisk.h> 21#include <linux/gfs2_ondisk.h>
22#include <linux/lm_interface.h>
23 22
24#include "gfs2.h" 23#include "gfs2.h"
25#include "incore.h" 24#include "incore.h"
@@ -90,27 +89,6 @@ void gfs2_aspace_put(struct inode *aspace)
90} 89}
91 90
92/** 91/**
93 * gfs2_meta_inval - Invalidate all buffers associated with a glock
94 * @gl: the glock
95 *
96 */
97
98void gfs2_meta_inval(struct gfs2_glock *gl)
99{
100 struct gfs2_sbd *sdp = gl->gl_sbd;
101 struct inode *aspace = gl->gl_aspace;
102 struct address_space *mapping = gl->gl_aspace->i_mapping;
103
104 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
105
106 atomic_inc(&aspace->i_writecount);
107 truncate_inode_pages(mapping, 0);
108 atomic_dec(&aspace->i_writecount);
109
110 gfs2_assert_withdraw(sdp, !mapping->nrpages);
111}
112
113/**
114 * gfs2_meta_sync - Sync all buffers associated with a glock 92 * gfs2_meta_sync - Sync all buffers associated with a glock
115 * @gl: The glock 93 * @gl: The glock
116 * 94 *
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index b1a5f3674d43..de270c2f9b63 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -40,7 +40,6 @@ static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
40struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp); 40struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp);
41void gfs2_aspace_put(struct inode *aspace); 41void gfs2_aspace_put(struct inode *aspace);
42 42
43void gfs2_meta_inval(struct gfs2_glock *gl);
44void gfs2_meta_sync(struct gfs2_glock *gl); 43void gfs2_meta_sync(struct gfs2_glock *gl);
45 44
46struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno); 45struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index 3cb0a44ba023..f7e8527a21e0 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -12,12 +12,11 @@
12#include <linux/completion.h> 12#include <linux/completion.h>
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/gfs2_ondisk.h> 14#include <linux/gfs2_ondisk.h>
15#include <linux/lm_interface.h>
16#include <linux/parser.h> 15#include <linux/parser.h>
17 16
18#include "gfs2.h" 17#include "gfs2.h"
19#include "incore.h" 18#include "incore.h"
20#include "mount.h" 19#include "super.h"
21#include "sys.h" 20#include "sys.h"
22#include "util.h" 21#include "util.h"
23 22
@@ -37,11 +36,15 @@ enum {
37 Opt_quota_off, 36 Opt_quota_off,
38 Opt_quota_account, 37 Opt_quota_account,
39 Opt_quota_on, 38 Opt_quota_on,
39 Opt_quota,
40 Opt_noquota,
40 Opt_suiddir, 41 Opt_suiddir,
41 Opt_nosuiddir, 42 Opt_nosuiddir,
42 Opt_data_writeback, 43 Opt_data_writeback,
43 Opt_data_ordered, 44 Opt_data_ordered,
44 Opt_meta, 45 Opt_meta,
46 Opt_discard,
47 Opt_nodiscard,
45 Opt_err, 48 Opt_err,
46}; 49};
47 50
@@ -61,11 +64,15 @@ static const match_table_t tokens = {
61 {Opt_quota_off, "quota=off"}, 64 {Opt_quota_off, "quota=off"},
62 {Opt_quota_account, "quota=account"}, 65 {Opt_quota_account, "quota=account"},
63 {Opt_quota_on, "quota=on"}, 66 {Opt_quota_on, "quota=on"},
67 {Opt_quota, "quota"},
68 {Opt_noquota, "noquota"},
64 {Opt_suiddir, "suiddir"}, 69 {Opt_suiddir, "suiddir"},
65 {Opt_nosuiddir, "nosuiddir"}, 70 {Opt_nosuiddir, "nosuiddir"},
66 {Opt_data_writeback, "data=writeback"}, 71 {Opt_data_writeback, "data=writeback"},
67 {Opt_data_ordered, "data=ordered"}, 72 {Opt_data_ordered, "data=ordered"},
68 {Opt_meta, "meta"}, 73 {Opt_meta, "meta"},
74 {Opt_discard, "discard"},
75 {Opt_nodiscard, "nodiscard"},
69 {Opt_err, NULL} 76 {Opt_err, NULL}
70}; 77};
71 78
@@ -77,101 +84,46 @@ static const match_table_t tokens = {
77 * Return: errno 84 * Return: errno
78 */ 85 */
79 86
80int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount) 87int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
81{ 88{
82 struct gfs2_args *args = &sdp->sd_args; 89 char *o;
83 char *data = data_arg; 90 int token;
84 char *options, *o, *v; 91 substring_t tmp[MAX_OPT_ARGS];
85 int error = 0;
86
87 if (!remount) {
88 /* Set some defaults */
89 args->ar_quota = GFS2_QUOTA_DEFAULT;
90 args->ar_data = GFS2_DATA_DEFAULT;
91 }
92 92
93 /* Split the options into tokens with the "," character and 93 /* Split the options into tokens with the "," character and
94 process them */ 94 process them */
95 95
96 for (options = data; (o = strsep(&options, ",")); ) { 96 while (1) {
97 int token; 97 o = strsep(&options, ",");
98 substring_t tmp[MAX_OPT_ARGS]; 98 if (o == NULL)
99 99 break;
100 if (!*o) 100 if (*o == '\0')
101 continue; 101 continue;
102 102
103 token = match_token(o, tokens, tmp); 103 token = match_token(o, tokens, tmp);
104 switch (token) { 104 switch (token) {
105 case Opt_lockproto: 105 case Opt_lockproto:
106 v = match_strdup(&tmp[0]); 106 match_strlcpy(args->ar_lockproto, &tmp[0],
107 if (!v) { 107 GFS2_LOCKNAME_LEN);
108 fs_info(sdp, "no memory for lockproto\n");
109 error = -ENOMEM;
110 goto out_error;
111 }
112
113 if (remount && strcmp(v, args->ar_lockproto)) {
114 kfree(v);
115 goto cant_remount;
116 }
117
118 strncpy(args->ar_lockproto, v, GFS2_LOCKNAME_LEN);
119 args->ar_lockproto[GFS2_LOCKNAME_LEN - 1] = 0;
120 kfree(v);
121 break; 108 break;
122 case Opt_locktable: 109 case Opt_locktable:
123 v = match_strdup(&tmp[0]); 110 match_strlcpy(args->ar_locktable, &tmp[0],
124 if (!v) { 111 GFS2_LOCKNAME_LEN);
125 fs_info(sdp, "no memory for locktable\n");
126 error = -ENOMEM;
127 goto out_error;
128 }
129
130 if (remount && strcmp(v, args->ar_locktable)) {
131 kfree(v);
132 goto cant_remount;
133 }
134
135 strncpy(args->ar_locktable, v, GFS2_LOCKNAME_LEN);
136 args->ar_locktable[GFS2_LOCKNAME_LEN - 1] = 0;
137 kfree(v);
138 break; 112 break;
139 case Opt_hostdata: 113 case Opt_hostdata:
140 v = match_strdup(&tmp[0]); 114 match_strlcpy(args->ar_hostdata, &tmp[0],
141 if (!v) { 115 GFS2_LOCKNAME_LEN);
142 fs_info(sdp, "no memory for hostdata\n");
143 error = -ENOMEM;
144 goto out_error;
145 }
146
147 if (remount && strcmp(v, args->ar_hostdata)) {
148 kfree(v);
149 goto cant_remount;
150 }
151
152 strncpy(args->ar_hostdata, v, GFS2_LOCKNAME_LEN);
153 args->ar_hostdata[GFS2_LOCKNAME_LEN - 1] = 0;
154 kfree(v);
155 break; 116 break;
156 case Opt_spectator: 117 case Opt_spectator:
157 if (remount && !args->ar_spectator)
158 goto cant_remount;
159 args->ar_spectator = 1; 118 args->ar_spectator = 1;
160 sdp->sd_vfs->s_flags |= MS_RDONLY;
161 break; 119 break;
162 case Opt_ignore_local_fs: 120 case Opt_ignore_local_fs:
163 if (remount && !args->ar_ignore_local_fs)
164 goto cant_remount;
165 args->ar_ignore_local_fs = 1; 121 args->ar_ignore_local_fs = 1;
166 break; 122 break;
167 case Opt_localflocks: 123 case Opt_localflocks:
168 if (remount && !args->ar_localflocks)
169 goto cant_remount;
170 args->ar_localflocks = 1; 124 args->ar_localflocks = 1;
171 break; 125 break;
172 case Opt_localcaching: 126 case Opt_localcaching:
173 if (remount && !args->ar_localcaching)
174 goto cant_remount;
175 args->ar_localcaching = 1; 127 args->ar_localcaching = 1;
176 break; 128 break;
177 case Opt_debug: 129 case Opt_debug:
@@ -181,25 +133,23 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
181 args->ar_debug = 0; 133 args->ar_debug = 0;
182 break; 134 break;
183 case Opt_upgrade: 135 case Opt_upgrade:
184 if (remount && !args->ar_upgrade)
185 goto cant_remount;
186 args->ar_upgrade = 1; 136 args->ar_upgrade = 1;
187 break; 137 break;
188 case Opt_acl: 138 case Opt_acl:
189 args->ar_posix_acl = 1; 139 args->ar_posix_acl = 1;
190 sdp->sd_vfs->s_flags |= MS_POSIXACL;
191 break; 140 break;
192 case Opt_noacl: 141 case Opt_noacl:
193 args->ar_posix_acl = 0; 142 args->ar_posix_acl = 0;
194 sdp->sd_vfs->s_flags &= ~MS_POSIXACL;
195 break; 143 break;
196 case Opt_quota_off: 144 case Opt_quota_off:
145 case Opt_noquota:
197 args->ar_quota = GFS2_QUOTA_OFF; 146 args->ar_quota = GFS2_QUOTA_OFF;
198 break; 147 break;
199 case Opt_quota_account: 148 case Opt_quota_account:
200 args->ar_quota = GFS2_QUOTA_ACCOUNT; 149 args->ar_quota = GFS2_QUOTA_ACCOUNT;
201 break; 150 break;
202 case Opt_quota_on: 151 case Opt_quota_on:
152 case Opt_quota:
203 args->ar_quota = GFS2_QUOTA_ON; 153 args->ar_quota = GFS2_QUOTA_ON;
204 break; 154 break;
205 case Opt_suiddir: 155 case Opt_suiddir:
@@ -215,29 +165,21 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
215 args->ar_data = GFS2_DATA_ORDERED; 165 args->ar_data = GFS2_DATA_ORDERED;
216 break; 166 break;
217 case Opt_meta: 167 case Opt_meta:
218 if (remount && args->ar_meta != 1)
219 goto cant_remount;
220 args->ar_meta = 1; 168 args->ar_meta = 1;
221 break; 169 break;
170 case Opt_discard:
171 args->ar_discard = 1;
172 break;
173 case Opt_nodiscard:
174 args->ar_discard = 0;
175 break;
222 case Opt_err: 176 case Opt_err:
223 default: 177 default:
224 fs_info(sdp, "unknown option: %s\n", o); 178 fs_info(sdp, "invalid mount option: %s\n", o);
225 error = -EINVAL; 179 return -EINVAL;
226 goto out_error;
227 } 180 }
228 } 181 }
229 182
230out_error: 183 return 0;
231 if (error)
232 fs_info(sdp, "invalid mount option(s)\n");
233
234 if (data != data_arg)
235 kfree(data);
236
237 return error;
238
239cant_remount:
240 fs_info(sdp, "can't remount with option %s\n", o);
241 return -EINVAL;
242} 184}
243 185
diff --git a/fs/gfs2/mount.h b/fs/gfs2/mount.h
deleted file mode 100644
index 401288acfdf3..000000000000
--- a/fs/gfs2/mount.h
+++ /dev/null
@@ -1,17 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __MOUNT_DOT_H__
11#define __MOUNT_DOT_H__
12
13struct gfs2_sbd;
14
15int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount);
16
17#endif /* __MOUNT_DOT_H__ */
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 4ddab67867eb..a6dde1751e17 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -19,7 +19,6 @@
19#include <linux/writeback.h> 19#include <linux/writeback.h>
20#include <linux/swap.h> 20#include <linux/swap.h>
21#include <linux/gfs2_ondisk.h> 21#include <linux/gfs2_ondisk.h>
22#include <linux/lm_interface.h>
23#include <linux/backing-dev.h> 22#include <linux/backing-dev.h>
24 23
25#include "gfs2.h" 24#include "gfs2.h"
@@ -442,6 +441,7 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
442 */ 441 */
443 if (unlikely(page->index)) { 442 if (unlikely(page->index)) {
444 zero_user(page, 0, PAGE_CACHE_SIZE); 443 zero_user(page, 0, PAGE_CACHE_SIZE);
444 SetPageUptodate(page);
445 return 0; 445 return 0;
446 } 446 }
447 447
@@ -1096,6 +1096,7 @@ static const struct address_space_operations gfs2_writeback_aops = {
1096 .releasepage = gfs2_releasepage, 1096 .releasepage = gfs2_releasepage,
1097 .direct_IO = gfs2_direct_IO, 1097 .direct_IO = gfs2_direct_IO,
1098 .migratepage = buffer_migrate_page, 1098 .migratepage = buffer_migrate_page,
1099 .is_partially_uptodate = block_is_partially_uptodate,
1099}; 1100};
1100 1101
1101static const struct address_space_operations gfs2_ordered_aops = { 1102static const struct address_space_operations gfs2_ordered_aops = {
@@ -1111,6 +1112,7 @@ static const struct address_space_operations gfs2_ordered_aops = {
1111 .releasepage = gfs2_releasepage, 1112 .releasepage = gfs2_releasepage,
1112 .direct_IO = gfs2_direct_IO, 1113 .direct_IO = gfs2_direct_IO,
1113 .migratepage = buffer_migrate_page, 1114 .migratepage = buffer_migrate_page,
1115 .is_partially_uptodate = block_is_partially_uptodate,
1114}; 1116};
1115 1117
1116static const struct address_space_operations gfs2_jdata_aops = { 1118static const struct address_space_operations gfs2_jdata_aops = {
@@ -1125,6 +1127,7 @@ static const struct address_space_operations gfs2_jdata_aops = {
1125 .bmap = gfs2_bmap, 1127 .bmap = gfs2_bmap,
1126 .invalidatepage = gfs2_invalidatepage, 1128 .invalidatepage = gfs2_invalidatepage,
1127 .releasepage = gfs2_releasepage, 1129 .releasepage = gfs2_releasepage,
1130 .is_partially_uptodate = block_is_partially_uptodate,
1128}; 1131};
1129 1132
1130void gfs2_set_aops(struct inode *inode) 1133void gfs2_set_aops(struct inode *inode)
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
index c2ad36330ca3..022c66cd5606 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/ops_dentry.c
@@ -13,7 +13,6 @@
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/gfs2_ondisk.h> 14#include <linux/gfs2_ondisk.h>
15#include <linux/crc32.h> 15#include <linux/crc32.h>
16#include <linux/lm_interface.h>
17 16
18#include "gfs2.h" 17#include "gfs2.h"
19#include "incore.h" 18#include "incore.h"
@@ -108,7 +107,7 @@ static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
108 return 0; 107 return 0;
109} 108}
110 109
111struct dentry_operations gfs2_dops = { 110const struct dentry_operations gfs2_dops = {
112 .d_revalidate = gfs2_drevalidate, 111 .d_revalidate = gfs2_drevalidate,
113 .d_hash = gfs2_dhash, 112 .d_hash = gfs2_dhash,
114}; 113};
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index 7fdeb14ddd1a..9200ef221716 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -14,7 +14,6 @@
14#include <linux/exportfs.h> 14#include <linux/exportfs.h>
15#include <linux/gfs2_ondisk.h> 15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h> 16#include <linux/crc32.h>
17#include <linux/lm_interface.h>
18 17
19#include "gfs2.h" 18#include "gfs2.h"
20#include "incore.h" 19#include "incore.h"
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 93fe41b67f97..70b9b8548945 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -20,9 +20,10 @@
20#include <linux/gfs2_ondisk.h> 20#include <linux/gfs2_ondisk.h>
21#include <linux/ext2_fs.h> 21#include <linux/ext2_fs.h>
22#include <linux/crc32.h> 22#include <linux/crc32.h>
23#include <linux/lm_interface.h>
24#include <linux/writeback.h> 23#include <linux/writeback.h>
25#include <asm/uaccess.h> 24#include <asm/uaccess.h>
25#include <linux/dlm.h>
26#include <linux/dlm_plock.h>
26 27
27#include "gfs2.h" 28#include "gfs2.h"
28#include "incore.h" 29#include "incore.h"
@@ -336,8 +337,9 @@ static int gfs2_allocate_page_backing(struct page *page)
336 * blocks allocated on disk to back that page. 337 * blocks allocated on disk to back that page.
337 */ 338 */
338 339
339static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) 340static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
340{ 341{
342 struct page *page = vmf->page;
341 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 343 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
342 struct gfs2_inode *ip = GFS2_I(inode); 344 struct gfs2_inode *ip = GFS2_I(inode);
343 struct gfs2_sbd *sdp = GFS2_SB(inode); 345 struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -354,7 +356,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
354 if (ret) 356 if (ret)
355 goto out; 357 goto out;
356 358
359 set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
357 set_bit(GIF_SW_PAGED, &ip->i_flags); 360 set_bit(GIF_SW_PAGED, &ip->i_flags);
361
358 ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required); 362 ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
359 if (ret || !alloc_required) 363 if (ret || !alloc_required)
360 goto out_unlock; 364 goto out_unlock;
@@ -409,6 +413,8 @@ out_unlock:
409 gfs2_glock_dq(&gh); 413 gfs2_glock_dq(&gh);
410out: 414out:
411 gfs2_holder_uninit(&gh); 415 gfs2_holder_uninit(&gh);
416 if (ret)
417 ret = VM_FAULT_SIGBUS;
412 return ret; 418 return ret;
413} 419}
414 420
@@ -560,57 +566,24 @@ static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
560 return ret; 566 return ret;
561} 567}
562 568
569#ifdef CONFIG_GFS2_FS_LOCKING_DLM
570
563/** 571/**
564 * gfs2_setlease - acquire/release a file lease 572 * gfs2_setlease - acquire/release a file lease
565 * @file: the file pointer 573 * @file: the file pointer
566 * @arg: lease type 574 * @arg: lease type
567 * @fl: file lock 575 * @fl: file lock
568 * 576 *
577 * We don't currently have a way to enforce a lease across the whole
578 * cluster; until we do, disable leases (by just returning -EINVAL),
579 * unless the administrator has requested purely local locking.
580 *
569 * Returns: errno 581 * Returns: errno
570 */ 582 */
571 583
572static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl) 584static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl)
573{ 585{
574 struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); 586 return -EINVAL;
575
576 /*
577 * We don't currently have a way to enforce a lease across the whole
578 * cluster; until we do, disable leases (by just returning -EINVAL),
579 * unless the administrator has requested purely local locking.
580 */
581 if (!sdp->sd_args.ar_localflocks)
582 return -EINVAL;
583 return generic_setlease(file, arg, fl);
584}
585
586static int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
587 struct file *file, struct file_lock *fl)
588{
589 int error = -EIO;
590 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
591 error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
592 sdp->sd_lockstruct.ls_lockspace, name, file, fl);
593 return error;
594}
595
596static int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
597 struct file *file, int cmd, struct file_lock *fl)
598{
599 int error = -EIO;
600 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
601 error = sdp->sd_lockstruct.ls_ops->lm_plock(
602 sdp->sd_lockstruct.ls_lockspace, name, file, cmd, fl);
603 return error;
604}
605
606static int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
607 struct file *file, struct file_lock *fl)
608{
609 int error = -EIO;
610 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
611 error = sdp->sd_lockstruct.ls_ops->lm_punlock(
612 sdp->sd_lockstruct.ls_lockspace, name, file, fl);
613 return error;
614} 587}
615 588
616/** 589/**
@@ -626,9 +599,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
626{ 599{
627 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); 600 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
628 struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); 601 struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
629 struct lm_lockname name = 602 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
630 { .ln_number = ip->i_no_addr,
631 .ln_type = LM_TYPE_PLOCK };
632 603
633 if (!(fl->fl_flags & FL_POSIX)) 604 if (!(fl->fl_flags & FL_POSIX))
634 return -ENOLCK; 605 return -ENOLCK;
@@ -640,12 +611,14 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
640 cmd = F_SETLK; 611 cmd = F_SETLK;
641 fl->fl_type = F_UNLCK; 612 fl->fl_type = F_UNLCK;
642 } 613 }
614 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
615 return -EIO;
643 if (IS_GETLK(cmd)) 616 if (IS_GETLK(cmd))
644 return gfs2_lm_plock_get(sdp, &name, file, fl); 617 return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl);
645 else if (fl->fl_type == F_UNLCK) 618 else if (fl->fl_type == F_UNLCK)
646 return gfs2_lm_punlock(sdp, &name, file, fl); 619 return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl);
647 else 620 else
648 return gfs2_lm_plock(sdp, &name, file, cmd, fl); 621 return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl);
649} 622}
650 623
651static int do_flock(struct file *file, int cmd, struct file_lock *fl) 624static int do_flock(struct file *file, int cmd, struct file_lock *fl)
@@ -732,7 +705,7 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
732 } 705 }
733} 706}
734 707
735const struct file_operations gfs2_file_fops = { 708const struct file_operations *gfs2_file_fops = &(const struct file_operations){
736 .llseek = gfs2_llseek, 709 .llseek = gfs2_llseek,
737 .read = do_sync_read, 710 .read = do_sync_read,
738 .aio_read = generic_file_aio_read, 711 .aio_read = generic_file_aio_read,
@@ -750,7 +723,7 @@ const struct file_operations gfs2_file_fops = {
750 .setlease = gfs2_setlease, 723 .setlease = gfs2_setlease,
751}; 724};
752 725
753const struct file_operations gfs2_dir_fops = { 726const struct file_operations *gfs2_dir_fops = &(const struct file_operations){
754 .readdir = gfs2_readdir, 727 .readdir = gfs2_readdir,
755 .unlocked_ioctl = gfs2_ioctl, 728 .unlocked_ioctl = gfs2_ioctl,
756 .open = gfs2_open, 729 .open = gfs2_open,
@@ -760,7 +733,9 @@ const struct file_operations gfs2_dir_fops = {
760 .flock = gfs2_flock, 733 .flock = gfs2_flock,
761}; 734};
762 735
763const struct file_operations gfs2_file_fops_nolock = { 736#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
737
738const struct file_operations *gfs2_file_fops_nolock = &(const struct file_operations){
764 .llseek = gfs2_llseek, 739 .llseek = gfs2_llseek,
765 .read = do_sync_read, 740 .read = do_sync_read,
766 .aio_read = generic_file_aio_read, 741 .aio_read = generic_file_aio_read,
@@ -773,10 +748,10 @@ const struct file_operations gfs2_file_fops_nolock = {
773 .fsync = gfs2_fsync, 748 .fsync = gfs2_fsync,
774 .splice_read = generic_file_splice_read, 749 .splice_read = generic_file_splice_read,
775 .splice_write = generic_file_splice_write, 750 .splice_write = generic_file_splice_write,
776 .setlease = gfs2_setlease, 751 .setlease = generic_setlease,
777}; 752};
778 753
779const struct file_operations gfs2_dir_fops_nolock = { 754const struct file_operations *gfs2_dir_fops_nolock = &(const struct file_operations){
780 .readdir = gfs2_readdir, 755 .readdir = gfs2_readdir,
781 .unlocked_ioctl = gfs2_ioctl, 756 .unlocked_ioctl = gfs2_ioctl,
782 .open = gfs2_open, 757 .open = gfs2_open,
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index f91eebdde581..51883b3ad89c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -17,7 +17,6 @@
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/mount.h> 18#include <linux/mount.h>
19#include <linux/gfs2_ondisk.h> 19#include <linux/gfs2_ondisk.h>
20#include <linux/lm_interface.h>
21 20
22#include "gfs2.h" 21#include "gfs2.h"
23#include "incore.h" 22#include "incore.h"
@@ -25,7 +24,6 @@
25#include "glock.h" 24#include "glock.h"
26#include "glops.h" 25#include "glops.h"
27#include "inode.h" 26#include "inode.h"
28#include "mount.h"
29#include "recovery.h" 27#include "recovery.h"
30#include "rgrp.h" 28#include "rgrp.h"
31#include "super.h" 29#include "super.h"
@@ -64,7 +62,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
64 gt->gt_quota_warn_period = 10; 62 gt->gt_quota_warn_period = 10;
65 gt->gt_quota_scale_num = 1; 63 gt->gt_quota_scale_num = 1;
66 gt->gt_quota_scale_den = 1; 64 gt->gt_quota_scale_den = 1;
67 gt->gt_quota_cache_secs = 300;
68 gt->gt_quota_quantum = 60; 65 gt->gt_quota_quantum = 60;
69 gt->gt_new_files_jdata = 0; 66 gt->gt_new_files_jdata = 0;
70 gt->gt_max_readahead = 1 << 18; 67 gt->gt_max_readahead = 1 << 18;
@@ -100,7 +97,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
100 mutex_init(&sdp->sd_jindex_mutex); 97 mutex_init(&sdp->sd_jindex_mutex);
101 98
102 INIT_LIST_HEAD(&sdp->sd_quota_list); 99 INIT_LIST_HEAD(&sdp->sd_quota_list);
103 spin_lock_init(&sdp->sd_quota_spin);
104 mutex_init(&sdp->sd_quota_mutex); 100 mutex_init(&sdp->sd_quota_mutex);
105 init_waitqueue_head(&sdp->sd_quota_wait); 101 init_waitqueue_head(&sdp->sd_quota_wait);
106 INIT_LIST_HEAD(&sdp->sd_trunc_list); 102 INIT_LIST_HEAD(&sdp->sd_trunc_list);
@@ -238,6 +234,7 @@ static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
238 234
239 memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN); 235 memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
240 memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN); 236 memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
237 memcpy(sb->sb_uuid, str->sb_uuid, 16);
241} 238}
242 239
243/** 240/**
@@ -299,15 +296,15 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
299 __free_page(page); 296 __free_page(page);
300 return 0; 297 return 0;
301} 298}
299
302/** 300/**
303 * gfs2_read_sb - Read super block 301 * gfs2_read_sb - Read super block
304 * @sdp: The GFS2 superblock 302 * @sdp: The GFS2 superblock
305 * @gl: the glock for the superblock (assumed to be held)
306 * @silent: Don't print message if mount fails 303 * @silent: Don't print message if mount fails
307 * 304 *
308 */ 305 */
309 306
310static int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent) 307static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
311{ 308{
312 u32 hash_blocks, ind_blocks, leaf_blocks; 309 u32 hash_blocks, ind_blocks, leaf_blocks;
313 u32 tmp_blocks; 310 u32 tmp_blocks;
@@ -527,7 +524,7 @@ static int init_sb(struct gfs2_sbd *sdp, int silent)
527 return ret; 524 return ret;
528 } 525 }
529 526
530 ret = gfs2_read_sb(sdp, sb_gh.gh_gl, silent); 527 ret = gfs2_read_sb(sdp, silent);
531 if (ret) { 528 if (ret) {
532 fs_err(sdp, "can't read superblock: %d\n", ret); 529 fs_err(sdp, "can't read superblock: %d\n", ret);
533 goto out; 530 goto out;
@@ -630,13 +627,13 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
630 return rc; 627 return rc;
631} 628}
632 629
633static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp) 630static void gfs2_others_may_mount(struct gfs2_sbd *sdp)
634{ 631{
635 if (!sdp->sd_lockstruct.ls_ops->lm_others_may_mount) 632 char *message = "FIRSTMOUNT=Done";
636 return; 633 char *envp[] = { message, NULL };
637 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 634 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
638 sdp->sd_lockstruct.ls_ops->lm_others_may_mount( 635 ls->ls_first_done = 1;
639 sdp->sd_lockstruct.ls_lockspace); 636 kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
640} 637}
641 638
642/** 639/**
@@ -796,7 +793,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
796 } 793 }
797 } 794 }
798 795
799 gfs2_lm_others_may_mount(sdp); 796 gfs2_others_may_mount(sdp);
800 } else if (!sdp->sd_args.ar_spectator) { 797 } else if (!sdp->sd_args.ar_spectator) {
801 error = gfs2_recover_journal(sdp->sd_jdesc); 798 error = gfs2_recover_journal(sdp->sd_jdesc);
802 if (error) { 799 if (error) {
@@ -1005,7 +1002,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
1005 goto fail_quotad; 1002 goto fail_quotad;
1006 1003
1007 sdp->sd_log_flush_time = jiffies; 1004 sdp->sd_log_flush_time = jiffies;
1008 sdp->sd_jindex_refresh_time = jiffies;
1009 1005
1010 p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); 1006 p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
1011 error = IS_ERR(p); 1007 error = IS_ERR(p);
@@ -1033,6 +1029,17 @@ fail:
1033 return error; 1029 return error;
1034} 1030}
1035 1031
1032static const match_table_t nolock_tokens = {
1033 { Opt_jid, "jid=%d\n", },
1034 { Opt_err, NULL },
1035};
1036
1037static const struct lm_lockops nolock_ops = {
1038 .lm_proto_name = "lock_nolock",
1039 .lm_put_lock = kmem_cache_free,
1040 .lm_tokens = &nolock_tokens,
1041};
1042
1036/** 1043/**
1037 * gfs2_lm_mount - mount a locking protocol 1044 * gfs2_lm_mount - mount a locking protocol
1038 * @sdp: the filesystem 1045 * @sdp: the filesystem
@@ -1044,31 +1051,73 @@ fail:
1044 1051
1045static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) 1052static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
1046{ 1053{
1047 char *proto = sdp->sd_proto_name; 1054 const struct lm_lockops *lm;
1048 char *table = sdp->sd_table_name; 1055 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
1049 int flags = LM_MFLAG_CONV_NODROP; 1056 struct gfs2_args *args = &sdp->sd_args;
1050 int error; 1057 const char *proto = sdp->sd_proto_name;
1058 const char *table = sdp->sd_table_name;
1059 const char *fsname;
1060 char *o, *options;
1061 int ret;
1051 1062
1052 if (sdp->sd_args.ar_spectator) 1063 if (!strcmp("lock_nolock", proto)) {
1053 flags |= LM_MFLAG_SPECTATOR; 1064 lm = &nolock_ops;
1065 sdp->sd_args.ar_localflocks = 1;
1066 sdp->sd_args.ar_localcaching = 1;
1067#ifdef CONFIG_GFS2_FS_LOCKING_DLM
1068 } else if (!strcmp("lock_dlm", proto)) {
1069 lm = &gfs2_dlm_ops;
1070#endif
1071 } else {
1072 printk(KERN_INFO "GFS2: can't find protocol %s\n", proto);
1073 return -ENOENT;
1074 }
1054 1075
1055 fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table); 1076 fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
1056 1077
1057 error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata, 1078 ls->ls_ops = lm;
1058 gfs2_glock_cb, sdp, 1079 ls->ls_first = 1;
1059 GFS2_MIN_LVB_SIZE, flags, 1080 ls->ls_id = 0;
1060 &sdp->sd_lockstruct, &sdp->sd_kobj);
1061 if (error) {
1062 fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n",
1063 proto, table, sdp->sd_args.ar_hostdata);
1064 goto out;
1065 }
1066 1081
1067 if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) || 1082 for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) {
1068 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >= 1083 substring_t tmp[MAX_OPT_ARGS];
1069 GFS2_MIN_LVB_SIZE)) { 1084 int token, option;
1070 gfs2_unmount_lockproto(&sdp->sd_lockstruct); 1085
1071 goto out; 1086 if (!o || !*o)
1087 continue;
1088
1089 token = match_token(o, *lm->lm_tokens, tmp);
1090 switch (token) {
1091 case Opt_jid:
1092 ret = match_int(&tmp[0], &option);
1093 if (ret || option < 0)
1094 goto hostdata_error;
1095 ls->ls_jid = option;
1096 break;
1097 case Opt_id:
1098 ret = match_int(&tmp[0], &option);
1099 if (ret)
1100 goto hostdata_error;
1101 ls->ls_id = option;
1102 break;
1103 case Opt_first:
1104 ret = match_int(&tmp[0], &option);
1105 if (ret || (option != 0 && option != 1))
1106 goto hostdata_error;
1107 ls->ls_first = option;
1108 break;
1109 case Opt_nodir:
1110 ret = match_int(&tmp[0], &option);
1111 if (ret || (option != 0 && option != 1))
1112 goto hostdata_error;
1113 ls->ls_nodir = option;
1114 break;
1115 case Opt_err:
1116 default:
1117hostdata_error:
1118 fs_info(sdp, "unknown hostdata (%s)\n", o);
1119 return -EINVAL;
1120 }
1072 } 1121 }
1073 1122
1074 if (sdp->sd_args.ar_spectator) 1123 if (sdp->sd_args.ar_spectator)
@@ -1077,22 +1126,25 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
1077 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table, 1126 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
1078 sdp->sd_lockstruct.ls_jid); 1127 sdp->sd_lockstruct.ls_jid);
1079 1128
1080 fs_info(sdp, "Joined cluster. Now mounting FS...\n"); 1129 fsname = strchr(table, ':');
1081 1130 if (fsname)
1082 if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) && 1131 fsname++;
1083 !sdp->sd_args.ar_ignore_local_fs) { 1132 if (lm->lm_mount == NULL) {
1084 sdp->sd_args.ar_localflocks = 1; 1133 fs_info(sdp, "Now mounting FS...\n");
1085 sdp->sd_args.ar_localcaching = 1; 1134 return 0;
1086 } 1135 }
1087 1136 ret = lm->lm_mount(sdp, fsname);
1088out: 1137 if (ret == 0)
1089 return error; 1138 fs_info(sdp, "Joined cluster. Now mounting FS...\n");
1139 return ret;
1090} 1140}
1091 1141
1092void gfs2_lm_unmount(struct gfs2_sbd *sdp) 1142void gfs2_lm_unmount(struct gfs2_sbd *sdp)
1093{ 1143{
1094 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 1144 const struct lm_lockops *lm = sdp->sd_lockstruct.ls_ops;
1095 gfs2_unmount_lockproto(&sdp->sd_lockstruct); 1145 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) &&
1146 lm->lm_unmount)
1147 lm->lm_unmount(sdp);
1096} 1148}
1097 1149
1098/** 1150/**
@@ -1116,12 +1168,20 @@ static int fill_super(struct super_block *sb, void *data, int silent)
1116 return -ENOMEM; 1168 return -ENOMEM;
1117 } 1169 }
1118 1170
1119 error = gfs2_mount_args(sdp, (char *)data, 0); 1171 sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
1172 sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
1173
1174 error = gfs2_mount_args(sdp, &sdp->sd_args, data);
1120 if (error) { 1175 if (error) {
1121 printk(KERN_WARNING "GFS2: can't parse mount arguments\n"); 1176 printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
1122 goto fail; 1177 goto fail;
1123 } 1178 }
1124 1179
1180 if (sdp->sd_args.ar_spectator)
1181 sb->s_flags |= MS_RDONLY;
1182 if (sdp->sd_args.ar_posix_acl)
1183 sb->s_flags |= MS_POSIXACL;
1184
1125 sb->s_magic = GFS2_MAGIC; 1185 sb->s_magic = GFS2_MAGIC;
1126 sb->s_op = &gfs2_super_ops; 1186 sb->s_op = &gfs2_super_ops;
1127 sb->s_export_op = &gfs2_export_ops; 1187 sb->s_export_op = &gfs2_export_ops;
@@ -1199,6 +1259,8 @@ fail_sb:
1199 dput(sdp->sd_root_dir); 1259 dput(sdp->sd_root_dir);
1200 if (sdp->sd_master_dir) 1260 if (sdp->sd_master_dir)
1201 dput(sdp->sd_master_dir); 1261 dput(sdp->sd_master_dir);
1262 if (sb->s_root)
1263 dput(sb->s_root);
1202 sb->s_root = NULL; 1264 sb->s_root = NULL;
1203fail_locking: 1265fail_locking:
1204 init_locking(sdp, &mount_gh, UNDO); 1266 init_locking(sdp, &mount_gh, UNDO);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 49877546beb9..abd5429ae285 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -18,7 +18,6 @@
18#include <linux/posix_acl.h> 18#include <linux/posix_acl.h>
19#include <linux/gfs2_ondisk.h> 19#include <linux/gfs2_ondisk.h>
20#include <linux/crc32.h> 20#include <linux/crc32.h>
21#include <linux/lm_interface.h>
22#include <linux/fiemap.h> 21#include <linux/fiemap.h>
23#include <asm/uaccess.h> 22#include <asm/uaccess.h>
24 23
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 320323d03479..458019569dcb 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -19,7 +19,6 @@
19#include <linux/delay.h> 19#include <linux/delay.h>
20#include <linux/gfs2_ondisk.h> 20#include <linux/gfs2_ondisk.h>
21#include <linux/crc32.h> 21#include <linux/crc32.h>
22#include <linux/lm_interface.h>
23#include <linux/time.h> 22#include <linux/time.h>
24 23
25#include "gfs2.h" 24#include "gfs2.h"
@@ -27,7 +26,6 @@
27#include "glock.h" 26#include "glock.h"
28#include "inode.h" 27#include "inode.h"
29#include "log.h" 28#include "log.h"
30#include "mount.h"
31#include "quota.h" 29#include "quota.h"
32#include "recovery.h" 30#include "recovery.h"
33#include "rgrp.h" 31#include "rgrp.h"
@@ -40,6 +38,8 @@
40#include "bmap.h" 38#include "bmap.h"
41#include "meta_io.h" 39#include "meta_io.h"
42 40
41#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
42
43/** 43/**
44 * gfs2_write_inode - Make sure the inode is stable on the disk 44 * gfs2_write_inode - Make sure the inode is stable on the disk
45 * @inode: The inode 45 * @inode: The inode
@@ -435,25 +435,45 @@ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
435static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) 435static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
436{ 436{
437 struct gfs2_sbd *sdp = sb->s_fs_info; 437 struct gfs2_sbd *sdp = sb->s_fs_info;
438 struct gfs2_args args = sdp->sd_args; /* Default to current settings */
438 int error; 439 int error;
439 440
440 error = gfs2_mount_args(sdp, data, 1); 441 error = gfs2_mount_args(sdp, &args, data);
441 if (error) 442 if (error)
442 return error; 443 return error;
443 444
445 /* Not allowed to change locking details */
446 if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) ||
447 strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) ||
448 strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata))
449 return -EINVAL;
450
451 /* Some flags must not be changed */
452 if (args_neq(&args, &sdp->sd_args, spectator) ||
453 args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
454 args_neq(&args, &sdp->sd_args, localflocks) ||
455 args_neq(&args, &sdp->sd_args, localcaching) ||
456 args_neq(&args, &sdp->sd_args, meta))
457 return -EINVAL;
458
444 if (sdp->sd_args.ar_spectator) 459 if (sdp->sd_args.ar_spectator)
445 *flags |= MS_RDONLY; 460 *flags |= MS_RDONLY;
446 else { 461
447 if (*flags & MS_RDONLY) { 462 if ((sb->s_flags ^ *flags) & MS_RDONLY) {
448 if (!(sb->s_flags & MS_RDONLY)) 463 if (*flags & MS_RDONLY)
449 error = gfs2_make_fs_ro(sdp); 464 error = gfs2_make_fs_ro(sdp);
450 } else if (!(*flags & MS_RDONLY) && 465 else
451 (sb->s_flags & MS_RDONLY)) {
452 error = gfs2_make_fs_rw(sdp); 466 error = gfs2_make_fs_rw(sdp);
453 } 467 if (error)
468 return error;
454 } 469 }
455 470
456 return error; 471 sdp->sd_args = args;
472 if (sdp->sd_args.ar_posix_acl)
473 sb->s_flags |= MS_POSIXACL;
474 else
475 sb->s_flags &= ~MS_POSIXACL;
476 return 0;
457} 477}
458 478
459/** 479/**
@@ -588,6 +608,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
588 } 608 }
589 seq_printf(s, ",data=%s", state); 609 seq_printf(s, ",data=%s", state);
590 } 610 }
611 if (args->ar_discard)
612 seq_printf(s, ",discard");
591 613
592 return 0; 614 return 0;
593} 615}
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index b08d09696b3e..8d53f66b5bcc 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -45,7 +45,6 @@
45#include <linux/fs.h> 45#include <linux/fs.h>
46#include <linux/bio.h> 46#include <linux/bio.h>
47#include <linux/gfs2_ondisk.h> 47#include <linux/gfs2_ondisk.h>
48#include <linux/lm_interface.h>
49#include <linux/kthread.h> 48#include <linux/kthread.h>
50#include <linux/freezer.h> 49#include <linux/freezer.h>
51 50
@@ -80,6 +79,51 @@ struct gfs2_quota_change_host {
80 u32 qc_id; 79 u32 qc_id;
81}; 80};
82 81
82static LIST_HEAD(qd_lru_list);
83static atomic_t qd_lru_count = ATOMIC_INIT(0);
84static spinlock_t qd_lru_lock = SPIN_LOCK_UNLOCKED;
85
86int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask)
87{
88 struct gfs2_quota_data *qd;
89 struct gfs2_sbd *sdp;
90
91 if (nr == 0)
92 goto out;
93
94 if (!(gfp_mask & __GFP_FS))
95 return -1;
96
97 spin_lock(&qd_lru_lock);
98 while (nr && !list_empty(&qd_lru_list)) {
99 qd = list_entry(qd_lru_list.next,
100 struct gfs2_quota_data, qd_reclaim);
101 sdp = qd->qd_gl->gl_sbd;
102
103 /* Free from the filesystem-specific list */
104 list_del(&qd->qd_list);
105
106 gfs2_assert_warn(sdp, !qd->qd_change);
107 gfs2_assert_warn(sdp, !qd->qd_slot_count);
108 gfs2_assert_warn(sdp, !qd->qd_bh_count);
109
110 gfs2_glock_put(qd->qd_gl);
111 atomic_dec(&sdp->sd_quota_count);
112
113 /* Delete it from the common reclaim list */
114 list_del_init(&qd->qd_reclaim);
115 atomic_dec(&qd_lru_count);
116 spin_unlock(&qd_lru_lock);
117 kmem_cache_free(gfs2_quotad_cachep, qd);
118 spin_lock(&qd_lru_lock);
119 nr--;
120 }
121 spin_unlock(&qd_lru_lock);
122
123out:
124 return (atomic_read(&qd_lru_count) * sysctl_vfs_cache_pressure) / 100;
125}
126
83static u64 qd2offset(struct gfs2_quota_data *qd) 127static u64 qd2offset(struct gfs2_quota_data *qd)
84{ 128{
85 u64 offset; 129 u64 offset;
@@ -100,22 +144,18 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
100 if (!qd) 144 if (!qd)
101 return -ENOMEM; 145 return -ENOMEM;
102 146
103 qd->qd_count = 1; 147 atomic_set(&qd->qd_count, 1);
104 qd->qd_id = id; 148 qd->qd_id = id;
105 if (user) 149 if (user)
106 set_bit(QDF_USER, &qd->qd_flags); 150 set_bit(QDF_USER, &qd->qd_flags);
107 qd->qd_slot = -1; 151 qd->qd_slot = -1;
152 INIT_LIST_HEAD(&qd->qd_reclaim);
108 153
109 error = gfs2_glock_get(sdp, 2 * (u64)id + !user, 154 error = gfs2_glock_get(sdp, 2 * (u64)id + !user,
110 &gfs2_quota_glops, CREATE, &qd->qd_gl); 155 &gfs2_quota_glops, CREATE, &qd->qd_gl);
111 if (error) 156 if (error)
112 goto fail; 157 goto fail;
113 158
114 error = gfs2_lvb_hold(qd->qd_gl);
115 gfs2_glock_put(qd->qd_gl);
116 if (error)
117 goto fail;
118
119 *qdp = qd; 159 *qdp = qd;
120 160
121 return 0; 161 return 0;
@@ -135,11 +175,17 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
135 175
136 for (;;) { 176 for (;;) {
137 found = 0; 177 found = 0;
138 spin_lock(&sdp->sd_quota_spin); 178 spin_lock(&qd_lru_lock);
139 list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { 179 list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
140 if (qd->qd_id == id && 180 if (qd->qd_id == id &&
141 !test_bit(QDF_USER, &qd->qd_flags) == !user) { 181 !test_bit(QDF_USER, &qd->qd_flags) == !user) {
142 qd->qd_count++; 182 if (!atomic_read(&qd->qd_count) &&
183 !list_empty(&qd->qd_reclaim)) {
184 /* Remove it from reclaim list */
185 list_del_init(&qd->qd_reclaim);
186 atomic_dec(&qd_lru_count);
187 }
188 atomic_inc(&qd->qd_count);
143 found = 1; 189 found = 1;
144 break; 190 break;
145 } 191 }
@@ -155,11 +201,11 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
155 new_qd = NULL; 201 new_qd = NULL;
156 } 202 }
157 203
158 spin_unlock(&sdp->sd_quota_spin); 204 spin_unlock(&qd_lru_lock);
159 205
160 if (qd || !create) { 206 if (qd || !create) {
161 if (new_qd) { 207 if (new_qd) {
162 gfs2_lvb_unhold(new_qd->qd_gl); 208 gfs2_glock_put(new_qd->qd_gl);
163 kmem_cache_free(gfs2_quotad_cachep, new_qd); 209 kmem_cache_free(gfs2_quotad_cachep, new_qd);
164 } 210 }
165 *qdp = qd; 211 *qdp = qd;
@@ -175,21 +221,18 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
175static void qd_hold(struct gfs2_quota_data *qd) 221static void qd_hold(struct gfs2_quota_data *qd)
176{ 222{
177 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; 223 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
178 224 gfs2_assert(sdp, atomic_read(&qd->qd_count));
179 spin_lock(&sdp->sd_quota_spin); 225 atomic_inc(&qd->qd_count);
180 gfs2_assert(sdp, qd->qd_count);
181 qd->qd_count++;
182 spin_unlock(&sdp->sd_quota_spin);
183} 226}
184 227
185static void qd_put(struct gfs2_quota_data *qd) 228static void qd_put(struct gfs2_quota_data *qd)
186{ 229{
187 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; 230 if (atomic_dec_and_lock(&qd->qd_count, &qd_lru_lock)) {
188 spin_lock(&sdp->sd_quota_spin); 231 /* Add to the reclaim list */
189 gfs2_assert(sdp, qd->qd_count); 232 list_add_tail(&qd->qd_reclaim, &qd_lru_list);
190 if (!--qd->qd_count) 233 atomic_inc(&qd_lru_count);
191 qd->qd_last_touched = jiffies; 234 spin_unlock(&qd_lru_lock);
192 spin_unlock(&sdp->sd_quota_spin); 235 }
193} 236}
194 237
195static int slot_get(struct gfs2_quota_data *qd) 238static int slot_get(struct gfs2_quota_data *qd)
@@ -198,10 +241,10 @@ static int slot_get(struct gfs2_quota_data *qd)
198 unsigned int c, o = 0, b; 241 unsigned int c, o = 0, b;
199 unsigned char byte = 0; 242 unsigned char byte = 0;
200 243
201 spin_lock(&sdp->sd_quota_spin); 244 spin_lock(&qd_lru_lock);
202 245
203 if (qd->qd_slot_count++) { 246 if (qd->qd_slot_count++) {
204 spin_unlock(&sdp->sd_quota_spin); 247 spin_unlock(&qd_lru_lock);
205 return 0; 248 return 0;
206 } 249 }
207 250
@@ -225,13 +268,13 @@ found:
225 268
226 sdp->sd_quota_bitmap[c][o] |= 1 << b; 269 sdp->sd_quota_bitmap[c][o] |= 1 << b;
227 270
228 spin_unlock(&sdp->sd_quota_spin); 271 spin_unlock(&qd_lru_lock);
229 272
230 return 0; 273 return 0;
231 274
232fail: 275fail:
233 qd->qd_slot_count--; 276 qd->qd_slot_count--;
234 spin_unlock(&sdp->sd_quota_spin); 277 spin_unlock(&qd_lru_lock);
235 return -ENOSPC; 278 return -ENOSPC;
236} 279}
237 280
@@ -239,23 +282,23 @@ static void slot_hold(struct gfs2_quota_data *qd)
239{ 282{
240 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; 283 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
241 284
242 spin_lock(&sdp->sd_quota_spin); 285 spin_lock(&qd_lru_lock);
243 gfs2_assert(sdp, qd->qd_slot_count); 286 gfs2_assert(sdp, qd->qd_slot_count);
244 qd->qd_slot_count++; 287 qd->qd_slot_count++;
245 spin_unlock(&sdp->sd_quota_spin); 288 spin_unlock(&qd_lru_lock);
246} 289}
247 290
248static void slot_put(struct gfs2_quota_data *qd) 291static void slot_put(struct gfs2_quota_data *qd)
249{ 292{
250 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; 293 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
251 294
252 spin_lock(&sdp->sd_quota_spin); 295 spin_lock(&qd_lru_lock);
253 gfs2_assert(sdp, qd->qd_slot_count); 296 gfs2_assert(sdp, qd->qd_slot_count);
254 if (!--qd->qd_slot_count) { 297 if (!--qd->qd_slot_count) {
255 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0); 298 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0);
256 qd->qd_slot = -1; 299 qd->qd_slot = -1;
257 } 300 }
258 spin_unlock(&sdp->sd_quota_spin); 301 spin_unlock(&qd_lru_lock);
259} 302}
260 303
261static int bh_get(struct gfs2_quota_data *qd) 304static int bh_get(struct gfs2_quota_data *qd)
@@ -330,7 +373,7 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
330 if (sdp->sd_vfs->s_flags & MS_RDONLY) 373 if (sdp->sd_vfs->s_flags & MS_RDONLY)
331 return 0; 374 return 0;
332 375
333 spin_lock(&sdp->sd_quota_spin); 376 spin_lock(&qd_lru_lock);
334 377
335 list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { 378 list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
336 if (test_bit(QDF_LOCKED, &qd->qd_flags) || 379 if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
@@ -341,8 +384,8 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
341 list_move_tail(&qd->qd_list, &sdp->sd_quota_list); 384 list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
342 385
343 set_bit(QDF_LOCKED, &qd->qd_flags); 386 set_bit(QDF_LOCKED, &qd->qd_flags);
344 gfs2_assert_warn(sdp, qd->qd_count); 387 gfs2_assert_warn(sdp, atomic_read(&qd->qd_count));
345 qd->qd_count++; 388 atomic_inc(&qd->qd_count);
346 qd->qd_change_sync = qd->qd_change; 389 qd->qd_change_sync = qd->qd_change;
347 gfs2_assert_warn(sdp, qd->qd_slot_count); 390 gfs2_assert_warn(sdp, qd->qd_slot_count);
348 qd->qd_slot_count++; 391 qd->qd_slot_count++;
@@ -354,7 +397,7 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
354 if (!found) 397 if (!found)
355 qd = NULL; 398 qd = NULL;
356 399
357 spin_unlock(&sdp->sd_quota_spin); 400 spin_unlock(&qd_lru_lock);
358 401
359 if (qd) { 402 if (qd) {
360 gfs2_assert_warn(sdp, qd->qd_change_sync); 403 gfs2_assert_warn(sdp, qd->qd_change_sync);
@@ -379,24 +422,24 @@ static int qd_trylock(struct gfs2_quota_data *qd)
379 if (sdp->sd_vfs->s_flags & MS_RDONLY) 422 if (sdp->sd_vfs->s_flags & MS_RDONLY)
380 return 0; 423 return 0;
381 424
382 spin_lock(&sdp->sd_quota_spin); 425 spin_lock(&qd_lru_lock);
383 426
384 if (test_bit(QDF_LOCKED, &qd->qd_flags) || 427 if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
385 !test_bit(QDF_CHANGE, &qd->qd_flags)) { 428 !test_bit(QDF_CHANGE, &qd->qd_flags)) {
386 spin_unlock(&sdp->sd_quota_spin); 429 spin_unlock(&qd_lru_lock);
387 return 0; 430 return 0;
388 } 431 }
389 432
390 list_move_tail(&qd->qd_list, &sdp->sd_quota_list); 433 list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
391 434
392 set_bit(QDF_LOCKED, &qd->qd_flags); 435 set_bit(QDF_LOCKED, &qd->qd_flags);
393 gfs2_assert_warn(sdp, qd->qd_count); 436 gfs2_assert_warn(sdp, atomic_read(&qd->qd_count));
394 qd->qd_count++; 437 atomic_inc(&qd->qd_count);
395 qd->qd_change_sync = qd->qd_change; 438 qd->qd_change_sync = qd->qd_change;
396 gfs2_assert_warn(sdp, qd->qd_slot_count); 439 gfs2_assert_warn(sdp, qd->qd_slot_count);
397 qd->qd_slot_count++; 440 qd->qd_slot_count++;
398 441
399 spin_unlock(&sdp->sd_quota_spin); 442 spin_unlock(&qd_lru_lock);
400 443
401 gfs2_assert_warn(sdp, qd->qd_change_sync); 444 gfs2_assert_warn(sdp, qd->qd_change_sync);
402 if (bh_get(qd)) { 445 if (bh_get(qd)) {
@@ -556,9 +599,9 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
556 x = be64_to_cpu(qc->qc_change) + change; 599 x = be64_to_cpu(qc->qc_change) + change;
557 qc->qc_change = cpu_to_be64(x); 600 qc->qc_change = cpu_to_be64(x);
558 601
559 spin_lock(&sdp->sd_quota_spin); 602 spin_lock(&qd_lru_lock);
560 qd->qd_change = x; 603 qd->qd_change = x;
561 spin_unlock(&sdp->sd_quota_spin); 604 spin_unlock(&qd_lru_lock);
562 605
563 if (!x) { 606 if (!x) {
564 gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags)); 607 gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags));
@@ -802,8 +845,8 @@ restart:
802 loff_t pos; 845 loff_t pos;
803 gfs2_glock_dq_uninit(q_gh); 846 gfs2_glock_dq_uninit(q_gh);
804 error = gfs2_glock_nq_init(qd->qd_gl, 847 error = gfs2_glock_nq_init(qd->qd_gl,
805 LM_ST_EXCLUSIVE, GL_NOCACHE, 848 LM_ST_EXCLUSIVE, GL_NOCACHE,
806 q_gh); 849 q_gh);
807 if (error) 850 if (error)
808 return error; 851 return error;
809 852
@@ -820,7 +863,6 @@ restart:
820 863
821 gfs2_glock_dq_uninit(&i_gh); 864 gfs2_glock_dq_uninit(&i_gh);
822 865
823
824 gfs2_quota_in(&q, buf); 866 gfs2_quota_in(&q, buf);
825 qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; 867 qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
826 qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC); 868 qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
@@ -890,9 +932,9 @@ static int need_sync(struct gfs2_quota_data *qd)
890 if (!qd->qd_qb.qb_limit) 932 if (!qd->qd_qb.qb_limit)
891 return 0; 933 return 0;
892 934
893 spin_lock(&sdp->sd_quota_spin); 935 spin_lock(&qd_lru_lock);
894 value = qd->qd_change; 936 value = qd->qd_change;
895 spin_unlock(&sdp->sd_quota_spin); 937 spin_unlock(&qd_lru_lock);
896 938
897 spin_lock(&gt->gt_spin); 939 spin_lock(&gt->gt_spin);
898 num = gt->gt_quota_scale_num; 940 num = gt->gt_quota_scale_num;
@@ -985,9 +1027,9 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
985 continue; 1027 continue;
986 1028
987 value = (s64)be64_to_cpu(qd->qd_qb.qb_value); 1029 value = (s64)be64_to_cpu(qd->qd_qb.qb_value);
988 spin_lock(&sdp->sd_quota_spin); 1030 spin_lock(&qd_lru_lock);
989 value += qd->qd_change; 1031 value += qd->qd_change;
990 spin_unlock(&sdp->sd_quota_spin); 1032 spin_unlock(&qd_lru_lock);
991 1033
992 if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) { 1034 if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
993 print_message(qd, "exceeded"); 1035 print_message(qd, "exceeded");
@@ -1171,13 +1213,12 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
1171 qd->qd_change = qc.qc_change; 1213 qd->qd_change = qc.qc_change;
1172 qd->qd_slot = slot; 1214 qd->qd_slot = slot;
1173 qd->qd_slot_count = 1; 1215 qd->qd_slot_count = 1;
1174 qd->qd_last_touched = jiffies;
1175 1216
1176 spin_lock(&sdp->sd_quota_spin); 1217 spin_lock(&qd_lru_lock);
1177 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1); 1218 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1);
1178 list_add(&qd->qd_list, &sdp->sd_quota_list); 1219 list_add(&qd->qd_list, &sdp->sd_quota_list);
1179 atomic_inc(&sdp->sd_quota_count); 1220 atomic_inc(&sdp->sd_quota_count);
1180 spin_unlock(&sdp->sd_quota_spin); 1221 spin_unlock(&qd_lru_lock);
1181 1222
1182 found++; 1223 found++;
1183 } 1224 }
@@ -1197,73 +1238,48 @@ fail:
1197 return error; 1238 return error;
1198} 1239}
1199 1240
1200static void gfs2_quota_scan(struct gfs2_sbd *sdp)
1201{
1202 struct gfs2_quota_data *qd, *safe;
1203 LIST_HEAD(dead);
1204
1205 spin_lock(&sdp->sd_quota_spin);
1206 list_for_each_entry_safe(qd, safe, &sdp->sd_quota_list, qd_list) {
1207 if (!qd->qd_count &&
1208 time_after_eq(jiffies, qd->qd_last_touched +
1209 gfs2_tune_get(sdp, gt_quota_cache_secs) * HZ)) {
1210 list_move(&qd->qd_list, &dead);
1211 gfs2_assert_warn(sdp,
1212 atomic_read(&sdp->sd_quota_count) > 0);
1213 atomic_dec(&sdp->sd_quota_count);
1214 }
1215 }
1216 spin_unlock(&sdp->sd_quota_spin);
1217
1218 while (!list_empty(&dead)) {
1219 qd = list_entry(dead.next, struct gfs2_quota_data, qd_list);
1220 list_del(&qd->qd_list);
1221
1222 gfs2_assert_warn(sdp, !qd->qd_change);
1223 gfs2_assert_warn(sdp, !qd->qd_slot_count);
1224 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1225
1226 gfs2_lvb_unhold(qd->qd_gl);
1227 kmem_cache_free(gfs2_quotad_cachep, qd);
1228 }
1229}
1230
1231void gfs2_quota_cleanup(struct gfs2_sbd *sdp) 1241void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
1232{ 1242{
1233 struct list_head *head = &sdp->sd_quota_list; 1243 struct list_head *head = &sdp->sd_quota_list;
1234 struct gfs2_quota_data *qd; 1244 struct gfs2_quota_data *qd;
1235 unsigned int x; 1245 unsigned int x;
1236 1246
1237 spin_lock(&sdp->sd_quota_spin); 1247 spin_lock(&qd_lru_lock);
1238 while (!list_empty(head)) { 1248 while (!list_empty(head)) {
1239 qd = list_entry(head->prev, struct gfs2_quota_data, qd_list); 1249 qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
1240 1250
1241 if (qd->qd_count > 1 || 1251 if (atomic_read(&qd->qd_count) > 1 ||
1242 (qd->qd_count && !test_bit(QDF_CHANGE, &qd->qd_flags))) { 1252 (atomic_read(&qd->qd_count) &&
1253 !test_bit(QDF_CHANGE, &qd->qd_flags))) {
1243 list_move(&qd->qd_list, head); 1254 list_move(&qd->qd_list, head);
1244 spin_unlock(&sdp->sd_quota_spin); 1255 spin_unlock(&qd_lru_lock);
1245 schedule(); 1256 schedule();
1246 spin_lock(&sdp->sd_quota_spin); 1257 spin_lock(&qd_lru_lock);
1247 continue; 1258 continue;
1248 } 1259 }
1249 1260
1250 list_del(&qd->qd_list); 1261 list_del(&qd->qd_list);
1262 /* Also remove if this qd exists in the reclaim list */
1263 if (!list_empty(&qd->qd_reclaim)) {
1264 list_del_init(&qd->qd_reclaim);
1265 atomic_dec(&qd_lru_count);
1266 }
1251 atomic_dec(&sdp->sd_quota_count); 1267 atomic_dec(&sdp->sd_quota_count);
1252 spin_unlock(&sdp->sd_quota_spin); 1268 spin_unlock(&qd_lru_lock);
1253 1269
1254 if (!qd->qd_count) { 1270 if (!atomic_read(&qd->qd_count)) {
1255 gfs2_assert_warn(sdp, !qd->qd_change); 1271 gfs2_assert_warn(sdp, !qd->qd_change);
1256 gfs2_assert_warn(sdp, !qd->qd_slot_count); 1272 gfs2_assert_warn(sdp, !qd->qd_slot_count);
1257 } else 1273 } else
1258 gfs2_assert_warn(sdp, qd->qd_slot_count == 1); 1274 gfs2_assert_warn(sdp, qd->qd_slot_count == 1);
1259 gfs2_assert_warn(sdp, !qd->qd_bh_count); 1275 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1260 1276
1261 gfs2_lvb_unhold(qd->qd_gl); 1277 gfs2_glock_put(qd->qd_gl);
1262 kmem_cache_free(gfs2_quotad_cachep, qd); 1278 kmem_cache_free(gfs2_quotad_cachep, qd);
1263 1279
1264 spin_lock(&sdp->sd_quota_spin); 1280 spin_lock(&qd_lru_lock);
1265 } 1281 }
1266 spin_unlock(&sdp->sd_quota_spin); 1282 spin_unlock(&qd_lru_lock);
1267 1283
1268 gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count)); 1284 gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
1269 1285
@@ -1341,9 +1357,6 @@ int gfs2_quotad(void *data)
1341 quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t, 1357 quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
1342 &quotad_timeo, &tune->gt_quota_quantum); 1358 &quotad_timeo, &tune->gt_quota_quantum);
1343 1359
1344 /* FIXME: This should be turned into a shrinker */
1345 gfs2_quota_scan(sdp);
1346
1347 /* Check for & recover partially truncated inodes */ 1360 /* Check for & recover partially truncated inodes */
1348 quotad_check_trunc_list(sdp); 1361 quotad_check_trunc_list(sdp);
1349 1362
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index cec9032be97d..0fa5fa63d0e8 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -49,4 +49,6 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
49 return ret; 49 return ret;
50} 50}
51 51
52extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask);
53
52#endif /* __QUOTA_DOT_H__ */ 54#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index efd09c3d2b26..247e8f7d6b3d 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -13,7 +13,6 @@
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/gfs2_ondisk.h> 14#include <linux/gfs2_ondisk.h>
15#include <linux/crc32.h> 15#include <linux/crc32.h>
16#include <linux/lm_interface.h>
17#include <linux/kthread.h> 16#include <linux/kthread.h>
18#include <linux/freezer.h> 17#include <linux/freezer.h>
19 18
@@ -427,20 +426,23 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
427} 426}
428 427
429 428
430static void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, 429static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
431 unsigned int message) 430 unsigned int message)
432{ 431{
433 if (!sdp->sd_lockstruct.ls_ops->lm_recovery_done) 432 char env_jid[20];
434 return; 433 char env_status[20];
435 434 char *envp[] = { env_jid, env_status, NULL };
436 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 435 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
437 sdp->sd_lockstruct.ls_ops->lm_recovery_done( 436 ls->ls_recover_jid_done = jid;
438 sdp->sd_lockstruct.ls_lockspace, jid, message); 437 ls->ls_recover_jid_status = message;
438 sprintf(env_jid, "JID=%d", jid);
439 sprintf(env_status, "RECOVERY=%s",
440 message == LM_RD_SUCCESS ? "Done" : "Failed");
441 kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
439} 442}
440 443
441
442/** 444/**
443 * gfs2_recover_journal - recovery a given journal 445 * gfs2_recover_journal - recover a given journal
444 * @jd: the struct gfs2_jdesc describing the journal 446 * @jd: the struct gfs2_jdesc describing the journal
445 * 447 *
446 * Acquire the journal's lock, check to see if the journal is clean, and 448 * Acquire the journal's lock, check to see if the journal is clean, and
@@ -561,7 +563,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
561 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) 563 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
562 gfs2_glock_dq_uninit(&ji_gh); 564 gfs2_glock_dq_uninit(&ji_gh);
563 565
564 gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); 566 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
565 567
566 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) 568 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
567 gfs2_glock_dq_uninit(&j_gh); 569 gfs2_glock_dq_uninit(&j_gh);
@@ -581,7 +583,7 @@ fail_gunlock_j:
581 fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done"); 583 fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
582 584
583fail: 585fail:
584 gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); 586 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
585 return error; 587 return error;
586} 588}
587 589
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 8b01c635d925..f03d024038ea 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -13,8 +13,8 @@
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/gfs2_ondisk.h> 15#include <linux/gfs2_ondisk.h>
16#include <linux/lm_interface.h>
17#include <linux/prefetch.h> 16#include <linux/prefetch.h>
17#include <linux/blkdev.h>
18 18
19#include "gfs2.h" 19#include "gfs2.h"
20#include "incore.h" 20#include "incore.h"
@@ -132,81 +132,90 @@ static inline unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd,
132} 132}
133 133
134/** 134/**
135 * gfs2_bit_search
136 * @ptr: Pointer to bitmap data
137 * @mask: Mask to use (normally 0x55555.... but adjusted for search start)
138 * @state: The state we are searching for
139 *
140 * We xor the bitmap data with a patter which is the bitwise opposite
141 * of what we are looking for, this gives rise to a pattern of ones
142 * wherever there is a match. Since we have two bits per entry, we
143 * take this pattern, shift it down by one place and then and it with
144 * the original. All the even bit positions (0,2,4, etc) then represent
145 * successful matches, so we mask with 0x55555..... to remove the unwanted
146 * odd bit positions.
147 *
148 * This allows searching of a whole u64 at once (32 blocks) with a
149 * single test (on 64 bit arches).
150 */
151
152static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
153{
154 u64 tmp;
155 static const u64 search[] = {
156 [0] = 0xffffffffffffffffULL,
157 [1] = 0xaaaaaaaaaaaaaaaaULL,
158 [2] = 0x5555555555555555ULL,
159 [3] = 0x0000000000000000ULL,
160 };
161 tmp = le64_to_cpu(*ptr) ^ search[state];
162 tmp &= (tmp >> 1);
163 tmp &= mask;
164 return tmp;
165}
166
167/**
135 * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing 168 * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
136 * a block in a given allocation state. 169 * a block in a given allocation state.
137 * @buffer: the buffer that holds the bitmaps 170 * @buffer: the buffer that holds the bitmaps
138 * @buflen: the length (in bytes) of the buffer 171 * @len: the length (in bytes) of the buffer
139 * @goal: start search at this block's bit-pair (within @buffer) 172 * @goal: start search at this block's bit-pair (within @buffer)
140 * @old_state: GFS2_BLKST_XXX the state of the block we're looking for. 173 * @state: GFS2_BLKST_XXX the state of the block we're looking for.
141 * 174 *
142 * Scope of @goal and returned block number is only within this bitmap buffer, 175 * Scope of @goal and returned block number is only within this bitmap buffer,
143 * not entire rgrp or filesystem. @buffer will be offset from the actual 176 * not entire rgrp or filesystem. @buffer will be offset from the actual
144 * beginning of a bitmap block buffer, skipping any header structures. 177 * beginning of a bitmap block buffer, skipping any header structures, but
178 * headers are always a multiple of 64 bits long so that the buffer is
179 * always aligned to a 64 bit boundary.
180 *
181 * The size of the buffer is in bytes, but is it assumed that it is
182 * always ok to to read a complete multiple of 64 bits at the end
183 * of the block in case the end is no aligned to a natural boundary.
145 * 184 *
146 * Return: the block number (bitmap buffer scope) that was found 185 * Return: the block number (bitmap buffer scope) that was found
147 */ 186 */
148 187
149static u32 gfs2_bitfit(const u8 *buffer, unsigned int buflen, u32 goal, 188static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
150 u8 old_state) 189 u32 goal, u8 state)
151{ 190{
152 const u8 *byte, *start, *end; 191 u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1);
153 int bit, startbit; 192 const __le64 *ptr = ((__le64 *)buf) + (goal >> 5);
154 u32 g1, g2, misaligned; 193 const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64)));
155 unsigned long *plong; 194 u64 tmp;
156 unsigned long lskipval; 195 u64 mask = 0x5555555555555555ULL;
157 196 u32 bit;
158 lskipval = (old_state & GFS2_BLKST_USED) ? LBITSKIP00 : LBITSKIP55; 197
159 g1 = (goal / GFS2_NBBY); 198 BUG_ON(state > 3);
160 start = buffer + g1; 199
161 byte = start; 200 /* Mask off bits we don't care about at the start of the search */
162 end = buffer + buflen; 201 mask <<= spoint;
163 g2 = ALIGN(g1, sizeof(unsigned long)); 202 tmp = gfs2_bit_search(ptr, mask, state);
164 plong = (unsigned long *)(buffer + g2); 203 ptr++;
165 startbit = bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE; 204 while(tmp == 0 && ptr < end) {
166 misaligned = g2 - g1; 205 tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state);
167 if (!misaligned) 206 ptr++;
168 goto ulong_aligned;
169/* parse the bitmap a byte at a time */
170misaligned:
171 while (byte < end) {
172 if (((*byte >> bit) & GFS2_BIT_MASK) == old_state) {
173 return goal +
174 (((byte - start) * GFS2_NBBY) +
175 ((bit - startbit) >> 1));
176 }
177 bit += GFS2_BIT_SIZE;
178 if (bit >= GFS2_NBBY * GFS2_BIT_SIZE) {
179 bit = 0;
180 byte++;
181 misaligned--;
182 if (!misaligned) {
183 plong = (unsigned long *)byte;
184 goto ulong_aligned;
185 }
186 }
187 }
188 return BFITNOENT;
189
190/* parse the bitmap a unsigned long at a time */
191ulong_aligned:
192 /* Stop at "end - 1" or else prefetch can go past the end and segfault.
193 We could "if" it but we'd lose some of the performance gained.
194 This way will only slow down searching the very last 4/8 bytes
195 depending on architecture. I've experimented with several ways
196 of writing this section such as using an else before the goto
197 but this one seems to be the fastest. */
198 while ((unsigned char *)plong < end - sizeof(unsigned long)) {
199 prefetch(plong + 1);
200 if (((*plong) & LBITMASK) != lskipval)
201 break;
202 plong++;
203 }
204 if ((unsigned char *)plong < end) {
205 byte = (const u8 *)plong;
206 misaligned += sizeof(unsigned long) - 1;
207 goto misaligned;
208 } 207 }
209 return BFITNOENT; 208 /* Mask off any bits which are more than len bytes from the start */
209 if (ptr == end && (len & (sizeof(u64) - 1)))
210 tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1))));
211 /* Didn't find anything, so return */
212 if (tmp == 0)
213 return BFITNOENT;
214 ptr--;
215 bit = fls64(tmp);
216 bit--; /* fls64 always adds one to the bit count */
217 bit /= 2; /* two bits per entry in the bitmap */
218 return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
210} 219}
211 220
212/** 221/**
@@ -831,6 +840,58 @@ void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd)
831 spin_unlock(&sdp->sd_rindex_spin); 840 spin_unlock(&sdp->sd_rindex_spin);
832} 841}
833 842
843static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
844 const struct gfs2_bitmap *bi)
845{
846 struct super_block *sb = sdp->sd_vfs;
847 struct block_device *bdev = sb->s_bdev;
848 const unsigned int sects_per_blk = sdp->sd_sb.sb_bsize /
849 bdev_hardsect_size(sb->s_bdev);
850 u64 blk;
851 sector_t start = 0;
852 sector_t nr_sects = 0;
853 int rv;
854 unsigned int x;
855
856 for (x = 0; x < bi->bi_len; x++) {
857 const u8 *orig = bi->bi_bh->b_data + bi->bi_offset + x;
858 const u8 *clone = bi->bi_clone + bi->bi_offset + x;
859 u8 diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
860 diff &= 0x55;
861 if (diff == 0)
862 continue;
863 blk = offset + ((bi->bi_start + x) * GFS2_NBBY);
864 blk *= sects_per_blk; /* convert to sectors */
865 while(diff) {
866 if (diff & 1) {
867 if (nr_sects == 0)
868 goto start_new_extent;
869 if ((start + nr_sects) != blk) {
870 rv = blkdev_issue_discard(bdev, start,
871 nr_sects, GFP_NOFS);
872 if (rv)
873 goto fail;
874 nr_sects = 0;
875start_new_extent:
876 start = blk;
877 }
878 nr_sects += sects_per_blk;
879 }
880 diff >>= 2;
881 blk += sects_per_blk;
882 }
883 }
884 if (nr_sects) {
885 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS);
886 if (rv)
887 goto fail;
888 }
889 return;
890fail:
891 fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem", rv);
892 sdp->sd_args.ar_discard = 0;
893}
894
834void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd) 895void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
835{ 896{
836 struct gfs2_sbd *sdp = rgd->rd_sbd; 897 struct gfs2_sbd *sdp = rgd->rd_sbd;
@@ -841,6 +902,8 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
841 struct gfs2_bitmap *bi = rgd->rd_bits + x; 902 struct gfs2_bitmap *bi = rgd->rd_bits + x;
842 if (!bi->bi_clone) 903 if (!bi->bi_clone)
843 continue; 904 continue;
905 if (sdp->sd_args.ar_discard)
906 gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi);
844 memcpy(bi->bi_clone + bi->bi_offset, 907 memcpy(bi->bi_clone + bi->bi_offset,
845 bi->bi_bh->b_data + bi->bi_offset, bi->bi_len); 908 bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
846 } 909 }
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 141b781f2fcc..601913e0a482 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -15,7 +15,6 @@
15#include <linux/crc32.h> 15#include <linux/crc32.h>
16#include <linux/gfs2_ondisk.h> 16#include <linux/gfs2_ondisk.h>
17#include <linux/bio.h> 17#include <linux/bio.h>
18#include <linux/lm_interface.h>
19 18
20#include "gfs2.h" 19#include "gfs2.h"
21#include "incore.h" 20#include "incore.h"
@@ -339,7 +338,6 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
339 struct gfs2_holder *t_gh) 338 struct gfs2_holder *t_gh)
340{ 339{
341 struct gfs2_inode *ip; 340 struct gfs2_inode *ip;
342 struct gfs2_holder ji_gh;
343 struct gfs2_jdesc *jd; 341 struct gfs2_jdesc *jd;
344 struct lfcc *lfcc; 342 struct lfcc *lfcc;
345 LIST_HEAD(list); 343 LIST_HEAD(list);
@@ -387,7 +385,6 @@ out:
387 gfs2_glock_dq_uninit(&lfcc->gh); 385 gfs2_glock_dq_uninit(&lfcc->gh);
388 kfree(lfcc); 386 kfree(lfcc);
389 } 387 }
390 gfs2_glock_dq_uninit(&ji_gh);
391 return error; 388 return error;
392} 389}
393 390
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index f6b8b00ad881..b56413e3e40d 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -14,7 +14,7 @@
14#include <linux/dcache.h> 14#include <linux/dcache.h>
15#include "incore.h" 15#include "incore.h"
16 16
17void gfs2_lm_unmount(struct gfs2_sbd *sdp); 17extern void gfs2_lm_unmount(struct gfs2_sbd *sdp);
18 18
19static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) 19static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
20{ 20{
@@ -27,27 +27,29 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
27 27
28void gfs2_jindex_free(struct gfs2_sbd *sdp); 28void gfs2_jindex_free(struct gfs2_sbd *sdp);
29 29
30struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid); 30extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data);
31int gfs2_jdesc_check(struct gfs2_jdesc *jd);
32 31
33int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, 32extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
34 struct gfs2_inode **ipp); 33extern int gfs2_jdesc_check(struct gfs2_jdesc *jd);
35 34
36int gfs2_make_fs_rw(struct gfs2_sbd *sdp); 35extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
36 struct gfs2_inode **ipp);
37 37
38int gfs2_statfs_init(struct gfs2_sbd *sdp); 38extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
39void gfs2_statfs_change(struct gfs2_sbd *sdp,
40 s64 total, s64 free, s64 dinodes);
41int gfs2_statfs_sync(struct gfs2_sbd *sdp);
42 39
43int gfs2_freeze_fs(struct gfs2_sbd *sdp); 40extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
44void gfs2_unfreeze_fs(struct gfs2_sbd *sdp); 41extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
42 s64 dinodes);
43extern int gfs2_statfs_sync(struct gfs2_sbd *sdp);
44
45extern int gfs2_freeze_fs(struct gfs2_sbd *sdp);
46extern void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
45 47
46extern struct file_system_type gfs2_fs_type; 48extern struct file_system_type gfs2_fs_type;
47extern struct file_system_type gfs2meta_fs_type; 49extern struct file_system_type gfs2meta_fs_type;
48extern const struct export_operations gfs2_export_ops; 50extern const struct export_operations gfs2_export_ops;
49extern const struct super_operations gfs2_super_ops; 51extern const struct super_operations gfs2_super_ops;
50extern struct dentry_operations gfs2_dops; 52extern const struct dentry_operations gfs2_dops;
51 53
52#endif /* __SUPER_DOT_H__ */ 54#endif /* __SUPER_DOT_H__ */
53 55
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 26c1fa777a95..7655f5025fec 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -14,9 +14,8 @@
14#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/kobject.h> 16#include <linux/kobject.h>
17#include <linux/gfs2_ondisk.h>
18#include <linux/lm_interface.h>
19#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18#include <linux/gfs2_ondisk.h>
20 19
21#include "gfs2.h" 20#include "gfs2.h"
22#include "incore.h" 21#include "incore.h"
@@ -25,6 +24,7 @@
25#include "glock.h" 24#include "glock.h"
26#include "quota.h" 25#include "quota.h"
27#include "util.h" 26#include "util.h"
27#include "glops.h"
28 28
29static ssize_t id_show(struct gfs2_sbd *sdp, char *buf) 29static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
30{ 30{
@@ -37,6 +37,30 @@ static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf)
37 return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_fsname); 37 return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_fsname);
38} 38}
39 39
40static int gfs2_uuid_valid(const u8 *uuid)
41{
42 int i;
43
44 for (i = 0; i < 16; i++) {
45 if (uuid[i])
46 return 1;
47 }
48 return 0;
49}
50
51static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf)
52{
53 const u8 *uuid = sdp->sd_sb.sb_uuid;
54 buf[0] = '\0';
55 if (!gfs2_uuid_valid(uuid))
56 return 0;
57 return snprintf(buf, PAGE_SIZE, "%02X%02X%02X%02X-%02X%02X-"
58 "%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X\n",
59 uuid[0], uuid[1], uuid[2], uuid[3], uuid[4], uuid[5],
60 uuid[6], uuid[7], uuid[8], uuid[9], uuid[10], uuid[11],
61 uuid[12], uuid[13], uuid[14], uuid[15]);
62}
63
40static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf) 64static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
41{ 65{
42 unsigned int count; 66 unsigned int count;
@@ -148,6 +172,46 @@ static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
148 return len; 172 return len;
149} 173}
150 174
175static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
176{
177 struct gfs2_glock *gl;
178 const struct gfs2_glock_operations *glops;
179 unsigned int glmode;
180 unsigned int gltype;
181 unsigned long long glnum;
182 char mode[16];
183 int rv;
184
185 if (!capable(CAP_SYS_ADMIN))
186 return -EACCES;
187
188 rv = sscanf(buf, "%u:%llu %15s", &gltype, &glnum,
189 mode);
190 if (rv != 3)
191 return -EINVAL;
192
193 if (strcmp(mode, "EX") == 0)
194 glmode = LM_ST_UNLOCKED;
195 else if ((strcmp(mode, "CW") == 0) || (strcmp(mode, "DF") == 0))
196 glmode = LM_ST_DEFERRED;
197 else if ((strcmp(mode, "PR") == 0) || (strcmp(mode, "SH") == 0))
198 glmode = LM_ST_SHARED;
199 else
200 return -EINVAL;
201
202 if (gltype > LM_TYPE_JOURNAL)
203 return -EINVAL;
204 glops = gfs2_glops_list[gltype];
205 if (glops == NULL)
206 return -EINVAL;
207 rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl);
208 if (rv)
209 return rv;
210 gfs2_glock_cb(gl, glmode);
211 gfs2_glock_put(gl);
212 return len;
213}
214
151struct gfs2_attr { 215struct gfs2_attr {
152 struct attribute attr; 216 struct attribute attr;
153 ssize_t (*show)(struct gfs2_sbd *, char *); 217 ssize_t (*show)(struct gfs2_sbd *, char *);
@@ -159,22 +223,26 @@ static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
159 223
160GFS2_ATTR(id, 0444, id_show, NULL); 224GFS2_ATTR(id, 0444, id_show, NULL);
161GFS2_ATTR(fsname, 0444, fsname_show, NULL); 225GFS2_ATTR(fsname, 0444, fsname_show, NULL);
226GFS2_ATTR(uuid, 0444, uuid_show, NULL);
162GFS2_ATTR(freeze, 0644, freeze_show, freeze_store); 227GFS2_ATTR(freeze, 0644, freeze_show, freeze_store);
163GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store); 228GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
164GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store); 229GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store);
165GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store); 230GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store);
166GFS2_ATTR(quota_refresh_user, 0200, NULL, quota_refresh_user_store); 231GFS2_ATTR(quota_refresh_user, 0200, NULL, quota_refresh_user_store);
167GFS2_ATTR(quota_refresh_group, 0200, NULL, quota_refresh_group_store); 232GFS2_ATTR(quota_refresh_group, 0200, NULL, quota_refresh_group_store);
233GFS2_ATTR(demote_rq, 0200, NULL, demote_rq_store);
168 234
169static struct attribute *gfs2_attrs[] = { 235static struct attribute *gfs2_attrs[] = {
170 &gfs2_attr_id.attr, 236 &gfs2_attr_id.attr,
171 &gfs2_attr_fsname.attr, 237 &gfs2_attr_fsname.attr,
238 &gfs2_attr_uuid.attr,
172 &gfs2_attr_freeze.attr, 239 &gfs2_attr_freeze.attr,
173 &gfs2_attr_withdraw.attr, 240 &gfs2_attr_withdraw.attr,
174 &gfs2_attr_statfs_sync.attr, 241 &gfs2_attr_statfs_sync.attr,
175 &gfs2_attr_quota_sync.attr, 242 &gfs2_attr_quota_sync.attr,
176 &gfs2_attr_quota_refresh_user.attr, 243 &gfs2_attr_quota_refresh_user.attr,
177 &gfs2_attr_quota_refresh_group.attr, 244 &gfs2_attr_quota_refresh_group.attr,
245 &gfs2_attr_demote_rq.attr,
178 NULL, 246 NULL,
179}; 247};
180 248
@@ -224,14 +292,145 @@ static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name)
224 292
225LOCKSTRUCT_ATTR(jid, "%u\n"); 293LOCKSTRUCT_ATTR(jid, "%u\n");
226LOCKSTRUCT_ATTR(first, "%u\n"); 294LOCKSTRUCT_ATTR(first, "%u\n");
227LOCKSTRUCT_ATTR(lvb_size, "%u\n");
228LOCKSTRUCT_ATTR(flags, "%d\n");
229 295
230static struct attribute *lockstruct_attrs[] = { 296static struct attribute *lockstruct_attrs[] = {
231 &lockstruct_attr_jid.attr, 297 &lockstruct_attr_jid.attr,
232 &lockstruct_attr_first.attr, 298 &lockstruct_attr_first.attr,
233 &lockstruct_attr_lvb_size.attr, 299 NULL,
234 &lockstruct_attr_flags.attr, 300};
301
302/*
303 * lock_module. Originally from lock_dlm
304 */
305
306static ssize_t proto_name_show(struct gfs2_sbd *sdp, char *buf)
307{
308 const struct lm_lockops *ops = sdp->sd_lockstruct.ls_ops;
309 return sprintf(buf, "%s\n", ops->lm_proto_name);
310}
311
312static ssize_t block_show(struct gfs2_sbd *sdp, char *buf)
313{
314 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
315 ssize_t ret;
316 int val = 0;
317
318 if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))
319 val = 1;
320 ret = sprintf(buf, "%d\n", val);
321 return ret;
322}
323
324static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
325{
326 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
327 ssize_t ret = len;
328 int val;
329
330 val = simple_strtol(buf, NULL, 0);
331
332 if (val == 1)
333 set_bit(DFL_BLOCK_LOCKS, &ls->ls_flags);
334 else if (val == 0) {
335 clear_bit(DFL_BLOCK_LOCKS, &ls->ls_flags);
336 smp_mb__after_clear_bit();
337 gfs2_glock_thaw(sdp);
338 } else {
339 ret = -EINVAL;
340 }
341 return ret;
342}
343
344static ssize_t lkid_show(struct gfs2_sbd *sdp, char *buf)
345{
346 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
347 return sprintf(buf, "%u\n", ls->ls_id);
348}
349
350static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
351{
352 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
353 return sprintf(buf, "%d\n", ls->ls_first);
354}
355
356static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
357{
358 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
359 return sprintf(buf, "%d\n", ls->ls_first_done);
360}
361
362static ssize_t recover_show(struct gfs2_sbd *sdp, char *buf)
363{
364 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
365 return sprintf(buf, "%d\n", ls->ls_recover_jid);
366}
367
368static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
369{
370 struct gfs2_jdesc *jd;
371
372 spin_lock(&sdp->sd_jindex_spin);
373 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
374 if (jd->jd_jid != jid)
375 continue;
376 jd->jd_dirty = 1;
377 break;
378 }
379 spin_unlock(&sdp->sd_jindex_spin);
380}
381
382static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
383{
384 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
385 ls->ls_recover_jid = simple_strtol(buf, NULL, 0);
386 gfs2_jdesc_make_dirty(sdp, ls->ls_recover_jid);
387 if (sdp->sd_recoverd_process)
388 wake_up_process(sdp->sd_recoverd_process);
389 return len;
390}
391
392static ssize_t recover_done_show(struct gfs2_sbd *sdp, char *buf)
393{
394 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
395 return sprintf(buf, "%d\n", ls->ls_recover_jid_done);
396}
397
398static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
399{
400 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
401 return sprintf(buf, "%d\n", ls->ls_recover_jid_status);
402}
403
404struct gdlm_attr {
405 struct attribute attr;
406 ssize_t (*show)(struct gfs2_sbd *sdp, char *);
407 ssize_t (*store)(struct gfs2_sbd *sdp, const char *, size_t);
408};
409
410#define GDLM_ATTR(_name,_mode,_show,_store) \
411static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
412
413GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
414GDLM_ATTR(block, 0644, block_show, block_store);
415GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
416GDLM_ATTR(id, 0444, lkid_show, NULL);
417GDLM_ATTR(first, 0444, lkfirst_show, NULL);
418GDLM_ATTR(first_done, 0444, first_done_show, NULL);
419GDLM_ATTR(recover, 0644, recover_show, recover_store);
420GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
421GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
422
423static struct attribute *lock_module_attrs[] = {
424 &gdlm_attr_proto_name.attr,
425 &gdlm_attr_block.attr,
426 &gdlm_attr_withdraw.attr,
427 &gdlm_attr_id.attr,
428 &lockstruct_attr_jid.attr,
429 &gdlm_attr_first.attr,
430 &gdlm_attr_first_done.attr,
431 &gdlm_attr_recover.attr,
432 &gdlm_attr_recover_done.attr,
433 &gdlm_attr_recover_status.attr,
235 NULL, 434 NULL,
236}; 435};
237 436
@@ -373,7 +572,6 @@ TUNE_ATTR(complain_secs, 0);
373TUNE_ATTR(statfs_slow, 0); 572TUNE_ATTR(statfs_slow, 0);
374TUNE_ATTR(new_files_jdata, 0); 573TUNE_ATTR(new_files_jdata, 0);
375TUNE_ATTR(quota_simul_sync, 1); 574TUNE_ATTR(quota_simul_sync, 1);
376TUNE_ATTR(quota_cache_secs, 1);
377TUNE_ATTR(stall_secs, 1); 575TUNE_ATTR(stall_secs, 1);
378TUNE_ATTR(statfs_quantum, 1); 576TUNE_ATTR(statfs_quantum, 1);
379TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process); 577TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
@@ -389,7 +587,6 @@ static struct attribute *tune_attrs[] = {
389 &tune_attr_complain_secs.attr, 587 &tune_attr_complain_secs.attr,
390 &tune_attr_statfs_slow.attr, 588 &tune_attr_statfs_slow.attr,
391 &tune_attr_quota_simul_sync.attr, 589 &tune_attr_quota_simul_sync.attr,
392 &tune_attr_quota_cache_secs.attr,
393 &tune_attr_stall_secs.attr, 590 &tune_attr_stall_secs.attr,
394 &tune_attr_statfs_quantum.attr, 591 &tune_attr_statfs_quantum.attr,
395 &tune_attr_recoverd_secs.attr, 592 &tune_attr_recoverd_secs.attr,
@@ -414,6 +611,11 @@ static struct attribute_group tune_group = {
414 .attrs = tune_attrs, 611 .attrs = tune_attrs,
415}; 612};
416 613
614static struct attribute_group lock_module_group = {
615 .name = "lock_module",
616 .attrs = lock_module_attrs,
617};
618
417int gfs2_sys_fs_add(struct gfs2_sbd *sdp) 619int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
418{ 620{
419 int error; 621 int error;
@@ -436,9 +638,15 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
436 if (error) 638 if (error)
437 goto fail_args; 639 goto fail_args;
438 640
641 error = sysfs_create_group(&sdp->sd_kobj, &lock_module_group);
642 if (error)
643 goto fail_tune;
644
439 kobject_uevent(&sdp->sd_kobj, KOBJ_ADD); 645 kobject_uevent(&sdp->sd_kobj, KOBJ_ADD);
440 return 0; 646 return 0;
441 647
648fail_tune:
649 sysfs_remove_group(&sdp->sd_kobj, &tune_group);
442fail_args: 650fail_args:
443 sysfs_remove_group(&sdp->sd_kobj, &args_group); 651 sysfs_remove_group(&sdp->sd_kobj, &args_group);
444fail_lockstruct: 652fail_lockstruct:
@@ -455,15 +663,27 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
455 sysfs_remove_group(&sdp->sd_kobj, &tune_group); 663 sysfs_remove_group(&sdp->sd_kobj, &tune_group);
456 sysfs_remove_group(&sdp->sd_kobj, &args_group); 664 sysfs_remove_group(&sdp->sd_kobj, &args_group);
457 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group); 665 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
666 sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
458 kobject_put(&sdp->sd_kobj); 667 kobject_put(&sdp->sd_kobj);
459} 668}
460 669
670
461static int gfs2_uevent(struct kset *kset, struct kobject *kobj, 671static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
462 struct kobj_uevent_env *env) 672 struct kobj_uevent_env *env)
463{ 673{
464 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); 674 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
675 const u8 *uuid = sdp->sd_sb.sb_uuid;
676
465 add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); 677 add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
466 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); 678 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
679 if (gfs2_uuid_valid(uuid)) {
680 add_uevent_var(env, "UUID=%02X%02X%02X%02X-%02X%02X-%02X%02X-"
681 "%02X%02X-%02X%02X%02X%02X%02X%02X",
682 uuid[0], uuid[1], uuid[2], uuid[3], uuid[4],
683 uuid[5], uuid[6], uuid[7], uuid[8], uuid[9],
684 uuid[10], uuid[11], uuid[12], uuid[13],
685 uuid[14], uuid[15]);
686 }
467 return 0; 687 return 0;
468} 688}
469 689
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index f677b8a83f0c..053752d4b27f 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -12,9 +12,8 @@
12#include <linux/spinlock.h> 12#include <linux/spinlock.h>
13#include <linux/completion.h> 13#include <linux/completion.h>
14#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/kallsyms.h> 15#include <linux/kallsyms.h>
17#include <linux/lm_interface.h> 16#include <linux/gfs2_ondisk.h>
18 17
19#include "gfs2.h" 18#include "gfs2.h"
20#include "incore.h" 19#include "incore.h"
@@ -88,9 +87,11 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
88 87
89 if (!tr->tr_touched) { 88 if (!tr->tr_touched) {
90 gfs2_log_release(sdp, tr->tr_reserved); 89 gfs2_log_release(sdp, tr->tr_reserved);
91 gfs2_glock_dq(&tr->tr_t_gh); 90 if (tr->tr_t_gh.gh_gl) {
92 gfs2_holder_uninit(&tr->tr_t_gh); 91 gfs2_glock_dq(&tr->tr_t_gh);
93 kfree(tr); 92 gfs2_holder_uninit(&tr->tr_t_gh);
93 kfree(tr);
94 }
94 return; 95 return;
95 } 96 }
96 97
@@ -106,9 +107,11 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
106 } 107 }
107 108
108 gfs2_log_commit(sdp, tr); 109 gfs2_log_commit(sdp, tr);
109 gfs2_glock_dq(&tr->tr_t_gh); 110 if (tr->tr_t_gh.gh_gl) {
110 gfs2_holder_uninit(&tr->tr_t_gh); 111 gfs2_glock_dq(&tr->tr_t_gh);
111 kfree(tr); 112 gfs2_holder_uninit(&tr->tr_t_gh);
113 kfree(tr);
114 }
112 115
113 if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS) 116 if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
114 gfs2_log_flush(sdp, NULL); 117 gfs2_log_flush(sdp, NULL);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 374f50e95496..9d12b1118ba0 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -13,7 +13,6 @@
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/crc32.h> 14#include <linux/crc32.h>
15#include <linux/gfs2_ondisk.h> 15#include <linux/gfs2_ondisk.h>
16#include <linux/lm_interface.h>
17#include <asm/uaccess.h> 16#include <asm/uaccess.h>
18 17
19#include "gfs2.h" 18#include "gfs2.h"
@@ -35,6 +34,8 @@ void gfs2_assert_i(struct gfs2_sbd *sdp)
35 34
36int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) 35int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
37{ 36{
37 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
38 const struct lm_lockops *lm = ls->ls_ops;
38 va_list args; 39 va_list args;
39 40
40 if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) 41 if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
@@ -47,8 +48,12 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
47 fs_err(sdp, "about to withdraw this file system\n"); 48 fs_err(sdp, "about to withdraw this file system\n");
48 BUG_ON(sdp->sd_args.ar_debug); 49 BUG_ON(sdp->sd_args.ar_debug);
49 50
50 fs_err(sdp, "telling LM to withdraw\n"); 51 kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
51 gfs2_withdraw_lockproto(&sdp->sd_lockstruct); 52
53 if (lm->lm_unmount) {
54 fs_err(sdp, "telling LM to unmount\n");
55 lm->lm_unmount(sdp);
56 }
52 fs_err(sdp, "withdrawn\n"); 57 fs_err(sdp, "withdrawn\n");
53 dump_stack(); 58 dump_stack();
54 59
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 9955232fdf8c..052387e11671 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -213,7 +213,7 @@ extern void hfs_mdb_put(struct super_block *);
213extern int hfs_part_find(struct super_block *, sector_t *, sector_t *); 213extern int hfs_part_find(struct super_block *, sector_t *, sector_t *);
214 214
215/* string.c */ 215/* string.c */
216extern struct dentry_operations hfs_dentry_operations; 216extern const struct dentry_operations hfs_dentry_operations;
217 217
218extern int hfs_hash_dentry(struct dentry *, struct qstr *); 218extern int hfs_hash_dentry(struct dentry *, struct qstr *);
219extern int hfs_strcmp(const unsigned char *, unsigned int, 219extern int hfs_strcmp(const unsigned char *, unsigned int,
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index c8b5acf4b0b7..a36bb749926d 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -82,6 +82,7 @@ static void hfs_put_super(struct super_block *sb)
82static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf) 82static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
83{ 83{
84 struct super_block *sb = dentry->d_sb; 84 struct super_block *sb = dentry->d_sb;
85 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
85 86
86 buf->f_type = HFS_SUPER_MAGIC; 87 buf->f_type = HFS_SUPER_MAGIC;
87 buf->f_bsize = sb->s_blocksize; 88 buf->f_bsize = sb->s_blocksize;
@@ -90,6 +91,8 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
90 buf->f_bavail = buf->f_bfree; 91 buf->f_bavail = buf->f_bfree;
91 buf->f_files = HFS_SB(sb)->fs_ablocks; 92 buf->f_files = HFS_SB(sb)->fs_ablocks;
92 buf->f_ffree = HFS_SB(sb)->free_ablocks; 93 buf->f_ffree = HFS_SB(sb)->free_ablocks;
94 buf->f_fsid.val[0] = (u32)id;
95 buf->f_fsid.val[1] = (u32)(id >> 32);
93 buf->f_namelen = HFS_NAMELEN; 96 buf->f_namelen = HFS_NAMELEN;
94 97
95 return 0; 98 return 0;
diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c
index 5bf89ec01cd4..7478f5c219aa 100644
--- a/fs/hfs/sysdep.c
+++ b/fs/hfs/sysdep.c
@@ -31,7 +31,7 @@ static int hfs_revalidate_dentry(struct dentry *dentry, struct nameidata *nd)
31 return 1; 31 return 1;
32} 32}
33 33
34struct dentry_operations hfs_dentry_operations = 34const struct dentry_operations hfs_dentry_operations =
35{ 35{
36 .d_revalidate = hfs_revalidate_dentry, 36 .d_revalidate = hfs_revalidate_dentry,
37 .d_hash = hfs_hash_dentry, 37 .d_hash = hfs_hash_dentry,
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index f027a905225f..5c10d803d9df 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -327,7 +327,7 @@ void hfsplus_file_truncate(struct inode *);
327/* inode.c */ 327/* inode.c */
328extern const struct address_space_operations hfsplus_aops; 328extern const struct address_space_operations hfsplus_aops;
329extern const struct address_space_operations hfsplus_btree_aops; 329extern const struct address_space_operations hfsplus_btree_aops;
330extern struct dentry_operations hfsplus_dentry_operations; 330extern const struct dentry_operations hfsplus_dentry_operations;
331 331
332void hfsplus_inode_read_fork(struct inode *, struct hfsplus_fork_raw *); 332void hfsplus_inode_read_fork(struct inode *, struct hfsplus_fork_raw *);
333void hfsplus_inode_write_fork(struct inode *, struct hfsplus_fork_raw *); 333void hfsplus_inode_write_fork(struct inode *, struct hfsplus_fork_raw *);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index f105ee9e1cc4..1bcf597c0562 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -137,7 +137,7 @@ const struct address_space_operations hfsplus_aops = {
137 .writepages = hfsplus_writepages, 137 .writepages = hfsplus_writepages,
138}; 138};
139 139
140struct dentry_operations hfsplus_dentry_operations = { 140const struct dentry_operations hfsplus_dentry_operations = {
141 .d_hash = hfsplus_hash_dentry, 141 .d_hash = hfsplus_hash_dentry,
142 .d_compare = hfsplus_compare_dentry, 142 .d_compare = hfsplus_compare_dentry,
143}; 143};
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index bab7f8d1bdfa..3fcbb0e1f6fc 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -48,7 +48,7 @@ void hfsplus_fill_defaults(struct hfsplus_sb_info *opts)
48 48
49 opts->creator = HFSPLUS_DEF_CR_TYPE; 49 opts->creator = HFSPLUS_DEF_CR_TYPE;
50 opts->type = HFSPLUS_DEF_CR_TYPE; 50 opts->type = HFSPLUS_DEF_CR_TYPE;
51 opts->umask = current->fs->umask; 51 opts->umask = current_umask();
52 opts->uid = current_uid(); 52 opts->uid = current_uid();
53 opts->gid = current_gid(); 53 opts->gid = current_gid();
54 opts->part = -1; 54 opts->part = -1;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index eb74531a0a8e..f2a64020f42e 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -223,6 +223,7 @@ static void hfsplus_put_super(struct super_block *sb)
223static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) 223static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
224{ 224{
225 struct super_block *sb = dentry->d_sb; 225 struct super_block *sb = dentry->d_sb;
226 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
226 227
227 buf->f_type = HFSPLUS_SUPER_MAGIC; 228 buf->f_type = HFSPLUS_SUPER_MAGIC;
228 buf->f_bsize = sb->s_blocksize; 229 buf->f_bsize = sb->s_blocksize;
@@ -231,6 +232,8 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
231 buf->f_bavail = buf->f_bfree; 232 buf->f_bavail = buf->f_bfree;
232 buf->f_files = 0xFFFFFFFF; 233 buf->f_files = 0xFFFFFFFF;
233 buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid; 234 buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid;
235 buf->f_fsid.val[0] = (u32)id;
236 buf->f_fsid.val[1] = (u32)(id >> 32);
234 buf->f_namelen = HFSPLUS_MAX_STRLEN; 237 buf->f_namelen = HFSPLUS_MAX_STRLEN;
235 238
236 return 0; 239 return 0;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 5c538e0ec14b..fe02ad4740e7 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -31,12 +31,12 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
31 31
32#define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_path.dentry->d_inode) 32#define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_path.dentry->d_inode)
33 33
34int hostfs_d_delete(struct dentry *dentry) 34static int hostfs_d_delete(struct dentry *dentry)
35{ 35{
36 return 1; 36 return 1;
37} 37}
38 38
39struct dentry_operations hostfs_dentry_ops = { 39static const struct dentry_operations hostfs_dentry_ops = {
40 .d_delete = hostfs_d_delete, 40 .d_delete = hostfs_d_delete,
41}; 41};
42 42
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 08319126b2af..940d6d150bee 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -49,7 +49,7 @@ static int hpfs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qst
49 return 0; 49 return 0;
50} 50}
51 51
52static struct dentry_operations hpfs_dentry_operations = { 52static const struct dentry_operations hpfs_dentry_operations = {
53 .d_hash = hpfs_hash_dentry, 53 .d_hash = hpfs_hash_dentry,
54 .d_compare = hpfs_compare_dentry, 54 .d_compare = hpfs_compare_dentry,
55}; 55};
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 0d049b8919c4..fecf402d7b8a 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -136,6 +136,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
136{ 136{
137 struct super_block *s = dentry->d_sb; 137 struct super_block *s = dentry->d_sb;
138 struct hpfs_sb_info *sbi = hpfs_sb(s); 138 struct hpfs_sb_info *sbi = hpfs_sb(s);
139 u64 id = huge_encode_dev(s->s_bdev->bd_dev);
139 lock_kernel(); 140 lock_kernel();
140 141
141 /*if (sbi->sb_n_free == -1) {*/ 142 /*if (sbi->sb_n_free == -1) {*/
@@ -149,6 +150,8 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
149 buf->f_bavail = sbi->sb_n_free; 150 buf->f_bavail = sbi->sb_n_free;
150 buf->f_files = sbi->sb_dirband_size / 4; 151 buf->f_files = sbi->sb_dirband_size / 4;
151 buf->f_ffree = sbi->sb_n_free_dnodes; 152 buf->f_ffree = sbi->sb_n_free_dnodes;
153 buf->f_fsid.val[0] = (u32)id;
154 buf->f_fsid.val[1] = (u32)(id >> 32);
152 buf->f_namelen = 254; 155 buf->f_namelen = 254;
153 156
154 unlock_kernel(); 157 unlock_kernel();
@@ -477,7 +480,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
477 480
478 uid = current_uid(); 481 uid = current_uid();
479 gid = current_gid(); 482 gid = current_gid();
480 umask = current->fs->umask; 483 umask = current_umask();
481 lowercase = 0; 484 lowercase = 0;
482 conv = CONV_BINARY; 485 conv = CONV_BINARY;
483 eas = 2; 486 eas = 2;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index b278f7f52024..a5089a6dd67a 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -280,7 +280,12 @@ static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count,
280 "errno = %d\n", err); 280 "errno = %d\n", err);
281 return err; 281 return err;
282 } 282 }
283 count = hppfs_read_file(hppfs->host_fd, buf, count); 283 err = hppfs_read_file(hppfs->host_fd, buf, count);
284 if (err < 0) {
285 printk(KERN_ERR "hppfs_read: read failed: %d\n", err);
286 return err;
287 }
288 count = err;
284 if (count > 0) 289 if (count > 0)
285 *ppos += count; 290 *ppos += count;
286 } 291 }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9b800d97a687..23a3c76711e0 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -943,14 +943,13 @@ static struct vfsmount *hugetlbfs_vfsmount;
943 943
944static int can_do_hugetlb_shm(void) 944static int can_do_hugetlb_shm(void)
945{ 945{
946 return likely(capable(CAP_IPC_LOCK) || 946 return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
947 in_group_p(sysctl_hugetlb_shm_group) ||
948 can_do_mlock());
949} 947}
950 948
951struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag) 949struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
952{ 950{
953 int error = -ENOMEM; 951 int error = -ENOMEM;
952 int unlock_shm = 0;
954 struct file *file; 953 struct file *file;
955 struct inode *inode; 954 struct inode *inode;
956 struct dentry *dentry, *root; 955 struct dentry *dentry, *root;
@@ -960,11 +959,14 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
960 if (!hugetlbfs_vfsmount) 959 if (!hugetlbfs_vfsmount)
961 return ERR_PTR(-ENOENT); 960 return ERR_PTR(-ENOENT);
962 961
963 if (!can_do_hugetlb_shm()) 962 if (!can_do_hugetlb_shm()) {
964 return ERR_PTR(-EPERM); 963 if (user_shm_lock(size, user)) {
965 964 unlock_shm = 1;
966 if (!user_shm_lock(size, user)) 965 WARN_ONCE(1,
967 return ERR_PTR(-ENOMEM); 966 "Using mlock ulimits for SHM_HUGETLB deprecated\n");
967 } else
968 return ERR_PTR(-EPERM);
969 }
968 970
969 root = hugetlbfs_vfsmount->mnt_root; 971 root = hugetlbfs_vfsmount->mnt_root;
970 quick_string.name = name; 972 quick_string.name = name;
@@ -1004,7 +1006,8 @@ out_inode:
1004out_dentry: 1006out_dentry:
1005 dput(dentry); 1007 dput(dentry);
1006out_shm_unlock: 1008out_shm_unlock:
1007 user_shm_unlock(size, user); 1009 if (unlock_shm)
1010 user_shm_unlock(size, user);
1008 return ERR_PTR(error); 1011 return ERR_PTR(error);
1009} 1012}
1010 1013
diff --git a/fs/inode.c b/fs/inode.c
index 913ab2d9a5d1..d06d6d268de9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -17,6 +17,7 @@
17#include <linux/hash.h> 17#include <linux/hash.h>
18#include <linux/swap.h> 18#include <linux/swap.h>
19#include <linux/security.h> 19#include <linux/security.h>
20#include <linux/ima.h>
20#include <linux/pagemap.h> 21#include <linux/pagemap.h>
21#include <linux/cdev.h> 22#include <linux/cdev.h>
22#include <linux/bootmem.h> 23#include <linux/bootmem.h>
@@ -147,13 +148,13 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
147 inode->i_cdev = NULL; 148 inode->i_cdev = NULL;
148 inode->i_rdev = 0; 149 inode->i_rdev = 0;
149 inode->dirtied_when = 0; 150 inode->dirtied_when = 0;
150 if (security_inode_alloc(inode)) { 151
151 if (inode->i_sb->s_op->destroy_inode) 152 if (security_inode_alloc(inode))
152 inode->i_sb->s_op->destroy_inode(inode); 153 goto out_free_inode;
153 else 154
154 kmem_cache_free(inode_cachep, (inode)); 155 /* allocate and initialize an i_integrity */
155 return NULL; 156 if (ima_inode_alloc(inode))
156 } 157 goto out_free_security;
157 158
158 spin_lock_init(&inode->i_lock); 159 spin_lock_init(&inode->i_lock);
159 lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); 160 lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
@@ -189,6 +190,15 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
189 inode->i_mapping = mapping; 190 inode->i_mapping = mapping;
190 191
191 return inode; 192 return inode;
193
194out_free_security:
195 security_inode_free(inode);
196out_free_inode:
197 if (inode->i_sb->s_op->destroy_inode)
198 inode->i_sb->s_op->destroy_inode(inode);
199 else
200 kmem_cache_free(inode_cachep, (inode));
201 return NULL;
192} 202}
193EXPORT_SYMBOL(inode_init_always); 203EXPORT_SYMBOL(inode_init_always);
194 204
@@ -284,7 +294,7 @@ void clear_inode(struct inode *inode)
284 BUG_ON(!(inode->i_state & I_FREEING)); 294 BUG_ON(!(inode->i_state & I_FREEING));
285 BUG_ON(inode->i_state & I_CLEAR); 295 BUG_ON(inode->i_state & I_CLEAR);
286 inode_sync_wait(inode); 296 inode_sync_wait(inode);
287 DQUOT_DROP(inode); 297 vfs_dq_drop(inode);
288 if (inode->i_sb->s_op->clear_inode) 298 if (inode->i_sb->s_op->clear_inode)
289 inode->i_sb->s_op->clear_inode(inode); 299 inode->i_sb->s_op->clear_inode(inode);
290 if (S_ISBLK(inode->i_mode) && inode->i_bdev) 300 if (S_ISBLK(inode->i_mode) && inode->i_bdev)
@@ -356,9 +366,12 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
356 if (tmp == head) 366 if (tmp == head)
357 break; 367 break;
358 inode = list_entry(tmp, struct inode, i_sb_list); 368 inode = list_entry(tmp, struct inode, i_sb_list);
369 if (inode->i_state & I_NEW)
370 continue;
359 invalidate_inode_buffers(inode); 371 invalidate_inode_buffers(inode);
360 if (!atomic_read(&inode->i_count)) { 372 if (!atomic_read(&inode->i_count)) {
361 list_move(&inode->i_list, dispose); 373 list_move(&inode->i_list, dispose);
374 WARN_ON(inode->i_state & I_NEW);
362 inode->i_state |= I_FREEING; 375 inode->i_state |= I_FREEING;
363 count++; 376 count++;
364 continue; 377 continue;
@@ -460,6 +473,7 @@ static void prune_icache(int nr_to_scan)
460 continue; 473 continue;
461 } 474 }
462 list_move(&inode->i_list, &freeable); 475 list_move(&inode->i_list, &freeable);
476 WARN_ON(inode->i_state & I_NEW);
463 inode->i_state |= I_FREEING; 477 inode->i_state |= I_FREEING;
464 nr_pruned++; 478 nr_pruned++;
465 } 479 }
@@ -656,6 +670,7 @@ void unlock_new_inode(struct inode *inode)
656 * just created it (so there can be no old holders 670 * just created it (so there can be no old holders
657 * that haven't tested I_LOCK). 671 * that haven't tested I_LOCK).
658 */ 672 */
673 WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW));
659 inode->i_state &= ~(I_LOCK|I_NEW); 674 inode->i_state &= ~(I_LOCK|I_NEW);
660 wake_up_inode(inode); 675 wake_up_inode(inode);
661} 676}
@@ -1145,6 +1160,7 @@ void generic_delete_inode(struct inode *inode)
1145 1160
1146 list_del_init(&inode->i_list); 1161 list_del_init(&inode->i_list);
1147 list_del_init(&inode->i_sb_list); 1162 list_del_init(&inode->i_sb_list);
1163 WARN_ON(inode->i_state & I_NEW);
1148 inode->i_state |= I_FREEING; 1164 inode->i_state |= I_FREEING;
1149 inodes_stat.nr_inodes--; 1165 inodes_stat.nr_inodes--;
1150 spin_unlock(&inode_lock); 1166 spin_unlock(&inode_lock);
@@ -1154,7 +1170,7 @@ void generic_delete_inode(struct inode *inode)
1154 if (op->delete_inode) { 1170 if (op->delete_inode) {
1155 void (*delete)(struct inode *) = op->delete_inode; 1171 void (*delete)(struct inode *) = op->delete_inode;
1156 if (!is_bad_inode(inode)) 1172 if (!is_bad_inode(inode))
1157 DQUOT_INIT(inode); 1173 vfs_dq_init(inode);
1158 /* Filesystems implementing their own 1174 /* Filesystems implementing their own
1159 * s_op->delete_inode are required to call 1175 * s_op->delete_inode are required to call
1160 * truncate_inode_pages and clear_inode() 1176 * truncate_inode_pages and clear_inode()
@@ -1186,16 +1202,19 @@ static void generic_forget_inode(struct inode *inode)
1186 spin_unlock(&inode_lock); 1202 spin_unlock(&inode_lock);
1187 return; 1203 return;
1188 } 1204 }
1205 WARN_ON(inode->i_state & I_NEW);
1189 inode->i_state |= I_WILL_FREE; 1206 inode->i_state |= I_WILL_FREE;
1190 spin_unlock(&inode_lock); 1207 spin_unlock(&inode_lock);
1191 write_inode_now(inode, 1); 1208 write_inode_now(inode, 1);
1192 spin_lock(&inode_lock); 1209 spin_lock(&inode_lock);
1210 WARN_ON(inode->i_state & I_NEW);
1193 inode->i_state &= ~I_WILL_FREE; 1211 inode->i_state &= ~I_WILL_FREE;
1194 inodes_stat.nr_unused--; 1212 inodes_stat.nr_unused--;
1195 hlist_del_init(&inode->i_hash); 1213 hlist_del_init(&inode->i_hash);
1196 } 1214 }
1197 list_del_init(&inode->i_list); 1215 list_del_init(&inode->i_list);
1198 list_del_init(&inode->i_sb_list); 1216 list_del_init(&inode->i_sb_list);
1217 WARN_ON(inode->i_state & I_NEW);
1199 inode->i_state |= I_FREEING; 1218 inode->i_state |= I_FREEING;
1200 inodes_stat.nr_inodes--; 1219 inodes_stat.nr_inodes--;
1201 spin_unlock(&inode_lock); 1220 spin_unlock(&inode_lock);
@@ -1283,6 +1302,40 @@ sector_t bmap(struct inode * inode, sector_t block)
1283} 1302}
1284EXPORT_SYMBOL(bmap); 1303EXPORT_SYMBOL(bmap);
1285 1304
1305/*
1306 * With relative atime, only update atime if the previous atime is
1307 * earlier than either the ctime or mtime or if at least a day has
1308 * passed since the last atime update.
1309 */
1310static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
1311 struct timespec now)
1312{
1313
1314 if (!(mnt->mnt_flags & MNT_RELATIME))
1315 return 1;
1316 /*
1317 * Is mtime younger than atime? If yes, update atime:
1318 */
1319 if (timespec_compare(&inode->i_mtime, &inode->i_atime) >= 0)
1320 return 1;
1321 /*
1322 * Is ctime younger than atime? If yes, update atime:
1323 */
1324 if (timespec_compare(&inode->i_ctime, &inode->i_atime) >= 0)
1325 return 1;
1326
1327 /*
1328 * Is the previous atime value older than a day? If yes,
1329 * update atime:
1330 */
1331 if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60)
1332 return 1;
1333 /*
1334 * Good, we can skip the atime update:
1335 */
1336 return 0;
1337}
1338
1286/** 1339/**
1287 * touch_atime - update the access time 1340 * touch_atime - update the access time
1288 * @mnt: mount the inode is accessed on 1341 * @mnt: mount the inode is accessed on
@@ -1310,17 +1363,12 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
1310 goto out; 1363 goto out;
1311 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) 1364 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1312 goto out; 1365 goto out;
1313 if (mnt->mnt_flags & MNT_RELATIME) {
1314 /*
1315 * With relative atime, only update atime if the previous
1316 * atime is earlier than either the ctime or mtime.
1317 */
1318 if (timespec_compare(&inode->i_mtime, &inode->i_atime) < 0 &&
1319 timespec_compare(&inode->i_ctime, &inode->i_atime) < 0)
1320 goto out;
1321 }
1322 1366
1323 now = current_fs_time(inode->i_sb); 1367 now = current_fs_time(inode->i_sb);
1368
1369 if (!relatime_need_update(mnt, inode, now))
1370 goto out;
1371
1324 if (timespec_equal(&inode->i_atime, &now)) 1372 if (timespec_equal(&inode->i_atime, &now))
1325 goto out; 1373 goto out;
1326 1374
diff --git a/fs/internal.h b/fs/internal.h
index 0d8ac497b3d5..b4dac4fb6b61 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -11,6 +11,7 @@
11 11
12struct super_block; 12struct super_block;
13struct linux_binprm; 13struct linux_binprm;
14struct path;
14 15
15/* 16/*
16 * block_dev.c 17 * block_dev.c
@@ -43,7 +44,7 @@ extern void __init chrdev_init(void);
43/* 44/*
44 * exec.c 45 * exec.c
45 */ 46 */
46extern void check_unsafe_exec(struct linux_binprm *, struct files_struct *); 47extern int check_unsafe_exec(struct linux_binprm *);
47 48
48/* 49/*
49 * namespace.c 50 * namespace.c
@@ -60,3 +61,8 @@ extern void umount_tree(struct vfsmount *, int, struct list_head *);
60extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); 61extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
61 62
62extern void __init mnt_init(void); 63extern void __init mnt_init(void);
64
65/*
66 * fs_struct.c
67 */
68extern void chroot_fs_refs(struct path *, struct path *);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 240ec63984cb..ac2d47e43926 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -404,10 +404,12 @@ static int ioctl_fionbio(struct file *filp, int __user *argp)
404 if (O_NONBLOCK != O_NDELAY) 404 if (O_NONBLOCK != O_NDELAY)
405 flag |= O_NDELAY; 405 flag |= O_NDELAY;
406#endif 406#endif
407 spin_lock(&filp->f_lock);
407 if (on) 408 if (on)
408 filp->f_flags |= flag; 409 filp->f_flags |= flag;
409 else 410 else
410 filp->f_flags &= ~flag; 411 filp->f_flags &= ~flag;
412 spin_unlock(&filp->f_lock);
411 return error; 413 return error;
412} 414}
413 415
@@ -425,18 +427,12 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp,
425 /* Did FASYNC state change ? */ 427 /* Did FASYNC state change ? */
426 if ((flag ^ filp->f_flags) & FASYNC) { 428 if ((flag ^ filp->f_flags) & FASYNC) {
427 if (filp->f_op && filp->f_op->fasync) 429 if (filp->f_op && filp->f_op->fasync)
430 /* fasync() adjusts filp->f_flags */
428 error = filp->f_op->fasync(fd, filp, on); 431 error = filp->f_op->fasync(fd, filp, on);
429 else 432 else
430 error = -ENOTTY; 433 error = -ENOTTY;
431 } 434 }
432 if (error) 435 return error < 0 ? error : 0;
433 return error;
434
435 if (on)
436 filp->f_flags |= FASYNC;
437 else
438 filp->f_flags &= ~FASYNC;
439 return error;
440} 436}
441 437
442static int ioctl_fsfreeze(struct file *filp) 438static int ioctl_fsfreeze(struct file *filp)
@@ -499,17 +495,11 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
499 break; 495 break;
500 496
501 case FIONBIO: 497 case FIONBIO:
502 /* BKL needed to avoid races tweaking f_flags */
503 lock_kernel();
504 error = ioctl_fionbio(filp, argp); 498 error = ioctl_fionbio(filp, argp);
505 unlock_kernel();
506 break; 499 break;
507 500
508 case FIOASYNC: 501 case FIOASYNC:
509 /* BKL needed to avoid races tweaking f_flags */
510 lock_kernel();
511 error = ioctl_fioasync(fd, filp, argp); 502 error = ioctl_fioasync(fd, filp, argp);
512 unlock_kernel();
513 break; 503 break;
514 504
515 case FIOQSIZE: 505 case FIOQSIZE:
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 6147ec3643a0..b4cbe9603c7d 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -114,7 +114,7 @@ static const struct super_operations isofs_sops = {
114}; 114};
115 115
116 116
117static struct dentry_operations isofs_dentry_ops[] = { 117static const struct dentry_operations isofs_dentry_ops[] = {
118 { 118 {
119 .d_hash = isofs_hash, 119 .d_hash = isofs_hash,
120 .d_compare = isofs_dentry_cmp, 120 .d_compare = isofs_dentry_cmp,
@@ -923,6 +923,7 @@ out_freesbi:
923static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf) 923static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
924{ 924{
925 struct super_block *sb = dentry->d_sb; 925 struct super_block *sb = dentry->d_sb;
926 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
926 927
927 buf->f_type = ISOFS_SUPER_MAGIC; 928 buf->f_type = ISOFS_SUPER_MAGIC;
928 buf->f_bsize = sb->s_blocksize; 929 buf->f_bsize = sb->s_blocksize;
@@ -932,6 +933,8 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
932 buf->f_bavail = 0; 933 buf->f_bavail = 0;
933 buf->f_files = ISOFS_SB(sb)->s_ninodes; 934 buf->f_files = ISOFS_SB(sb)->s_ninodes;
934 buf->f_ffree = 0; 935 buf->f_ffree = 0;
936 buf->f_fsid.val[0] = (u32)id;
937 buf->f_fsid.val[1] = (u32)(id >> 32);
935 buf->f_namelen = NAME_MAX; 938 buf->f_namelen = NAME_MAX;
936 return 0; 939 return 0;
937} 940}
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 3fbffb1ea714..f8077b9c8981 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -20,6 +20,7 @@
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/bio.h>
23 24
24/* 25/*
25 * Default IO end handler for temporary BJ_IO buffer_heads. 26 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -171,14 +172,15 @@ static int journal_write_commit_record(journal_t *journal,
171 return (ret == -EIO); 172 return (ret == -EIO);
172} 173}
173 174
174static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) 175static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
176 int write_op)
175{ 177{
176 int i; 178 int i;
177 179
178 for (i = 0; i < bufs; i++) { 180 for (i = 0; i < bufs; i++) {
179 wbuf[i]->b_end_io = end_buffer_write_sync; 181 wbuf[i]->b_end_io = end_buffer_write_sync;
180 /* We use-up our safety reference in submit_bh() */ 182 /* We use-up our safety reference in submit_bh() */
181 submit_bh(WRITE, wbuf[i]); 183 submit_bh(write_op, wbuf[i]);
182 } 184 }
183} 185}
184 186
@@ -186,7 +188,8 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
186 * Submit all the data buffers to disk 188 * Submit all the data buffers to disk
187 */ 189 */
188static int journal_submit_data_buffers(journal_t *journal, 190static int journal_submit_data_buffers(journal_t *journal,
189 transaction_t *commit_transaction) 191 transaction_t *commit_transaction,
192 int write_op)
190{ 193{
191 struct journal_head *jh; 194 struct journal_head *jh;
192 struct buffer_head *bh; 195 struct buffer_head *bh;
@@ -225,7 +228,7 @@ write_out_data:
225 BUFFER_TRACE(bh, "needs blocking lock"); 228 BUFFER_TRACE(bh, "needs blocking lock");
226 spin_unlock(&journal->j_list_lock); 229 spin_unlock(&journal->j_list_lock);
227 /* Write out all data to prevent deadlocks */ 230 /* Write out all data to prevent deadlocks */
228 journal_do_submit_data(wbuf, bufs); 231 journal_do_submit_data(wbuf, bufs, write_op);
229 bufs = 0; 232 bufs = 0;
230 lock_buffer(bh); 233 lock_buffer(bh);
231 spin_lock(&journal->j_list_lock); 234 spin_lock(&journal->j_list_lock);
@@ -256,7 +259,7 @@ write_out_data:
256 jbd_unlock_bh_state(bh); 259 jbd_unlock_bh_state(bh);
257 if (bufs == journal->j_wbufsize) { 260 if (bufs == journal->j_wbufsize) {
258 spin_unlock(&journal->j_list_lock); 261 spin_unlock(&journal->j_list_lock);
259 journal_do_submit_data(wbuf, bufs); 262 journal_do_submit_data(wbuf, bufs, write_op);
260 bufs = 0; 263 bufs = 0;
261 goto write_out_data; 264 goto write_out_data;
262 } 265 }
@@ -286,7 +289,7 @@ write_out_data:
286 } 289 }
287 } 290 }
288 spin_unlock(&journal->j_list_lock); 291 spin_unlock(&journal->j_list_lock);
289 journal_do_submit_data(wbuf, bufs); 292 journal_do_submit_data(wbuf, bufs, write_op);
290 293
291 return err; 294 return err;
292} 295}
@@ -315,6 +318,7 @@ void journal_commit_transaction(journal_t *journal)
315 int first_tag = 0; 318 int first_tag = 0;
316 int tag_flag; 319 int tag_flag;
317 int i; 320 int i;
321 int write_op = WRITE;
318 322
319 /* 323 /*
320 * First job: lock down the current transaction and wait for 324 * First job: lock down the current transaction and wait for
@@ -347,6 +351,8 @@ void journal_commit_transaction(journal_t *journal)
347 spin_lock(&journal->j_state_lock); 351 spin_lock(&journal->j_state_lock);
348 commit_transaction->t_state = T_LOCKED; 352 commit_transaction->t_state = T_LOCKED;
349 353
354 if (commit_transaction->t_synchronous_commit)
355 write_op = WRITE_SYNC;
350 spin_lock(&commit_transaction->t_handle_lock); 356 spin_lock(&commit_transaction->t_handle_lock);
351 while (commit_transaction->t_updates) { 357 while (commit_transaction->t_updates) {
352 DEFINE_WAIT(wait); 358 DEFINE_WAIT(wait);
@@ -431,7 +437,8 @@ void journal_commit_transaction(journal_t *journal)
431 * Now start flushing things to disk, in the order they appear 437 * Now start flushing things to disk, in the order they appear
432 * on the transaction lists. Data blocks go first. 438 * on the transaction lists. Data blocks go first.
433 */ 439 */
434 err = journal_submit_data_buffers(journal, commit_transaction); 440 err = journal_submit_data_buffers(journal, commit_transaction,
441 write_op);
435 442
436 /* 443 /*
437 * Wait for all previously submitted IO to complete. 444 * Wait for all previously submitted IO to complete.
@@ -660,7 +667,7 @@ start_journal_io:
660 clear_buffer_dirty(bh); 667 clear_buffer_dirty(bh);
661 set_buffer_uptodate(bh); 668 set_buffer_uptodate(bh);
662 bh->b_end_io = journal_end_buffer_io_sync; 669 bh->b_end_io = journal_end_buffer_io_sync;
663 submit_bh(WRITE, bh); 670 submit_bh(write_op, bh);
664 } 671 }
665 cond_resched(); 672 cond_resched();
666 673
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index e79c07812afa..737f7246a4b5 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -637,6 +637,8 @@ struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
637 return NULL; 637 return NULL;
638 638
639 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 639 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
640 if (!bh)
641 return NULL;
640 lock_buffer(bh); 642 lock_buffer(bh);
641 memset(bh->b_data, 0, journal->j_blocksize); 643 memset(bh->b_data, 0, journal->j_blocksize);
642 set_buffer_uptodate(bh); 644 set_buffer_uptodate(bh);
@@ -733,9 +735,7 @@ journal_t * journal_init_dev(struct block_device *bdev,
733 if (!journal->j_wbuf) { 735 if (!journal->j_wbuf) {
734 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 736 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
735 __func__); 737 __func__);
736 kfree(journal); 738 goto out_err;
737 journal = NULL;
738 goto out;
739 } 739 }
740 journal->j_dev = bdev; 740 journal->j_dev = bdev;
741 journal->j_fs_dev = fs_dev; 741 journal->j_fs_dev = fs_dev;
@@ -743,11 +743,19 @@ journal_t * journal_init_dev(struct block_device *bdev,
743 journal->j_maxlen = len; 743 journal->j_maxlen = len;
744 744
745 bh = __getblk(journal->j_dev, start, journal->j_blocksize); 745 bh = __getblk(journal->j_dev, start, journal->j_blocksize);
746 J_ASSERT(bh != NULL); 746 if (!bh) {
747 printk(KERN_ERR
748 "%s: Cannot get buffer for journal superblock\n",
749 __func__);
750 goto out_err;
751 }
747 journal->j_sb_buffer = bh; 752 journal->j_sb_buffer = bh;
748 journal->j_superblock = (journal_superblock_t *)bh->b_data; 753 journal->j_superblock = (journal_superblock_t *)bh->b_data;
749out: 754
750 return journal; 755 return journal;
756out_err:
757 kfree(journal);
758 return NULL;
751} 759}
752 760
753/** 761/**
@@ -787,8 +795,7 @@ journal_t * journal_init_inode (struct inode *inode)
787 if (!journal->j_wbuf) { 795 if (!journal->j_wbuf) {
788 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 796 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
789 __func__); 797 __func__);
790 kfree(journal); 798 goto out_err;
791 return NULL;
792 } 799 }
793 800
794 err = journal_bmap(journal, 0, &blocknr); 801 err = journal_bmap(journal, 0, &blocknr);
@@ -796,16 +803,23 @@ journal_t * journal_init_inode (struct inode *inode)
796 if (err) { 803 if (err) {
797 printk(KERN_ERR "%s: Cannnot locate journal superblock\n", 804 printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
798 __func__); 805 __func__);
799 kfree(journal); 806 goto out_err;
800 return NULL;
801 } 807 }
802 808
803 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 809 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
804 J_ASSERT(bh != NULL); 810 if (!bh) {
811 printk(KERN_ERR
812 "%s: Cannot get buffer for journal superblock\n",
813 __func__);
814 goto out_err;
815 }
805 journal->j_sb_buffer = bh; 816 journal->j_sb_buffer = bh;
806 journal->j_superblock = (journal_superblock_t *)bh->b_data; 817 journal->j_superblock = (journal_superblock_t *)bh->b_data;
807 818
808 return journal; 819 return journal;
820out_err:
821 kfree(journal);
822 return NULL;
809} 823}
810 824
811/* 825/*
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index e6a117431277..ed886e6db399 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1440,6 +1440,8 @@ int journal_stop(handle_t *handle)
1440 } 1440 }
1441 } 1441 }
1442 1442
1443 if (handle->h_sync)
1444 transaction->t_synchronous_commit = 1;
1443 current->journal_info = NULL; 1445 current->journal_info = NULL;
1444 spin_lock(&journal->j_state_lock); 1446 spin_lock(&journal->j_state_lock);
1445 spin_lock(&transaction->t_handle_lock); 1447 spin_lock(&transaction->t_handle_lock);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 62804e57a44c..4ea72377c7a2 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -367,6 +367,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
367 int tag_bytes = journal_tag_bytes(journal); 367 int tag_bytes = journal_tag_bytes(journal);
368 struct buffer_head *cbh = NULL; /* For transactional checksums */ 368 struct buffer_head *cbh = NULL; /* For transactional checksums */
369 __u32 crc32_sum = ~0; 369 __u32 crc32_sum = ~0;
370 int write_op = WRITE;
370 371
371 /* 372 /*
372 * First job: lock down the current transaction and wait for 373 * First job: lock down the current transaction and wait for
@@ -401,6 +402,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
401 spin_lock(&journal->j_state_lock); 402 spin_lock(&journal->j_state_lock);
402 commit_transaction->t_state = T_LOCKED; 403 commit_transaction->t_state = T_LOCKED;
403 404
405 if (commit_transaction->t_synchronous_commit)
406 write_op = WRITE_SYNC;
404 stats.u.run.rs_wait = commit_transaction->t_max_wait; 407 stats.u.run.rs_wait = commit_transaction->t_max_wait;
405 stats.u.run.rs_locked = jiffies; 408 stats.u.run.rs_locked = jiffies;
406 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, 409 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
@@ -680,7 +683,7 @@ start_journal_io:
680 clear_buffer_dirty(bh); 683 clear_buffer_dirty(bh);
681 set_buffer_uptodate(bh); 684 set_buffer_uptodate(bh);
682 bh->b_end_io = journal_end_buffer_io_sync; 685 bh->b_end_io = journal_end_buffer_io_sync;
683 submit_bh(WRITE, bh); 686 submit_bh(write_op, bh);
684 } 687 }
685 cond_resched(); 688 cond_resched();
686 stats.u.run.rs_blocks_logged += bufs; 689 stats.u.run.rs_blocks_logged += bufs;
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 257ff2625765..bbe6d592d8b3 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -55,6 +55,25 @@
55 * need do nothing. 55 * need do nothing.
56 * RevokeValid set, Revoked set: 56 * RevokeValid set, Revoked set:
57 * buffer has been revoked. 57 * buffer has been revoked.
58 *
59 * Locking rules:
60 * We keep two hash tables of revoke records. One hashtable belongs to the
61 * running transaction (is pointed to by journal->j_revoke), the other one
62 * belongs to the committing transaction. Accesses to the second hash table
63 * happen only from the kjournald and no other thread touches this table. Also
64 * journal_switch_revoke_table() which switches which hashtable belongs to the
65 * running and which to the committing transaction is called only from
66 * kjournald. Therefore we need no locks when accessing the hashtable belonging
67 * to the committing transaction.
68 *
69 * All users operating on the hash table belonging to the running transaction
70 * have a handle to the transaction. Therefore they are safe from kjournald
71 * switching hash tables under them. For operations on the lists of entries in
72 * the hash table j_revoke_lock is used.
73 *
74 * Finally, also replay code uses the hash tables but at this moment noone else
75 * can touch them (filesystem isn't mounted yet) and hence no locking is
76 * needed.
58 */ 77 */
59 78
60#ifndef __KERNEL__ 79#ifndef __KERNEL__
@@ -401,8 +420,6 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
401 * the second time we would still have a pending revoke to cancel. So, 420 * the second time we would still have a pending revoke to cancel. So,
402 * do not trust the Revoked bit on buffers unless RevokeValid is also 421 * do not trust the Revoked bit on buffers unless RevokeValid is also
403 * set. 422 * set.
404 *
405 * The caller must have the journal locked.
406 */ 423 */
407int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) 424int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
408{ 425{
@@ -480,10 +497,7 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
480/* 497/*
481 * Write revoke records to the journal for all entries in the current 498 * Write revoke records to the journal for all entries in the current
482 * revoke hash, deleting the entries as we go. 499 * revoke hash, deleting the entries as we go.
483 *
484 * Called with the journal lock held.
485 */ 500 */
486
487void jbd2_journal_write_revoke_records(journal_t *journal, 501void jbd2_journal_write_revoke_records(journal_t *journal,
488 transaction_t *transaction) 502 transaction_t *transaction)
489{ 503{
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 28ce21d8598e..996ffda06bf3 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1315,6 +1315,8 @@ int jbd2_journal_stop(handle_t *handle)
1315 } 1315 }
1316 } 1316 }
1317 1317
1318 if (handle->h_sync)
1319 transaction->t_synchronous_commit = 1;
1318 current->journal_info = NULL; 1320 current->journal_info = NULL;
1319 spin_lock(&journal->j_state_lock); 1321 spin_lock(&journal->j_state_lock);
1320 spin_lock(&transaction->t_handle_lock); 1322 spin_lock(&transaction->t_handle_lock);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index d98713777a1b..77ccf8cb0823 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -336,7 +336,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
336 return PTR_ERR(acl); 336 return PTR_ERR(acl);
337 337
338 if (!acl) { 338 if (!acl) {
339 *i_mode &= ~current->fs->umask; 339 *i_mode &= ~current_umask();
340 } else { 340 } else {
341 if (S_ISDIR(*i_mode)) 341 if (S_ISDIR(*i_mode))
342 jffs2_iset_acl(inode, &f->i_acl_default, acl); 342 jffs2_iset_acl(inode, &f->i_acl_default, acl);
diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig
index 9ff619a6f9cc..57cef19951db 100644
--- a/fs/jfs/Kconfig
+++ b/fs/jfs/Kconfig
@@ -1,6 +1,7 @@
1config JFS_FS 1config JFS_FS
2 tristate "JFS filesystem support" 2 tristate "JFS filesystem support"
3 select NLS 3 select NLS
4 select CRC32
4 help 5 help
5 This is a port of IBM's Journaled Filesystem . More information is 6 This is a port of IBM's Journaled Filesystem . More information is
6 available in the file <file:Documentation/filesystems/jfs.txt>. 7 available in the file <file:Documentation/filesystems/jfs.txt>.
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index d3e5c33665de..06ca1b8d2054 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -182,7 +182,7 @@ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
182cleanup: 182cleanup:
183 posix_acl_release(acl); 183 posix_acl_release(acl);
184 } else 184 } else
185 inode->i_mode &= ~current->fs->umask; 185 inode->i_mode &= ~current_umask();
186 186
187 JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) | 187 JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) |
188 inode->i_mode; 188 inode->i_mode;
@@ -233,7 +233,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
233 233
234 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || 234 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
235 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { 235 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
236 if (DQUOT_TRANSFER(inode, iattr)) 236 if (vfs_dq_transfer(inode, iattr))
237 return -EDQUOT; 237 return -EDQUOT;
238 } 238 }
239 239
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index b00ee9f05a06..b2ae190a77ba 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -158,9 +158,9 @@ void jfs_delete_inode(struct inode *inode)
158 /* 158 /*
159 * Free the inode from the quota allocation. 159 * Free the inode from the quota allocation.
160 */ 160 */
161 DQUOT_INIT(inode); 161 vfs_dq_init(inode);
162 DQUOT_FREE_INODE(inode); 162 vfs_dq_free_inode(inode);
163 DQUOT_DROP(inode); 163 vfs_dq_drop(inode);
164 } 164 }
165 165
166 clear_inode(inode); 166 clear_inode(inode);
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
index 6a73de84bcef..dd824d9b0b1a 100644
--- a/fs/jfs/jfs_debug.c
+++ b/fs/jfs/jfs_debug.c
@@ -90,7 +90,6 @@ void jfs_proc_init(void)
90 90
91 if (!(base = proc_mkdir("fs/jfs", NULL))) 91 if (!(base = proc_mkdir("fs/jfs", NULL)))
92 return; 92 return;
93 base->owner = THIS_MODULE;
94 93
95 for (i = 0; i < NPROCENT; i++) 94 for (i = 0; i < NPROCENT; i++)
96 proc_create(Entries[i].name, 0, base, Entries[i].proc_fops); 95 proc_create(Entries[i].name, 0, base, Entries[i].proc_fops);
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 4dcc05819998..925871e9887b 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -381,10 +381,10 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
381 * It's time to move the inline table to an external 381 * It's time to move the inline table to an external
382 * page and begin to build the xtree 382 * page and begin to build the xtree
383 */ 383 */
384 if (DQUOT_ALLOC_BLOCK(ip, sbi->nbperpage)) 384 if (vfs_dq_alloc_block(ip, sbi->nbperpage))
385 goto clean_up; 385 goto clean_up;
386 if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) { 386 if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) {
387 DQUOT_FREE_BLOCK(ip, sbi->nbperpage); 387 vfs_dq_free_block(ip, sbi->nbperpage);
388 goto clean_up; 388 goto clean_up;
389 } 389 }
390 390
@@ -408,7 +408,7 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
408 memcpy(&jfs_ip->i_dirtable, temp_table, 408 memcpy(&jfs_ip->i_dirtable, temp_table,
409 sizeof (temp_table)); 409 sizeof (temp_table));
410 dbFree(ip, xaddr, sbi->nbperpage); 410 dbFree(ip, xaddr, sbi->nbperpage);
411 DQUOT_FREE_BLOCK(ip, sbi->nbperpage); 411 vfs_dq_free_block(ip, sbi->nbperpage);
412 goto clean_up; 412 goto clean_up;
413 } 413 }
414 ip->i_size = PSIZE; 414 ip->i_size = PSIZE;
@@ -1027,7 +1027,7 @@ static int dtSplitUp(tid_t tid,
1027 n = xlen; 1027 n = xlen;
1028 1028
1029 /* Allocate blocks to quota. */ 1029 /* Allocate blocks to quota. */
1030 if (DQUOT_ALLOC_BLOCK(ip, n)) { 1030 if (vfs_dq_alloc_block(ip, n)) {
1031 rc = -EDQUOT; 1031 rc = -EDQUOT;
1032 goto extendOut; 1032 goto extendOut;
1033 } 1033 }
@@ -1308,7 +1308,7 @@ static int dtSplitUp(tid_t tid,
1308 1308
1309 /* Rollback quota allocation */ 1309 /* Rollback quota allocation */
1310 if (rc && quota_allocation) 1310 if (rc && quota_allocation)
1311 DQUOT_FREE_BLOCK(ip, quota_allocation); 1311 vfs_dq_free_block(ip, quota_allocation);
1312 1312
1313 dtSplitUp_Exit: 1313 dtSplitUp_Exit:
1314 1314
@@ -1369,7 +1369,7 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
1369 return -EIO; 1369 return -EIO;
1370 1370
1371 /* Allocate blocks to quota. */ 1371 /* Allocate blocks to quota. */
1372 if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) { 1372 if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
1373 release_metapage(rmp); 1373 release_metapage(rmp);
1374 return -EDQUOT; 1374 return -EDQUOT;
1375 } 1375 }
@@ -1916,7 +1916,7 @@ static int dtSplitRoot(tid_t tid,
1916 rp = rmp->data; 1916 rp = rmp->data;
1917 1917
1918 /* Allocate blocks to quota. */ 1918 /* Allocate blocks to quota. */
1919 if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) { 1919 if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
1920 release_metapage(rmp); 1920 release_metapage(rmp);
1921 return -EDQUOT; 1921 return -EDQUOT;
1922 } 1922 }
@@ -2287,7 +2287,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
2287 xlen = lengthPXD(&fp->header.self); 2287 xlen = lengthPXD(&fp->header.self);
2288 2288
2289 /* Free quota allocation. */ 2289 /* Free quota allocation. */
2290 DQUOT_FREE_BLOCK(ip, xlen); 2290 vfs_dq_free_block(ip, xlen);
2291 2291
2292 /* free/invalidate its buffer page */ 2292 /* free/invalidate its buffer page */
2293 discard_metapage(fmp); 2293 discard_metapage(fmp);
@@ -2363,7 +2363,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
2363 xlen = lengthPXD(&p->header.self); 2363 xlen = lengthPXD(&p->header.self);
2364 2364
2365 /* Free quota allocation */ 2365 /* Free quota allocation */
2366 DQUOT_FREE_BLOCK(ip, xlen); 2366 vfs_dq_free_block(ip, xlen);
2367 2367
2368 /* free/invalidate its buffer page */ 2368 /* free/invalidate its buffer page */
2369 discard_metapage(mp); 2369 discard_metapage(mp);
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index 7ae1e3281de9..bbbd5f202e37 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -141,7 +141,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
141 } 141 }
142 142
143 /* Allocate blocks to quota. */ 143 /* Allocate blocks to quota. */
144 if (DQUOT_ALLOC_BLOCK(ip, nxlen)) { 144 if (vfs_dq_alloc_block(ip, nxlen)) {
145 dbFree(ip, nxaddr, (s64) nxlen); 145 dbFree(ip, nxaddr, (s64) nxlen);
146 mutex_unlock(&JFS_IP(ip)->commit_mutex); 146 mutex_unlock(&JFS_IP(ip)->commit_mutex);
147 return -EDQUOT; 147 return -EDQUOT;
@@ -164,7 +164,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
164 */ 164 */
165 if (rc) { 165 if (rc) {
166 dbFree(ip, nxaddr, nxlen); 166 dbFree(ip, nxaddr, nxlen);
167 DQUOT_FREE_BLOCK(ip, nxlen); 167 vfs_dq_free_block(ip, nxlen);
168 mutex_unlock(&JFS_IP(ip)->commit_mutex); 168 mutex_unlock(&JFS_IP(ip)->commit_mutex);
169 return (rc); 169 return (rc);
170 } 170 }
@@ -256,7 +256,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
256 goto exit; 256 goto exit;
257 257
258 /* Allocat blocks to quota. */ 258 /* Allocat blocks to quota. */
259 if (DQUOT_ALLOC_BLOCK(ip, nxlen)) { 259 if (vfs_dq_alloc_block(ip, nxlen)) {
260 dbFree(ip, nxaddr, (s64) nxlen); 260 dbFree(ip, nxaddr, (s64) nxlen);
261 mutex_unlock(&JFS_IP(ip)->commit_mutex); 261 mutex_unlock(&JFS_IP(ip)->commit_mutex);
262 return -EDQUOT; 262 return -EDQUOT;
@@ -297,7 +297,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
297 /* extend the extent */ 297 /* extend the extent */
298 if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) { 298 if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) {
299 dbFree(ip, xaddr + xlen, delta); 299 dbFree(ip, xaddr + xlen, delta);
300 DQUOT_FREE_BLOCK(ip, nxlen); 300 vfs_dq_free_block(ip, nxlen);
301 goto exit; 301 goto exit;
302 } 302 }
303 } else { 303 } else {
@@ -308,7 +308,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
308 */ 308 */
309 if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) { 309 if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) {
310 dbFree(ip, nxaddr, nxlen); 310 dbFree(ip, nxaddr, nxlen);
311 DQUOT_FREE_BLOCK(ip, nxlen); 311 vfs_dq_free_block(ip, nxlen);
312 goto exit; 312 goto exit;
313 } 313 }
314 } 314 }
@@ -362,11 +362,12 @@ exit:
362int extHint(struct inode *ip, s64 offset, xad_t * xp) 362int extHint(struct inode *ip, s64 offset, xad_t * xp)
363{ 363{
364 struct super_block *sb = ip->i_sb; 364 struct super_block *sb = ip->i_sb;
365 struct xadlist xadl; 365 int nbperpage = JFS_SBI(sb)->nbperpage;
366 struct lxdlist lxdl;
367 lxd_t lxd;
368 s64 prev; 366 s64 prev;
369 int rc, nbperpage = JFS_SBI(sb)->nbperpage; 367 int rc = 0;
368 s64 xaddr;
369 int xlen;
370 int xflag;
370 371
371 /* init the hint as "no hint provided" */ 372 /* init the hint as "no hint provided" */
372 XADaddress(xp, 0); 373 XADaddress(xp, 0);
@@ -376,46 +377,30 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp)
376 */ 377 */
377 prev = ((offset & ~POFFSET) >> JFS_SBI(sb)->l2bsize) - nbperpage; 378 prev = ((offset & ~POFFSET) >> JFS_SBI(sb)->l2bsize) - nbperpage;
378 379
379 /* if the offsets in the first page of the file, 380 /* if the offset is in the first page of the file, no hint provided.
380 * no hint provided.
381 */ 381 */
382 if (prev < 0) 382 if (prev < 0)
383 return (0); 383 goto out;
384
385 /* prepare to lookup the previous page's extent info */
386 lxdl.maxnlxd = 1;
387 lxdl.nlxd = 1;
388 lxdl.lxd = &lxd;
389 LXDoffset(&lxd, prev)
390 LXDlength(&lxd, nbperpage);
391
392 xadl.maxnxad = 1;
393 xadl.nxad = 0;
394 xadl.xad = xp;
395
396 /* perform the lookup */
397 if ((rc = xtLookupList(ip, &lxdl, &xadl, 0)))
398 return (rc);
399
400 /* check if no extent exists for the previous page.
401 * this is possible for sparse files.
402 */
403 if (xadl.nxad == 0) {
404// assert(ISSPARSE(ip));
405 return (0);
406 }
407 384
408 /* only preserve the abnr flag within the xad flags 385 rc = xtLookup(ip, prev, nbperpage, &xflag, &xaddr, &xlen, 0);
409 * of the returned hint.
410 */
411 xp->flag &= XAD_NOTRECORDED;
412 386
413 if(xadl.nxad != 1 || lengthXAD(xp) != nbperpage) { 387 if ((rc == 0) && xlen) {
414 jfs_error(ip->i_sb, "extHint: corrupt xtree"); 388 if (xlen != nbperpage) {
415 return -EIO; 389 jfs_error(ip->i_sb, "extHint: corrupt xtree");
416 } 390 rc = -EIO;
391 }
392 XADaddress(xp, xaddr);
393 XADlength(xp, xlen);
394 /*
395 * only preserve the abnr flag within the xad flags
396 * of the returned hint.
397 */
398 xp->flag = xflag & XAD_NOTRECORDED;
399 } else
400 rc = 0;
417 401
418 return (0); 402out:
403 return (rc);
419} 404}
420 405
421 406
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 0f94381ca6d0..346057218edc 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -57,12 +57,6 @@
57#include "jfs_debug.h" 57#include "jfs_debug.h"
58 58
59/* 59/*
60 * __mark_inode_dirty expects inodes to be hashed. Since we don't want
61 * special inodes in the fileset inode space, we make them appear hashed,
62 * but do not put on any lists.
63 */
64
65/*
66 * imap locks 60 * imap locks
67 */ 61 */
68/* iag free list lock */ 62/* iag free list lock */
@@ -497,7 +491,9 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
497 release_metapage(mp); 491 release_metapage(mp);
498 492
499 /* 493 /*
500 * that will look hashed, but won't be on any list; hlist_del() 494 * __mark_inode_dirty expects inodes to be hashed. Since we don't
495 * want special inodes in the fileset inode space, we make them
496 * appear hashed, but do not put on any lists. hlist_del()
501 * will work fine and require no locking. 497 * will work fine and require no locking.
502 */ 498 */
503 ip->i_hash.pprev = &ip->i_hash.next; 499 ip->i_hash.pprev = &ip->i_hash.next;
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index d4d142c2edd4..dc0e02159ac9 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -116,7 +116,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
116 /* 116 /*
117 * Allocate inode to quota. 117 * Allocate inode to quota.
118 */ 118 */
119 if (DQUOT_ALLOC_INODE(inode)) { 119 if (vfs_dq_alloc_inode(inode)) {
120 rc = -EDQUOT; 120 rc = -EDQUOT;
121 goto fail_drop; 121 goto fail_drop;
122 } 122 }
@@ -162,7 +162,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
162 return inode; 162 return inode;
163 163
164fail_drop: 164fail_drop:
165 DQUOT_DROP(inode); 165 vfs_dq_drop(inode);
166 inode->i_flags |= S_NOQUOTA; 166 inode->i_flags |= S_NOQUOTA;
167fail_unlock: 167fail_unlock:
168 inode->i_nlink = 0; 168 inode->i_nlink = 0;
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index adb2fafcc544..1eff7db34d63 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -47,5 +47,5 @@ extern const struct file_operations jfs_dir_operations;
47extern const struct inode_operations jfs_file_inode_operations; 47extern const struct inode_operations jfs_file_inode_operations;
48extern const struct file_operations jfs_file_operations; 48extern const struct file_operations jfs_file_operations;
49extern const struct inode_operations jfs_symlink_inode_operations; 49extern const struct inode_operations jfs_symlink_inode_operations;
50extern struct dentry_operations jfs_ci_dentry_operations; 50extern const struct dentry_operations jfs_ci_dentry_operations;
51#endif /* _H_JFS_INODE */ 51#endif /* _H_JFS_INODE */
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index c350057087dd..07b6c5dfb4b6 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -369,6 +369,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
369 unsigned long bio_bytes = 0; 369 unsigned long bio_bytes = 0;
370 unsigned long bio_offset = 0; 370 unsigned long bio_offset = 0;
371 int offset; 371 int offset;
372 int bad_blocks = 0;
372 373
373 page_start = (sector_t)page->index << 374 page_start = (sector_t)page->index <<
374 (PAGE_CACHE_SHIFT - inode->i_blkbits); 375 (PAGE_CACHE_SHIFT - inode->i_blkbits);
@@ -394,6 +395,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
394 } 395 }
395 396
396 clear_bit(META_dirty, &mp->flag); 397 clear_bit(META_dirty, &mp->flag);
398 set_bit(META_io, &mp->flag);
397 block_offset = offset >> inode->i_blkbits; 399 block_offset = offset >> inode->i_blkbits;
398 lblock = page_start + block_offset; 400 lblock = page_start + block_offset;
399 if (bio) { 401 if (bio) {
@@ -402,7 +404,6 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
402 len = min(xlen, blocks_per_mp); 404 len = min(xlen, blocks_per_mp);
403 xlen -= len; 405 xlen -= len;
404 bio_bytes += len << inode->i_blkbits; 406 bio_bytes += len << inode->i_blkbits;
405 set_bit(META_io, &mp->flag);
406 continue; 407 continue;
407 } 408 }
408 /* Not contiguous */ 409 /* Not contiguous */
@@ -424,12 +425,14 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
424 xlen = (PAGE_CACHE_SIZE - offset) >> inode->i_blkbits; 425 xlen = (PAGE_CACHE_SIZE - offset) >> inode->i_blkbits;
425 pblock = metapage_get_blocks(inode, lblock, &xlen); 426 pblock = metapage_get_blocks(inode, lblock, &xlen);
426 if (!pblock) { 427 if (!pblock) {
427 /* Need better error handling */
428 printk(KERN_ERR "JFS: metapage_get_blocks failed\n"); 428 printk(KERN_ERR "JFS: metapage_get_blocks failed\n");
429 dec_io(page, last_write_complete); 429 /*
430 * We already called inc_io(), but can't cancel it
431 * with dec_io() until we're done with the page
432 */
433 bad_blocks++;
430 continue; 434 continue;
431 } 435 }
432 set_bit(META_io, &mp->flag);
433 len = min(xlen, (int)JFS_SBI(inode->i_sb)->nbperpage); 436 len = min(xlen, (int)JFS_SBI(inode->i_sb)->nbperpage);
434 437
435 bio = bio_alloc(GFP_NOFS, 1); 438 bio = bio_alloc(GFP_NOFS, 1);
@@ -459,6 +462,9 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
459 462
460 unlock_page(page); 463 unlock_page(page);
461 464
465 if (bad_blocks)
466 goto err_out;
467
462 if (nr_underway == 0) 468 if (nr_underway == 0)
463 end_page_writeback(page); 469 end_page_writeback(page);
464 470
@@ -474,7 +480,9 @@ skip:
474 bio_put(bio); 480 bio_put(bio);
475 unlock_page(page); 481 unlock_page(page);
476 dec_io(page, last_write_complete); 482 dec_io(page, last_write_complete);
477 483err_out:
484 while (bad_blocks--)
485 dec_io(page, last_write_complete);
478 return -EIO; 486 return -EIO;
479} 487}
480 488
diff --git a/fs/jfs/jfs_types.h b/fs/jfs/jfs_types.h
index 649f9817accd..43ea3713c083 100644
--- a/fs/jfs/jfs_types.h
+++ b/fs/jfs/jfs_types.h
@@ -58,35 +58,6 @@ struct timestruc_t {
58#define ONES 0xffffffffu /* all bit on */ 58#define ONES 0xffffffffu /* all bit on */
59 59
60/* 60/*
61 * logical xd (lxd)
62 */
63typedef struct {
64 unsigned len:24;
65 unsigned off1:8;
66 u32 off2;
67} lxd_t;
68
69/* lxd_t field construction */
70#define LXDlength(lxd, length32) ( (lxd)->len = length32 )
71#define LXDoffset(lxd, offset64)\
72{\
73 (lxd)->off1 = ((s64)offset64) >> 32;\
74 (lxd)->off2 = (offset64) & 0xffffffff;\
75}
76
77/* lxd_t field extraction */
78#define lengthLXD(lxd) ( (lxd)->len )
79#define offsetLXD(lxd)\
80 ( ((s64)((lxd)->off1)) << 32 | (lxd)->off2 )
81
82/* lxd list */
83struct lxdlist {
84 s16 maxnlxd;
85 s16 nlxd;
86 lxd_t *lxd;
87};
88
89/*
90 * physical xd (pxd) 61 * physical xd (pxd)
91 */ 62 */
92typedef struct { 63typedef struct {
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index ae3acafb447b..d654a6458648 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -164,11 +164,8 @@ int xtLookup(struct inode *ip, s64 lstart,
164 /* is lookup offset beyond eof ? */ 164 /* is lookup offset beyond eof ? */
165 size = ((u64) ip->i_size + (JFS_SBI(ip->i_sb)->bsize - 1)) >> 165 size = ((u64) ip->i_size + (JFS_SBI(ip->i_sb)->bsize - 1)) >>
166 JFS_SBI(ip->i_sb)->l2bsize; 166 JFS_SBI(ip->i_sb)->l2bsize;
167 if (lstart >= size) { 167 if (lstart >= size)
168 jfs_err("xtLookup: lstart (0x%lx) >= size (0x%lx)",
169 (ulong) lstart, (ulong) size);
170 return 0; 168 return 0;
171 }
172 } 169 }
173 170
174 /* 171 /*
@@ -220,264 +217,6 @@ int xtLookup(struct inode *ip, s64 lstart,
220 return rc; 217 return rc;
221} 218}
222 219
223
224/*
225 * xtLookupList()
226 *
227 * function: map a single logical extent into a list of physical extent;
228 *
229 * parameter:
230 * struct inode *ip,
231 * struct lxdlist *lxdlist, lxd list (in)
232 * struct xadlist *xadlist, xad list (in/out)
233 * int flag)
234 *
235 * coverage of lxd by xad under assumption of
236 * . lxd's are ordered and disjoint.
237 * . xad's are ordered and disjoint.
238 *
239 * return:
240 * 0: success
241 *
242 * note: a page being written (even a single byte) is backed fully,
243 * except the last page which is only backed with blocks
244 * required to cover the last byte;
245 * the extent backing a page is fully contained within an xad;
246 */
247int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
248 struct xadlist * xadlist, int flag)
249{
250 int rc = 0;
251 struct btstack btstack;
252 int cmp;
253 s64 bn;
254 struct metapage *mp;
255 xtpage_t *p;
256 int index;
257 lxd_t *lxd;
258 xad_t *xad, *pxd;
259 s64 size, lstart, lend, xstart, xend, pstart;
260 s64 llen, xlen, plen;
261 s64 xaddr, paddr;
262 int nlxd, npxd, maxnpxd;
263
264 npxd = xadlist->nxad = 0;
265 maxnpxd = xadlist->maxnxad;
266 pxd = xadlist->xad;
267
268 nlxd = lxdlist->nlxd;
269 lxd = lxdlist->lxd;
270
271 lstart = offsetLXD(lxd);
272 llen = lengthLXD(lxd);
273 lend = lstart + llen;
274
275 size = (ip->i_size + (JFS_SBI(ip->i_sb)->bsize - 1)) >>
276 JFS_SBI(ip->i_sb)->l2bsize;
277
278 /*
279 * search for the xad entry covering the logical extent
280 */
281 search:
282 if (lstart >= size)
283 return 0;
284
285 if ((rc = xtSearch(ip, lstart, NULL, &cmp, &btstack, 0)))
286 return rc;
287
288 /*
289 * compute the physical extent covering logical extent
290 *
291 * N.B. search may have failed (e.g., hole in sparse file),
292 * and returned the index of the next entry.
293 */
294//map:
295 /* retrieve search result */
296 XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
297
298 /* is xad on the next sibling page ? */
299 if (index == le16_to_cpu(p->header.nextindex)) {
300 if (p->header.flag & BT_ROOT)
301 goto mapend;
302
303 if ((bn = le64_to_cpu(p->header.next)) == 0)
304 goto mapend;
305
306 XT_PUTPAGE(mp);
307
308 /* get next sibling page */
309 XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
310 if (rc)
311 return rc;
312
313 index = XTENTRYSTART;
314 }
315
316 xad = &p->xad[index];
317
318 /*
319 * is lxd covered by xad ?
320 */
321 compare:
322 xstart = offsetXAD(xad);
323 xlen = lengthXAD(xad);
324 xend = xstart + xlen;
325 xaddr = addressXAD(xad);
326
327 compare1:
328 if (xstart < lstart)
329 goto compare2;
330
331 /* (lstart <= xstart) */
332
333 /* lxd is NOT covered by xad */
334 if (lend <= xstart) {
335 /*
336 * get next lxd
337 */
338 if (--nlxd == 0)
339 goto mapend;
340 lxd++;
341
342 lstart = offsetLXD(lxd);
343 llen = lengthLXD(lxd);
344 lend = lstart + llen;
345 if (lstart >= size)
346 goto mapend;
347
348 /* compare with the current xad */
349 goto compare1;
350 }
351 /* lxd is covered by xad */
352 else { /* (xstart < lend) */
353
354 /* initialize new pxd */
355 pstart = xstart;
356 plen = min(lend - xstart, xlen);
357 paddr = xaddr;
358
359 goto cover;
360 }
361
362 /* (xstart < lstart) */
363 compare2:
364 /* lxd is covered by xad */
365 if (lstart < xend) {
366 /* initialize new pxd */
367 pstart = lstart;
368 plen = min(xend - lstart, llen);
369 paddr = xaddr + (lstart - xstart);
370
371 goto cover;
372 }
373 /* lxd is NOT covered by xad */
374 else { /* (xend <= lstart) */
375
376 /*
377 * get next xad
378 *
379 * linear search next xad covering lxd on
380 * the current xad page, and then tree search
381 */
382 if (index == le16_to_cpu(p->header.nextindex) - 1) {
383 if (p->header.flag & BT_ROOT)
384 goto mapend;
385
386 XT_PUTPAGE(mp);
387 goto search;
388 } else {
389 index++;
390 xad++;
391
392 /* compare with new xad */
393 goto compare;
394 }
395 }
396
397 /*
398 * lxd is covered by xad and a new pxd has been initialized
399 * (lstart <= xstart < lend) or (xstart < lstart < xend)
400 */
401 cover:
402 /* finalize pxd corresponding to current xad */
403 XT_PUTENTRY(pxd, xad->flag, pstart, plen, paddr);
404
405 if (++npxd >= maxnpxd)
406 goto mapend;
407 pxd++;
408
409 /*
410 * lxd is fully covered by xad
411 */
412 if (lend <= xend) {
413 /*
414 * get next lxd
415 */
416 if (--nlxd == 0)
417 goto mapend;
418 lxd++;
419
420 lstart = offsetLXD(lxd);
421 llen = lengthLXD(lxd);
422 lend = lstart + llen;
423 if (lstart >= size)
424 goto mapend;
425
426 /*
427 * test for old xad covering new lxd
428 * (old xstart < new lstart)
429 */
430 goto compare2;
431 }
432 /*
433 * lxd is partially covered by xad
434 */
435 else { /* (xend < lend) */
436
437 /*
438 * get next xad
439 *
440 * linear search next xad covering lxd on
441 * the current xad page, and then next xad page search
442 */
443 if (index == le16_to_cpu(p->header.nextindex) - 1) {
444 if (p->header.flag & BT_ROOT)
445 goto mapend;
446
447 if ((bn = le64_to_cpu(p->header.next)) == 0)
448 goto mapend;
449
450 XT_PUTPAGE(mp);
451
452 /* get next sibling page */
453 XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
454 if (rc)
455 return rc;
456
457 index = XTENTRYSTART;
458 xad = &p->xad[index];
459 } else {
460 index++;
461 xad++;
462 }
463
464 /*
465 * test for new xad covering old lxd
466 * (old lstart < new xstart)
467 */
468 goto compare;
469 }
470
471 mapend:
472 xadlist->nxad = npxd;
473
474//out:
475 XT_PUTPAGE(mp);
476
477 return rc;
478}
479
480
481/* 220/*
482 * xtSearch() 221 * xtSearch()
483 * 222 *
@@ -846,10 +585,10 @@ int xtInsert(tid_t tid, /* transaction id */
846 hint = addressXAD(xad) + lengthXAD(xad) - 1; 585 hint = addressXAD(xad) + lengthXAD(xad) - 1;
847 } else 586 } else
848 hint = 0; 587 hint = 0;
849 if ((rc = DQUOT_ALLOC_BLOCK(ip, xlen))) 588 if ((rc = vfs_dq_alloc_block(ip, xlen)))
850 goto out; 589 goto out;
851 if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) { 590 if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) {
852 DQUOT_FREE_BLOCK(ip, xlen); 591 vfs_dq_free_block(ip, xlen);
853 goto out; 592 goto out;
854 } 593 }
855 } 594 }
@@ -878,7 +617,7 @@ int xtInsert(tid_t tid, /* transaction id */
878 /* undo data extent allocation */ 617 /* undo data extent allocation */
879 if (*xaddrp == 0) { 618 if (*xaddrp == 0) {
880 dbFree(ip, xaddr, (s64) xlen); 619 dbFree(ip, xaddr, (s64) xlen);
881 DQUOT_FREE_BLOCK(ip, xlen); 620 vfs_dq_free_block(ip, xlen);
882 } 621 }
883 return rc; 622 return rc;
884 } 623 }
@@ -1246,7 +985,7 @@ xtSplitPage(tid_t tid, struct inode *ip,
1246 rbn = addressPXD(pxd); 985 rbn = addressPXD(pxd);
1247 986
1248 /* Allocate blocks to quota. */ 987 /* Allocate blocks to quota. */
1249 if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) { 988 if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
1250 rc = -EDQUOT; 989 rc = -EDQUOT;
1251 goto clean_up; 990 goto clean_up;
1252 } 991 }
@@ -1456,7 +1195,7 @@ xtSplitPage(tid_t tid, struct inode *ip,
1456 1195
1457 /* Rollback quota allocation. */ 1196 /* Rollback quota allocation. */
1458 if (quota_allocation) 1197 if (quota_allocation)
1459 DQUOT_FREE_BLOCK(ip, quota_allocation); 1198 vfs_dq_free_block(ip, quota_allocation);
1460 1199
1461 return (rc); 1200 return (rc);
1462} 1201}
@@ -1513,7 +1252,7 @@ xtSplitRoot(tid_t tid,
1513 return -EIO; 1252 return -EIO;
1514 1253
1515 /* Allocate blocks to quota. */ 1254 /* Allocate blocks to quota. */
1516 if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) { 1255 if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
1517 release_metapage(rmp); 1256 release_metapage(rmp);
1518 return -EDQUOT; 1257 return -EDQUOT;
1519 } 1258 }
@@ -3941,7 +3680,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
3941 ip->i_size = newsize; 3680 ip->i_size = newsize;
3942 3681
3943 /* update quota allocation to reflect freed blocks */ 3682 /* update quota allocation to reflect freed blocks */
3944 DQUOT_FREE_BLOCK(ip, nfreed); 3683 vfs_dq_free_block(ip, nfreed);
3945 3684
3946 /* 3685 /*
3947 * free tlock of invalidated pages 3686 * free tlock of invalidated pages
diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h
index 70815c8a3d6a..08c0c749b986 100644
--- a/fs/jfs/jfs_xtree.h
+++ b/fs/jfs/jfs_xtree.h
@@ -110,8 +110,6 @@ typedef union {
110 */ 110 */
111extern int xtLookup(struct inode *ip, s64 lstart, s64 llen, 111extern int xtLookup(struct inode *ip, s64 lstart, s64 llen,
112 int *pflag, s64 * paddr, int *plen, int flag); 112 int *pflag, s64 * paddr, int *plen, int flag);
113extern int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
114 struct xadlist * xadlist, int flag);
115extern void xtInitRoot(tid_t tid, struct inode *ip); 113extern void xtInitRoot(tid_t tid, struct inode *ip);
116extern int xtInsert(tid_t tid, struct inode *ip, 114extern int xtInsert(tid_t tid, struct inode *ip,
117 int xflag, s64 xoff, int xlen, s64 * xaddrp, int flag); 115 int xflag, s64 xoff, int xlen, s64 * xaddrp, int flag);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index b4de56b851e4..514ee2edb92a 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -35,7 +35,7 @@
35/* 35/*
36 * forward references 36 * forward references
37 */ 37 */
38struct dentry_operations jfs_ci_dentry_operations; 38const struct dentry_operations jfs_ci_dentry_operations;
39 39
40static s64 commitZeroLink(tid_t, struct inode *); 40static s64 commitZeroLink(tid_t, struct inode *);
41 41
@@ -356,7 +356,7 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
356 jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name); 356 jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
357 357
358 /* Init inode for quota operations. */ 358 /* Init inode for quota operations. */
359 DQUOT_INIT(ip); 359 vfs_dq_init(ip);
360 360
361 /* directory must be empty to be removed */ 361 /* directory must be empty to be removed */
362 if (!dtEmpty(ip)) { 362 if (!dtEmpty(ip)) {
@@ -483,7 +483,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
483 jfs_info("jfs_unlink: dip:0x%p name:%s", dip, dentry->d_name.name); 483 jfs_info("jfs_unlink: dip:0x%p name:%s", dip, dentry->d_name.name);
484 484
485 /* Init inode for quota operations. */ 485 /* Init inode for quota operations. */
486 DQUOT_INIT(ip); 486 vfs_dq_init(ip);
487 487
488 if ((rc = get_UCSname(&dname, dentry))) 488 if ((rc = get_UCSname(&dname, dentry)))
489 goto out; 489 goto out;
@@ -1136,7 +1136,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1136 } else if (new_ip) { 1136 } else if (new_ip) {
1137 IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL); 1137 IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL);
1138 /* Init inode for quota operations. */ 1138 /* Init inode for quota operations. */
1139 DQUOT_INIT(new_ip); 1139 vfs_dq_init(new_ip);
1140 } 1140 }
1141 1141
1142 /* 1142 /*
@@ -1595,7 +1595,7 @@ out:
1595 return result; 1595 return result;
1596} 1596}
1597 1597
1598struct dentry_operations jfs_ci_dentry_operations = 1598const struct dentry_operations jfs_ci_dentry_operations =
1599{ 1599{
1600 .d_hash = jfs_ci_hash, 1600 .d_hash = jfs_ci_hash,
1601 .d_compare = jfs_ci_compare, 1601 .d_compare = jfs_ci_compare,
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index b37d1f78b854..6f21adf9479a 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -29,6 +29,7 @@
29#include <linux/posix_acl.h> 29#include <linux/posix_acl.h>
30#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
31#include <linux/exportfs.h> 31#include <linux/exportfs.h>
32#include <linux/crc32.h>
32#include <asm/uaccess.h> 33#include <asm/uaccess.h>
33#include <linux/seq_file.h> 34#include <linux/seq_file.h>
34 35
@@ -168,6 +169,9 @@ static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
168 buf->f_files = maxinodes; 169 buf->f_files = maxinodes;
169 buf->f_ffree = maxinodes - (atomic_read(&imap->im_numinos) - 170 buf->f_ffree = maxinodes - (atomic_read(&imap->im_numinos) -
170 atomic_read(&imap->im_numfree)); 171 atomic_read(&imap->im_numfree));
172 buf->f_fsid.val[0] = (u32)crc32_le(0, sbi->uuid, sizeof(sbi->uuid)/2);
173 buf->f_fsid.val[1] = (u32)crc32_le(0, sbi->uuid + sizeof(sbi->uuid)/2,
174 sizeof(sbi->uuid)/2);
171 175
172 buf->f_namelen = JFS_NAME_MAX; 176 buf->f_namelen = JFS_NAME_MAX;
173 return 0; 177 return 0;
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 9b7f2cdaae0a..61dfa8173ebc 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -260,14 +260,14 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size,
260 nblocks = (size + (sb->s_blocksize - 1)) >> sb->s_blocksize_bits; 260 nblocks = (size + (sb->s_blocksize - 1)) >> sb->s_blocksize_bits;
261 261
262 /* Allocate new blocks to quota. */ 262 /* Allocate new blocks to quota. */
263 if (DQUOT_ALLOC_BLOCK(ip, nblocks)) { 263 if (vfs_dq_alloc_block(ip, nblocks)) {
264 return -EDQUOT; 264 return -EDQUOT;
265 } 265 }
266 266
267 rc = dbAlloc(ip, INOHINT(ip), nblocks, &blkno); 267 rc = dbAlloc(ip, INOHINT(ip), nblocks, &blkno);
268 if (rc) { 268 if (rc) {
269 /*Rollback quota allocation. */ 269 /*Rollback quota allocation. */
270 DQUOT_FREE_BLOCK(ip, nblocks); 270 vfs_dq_free_block(ip, nblocks);
271 return rc; 271 return rc;
272 } 272 }
273 273
@@ -332,7 +332,7 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size,
332 332
333 failed: 333 failed:
334 /* Rollback quota allocation. */ 334 /* Rollback quota allocation. */
335 DQUOT_FREE_BLOCK(ip, nblocks); 335 vfs_dq_free_block(ip, nblocks);
336 336
337 dbFree(ip, blkno, nblocks); 337 dbFree(ip, blkno, nblocks);
338 return rc; 338 return rc;
@@ -538,7 +538,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
538 538
539 if (blocks_needed > current_blocks) { 539 if (blocks_needed > current_blocks) {
540 /* Allocate new blocks to quota. */ 540 /* Allocate new blocks to quota. */
541 if (DQUOT_ALLOC_BLOCK(inode, blocks_needed)) 541 if (vfs_dq_alloc_block(inode, blocks_needed))
542 return -EDQUOT; 542 return -EDQUOT;
543 543
544 quota_allocation = blocks_needed; 544 quota_allocation = blocks_needed;
@@ -602,7 +602,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
602 clean_up: 602 clean_up:
603 /* Rollback quota allocation */ 603 /* Rollback quota allocation */
604 if (quota_allocation) 604 if (quota_allocation)
605 DQUOT_FREE_BLOCK(inode, quota_allocation); 605 vfs_dq_free_block(inode, quota_allocation);
606 606
607 return (rc); 607 return (rc);
608} 608}
@@ -677,7 +677,7 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf,
677 677
678 /* If old blocks exist, they must be removed from quota allocation. */ 678 /* If old blocks exist, they must be removed from quota allocation. */
679 if (old_blocks) 679 if (old_blocks)
680 DQUOT_FREE_BLOCK(inode, old_blocks); 680 vfs_dq_free_block(inode, old_blocks);
681 681
682 inode->i_ctime = CURRENT_TIME; 682 inode->i_ctime = CURRENT_TIME;
683 683
diff --git a/fs/libfs.c b/fs/libfs.c
index 49b44099dabb..4910a36f516e 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -44,7 +44,7 @@ static int simple_delete_dentry(struct dentry *dentry)
44 */ 44 */
45struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) 45struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
46{ 46{
47 static struct dentry_operations simple_dentry_operations = { 47 static const struct dentry_operations simple_dentry_operations = {
48 .d_delete = simple_delete_dentry, 48 .d_delete = simple_delete_dentry,
49 }; 49 };
50 50
@@ -242,7 +242,8 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
242 d_instantiate(dentry, root); 242 d_instantiate(dentry, root);
243 s->s_root = dentry; 243 s->s_root = dentry;
244 s->s_flags |= MS_ACTIVE; 244 s->s_flags |= MS_ACTIVE;
245 return simple_set_mnt(mnt, s); 245 simple_set_mnt(mnt, s);
246 return 0;
246 247
247Enomem: 248Enomem:
248 up_write(&s->s_umount); 249 up_write(&s->s_umount);
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 5e2c4d5ac827..6d5d4a4169e5 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -16,6 +16,8 @@
16#include <linux/sunrpc/svc.h> 16#include <linux/sunrpc/svc.h>
17#include <linux/lockd/lockd.h> 17#include <linux/lockd/lockd.h>
18 18
19#include <asm/unaligned.h>
20
19#define NLMDBG_FACILITY NLMDBG_MONITOR 21#define NLMDBG_FACILITY NLMDBG_MONITOR
20#define NSM_PROGRAM 100024 22#define NSM_PROGRAM 100024
21#define NSM_VERSION 1 23#define NSM_VERSION 1
@@ -274,10 +276,12 @@ static void nsm_init_private(struct nsm_handle *nsm)
274{ 276{
275 u64 *p = (u64 *)&nsm->sm_priv.data; 277 u64 *p = (u64 *)&nsm->sm_priv.data;
276 struct timespec ts; 278 struct timespec ts;
279 s64 ns;
277 280
278 ktime_get_ts(&ts); 281 ktime_get_ts(&ts);
279 *p++ = timespec_to_ns(&ts); 282 ns = timespec_to_ns(&ts);
280 *p = (unsigned long)nsm; 283 put_unaligned(ns, p);
284 put_unaligned((unsigned long)nsm, p + 1);
281} 285}
282 286
283static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap, 287static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 64f1c31b5853..abf83881f68a 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -53,17 +53,6 @@ static struct svc_rqst *nlmsvc_rqst;
53unsigned long nlmsvc_timeout; 53unsigned long nlmsvc_timeout;
54 54
55/* 55/*
56 * If the kernel has IPv6 support available, always listen for
57 * both AF_INET and AF_INET6 requests.
58 */
59#if (defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) && \
60 defined(CONFIG_SUNRPC_REGISTER_V4)
61static const sa_family_t nlmsvc_family = AF_INET6;
62#else /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
63static const sa_family_t nlmsvc_family = AF_INET;
64#endif /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
65
66/*
67 * These can be set at insmod time (useful for NFS as root filesystem), 56 * These can be set at insmod time (useful for NFS as root filesystem),
68 * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003 57 * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003
69 */ 58 */
@@ -204,19 +193,30 @@ lockd(void *vrqstp)
204 return 0; 193 return 0;
205} 194}
206 195
207static int create_lockd_listener(struct svc_serv *serv, char *name, 196static int create_lockd_listener(struct svc_serv *serv, const char *name,
208 unsigned short port) 197 const int family, const unsigned short port)
209{ 198{
210 struct svc_xprt *xprt; 199 struct svc_xprt *xprt;
211 200
212 xprt = svc_find_xprt(serv, name, 0, 0); 201 xprt = svc_find_xprt(serv, name, family, 0);
213 if (xprt == NULL) 202 if (xprt == NULL)
214 return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS); 203 return svc_create_xprt(serv, name, family, port,
215 204 SVC_SOCK_DEFAULTS);
216 svc_xprt_put(xprt); 205 svc_xprt_put(xprt);
217 return 0; 206 return 0;
218} 207}
219 208
209static int create_lockd_family(struct svc_serv *serv, const int family)
210{
211 int err;
212
213 err = create_lockd_listener(serv, "udp", family, nlm_udpport);
214 if (err < 0)
215 return err;
216
217 return create_lockd_listener(serv, "tcp", family, nlm_tcpport);
218}
219
220/* 220/*
221 * Ensure there are active UDP and TCP listeners for lockd. 221 * Ensure there are active UDP and TCP listeners for lockd.
222 * 222 *
@@ -232,13 +232,15 @@ static int make_socks(struct svc_serv *serv)
232 static int warned; 232 static int warned;
233 int err; 233 int err;
234 234
235 err = create_lockd_listener(serv, "udp", nlm_udpport); 235 err = create_lockd_family(serv, PF_INET);
236 if (err < 0) 236 if (err < 0)
237 goto out_err; 237 goto out_err;
238 238
239 err = create_lockd_listener(serv, "tcp", nlm_tcpport); 239#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
240 if (err < 0) 240 err = create_lockd_family(serv, PF_INET6);
241 if (err < 0 && err != -EAFNOSUPPORT)
241 goto out_err; 242 goto out_err;
243#endif /* CONFIG_IPV6 || CONFIG_IPV6_MODULE */
242 244
243 warned = 0; 245 warned = 0;
244 return 0; 246 return 0;
@@ -274,7 +276,7 @@ int lockd_up(void)
274 "lockd_up: no pid, %d users??\n", nlmsvc_users); 276 "lockd_up: no pid, %d users??\n", nlmsvc_users);
275 277
276 error = -ENOMEM; 278 error = -ENOMEM;
277 serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL); 279 serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
278 if (!serv) { 280 if (!serv) {
279 printk(KERN_WARNING "lockd_up: create service failed\n"); 281 printk(KERN_WARNING "lockd_up: create service failed\n");
280 goto out; 282 goto out;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index d1d1eb84679d..daad3c2740db 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * 5 *
6 * Copyright (C) 1996 Gertjan van Wingerde (gertjan@cs.vu.nl) 6 * Copyright (C) 1996 Gertjan van Wingerde
7 * Minix V2 fs support. 7 * Minix V2 fs support.
8 * 8 *
9 * Modified for 680x0 by Andreas Schwab 9 * Modified for 680x0 by Andreas Schwab
@@ -321,15 +321,20 @@ out:
321 321
322static int minix_statfs(struct dentry *dentry, struct kstatfs *buf) 322static int minix_statfs(struct dentry *dentry, struct kstatfs *buf)
323{ 323{
324 struct minix_sb_info *sbi = minix_sb(dentry->d_sb); 324 struct super_block *sb = dentry->d_sb;
325 buf->f_type = dentry->d_sb->s_magic; 325 struct minix_sb_info *sbi = minix_sb(sb);
326 buf->f_bsize = dentry->d_sb->s_blocksize; 326 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
327 buf->f_type = sb->s_magic;
328 buf->f_bsize = sb->s_blocksize;
327 buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size; 329 buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size;
328 buf->f_bfree = minix_count_free_blocks(sbi); 330 buf->f_bfree = minix_count_free_blocks(sbi);
329 buf->f_bavail = buf->f_bfree; 331 buf->f_bavail = buf->f_bfree;
330 buf->f_files = sbi->s_ninodes; 332 buf->f_files = sbi->s_ninodes;
331 buf->f_ffree = minix_count_free_inodes(sbi); 333 buf->f_ffree = minix_count_free_inodes(sbi);
332 buf->f_namelen = sbi->s_namelen; 334 buf->f_namelen = sbi->s_namelen;
335 buf->f_fsid.val[0] = (u32)id;
336 buf->f_fsid.val[1] = (u32)(id >> 32);
337
333 return 0; 338 return 0;
334} 339}
335 340
diff --git a/fs/mpage.c b/fs/mpage.c
index 16c3ef37eae3..680ba60863ff 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
82 bio_put(bio); 82 bio_put(bio);
83} 83}
84 84
85struct bio *mpage_bio_submit(int rw, struct bio *bio) 85static struct bio *mpage_bio_submit(int rw, struct bio *bio)
86{ 86{
87 bio->bi_end_io = mpage_end_io_read; 87 bio->bi_end_io = mpage_end_io_read;
88 if (rw == WRITE) 88 if (rw == WRITE)
@@ -90,7 +90,6 @@ struct bio *mpage_bio_submit(int rw, struct bio *bio)
90 submit_bio(rw, bio); 90 submit_bio(rw, bio);
91 return NULL; 91 return NULL;
92} 92}
93EXPORT_SYMBOL(mpage_bio_submit);
94 93
95static struct bio * 94static struct bio *
96mpage_alloc(struct block_device *bdev, 95mpage_alloc(struct block_device *bdev,
@@ -439,7 +438,14 @@ EXPORT_SYMBOL(mpage_readpage);
439 * just allocate full-size (16-page) BIOs. 438 * just allocate full-size (16-page) BIOs.
440 */ 439 */
441 440
442int __mpage_writepage(struct page *page, struct writeback_control *wbc, 441struct mpage_data {
442 struct bio *bio;
443 sector_t last_block_in_bio;
444 get_block_t *get_block;
445 unsigned use_writepage;
446};
447
448static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
443 void *data) 449 void *data)
444{ 450{
445 struct mpage_data *mpd = data; 451 struct mpage_data *mpd = data;
@@ -648,7 +654,6 @@ out:
648 mpd->bio = bio; 654 mpd->bio = bio;
649 return ret; 655 return ret;
650} 656}
651EXPORT_SYMBOL(__mpage_writepage);
652 657
653/** 658/**
654 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them 659 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
diff --git a/fs/namei.c b/fs/namei.c
index bbc15c237558..b8433ebfae05 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -24,6 +24,7 @@
24#include <linux/fsnotify.h> 24#include <linux/fsnotify.h>
25#include <linux/personality.h> 25#include <linux/personality.h>
26#include <linux/security.h> 26#include <linux/security.h>
27#include <linux/ima.h>
27#include <linux/syscalls.h> 28#include <linux/syscalls.h>
28#include <linux/mount.h> 29#include <linux/mount.h>
29#include <linux/audit.h> 30#include <linux/audit.h>
@@ -31,6 +32,7 @@
31#include <linux/file.h> 32#include <linux/file.h>
32#include <linux/fcntl.h> 33#include <linux/fcntl.h>
33#include <linux/device_cgroup.h> 34#include <linux/device_cgroup.h>
35#include <linux/fs_struct.h>
34#include <asm/uaccess.h> 36#include <asm/uaccess.h>
35 37
36#define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE]) 38#define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
@@ -850,6 +852,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
850 if (err == -EAGAIN) 852 if (err == -EAGAIN)
851 err = inode_permission(nd->path.dentry->d_inode, 853 err = inode_permission(nd->path.dentry->d_inode,
852 MAY_EXEC); 854 MAY_EXEC);
855 if (!err)
856 err = ima_path_check(&nd->path, MAY_EXEC);
853 if (err) 857 if (err)
854 break; 858 break;
855 859
@@ -1470,7 +1474,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
1470 error = security_inode_create(dir, dentry, mode); 1474 error = security_inode_create(dir, dentry, mode);
1471 if (error) 1475 if (error)
1472 return error; 1476 return error;
1473 DQUOT_INIT(dir); 1477 vfs_dq_init(dir);
1474 error = dir->i_op->create(dir, dentry, mode, nd); 1478 error = dir->i_op->create(dir, dentry, mode, nd);
1475 if (!error) 1479 if (!error)
1476 fsnotify_create(dir, dentry); 1480 fsnotify_create(dir, dentry);
@@ -1486,29 +1490,32 @@ int may_open(struct path *path, int acc_mode, int flag)
1486 if (!inode) 1490 if (!inode)
1487 return -ENOENT; 1491 return -ENOENT;
1488 1492
1489 if (S_ISLNK(inode->i_mode)) 1493 switch (inode->i_mode & S_IFMT) {
1494 case S_IFLNK:
1490 return -ELOOP; 1495 return -ELOOP;
1491 1496 case S_IFDIR:
1492 if (S_ISDIR(inode->i_mode) && (acc_mode & MAY_WRITE)) 1497 if (acc_mode & MAY_WRITE)
1493 return -EISDIR; 1498 return -EISDIR;
1494 1499 break;
1495 /* 1500 case S_IFBLK:
1496 * FIFO's, sockets and device files are special: they don't 1501 case S_IFCHR:
1497 * actually live on the filesystem itself, and as such you
1498 * can write to them even if the filesystem is read-only.
1499 */
1500 if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
1501 flag &= ~O_TRUNC;
1502 } else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
1503 if (path->mnt->mnt_flags & MNT_NODEV) 1502 if (path->mnt->mnt_flags & MNT_NODEV)
1504 return -EACCES; 1503 return -EACCES;
1505 1504 /*FALLTHRU*/
1505 case S_IFIFO:
1506 case S_IFSOCK:
1506 flag &= ~O_TRUNC; 1507 flag &= ~O_TRUNC;
1508 break;
1507 } 1509 }
1508 1510
1509 error = inode_permission(inode, acc_mode); 1511 error = inode_permission(inode, acc_mode);
1510 if (error) 1512 if (error)
1511 return error; 1513 return error;
1514
1515 error = ima_path_check(path,
1516 acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
1517 if (error)
1518 return error;
1512 /* 1519 /*
1513 * An append-only file must be opened in append mode for writing. 1520 * An append-only file must be opened in append mode for writing.
1514 */ 1521 */
@@ -1544,7 +1551,7 @@ int may_open(struct path *path, int acc_mode, int flag)
1544 error = security_path_truncate(path, 0, 1551 error = security_path_truncate(path, 0,
1545 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN); 1552 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
1546 if (!error) { 1553 if (!error) {
1547 DQUOT_INIT(inode); 1554 vfs_dq_init(inode);
1548 1555
1549 error = do_truncate(dentry, 0, 1556 error = do_truncate(dentry, 0,
1550 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, 1557 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
@@ -1555,7 +1562,7 @@ int may_open(struct path *path, int acc_mode, int flag)
1555 return error; 1562 return error;
1556 } else 1563 } else
1557 if (flag & FMODE_WRITE) 1564 if (flag & FMODE_WRITE)
1558 DQUOT_INIT(inode); 1565 vfs_dq_init(inode);
1559 1566
1560 return 0; 1567 return 0;
1561} 1568}
@@ -1572,7 +1579,7 @@ static int __open_namei_create(struct nameidata *nd, struct path *path,
1572 struct dentry *dir = nd->path.dentry; 1579 struct dentry *dir = nd->path.dentry;
1573 1580
1574 if (!IS_POSIXACL(dir->d_inode)) 1581 if (!IS_POSIXACL(dir->d_inode))
1575 mode &= ~current->fs->umask; 1582 mode &= ~current_umask();
1576 error = security_path_mknod(&nd->path, path->dentry, mode, 0); 1583 error = security_path_mknod(&nd->path, path->dentry, mode, 0);
1577 if (error) 1584 if (error)
1578 goto out_unlock; 1585 goto out_unlock;
@@ -1938,7 +1945,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1938 if (error) 1945 if (error)
1939 return error; 1946 return error;
1940 1947
1941 DQUOT_INIT(dir); 1948 vfs_dq_init(dir);
1942 error = dir->i_op->mknod(dir, dentry, mode, dev); 1949 error = dir->i_op->mknod(dir, dentry, mode, dev);
1943 if (!error) 1950 if (!error)
1944 fsnotify_create(dir, dentry); 1951 fsnotify_create(dir, dentry);
@@ -1983,7 +1990,7 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode,
1983 goto out_unlock; 1990 goto out_unlock;
1984 } 1991 }
1985 if (!IS_POSIXACL(nd.path.dentry->d_inode)) 1992 if (!IS_POSIXACL(nd.path.dentry->d_inode))
1986 mode &= ~current->fs->umask; 1993 mode &= ~current_umask();
1987 error = may_mknod(mode); 1994 error = may_mknod(mode);
1988 if (error) 1995 if (error)
1989 goto out_dput; 1996 goto out_dput;
@@ -2037,7 +2044,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
2037 if (error) 2044 if (error)
2038 return error; 2045 return error;
2039 2046
2040 DQUOT_INIT(dir); 2047 vfs_dq_init(dir);
2041 error = dir->i_op->mkdir(dir, dentry, mode); 2048 error = dir->i_op->mkdir(dir, dentry, mode);
2042 if (!error) 2049 if (!error)
2043 fsnotify_mkdir(dir, dentry); 2050 fsnotify_mkdir(dir, dentry);
@@ -2061,7 +2068,7 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode)
2061 goto out_unlock; 2068 goto out_unlock;
2062 2069
2063 if (!IS_POSIXACL(nd.path.dentry->d_inode)) 2070 if (!IS_POSIXACL(nd.path.dentry->d_inode))
2064 mode &= ~current->fs->umask; 2071 mode &= ~current_umask();
2065 error = mnt_want_write(nd.path.mnt); 2072 error = mnt_want_write(nd.path.mnt);
2066 if (error) 2073 if (error)
2067 goto out_dput; 2074 goto out_dput;
@@ -2123,7 +2130,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
2123 if (!dir->i_op->rmdir) 2130 if (!dir->i_op->rmdir)
2124 return -EPERM; 2131 return -EPERM;
2125 2132
2126 DQUOT_INIT(dir); 2133 vfs_dq_init(dir);
2127 2134
2128 mutex_lock(&dentry->d_inode->i_mutex); 2135 mutex_lock(&dentry->d_inode->i_mutex);
2129 dentry_unhash(dentry); 2136 dentry_unhash(dentry);
@@ -2210,7 +2217,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
2210 if (!dir->i_op->unlink) 2217 if (!dir->i_op->unlink)
2211 return -EPERM; 2218 return -EPERM;
2212 2219
2213 DQUOT_INIT(dir); 2220 vfs_dq_init(dir);
2214 2221
2215 mutex_lock(&dentry->d_inode->i_mutex); 2222 mutex_lock(&dentry->d_inode->i_mutex);
2216 if (d_mountpoint(dentry)) 2223 if (d_mountpoint(dentry))
@@ -2321,7 +2328,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
2321 if (error) 2328 if (error)
2322 return error; 2329 return error;
2323 2330
2324 DQUOT_INIT(dir); 2331 vfs_dq_init(dir);
2325 error = dir->i_op->symlink(dir, dentry, oldname); 2332 error = dir->i_op->symlink(dir, dentry, oldname);
2326 if (!error) 2333 if (!error)
2327 fsnotify_create(dir, dentry); 2334 fsnotify_create(dir, dentry);
@@ -2405,7 +2412,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
2405 return error; 2412 return error;
2406 2413
2407 mutex_lock(&inode->i_mutex); 2414 mutex_lock(&inode->i_mutex);
2408 DQUOT_INIT(dir); 2415 vfs_dq_init(dir);
2409 error = dir->i_op->link(old_dentry, dir, new_dentry); 2416 error = dir->i_op->link(old_dentry, dir, new_dentry);
2410 mutex_unlock(&inode->i_mutex); 2417 mutex_unlock(&inode->i_mutex);
2411 if (!error) 2418 if (!error)
@@ -2604,8 +2611,8 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
2604 if (!old_dir->i_op->rename) 2611 if (!old_dir->i_op->rename)
2605 return -EPERM; 2612 return -EPERM;
2606 2613
2607 DQUOT_INIT(old_dir); 2614 vfs_dq_init(old_dir);
2608 DQUOT_INIT(new_dir); 2615 vfs_dq_init(new_dir);
2609 2616
2610 old_name = fsnotify_oldname_init(old_dentry->d_name.name); 2617 old_name = fsnotify_oldname_init(old_dentry->d_name.name);
2611 2618
@@ -2891,10 +2898,3 @@ EXPORT_SYMBOL(vfs_symlink);
2891EXPORT_SYMBOL(vfs_unlink); 2898EXPORT_SYMBOL(vfs_unlink);
2892EXPORT_SYMBOL(dentry_unhash); 2899EXPORT_SYMBOL(dentry_unhash);
2893EXPORT_SYMBOL(generic_readlink); 2900EXPORT_SYMBOL(generic_readlink);
2894
2895/* to be mentioned only in INIT_TASK */
2896struct fs_struct init_fs = {
2897 .count = ATOMIC_INIT(1),
2898 .lock = __RW_LOCK_UNLOCKED(init_fs.lock),
2899 .umask = 0022,
2900};
diff --git a/fs/namespace.c b/fs/namespace.c
index 06f8e63f6cb1..c6f54e4c4290 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -27,6 +27,7 @@
27#include <linux/ramfs.h> 27#include <linux/ramfs.h>
28#include <linux/log2.h> 28#include <linux/log2.h>
29#include <linux/idr.h> 29#include <linux/idr.h>
30#include <linux/fs_struct.h>
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
31#include <asm/unistd.h> 32#include <asm/unistd.h>
32#include "pnode.h" 33#include "pnode.h"
@@ -397,11 +398,10 @@ static void __mnt_unmake_readonly(struct vfsmount *mnt)
397 spin_unlock(&vfsmount_lock); 398 spin_unlock(&vfsmount_lock);
398} 399}
399 400
400int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) 401void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
401{ 402{
402 mnt->mnt_sb = sb; 403 mnt->mnt_sb = sb;
403 mnt->mnt_root = dget(sb->s_root); 404 mnt->mnt_root = dget(sb->s_root);
404 return 0;
405} 405}
406 406
407EXPORT_SYMBOL(simple_set_mnt); 407EXPORT_SYMBOL(simple_set_mnt);
@@ -780,6 +780,7 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
780 { MNT_NOATIME, ",noatime" }, 780 { MNT_NOATIME, ",noatime" },
781 { MNT_NODIRATIME, ",nodiratime" }, 781 { MNT_NODIRATIME, ",nodiratime" },
782 { MNT_RELATIME, ",relatime" }, 782 { MNT_RELATIME, ",relatime" },
783 { MNT_STRICTATIME, ",strictatime" },
783 { 0, NULL } 784 { 0, NULL }
784 }; 785 };
785 const struct proc_fs_info *fs_infop; 786 const struct proc_fs_info *fs_infop;
@@ -1919,6 +1920,9 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
1919 if (data_page) 1920 if (data_page)
1920 ((char *)data_page)[PAGE_SIZE - 1] = 0; 1921 ((char *)data_page)[PAGE_SIZE - 1] = 0;
1921 1922
1923 /* Default to relatime */
1924 mnt_flags |= MNT_RELATIME;
1925
1922 /* Separate the per-mountpoint flags */ 1926 /* Separate the per-mountpoint flags */
1923 if (flags & MS_NOSUID) 1927 if (flags & MS_NOSUID)
1924 mnt_flags |= MNT_NOSUID; 1928 mnt_flags |= MNT_NOSUID;
@@ -1930,13 +1934,14 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
1930 mnt_flags |= MNT_NOATIME; 1934 mnt_flags |= MNT_NOATIME;
1931 if (flags & MS_NODIRATIME) 1935 if (flags & MS_NODIRATIME)
1932 mnt_flags |= MNT_NODIRATIME; 1936 mnt_flags |= MNT_NODIRATIME;
1933 if (flags & MS_RELATIME) 1937 if (flags & MS_STRICTATIME)
1934 mnt_flags |= MNT_RELATIME; 1938 mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
1935 if (flags & MS_RDONLY) 1939 if (flags & MS_RDONLY)
1936 mnt_flags |= MNT_READONLY; 1940 mnt_flags |= MNT_READONLY;
1937 1941
1938 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | 1942 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
1939 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT); 1943 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
1944 MS_STRICTATIME);
1940 1945
1941 /* ... and get the mountpoint */ 1946 /* ... and get the mountpoint */
1942 retval = kern_path(dir_name, LOOKUP_FOLLOW, &path); 1947 retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
@@ -2089,66 +2094,6 @@ out1:
2089} 2094}
2090 2095
2091/* 2096/*
2092 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
2093 * It can block. Requires the big lock held.
2094 */
2095void set_fs_root(struct fs_struct *fs, struct path *path)
2096{
2097 struct path old_root;
2098
2099 write_lock(&fs->lock);
2100 old_root = fs->root;
2101 fs->root = *path;
2102 path_get(path);
2103 write_unlock(&fs->lock);
2104 if (old_root.dentry)
2105 path_put(&old_root);
2106}
2107
2108/*
2109 * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
2110 * It can block. Requires the big lock held.
2111 */
2112void set_fs_pwd(struct fs_struct *fs, struct path *path)
2113{
2114 struct path old_pwd;
2115
2116 write_lock(&fs->lock);
2117 old_pwd = fs->pwd;
2118 fs->pwd = *path;
2119 path_get(path);
2120 write_unlock(&fs->lock);
2121
2122 if (old_pwd.dentry)
2123 path_put(&old_pwd);
2124}
2125
2126static void chroot_fs_refs(struct path *old_root, struct path *new_root)
2127{
2128 struct task_struct *g, *p;
2129 struct fs_struct *fs;
2130
2131 read_lock(&tasklist_lock);
2132 do_each_thread(g, p) {
2133 task_lock(p);
2134 fs = p->fs;
2135 if (fs) {
2136 atomic_inc(&fs->count);
2137 task_unlock(p);
2138 if (fs->root.dentry == old_root->dentry
2139 && fs->root.mnt == old_root->mnt)
2140 set_fs_root(fs, new_root);
2141 if (fs->pwd.dentry == old_root->dentry
2142 && fs->pwd.mnt == old_root->mnt)
2143 set_fs_pwd(fs, new_root);
2144 put_fs_struct(fs);
2145 } else
2146 task_unlock(p);
2147 } while_each_thread(g, p);
2148 read_unlock(&tasklist_lock);
2149}
2150
2151/*
2152 * pivot_root Semantics: 2097 * pivot_root Semantics:
2153 * Moves the root file system of the current process to the directory put_old, 2098 * Moves the root file system of the current process to the directory put_old,
2154 * makes new_root as the new root file system of the current process, and sets 2099 * makes new_root as the new root file system of the current process, and sets
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 07e9715b8658..9c590722d87e 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -79,7 +79,7 @@ static int ncp_hash_dentry(struct dentry *, struct qstr *);
79static int ncp_compare_dentry (struct dentry *, struct qstr *, struct qstr *); 79static int ncp_compare_dentry (struct dentry *, struct qstr *, struct qstr *);
80static int ncp_delete_dentry(struct dentry *); 80static int ncp_delete_dentry(struct dentry *);
81 81
82static struct dentry_operations ncp_dentry_operations = 82static const struct dentry_operations ncp_dentry_operations =
83{ 83{
84 .d_revalidate = ncp_lookup_validate, 84 .d_revalidate = ncp_lookup_validate,
85 .d_hash = ncp_hash_dentry, 85 .d_hash = ncp_hash_dentry,
@@ -87,7 +87,7 @@ static struct dentry_operations ncp_dentry_operations =
87 .d_delete = ncp_delete_dentry, 87 .d_delete = ncp_delete_dentry,
88}; 88};
89 89
90struct dentry_operations ncp_root_dentry_operations = 90const struct dentry_operations ncp_root_dentry_operations =
91{ 91{
92 .d_hash = ncp_hash_dentry, 92 .d_hash = ncp_hash_dentry,
93 .d_compare = ncp_compare_dentry, 93 .d_compare = ncp_compare_dentry,
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 36fe20d6eba2..e67f3ec07736 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -84,3 +84,11 @@ config ROOT_NFS
84 <file:Documentation/filesystems/nfsroot.txt>. 84 <file:Documentation/filesystems/nfsroot.txt>.
85 85
86 Most people say N here. 86 Most people say N here.
87
88config NFS_FSCACHE
89 bool "Provide NFS client caching support (EXPERIMENTAL)"
90 depends on EXPERIMENTAL
91 depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
92 help
93 Say Y here if you want NFS data to be cached locally on disc through
94 the general filesystem cache manager
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index ac6170c594a3..845159814de2 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -15,3 +15,4 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
15 callback.o callback_xdr.o callback_proc.o \ 15 callback.o callback_xdr.o callback_proc.o \
16 nfs4namespace.o 16 nfs4namespace.o
17nfs-$(CONFIG_SYSCTL) += sysctl.o 17nfs-$(CONFIG_SYSCTL) += sysctl.o
18nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 3e634f2a1083..a886e692ddd0 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -38,19 +38,10 @@ static struct svc_program nfs4_callback_program;
38 38
39unsigned int nfs_callback_set_tcpport; 39unsigned int nfs_callback_set_tcpport;
40unsigned short nfs_callback_tcpport; 40unsigned short nfs_callback_tcpport;
41unsigned short nfs_callback_tcpport6;
41static const int nfs_set_port_min = 0; 42static const int nfs_set_port_min = 0;
42static const int nfs_set_port_max = 65535; 43static const int nfs_set_port_max = 65535;
43 44
44/*
45 * If the kernel has IPv6 support available, always listen for
46 * both AF_INET and AF_INET6 requests.
47 */
48#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
49static const sa_family_t nfs_callback_family = AF_INET6;
50#else
51static const sa_family_t nfs_callback_family = AF_INET;
52#endif
53
54static int param_set_port(const char *val, struct kernel_param *kp) 45static int param_set_port(const char *val, struct kernel_param *kp)
55{ 46{
56 char *endp; 47 char *endp;
@@ -116,19 +107,29 @@ int nfs_callback_up(void)
116 mutex_lock(&nfs_callback_mutex); 107 mutex_lock(&nfs_callback_mutex);
117 if (nfs_callback_info.users++ || nfs_callback_info.task != NULL) 108 if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
118 goto out; 109 goto out;
119 serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, 110 serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
120 nfs_callback_family, NULL);
121 ret = -ENOMEM; 111 ret = -ENOMEM;
122 if (!serv) 112 if (!serv)
123 goto out_err; 113 goto out_err;
124 114
125 ret = svc_create_xprt(serv, "tcp", nfs_callback_set_tcpport, 115 ret = svc_create_xprt(serv, "tcp", PF_INET,
126 SVC_SOCK_ANONYMOUS); 116 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
127 if (ret <= 0) 117 if (ret <= 0)
128 goto out_err; 118 goto out_err;
129 nfs_callback_tcpport = ret; 119 nfs_callback_tcpport = ret;
130 dprintk("NFS: Callback listener port = %u (af %u)\n", 120 dprintk("NFS: Callback listener port = %u (af %u)\n",
131 nfs_callback_tcpport, nfs_callback_family); 121 nfs_callback_tcpport, PF_INET);
122
123#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
124 ret = svc_create_xprt(serv, "tcp", PF_INET6,
125 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
126 if (ret > 0) {
127 nfs_callback_tcpport6 = ret;
128 dprintk("NFS: Callback listener port = %u (af %u)\n",
129 nfs_callback_tcpport6, PF_INET6);
130 } else if (ret != -EAFNOSUPPORT)
131 goto out_err;
132#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
132 133
133 nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]); 134 nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
134 if (IS_ERR(nfs_callback_info.rqst)) { 135 if (IS_ERR(nfs_callback_info.rqst)) {
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index bb25d2135ff1..e110e286a262 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -72,5 +72,6 @@ extern void nfs_callback_down(void);
72 72
73extern unsigned int nfs_callback_set_tcpport; 73extern unsigned int nfs_callback_set_tcpport;
74extern unsigned short nfs_callback_tcpport; 74extern unsigned short nfs_callback_tcpport;
75extern unsigned short nfs_callback_tcpport6;
75 76
76#endif /* __LINUX_FS_NFS_CALLBACK_H */ 77#endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 9b728f3565a1..75c9cd2aa119 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -45,6 +45,7 @@
45#include "delegation.h" 45#include "delegation.h"
46#include "iostat.h" 46#include "iostat.h"
47#include "internal.h" 47#include "internal.h"
48#include "fscache.h"
48 49
49#define NFSDBG_FACILITY NFSDBG_CLIENT 50#define NFSDBG_FACILITY NFSDBG_CLIENT
50 51
@@ -154,6 +155,8 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
154 if (!IS_ERR(cred)) 155 if (!IS_ERR(cred))
155 clp->cl_machine_cred = cred; 156 clp->cl_machine_cred = cred;
156 157
158 nfs_fscache_get_client_cookie(clp);
159
157 return clp; 160 return clp;
158 161
159error_3: 162error_3:
@@ -187,6 +190,8 @@ static void nfs_free_client(struct nfs_client *clp)
187 190
188 nfs4_shutdown_client(clp); 191 nfs4_shutdown_client(clp);
189 192
193 nfs_fscache_release_client_cookie(clp);
194
190 /* -EIO all pending I/O */ 195 /* -EIO all pending I/O */
191 if (!IS_ERR(clp->cl_rpcclient)) 196 if (!IS_ERR(clp->cl_rpcclient))
192 rpc_shutdown_client(clp->cl_rpcclient); 197 rpc_shutdown_client(clp->cl_rpcclient);
@@ -224,53 +229,110 @@ void nfs_put_client(struct nfs_client *clp)
224} 229}
225 230
226#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 231#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
227static const struct in6_addr *nfs_map_ipv4_addr(const struct sockaddr *sa, struct in6_addr *addr_mapped) 232/*
233 * Test if two ip6 socket addresses refer to the same socket by
234 * comparing relevant fields. The padding bytes specifically, are not
235 * compared. sin6_flowinfo is not compared because it only affects QoS
236 * and sin6_scope_id is only compared if the address is "link local"
237 * because "link local" addresses need only be unique to a specific
238 * link. Conversely, ordinary unicast addresses might have different
239 * sin6_scope_id.
240 *
241 * The caller should ensure both socket addresses are AF_INET6.
242 */
243static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
244 const struct sockaddr *sa2)
228{ 245{
229 switch (sa->sa_family) { 246 const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
230 default: 247 const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
231 return NULL;
232 case AF_INET6:
233 return &((const struct sockaddr_in6 *)sa)->sin6_addr;
234 break;
235 case AF_INET:
236 ipv6_addr_set_v4mapped(((const struct sockaddr_in *)sa)->sin_addr.s_addr,
237 addr_mapped);
238 return addr_mapped;
239 }
240}
241 248
242static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, 249 if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
243 const struct sockaddr *sa2) 250 sin1->sin6_scope_id != sin2->sin6_scope_id)
251 return 0;
252
253 return ipv6_addr_equal(&sin1->sin6_addr, &sin1->sin6_addr);
254}
255#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
256static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
257 const struct sockaddr *sa2)
244{ 258{
245 const struct in6_addr *addr1;
246 const struct in6_addr *addr2;
247 struct in6_addr addr1_mapped;
248 struct in6_addr addr2_mapped;
249
250 addr1 = nfs_map_ipv4_addr(sa1, &addr1_mapped);
251 if (likely(addr1 != NULL)) {
252 addr2 = nfs_map_ipv4_addr(sa2, &addr2_mapped);
253 if (likely(addr2 != NULL))
254 return ipv6_addr_equal(addr1, addr2);
255 }
256 return 0; 259 return 0;
257} 260}
258#else 261#endif
259static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1, 262
260 const struct sockaddr_in *sa2) 263/*
264 * Test if two ip4 socket addresses refer to the same socket, by
265 * comparing relevant fields. The padding bytes specifically, are
266 * not compared.
267 *
268 * The caller should ensure both socket addresses are AF_INET.
269 */
270static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1,
271 const struct sockaddr *sa2)
261{ 272{
262 return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr; 273 const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
274 const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
275
276 return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
263} 277}
264 278
279static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
280 const struct sockaddr *sa2)
281{
282 const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
283 const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
284
285 return nfs_sockaddr_match_ipaddr6(sa1, sa2) &&
286 (sin1->sin6_port == sin2->sin6_port);
287}
288
289static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
290 const struct sockaddr *sa2)
291{
292 const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
293 const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
294
295 return nfs_sockaddr_match_ipaddr4(sa1, sa2) &&
296 (sin1->sin_port == sin2->sin_port);
297}
298
299/*
300 * Test if two socket addresses represent the same actual socket,
301 * by comparing (only) relevant fields, excluding the port number.
302 */
265static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, 303static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
266 const struct sockaddr *sa2) 304 const struct sockaddr *sa2)
267{ 305{
268 if (unlikely(sa1->sa_family != AF_INET || sa2->sa_family != AF_INET)) 306 if (sa1->sa_family != sa2->sa_family)
269 return 0; 307 return 0;
270 return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1, 308
271 (const struct sockaddr_in *)sa2); 309 switch (sa1->sa_family) {
310 case AF_INET:
311 return nfs_sockaddr_match_ipaddr4(sa1, sa2);
312 case AF_INET6:
313 return nfs_sockaddr_match_ipaddr6(sa1, sa2);
314 }
315 return 0;
316}
317
318/*
319 * Test if two socket addresses represent the same actual socket,
320 * by comparing (only) relevant fields, including the port number.
321 */
322static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
323 const struct sockaddr *sa2)
324{
325 if (sa1->sa_family != sa2->sa_family)
326 return 0;
327
328 switch (sa1->sa_family) {
329 case AF_INET:
330 return nfs_sockaddr_cmp_ip4(sa1, sa2);
331 case AF_INET6:
332 return nfs_sockaddr_cmp_ip6(sa1, sa2);
333 }
334 return 0;
272} 335}
273#endif
274 336
275/* 337/*
276 * Find a client by IP address and protocol version 338 * Find a client by IP address and protocol version
@@ -344,8 +406,10 @@ struct nfs_client *nfs_find_client_next(struct nfs_client *clp)
344static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data) 406static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data)
345{ 407{
346 struct nfs_client *clp; 408 struct nfs_client *clp;
409 const struct sockaddr *sap = data->addr;
347 410
348 list_for_each_entry(clp, &nfs_client_list, cl_share_link) { 411 list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
412 const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
349 /* Don't match clients that failed to initialise properly */ 413 /* Don't match clients that failed to initialise properly */
350 if (clp->cl_cons_state < 0) 414 if (clp->cl_cons_state < 0)
351 continue; 415 continue;
@@ -358,7 +422,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
358 continue; 422 continue;
359 423
360 /* Match the full socket address */ 424 /* Match the full socket address */
361 if (memcmp(&clp->cl_addr, data->addr, sizeof(clp->cl_addr)) != 0) 425 if (!nfs_sockaddr_cmp(sap, clap))
362 continue; 426 continue;
363 427
364 atomic_inc(&clp->cl_count); 428 atomic_inc(&clp->cl_count);
@@ -701,6 +765,7 @@ static int nfs_init_server(struct nfs_server *server,
701 765
702 /* Initialise the client representation from the mount data */ 766 /* Initialise the client representation from the mount data */
703 server->flags = data->flags; 767 server->flags = data->flags;
768 server->options = data->options;
704 769
705 if (data->rsize) 770 if (data->rsize)
706 server->rsize = nfs_block_size(data->rsize, NULL); 771 server->rsize = nfs_block_size(data->rsize, NULL);
@@ -1089,6 +1154,7 @@ static int nfs4_init_server(struct nfs_server *server,
1089 /* Initialise the client representation from the mount data */ 1154 /* Initialise the client representation from the mount data */
1090 server->flags = data->flags; 1155 server->flags = data->flags;
1091 server->caps |= NFS_CAP_ATOMIC_OPEN; 1156 server->caps |= NFS_CAP_ATOMIC_OPEN;
1157 server->options = data->options;
1092 1158
1093 /* Get a client record */ 1159 /* Get a client record */
1094 error = nfs4_set_client(server, 1160 error = nfs4_set_client(server,
@@ -1500,7 +1566,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
1500 1566
1501 /* display header on line 1 */ 1567 /* display header on line 1 */
1502 if (v == &nfs_volume_list) { 1568 if (v == &nfs_volume_list) {
1503 seq_puts(m, "NV SERVER PORT DEV FSID\n"); 1569 seq_puts(m, "NV SERVER PORT DEV FSID FSC\n");
1504 return 0; 1570 return 0;
1505 } 1571 }
1506 /* display one transport per line on subsequent lines */ 1572 /* display one transport per line on subsequent lines */
@@ -1514,12 +1580,13 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
1514 (unsigned long long) server->fsid.major, 1580 (unsigned long long) server->fsid.major,
1515 (unsigned long long) server->fsid.minor); 1581 (unsigned long long) server->fsid.minor);
1516 1582
1517 seq_printf(m, "v%u %s %s %-7s %-17s\n", 1583 seq_printf(m, "v%u %s %s %-7s %-17s %s\n",
1518 clp->rpc_ops->version, 1584 clp->rpc_ops->version,
1519 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), 1585 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
1520 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), 1586 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
1521 dev, 1587 dev,
1522 fsid); 1588 fsid,
1589 nfs_server_fscache_state(server));
1523 1590
1524 return 0; 1591 return 0;
1525} 1592}
@@ -1535,8 +1602,6 @@ int __init nfs_fs_proc_init(void)
1535 if (!proc_fs_nfs) 1602 if (!proc_fs_nfs)
1536 goto error_0; 1603 goto error_0;
1537 1604
1538 proc_fs_nfs->owner = THIS_MODULE;
1539
1540 /* a file of servers with which we're dealing */ 1605 /* a file of servers with which we're dealing */
1541 p = proc_create("servers", S_IFREG|S_IRUGO, 1606 p = proc_create("servers", S_IFREG|S_IRUGO,
1542 proc_fs_nfs, &nfs_server_list_fops); 1607 proc_fs_nfs, &nfs_server_list_fops);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e35c8199f82f..370b190a09d1 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -899,7 +899,7 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
899 iput(inode); 899 iput(inode);
900} 900}
901 901
902struct dentry_operations nfs_dentry_operations = { 902const struct dentry_operations nfs_dentry_operations = {
903 .d_revalidate = nfs_lookup_revalidate, 903 .d_revalidate = nfs_lookup_revalidate,
904 .d_delete = nfs_dentry_delete, 904 .d_delete = nfs_dentry_delete,
905 .d_iput = nfs_dentry_iput, 905 .d_iput = nfs_dentry_iput,
@@ -967,7 +967,7 @@ out:
967#ifdef CONFIG_NFS_V4 967#ifdef CONFIG_NFS_V4
968static int nfs_open_revalidate(struct dentry *, struct nameidata *); 968static int nfs_open_revalidate(struct dentry *, struct nameidata *);
969 969
970struct dentry_operations nfs4_dentry_operations = { 970const struct dentry_operations nfs4_dentry_operations = {
971 .d_revalidate = nfs_open_revalidate, 971 .d_revalidate = nfs_open_revalidate,
972 .d_delete = nfs_dentry_delete, 972 .d_delete = nfs_dentry_delete,
973 .d_iput = nfs_dentry_iput, 973 .d_iput = nfs_dentry_iput,
@@ -1624,8 +1624,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1624 } else if (atomic_read(&new_dentry->d_count) > 1) 1624 } else if (atomic_read(&new_dentry->d_count) > 1)
1625 /* dentry still busy? */ 1625 /* dentry still busy? */
1626 goto out; 1626 goto out;
1627 } else 1627 }
1628 nfs_drop_nlink(new_inode);
1629 1628
1630go_ahead: 1629go_ahead:
1631 /* 1630 /*
@@ -1638,10 +1637,8 @@ go_ahead:
1638 } 1637 }
1639 nfs_inode_return_delegation(old_inode); 1638 nfs_inode_return_delegation(old_inode);
1640 1639
1641 if (new_inode != NULL) { 1640 if (new_inode != NULL)
1642 nfs_inode_return_delegation(new_inode); 1641 nfs_inode_return_delegation(new_inode);
1643 d_delete(new_dentry);
1644 }
1645 1642
1646 error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name, 1643 error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name,
1647 new_dir, &new_dentry->d_name); 1644 new_dir, &new_dentry->d_name);
@@ -1650,6 +1647,8 @@ out:
1650 if (rehash) 1647 if (rehash)
1651 d_rehash(rehash); 1648 d_rehash(rehash);
1652 if (!error) { 1649 if (!error) {
1650 if (new_inode != NULL)
1651 nfs_drop_nlink(new_inode);
1653 d_move(old_dentry, new_dentry); 1652 d_move(old_dentry, new_dentry);
1654 nfs_set_verifier(new_dentry, 1653 nfs_set_verifier(new_dentry,
1655 nfs_save_change_attribute(new_dir)); 1654 nfs_save_change_attribute(new_dir));
@@ -1892,8 +1891,14 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
1892 cache.cred = cred; 1891 cache.cred = cred;
1893 cache.jiffies = jiffies; 1892 cache.jiffies = jiffies;
1894 status = NFS_PROTO(inode)->access(inode, &cache); 1893 status = NFS_PROTO(inode)->access(inode, &cache);
1895 if (status != 0) 1894 if (status != 0) {
1895 if (status == -ESTALE) {
1896 nfs_zap_caches(inode);
1897 if (!S_ISDIR(inode->i_mode))
1898 set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
1899 }
1896 return status; 1900 return status;
1901 }
1897 nfs_access_add_cache(inode, &cache); 1902 nfs_access_add_cache(inode, &cache);
1898out: 1903out:
1899 if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) 1904 if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 90f292b520d2..3523b895eb4b 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -35,6 +35,7 @@
35#include "delegation.h" 35#include "delegation.h"
36#include "internal.h" 36#include "internal.h"
37#include "iostat.h" 37#include "iostat.h"
38#include "fscache.h"
38 39
39#define NFSDBG_FACILITY NFSDBG_FILE 40#define NFSDBG_FACILITY NFSDBG_FILE
40 41
@@ -64,11 +65,7 @@ const struct file_operations nfs_file_operations = {
64 .write = do_sync_write, 65 .write = do_sync_write,
65 .aio_read = nfs_file_read, 66 .aio_read = nfs_file_read,
66 .aio_write = nfs_file_write, 67 .aio_write = nfs_file_write,
67#ifdef CONFIG_MMU
68 .mmap = nfs_file_mmap, 68 .mmap = nfs_file_mmap,
69#else
70 .mmap = generic_file_mmap,
71#endif
72 .open = nfs_file_open, 69 .open = nfs_file_open,
73 .flush = nfs_file_flush, 70 .flush = nfs_file_flush,
74 .release = nfs_file_release, 71 .release = nfs_file_release,
@@ -141,9 +138,6 @@ nfs_file_release(struct inode *inode, struct file *filp)
141 dentry->d_parent->d_name.name, 138 dentry->d_parent->d_name.name,
142 dentry->d_name.name); 139 dentry->d_name.name);
143 140
144 /* Ensure that dirty pages are flushed out with the right creds */
145 if (filp->f_mode & FMODE_WRITE)
146 nfs_wb_all(dentry->d_inode);
147 nfs_inc_stats(inode, NFSIOS_VFSRELEASE); 141 nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
148 return nfs_release(inode, filp); 142 return nfs_release(inode, filp);
149} 143}
@@ -235,7 +229,6 @@ nfs_file_flush(struct file *file, fl_owner_t id)
235 struct nfs_open_context *ctx = nfs_file_open_context(file); 229 struct nfs_open_context *ctx = nfs_file_open_context(file);
236 struct dentry *dentry = file->f_path.dentry; 230 struct dentry *dentry = file->f_path.dentry;
237 struct inode *inode = dentry->d_inode; 231 struct inode *inode = dentry->d_inode;
238 int status;
239 232
240 dprintk("NFS: flush(%s/%s)\n", 233 dprintk("NFS: flush(%s/%s)\n",
241 dentry->d_parent->d_name.name, 234 dentry->d_parent->d_name.name,
@@ -245,11 +238,8 @@ nfs_file_flush(struct file *file, fl_owner_t id)
245 return 0; 238 return 0;
246 nfs_inc_stats(inode, NFSIOS_VFSFLUSH); 239 nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
247 240
248 /* Ensure that data+attribute caches are up to date after close() */ 241 /* Flush writes to the server and return any errors */
249 status = nfs_do_fsync(ctx, inode); 242 return nfs_do_fsync(ctx, inode);
250 if (!status)
251 nfs_revalidate_inode(NFS_SERVER(inode), inode);
252 return status;
253} 243}
254 244
255static ssize_t 245static ssize_t
@@ -304,11 +294,13 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
304 dprintk("NFS: mmap(%s/%s)\n", 294 dprintk("NFS: mmap(%s/%s)\n",
305 dentry->d_parent->d_name.name, dentry->d_name.name); 295 dentry->d_parent->d_name.name, dentry->d_name.name);
306 296
307 status = nfs_revalidate_mapping(inode, file->f_mapping); 297 /* Note: generic_file_mmap() returns ENOSYS on nommu systems
298 * so we call that before revalidating the mapping
299 */
300 status = generic_file_mmap(file, vma);
308 if (!status) { 301 if (!status) {
309 vma->vm_ops = &nfs_file_vm_ops; 302 vma->vm_ops = &nfs_file_vm_ops;
310 vma->vm_flags |= VM_CAN_NONLINEAR; 303 status = nfs_revalidate_mapping(inode, file->f_mapping);
311 file_accessed(file);
312 } 304 }
313 return status; 305 return status;
314} 306}
@@ -354,6 +346,15 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
354 file->f_path.dentry->d_name.name, 346 file->f_path.dentry->d_name.name,
355 mapping->host->i_ino, len, (long long) pos); 347 mapping->host->i_ino, len, (long long) pos);
356 348
349 /*
350 * Prevent starvation issues if someone is doing a consistency
351 * sync-to-disk
352 */
353 ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
354 nfs_wait_bit_killable, TASK_KILLABLE);
355 if (ret)
356 return ret;
357
357 page = grab_cache_page_write_begin(mapping, index, flags); 358 page = grab_cache_page_write_begin(mapping, index, flags);
358 if (!page) 359 if (!page)
359 return -ENOMEM; 360 return -ENOMEM;
@@ -409,6 +410,13 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
409 return copied; 410 return copied;
410} 411}
411 412
413/*
414 * Partially or wholly invalidate a page
415 * - Release the private state associated with a page if undergoing complete
416 * page invalidation
417 * - Called if either PG_private or PG_fscache is set on the page
418 * - Caller holds page lock
419 */
412static void nfs_invalidate_page(struct page *page, unsigned long offset) 420static void nfs_invalidate_page(struct page *page, unsigned long offset)
413{ 421{
414 dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset); 422 dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
@@ -417,23 +425,43 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
417 return; 425 return;
418 /* Cancel any unstarted writes on this page */ 426 /* Cancel any unstarted writes on this page */
419 nfs_wb_page_cancel(page->mapping->host, page); 427 nfs_wb_page_cancel(page->mapping->host, page);
428
429 nfs_fscache_invalidate_page(page, page->mapping->host);
420} 430}
421 431
432/*
433 * Attempt to release the private state associated with a page
434 * - Called if either PG_private or PG_fscache is set on the page
435 * - Caller holds page lock
436 * - Return true (may release page) or false (may not)
437 */
422static int nfs_release_page(struct page *page, gfp_t gfp) 438static int nfs_release_page(struct page *page, gfp_t gfp)
423{ 439{
424 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); 440 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
425 441
426 /* If PagePrivate() is set, then the page is not freeable */ 442 /* If PagePrivate() is set, then the page is not freeable */
427 return 0; 443 if (PagePrivate(page))
444 return 0;
445 return nfs_fscache_release_page(page, gfp);
428} 446}
429 447
448/*
449 * Attempt to clear the private state associated with a page when an error
450 * occurs that requires the cached contents of an inode to be written back or
451 * destroyed
452 * - Called if either PG_private or fscache is set on the page
453 * - Caller holds page lock
454 * - Return 0 if successful, -error otherwise
455 */
430static int nfs_launder_page(struct page *page) 456static int nfs_launder_page(struct page *page)
431{ 457{
432 struct inode *inode = page->mapping->host; 458 struct inode *inode = page->mapping->host;
459 struct nfs_inode *nfsi = NFS_I(inode);
433 460
434 dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n", 461 dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
435 inode->i_ino, (long long)page_offset(page)); 462 inode->i_ino, (long long)page_offset(page));
436 463
464 nfs_fscache_wait_on_page_write(nfsi, page);
437 return nfs_wb_page(inode, page); 465 return nfs_wb_page(inode, page);
438} 466}
439 467
@@ -451,8 +479,14 @@ const struct address_space_operations nfs_file_aops = {
451 .launder_page = nfs_launder_page, 479 .launder_page = nfs_launder_page,
452}; 480};
453 481
454static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) 482/*
483 * Notification that a PTE pointing to an NFS page is about to be made
484 * writable, implying that someone is about to modify the page through a
485 * shared-writable mapping
486 */
487static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
455{ 488{
489 struct page *page = vmf->page;
456 struct file *filp = vma->vm_file; 490 struct file *filp = vma->vm_file;
457 struct dentry *dentry = filp->f_path.dentry; 491 struct dentry *dentry = filp->f_path.dentry;
458 unsigned pagelen; 492 unsigned pagelen;
@@ -464,6 +498,9 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
464 filp->f_mapping->host->i_ino, 498 filp->f_mapping->host->i_ino,
465 (long long)page_offset(page)); 499 (long long)page_offset(page));
466 500
501 /* make sure the cache has finished storing the page */
502 nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page);
503
467 lock_page(page); 504 lock_page(page);
468 mapping = page->mapping; 505 mapping = page->mapping;
469 if (mapping != dentry->d_inode->i_mapping) 506 if (mapping != dentry->d_inode->i_mapping)
@@ -483,6 +520,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
483 ret = pagelen; 520 ret = pagelen;
484out_unlock: 521out_unlock:
485 unlock_page(page); 522 unlock_page(page);
523 if (ret)
524 ret = VM_FAULT_SIGBUS;
486 return ret; 525 return ret;
487} 526}
488 527
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
new file mode 100644
index 000000000000..5b1006480bc2
--- /dev/null
+++ b/fs/nfs/fscache-index.c
@@ -0,0 +1,337 @@
1/* NFS FS-Cache index structure definition
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/init.h>
13#include <linux/kernel.h>
14#include <linux/sched.h>
15#include <linux/mm.h>
16#include <linux/nfs_fs.h>
17#include <linux/nfs_fs_sb.h>
18#include <linux/in6.h>
19
20#include "internal.h"
21#include "fscache.h"
22
23#define NFSDBG_FACILITY NFSDBG_FSCACHE
24
25/*
26 * Define the NFS filesystem for FS-Cache. Upon registration FS-Cache sticks
27 * the cookie for the top-level index object for NFS into here. The top-level
28 * index can than have other cache objects inserted into it.
29 */
30struct fscache_netfs nfs_fscache_netfs = {
31 .name = "nfs",
32 .version = 0,
33};
34
35/*
36 * Register NFS for caching
37 */
38int nfs_fscache_register(void)
39{
40 return fscache_register_netfs(&nfs_fscache_netfs);
41}
42
43/*
44 * Unregister NFS for caching
45 */
46void nfs_fscache_unregister(void)
47{
48 fscache_unregister_netfs(&nfs_fscache_netfs);
49}
50
51/*
52 * Layout of the key for an NFS server cache object.
53 */
54struct nfs_server_key {
55 uint16_t nfsversion; /* NFS protocol version */
56 uint16_t family; /* address family */
57 uint16_t port; /* IP port */
58 union {
59 struct in_addr ipv4_addr; /* IPv4 address */
60 struct in6_addr ipv6_addr; /* IPv6 address */
61 } addr[0];
62};
63
64/*
65 * Generate a key to describe a server in the main NFS index
66 * - We return the length of the key, or 0 if we can't generate one
67 */
68static uint16_t nfs_server_get_key(const void *cookie_netfs_data,
69 void *buffer, uint16_t bufmax)
70{
71 const struct nfs_client *clp = cookie_netfs_data;
72 const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr;
73 const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr;
74 struct nfs_server_key *key = buffer;
75 uint16_t len = sizeof(struct nfs_server_key);
76
77 key->nfsversion = clp->rpc_ops->version;
78 key->family = clp->cl_addr.ss_family;
79
80 memset(key, 0, len);
81
82 switch (clp->cl_addr.ss_family) {
83 case AF_INET:
84 key->port = sin->sin_port;
85 key->addr[0].ipv4_addr = sin->sin_addr;
86 len += sizeof(key->addr[0].ipv4_addr);
87 break;
88
89 case AF_INET6:
90 key->port = sin6->sin6_port;
91 key->addr[0].ipv6_addr = sin6->sin6_addr;
92 len += sizeof(key->addr[0].ipv6_addr);
93 break;
94
95 default:
96 printk(KERN_WARNING "NFS: Unknown network family '%d'\n",
97 clp->cl_addr.ss_family);
98 len = 0;
99 break;
100 }
101
102 return len;
103}
104
105/*
106 * Define the server object for FS-Cache. This is used to describe a server
107 * object to fscache_acquire_cookie(). It is keyed by the NFS protocol and
108 * server address parameters.
109 */
110const struct fscache_cookie_def nfs_fscache_server_index_def = {
111 .name = "NFS.server",
112 .type = FSCACHE_COOKIE_TYPE_INDEX,
113 .get_key = nfs_server_get_key,
114};
115
116/*
117 * Generate a key to describe a superblock key in the main NFS index
118 */
119static uint16_t nfs_super_get_key(const void *cookie_netfs_data,
120 void *buffer, uint16_t bufmax)
121{
122 const struct nfs_fscache_key *key;
123 const struct nfs_server *nfss = cookie_netfs_data;
124 uint16_t len;
125
126 key = nfss->fscache_key;
127 len = sizeof(key->key) + key->key.uniq_len;
128 if (len > bufmax) {
129 len = 0;
130 } else {
131 memcpy(buffer, &key->key, sizeof(key->key));
132 memcpy(buffer + sizeof(key->key),
133 key->key.uniquifier, key->key.uniq_len);
134 }
135
136 return len;
137}
138
139/*
140 * Define the superblock object for FS-Cache. This is used to describe a
141 * superblock object to fscache_acquire_cookie(). It is keyed by all the NFS
142 * parameters that might cause a separate superblock.
143 */
144const struct fscache_cookie_def nfs_fscache_super_index_def = {
145 .name = "NFS.super",
146 .type = FSCACHE_COOKIE_TYPE_INDEX,
147 .get_key = nfs_super_get_key,
148};
149
150/*
151 * Definition of the auxiliary data attached to NFS inode storage objects
152 * within the cache.
153 *
154 * The contents of this struct are recorded in the on-disk local cache in the
155 * auxiliary data attached to the data storage object backing an inode. This
156 * permits coherency to be managed when a new inode binds to an already extant
157 * cache object.
158 */
159struct nfs_fscache_inode_auxdata {
160 struct timespec mtime;
161 struct timespec ctime;
162 loff_t size;
163 u64 change_attr;
164};
165
166/*
167 * Generate a key to describe an NFS inode in an NFS server's index
168 */
169static uint16_t nfs_fscache_inode_get_key(const void *cookie_netfs_data,
170 void *buffer, uint16_t bufmax)
171{
172 const struct nfs_inode *nfsi = cookie_netfs_data;
173 uint16_t nsize;
174
175 /* use the inode's NFS filehandle as the key */
176 nsize = nfsi->fh.size;
177 memcpy(buffer, nfsi->fh.data, nsize);
178 return nsize;
179}
180
181/*
182 * Get certain file attributes from the netfs data
183 * - This function can be absent for an index
184 * - Not permitted to return an error
185 * - The netfs data from the cookie being used as the source is presented
186 */
187static void nfs_fscache_inode_get_attr(const void *cookie_netfs_data,
188 uint64_t *size)
189{
190 const struct nfs_inode *nfsi = cookie_netfs_data;
191
192 *size = nfsi->vfs_inode.i_size;
193}
194
195/*
196 * Get the auxiliary data from netfs data
197 * - This function can be absent if the index carries no state data
198 * - Should store the auxiliary data in the buffer
199 * - Should return the amount of amount stored
200 * - Not permitted to return an error
201 * - The netfs data from the cookie being used as the source is presented
202 */
203static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data,
204 void *buffer, uint16_t bufmax)
205{
206 struct nfs_fscache_inode_auxdata auxdata;
207 const struct nfs_inode *nfsi = cookie_netfs_data;
208
209 memset(&auxdata, 0, sizeof(auxdata));
210 auxdata.size = nfsi->vfs_inode.i_size;
211 auxdata.mtime = nfsi->vfs_inode.i_mtime;
212 auxdata.ctime = nfsi->vfs_inode.i_ctime;
213
214 if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
215 auxdata.change_attr = nfsi->change_attr;
216
217 if (bufmax > sizeof(auxdata))
218 bufmax = sizeof(auxdata);
219
220 memcpy(buffer, &auxdata, bufmax);
221 return bufmax;
222}
223
224/*
225 * Consult the netfs about the state of an object
226 * - This function can be absent if the index carries no state data
227 * - The netfs data from the cookie being used as the target is
228 * presented, as is the auxiliary data
229 */
230static
231enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
232 const void *data,
233 uint16_t datalen)
234{
235 struct nfs_fscache_inode_auxdata auxdata;
236 struct nfs_inode *nfsi = cookie_netfs_data;
237
238 if (datalen != sizeof(auxdata))
239 return FSCACHE_CHECKAUX_OBSOLETE;
240
241 memset(&auxdata, 0, sizeof(auxdata));
242 auxdata.size = nfsi->vfs_inode.i_size;
243 auxdata.mtime = nfsi->vfs_inode.i_mtime;
244 auxdata.ctime = nfsi->vfs_inode.i_ctime;
245
246 if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
247 auxdata.change_attr = nfsi->change_attr;
248
249 if (memcmp(data, &auxdata, datalen) != 0)
250 return FSCACHE_CHECKAUX_OBSOLETE;
251
252 return FSCACHE_CHECKAUX_OKAY;
253}
254
255/*
256 * Indication from FS-Cache that the cookie is no longer cached
257 * - This function is called when the backing store currently caching a cookie
258 * is removed
259 * - The netfs should use this to clean up any markers indicating cached pages
260 * - This is mandatory for any object that may have data
261 */
262static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data)
263{
264 struct nfs_inode *nfsi = cookie_netfs_data;
265 struct pagevec pvec;
266 pgoff_t first;
267 int loop, nr_pages;
268
269 pagevec_init(&pvec, 0);
270 first = 0;
271
272 dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi);
273
274 for (;;) {
275 /* grab a bunch of pages to unmark */
276 nr_pages = pagevec_lookup(&pvec,
277 nfsi->vfs_inode.i_mapping,
278 first,
279 PAGEVEC_SIZE - pagevec_count(&pvec));
280 if (!nr_pages)
281 break;
282
283 for (loop = 0; loop < nr_pages; loop++)
284 ClearPageFsCache(pvec.pages[loop]);
285
286 first = pvec.pages[nr_pages - 1]->index + 1;
287
288 pvec.nr = nr_pages;
289 pagevec_release(&pvec);
290 cond_resched();
291 }
292}
293
294/*
295 * Get an extra reference on a read context.
296 * - This function can be absent if the completion function doesn't require a
297 * context.
298 * - The read context is passed back to NFS in the event that a data read on the
299 * cache fails with EIO - in which case the server must be contacted to
300 * retrieve the data, which requires the read context for security.
301 */
302static void nfs_fh_get_context(void *cookie_netfs_data, void *context)
303{
304 get_nfs_open_context(context);
305}
306
307/*
308 * Release an extra reference on a read context.
309 * - This function can be absent if the completion function doesn't require a
310 * context.
311 */
312static void nfs_fh_put_context(void *cookie_netfs_data, void *context)
313{
314 if (context)
315 put_nfs_open_context(context);
316}
317
318/*
319 * Define the inode object for FS-Cache. This is used to describe an inode
320 * object to fscache_acquire_cookie(). It is keyed by the NFS file handle for
321 * an inode.
322 *
323 * Coherency is managed by comparing the copies of i_size, i_mtime and i_ctime
324 * held in the cache auxiliary data for the data storage object with those in
325 * the inode struct in memory.
326 */
327const struct fscache_cookie_def nfs_fscache_inode_object_def = {
328 .name = "NFS.fh",
329 .type = FSCACHE_COOKIE_TYPE_DATAFILE,
330 .get_key = nfs_fscache_inode_get_key,
331 .get_attr = nfs_fscache_inode_get_attr,
332 .get_aux = nfs_fscache_inode_get_aux,
333 .check_aux = nfs_fscache_inode_check_aux,
334 .now_uncached = nfs_fscache_inode_now_uncached,
335 .get_context = nfs_fh_get_context,
336 .put_context = nfs_fh_put_context,
337};
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
new file mode 100644
index 000000000000..379be678cb7e
--- /dev/null
+++ b/fs/nfs/fscache.c
@@ -0,0 +1,523 @@
1/* NFS filesystem cache interface
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/init.h>
13#include <linux/kernel.h>
14#include <linux/sched.h>
15#include <linux/mm.h>
16#include <linux/nfs_fs.h>
17#include <linux/nfs_fs_sb.h>
18#include <linux/in6.h>
19#include <linux/seq_file.h>
20
21#include "internal.h"
22#include "iostat.h"
23#include "fscache.h"
24
25#define NFSDBG_FACILITY NFSDBG_FSCACHE
26
27static struct rb_root nfs_fscache_keys = RB_ROOT;
28static DEFINE_SPINLOCK(nfs_fscache_keys_lock);
29
30/*
31 * Get the per-client index cookie for an NFS client if the appropriate mount
32 * flag was set
33 * - We always try and get an index cookie for the client, but get filehandle
34 * cookies on a per-superblock basis, depending on the mount flags
35 */
36void nfs_fscache_get_client_cookie(struct nfs_client *clp)
37{
38 /* create a cache index for looking up filehandles */
39 clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index,
40 &nfs_fscache_server_index_def,
41 clp);
42 dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n",
43 clp, clp->fscache);
44}
45
46/*
47 * Dispose of a per-client cookie
48 */
49void nfs_fscache_release_client_cookie(struct nfs_client *clp)
50{
51 dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n",
52 clp, clp->fscache);
53
54 fscache_relinquish_cookie(clp->fscache, 0);
55 clp->fscache = NULL;
56}
57
58/*
59 * Get the cache cookie for an NFS superblock. We have to handle
60 * uniquification here because the cache doesn't do it for us.
61 */
62void nfs_fscache_get_super_cookie(struct super_block *sb,
63 struct nfs_parsed_mount_data *data)
64{
65 struct nfs_fscache_key *key, *xkey;
66 struct nfs_server *nfss = NFS_SB(sb);
67 struct rb_node **p, *parent;
68 const char *uniq = data->fscache_uniq ?: "";
69 int diff, ulen;
70
71 ulen = strlen(uniq);
72 key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL);
73 if (!key)
74 return;
75
76 key->nfs_client = nfss->nfs_client;
77 key->key.super.s_flags = sb->s_flags & NFS_MS_MASK;
78 key->key.nfs_server.flags = nfss->flags;
79 key->key.nfs_server.rsize = nfss->rsize;
80 key->key.nfs_server.wsize = nfss->wsize;
81 key->key.nfs_server.acregmin = nfss->acregmin;
82 key->key.nfs_server.acregmax = nfss->acregmax;
83 key->key.nfs_server.acdirmin = nfss->acdirmin;
84 key->key.nfs_server.acdirmax = nfss->acdirmax;
85 key->key.nfs_server.fsid = nfss->fsid;
86 key->key.rpc_auth.au_flavor = nfss->client->cl_auth->au_flavor;
87
88 key->key.uniq_len = ulen;
89 memcpy(key->key.uniquifier, uniq, ulen);
90
91 spin_lock(&nfs_fscache_keys_lock);
92 p = &nfs_fscache_keys.rb_node;
93 parent = NULL;
94 while (*p) {
95 parent = *p;
96 xkey = rb_entry(parent, struct nfs_fscache_key, node);
97
98 if (key->nfs_client < xkey->nfs_client)
99 goto go_left;
100 if (key->nfs_client > xkey->nfs_client)
101 goto go_right;
102
103 diff = memcmp(&key->key, &xkey->key, sizeof(key->key));
104 if (diff < 0)
105 goto go_left;
106 if (diff > 0)
107 goto go_right;
108
109 if (key->key.uniq_len == 0)
110 goto non_unique;
111 diff = memcmp(key->key.uniquifier,
112 xkey->key.uniquifier,
113 key->key.uniq_len);
114 if (diff < 0)
115 goto go_left;
116 if (diff > 0)
117 goto go_right;
118 goto non_unique;
119
120 go_left:
121 p = &(*p)->rb_left;
122 continue;
123 go_right:
124 p = &(*p)->rb_right;
125 }
126
127 rb_link_node(&key->node, parent, p);
128 rb_insert_color(&key->node, &nfs_fscache_keys);
129 spin_unlock(&nfs_fscache_keys_lock);
130 nfss->fscache_key = key;
131
132 /* create a cache index for looking up filehandles */
133 nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache,
134 &nfs_fscache_super_index_def,
135 nfss);
136 dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",
137 nfss, nfss->fscache);
138 return;
139
140non_unique:
141 spin_unlock(&nfs_fscache_keys_lock);
142 kfree(key);
143 nfss->fscache_key = NULL;
144 nfss->fscache = NULL;
145 printk(KERN_WARNING "NFS:"
146 " Cache request denied due to non-unique superblock keys\n");
147}
148
149/*
150 * release a per-superblock cookie
151 */
152void nfs_fscache_release_super_cookie(struct super_block *sb)
153{
154 struct nfs_server *nfss = NFS_SB(sb);
155
156 dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n",
157 nfss, nfss->fscache);
158
159 fscache_relinquish_cookie(nfss->fscache, 0);
160 nfss->fscache = NULL;
161
162 if (nfss->fscache_key) {
163 spin_lock(&nfs_fscache_keys_lock);
164 rb_erase(&nfss->fscache_key->node, &nfs_fscache_keys);
165 spin_unlock(&nfs_fscache_keys_lock);
166 kfree(nfss->fscache_key);
167 nfss->fscache_key = NULL;
168 }
169}
170
171/*
172 * Initialise the per-inode cache cookie pointer for an NFS inode.
173 */
174void nfs_fscache_init_inode_cookie(struct inode *inode)
175{
176 NFS_I(inode)->fscache = NULL;
177 if (S_ISREG(inode->i_mode))
178 set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
179}
180
181/*
182 * Get the per-inode cache cookie for an NFS inode.
183 */
184static void nfs_fscache_enable_inode_cookie(struct inode *inode)
185{
186 struct super_block *sb = inode->i_sb;
187 struct nfs_inode *nfsi = NFS_I(inode);
188
189 if (nfsi->fscache || !NFS_FSCACHE(inode))
190 return;
191
192 if ((NFS_SB(sb)->options & NFS_OPTION_FSCACHE)) {
193 nfsi->fscache = fscache_acquire_cookie(
194 NFS_SB(sb)->fscache,
195 &nfs_fscache_inode_object_def,
196 nfsi);
197
198 dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n",
199 sb, nfsi, nfsi->fscache);
200 }
201}
202
203/*
204 * Release a per-inode cookie.
205 */
206void nfs_fscache_release_inode_cookie(struct inode *inode)
207{
208 struct nfs_inode *nfsi = NFS_I(inode);
209
210 dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n",
211 nfsi, nfsi->fscache);
212
213 fscache_relinquish_cookie(nfsi->fscache, 0);
214 nfsi->fscache = NULL;
215}
216
217/*
218 * Retire a per-inode cookie, destroying the data attached to it.
219 */
220void nfs_fscache_zap_inode_cookie(struct inode *inode)
221{
222 struct nfs_inode *nfsi = NFS_I(inode);
223
224 dfprintk(FSCACHE, "NFS: zapping cookie (0x%p/0x%p)\n",
225 nfsi, nfsi->fscache);
226
227 fscache_relinquish_cookie(nfsi->fscache, 1);
228 nfsi->fscache = NULL;
229}
230
231/*
232 * Turn off the cache with regard to a per-inode cookie if opened for writing,
233 * invalidating all the pages in the page cache relating to the associated
234 * inode to clear the per-page caching.
235 */
236static void nfs_fscache_disable_inode_cookie(struct inode *inode)
237{
238 clear_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
239
240 if (NFS_I(inode)->fscache) {
241 dfprintk(FSCACHE,
242 "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode));
243
244 /* Need to invalidate any mapped pages that were read in before
245 * turning off the cache.
246 */
247 if (inode->i_mapping && inode->i_mapping->nrpages)
248 invalidate_inode_pages2(inode->i_mapping);
249
250 nfs_fscache_zap_inode_cookie(inode);
251 }
252}
253
254/*
255 * wait_on_bit() sleep function for uninterruptible waiting
256 */
257static int nfs_fscache_wait_bit(void *flags)
258{
259 schedule();
260 return 0;
261}
262
263/*
264 * Lock against someone else trying to also acquire or relinquish a cookie
265 */
266static inline void nfs_fscache_inode_lock(struct inode *inode)
267{
268 struct nfs_inode *nfsi = NFS_I(inode);
269
270 while (test_and_set_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags))
271 wait_on_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK,
272 nfs_fscache_wait_bit, TASK_UNINTERRUPTIBLE);
273}
274
275/*
276 * Unlock cookie management lock
277 */
278static inline void nfs_fscache_inode_unlock(struct inode *inode)
279{
280 struct nfs_inode *nfsi = NFS_I(inode);
281
282 smp_mb__before_clear_bit();
283 clear_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags);
284 smp_mb__after_clear_bit();
285 wake_up_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK);
286}
287
288/*
289 * Decide if we should enable or disable local caching for this inode.
290 * - For now, with NFS, only regular files that are open read-only will be able
291 * to use the cache.
292 * - May be invoked multiple times in parallel by parallel nfs_open() functions.
293 */
294void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
295{
296 if (NFS_FSCACHE(inode)) {
297 nfs_fscache_inode_lock(inode);
298 if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
299 nfs_fscache_disable_inode_cookie(inode);
300 else
301 nfs_fscache_enable_inode_cookie(inode);
302 nfs_fscache_inode_unlock(inode);
303 }
304}
305
306/*
307 * Replace a per-inode cookie due to revalidation detecting a file having
308 * changed on the server.
309 */
310void nfs_fscache_reset_inode_cookie(struct inode *inode)
311{
312 struct nfs_inode *nfsi = NFS_I(inode);
313 struct nfs_server *nfss = NFS_SERVER(inode);
314 struct fscache_cookie *old = nfsi->fscache;
315
316 nfs_fscache_inode_lock(inode);
317 if (nfsi->fscache) {
318 /* retire the current fscache cache and get a new one */
319 fscache_relinquish_cookie(nfsi->fscache, 1);
320
321 nfsi->fscache = fscache_acquire_cookie(
322 nfss->nfs_client->fscache,
323 &nfs_fscache_inode_object_def,
324 nfsi);
325
326 dfprintk(FSCACHE,
327 "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n",
328 nfss, nfsi, old, nfsi->fscache);
329 }
330 nfs_fscache_inode_unlock(inode);
331}
332
333/*
334 * Release the caching state associated with a page, if the page isn't busy
335 * interacting with the cache.
336 * - Returns true (can release page) or false (page busy).
337 */
338int nfs_fscache_release_page(struct page *page, gfp_t gfp)
339{
340 struct nfs_inode *nfsi = NFS_I(page->mapping->host);
341 struct fscache_cookie *cookie = nfsi->fscache;
342
343 BUG_ON(!cookie);
344
345 if (fscache_check_page_write(cookie, page)) {
346 if (!(gfp & __GFP_WAIT))
347 return 0;
348 fscache_wait_on_page_write(cookie, page);
349 }
350
351 if (PageFsCache(page)) {
352 dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
353 cookie, page, nfsi);
354
355 fscache_uncache_page(cookie, page);
356 nfs_add_fscache_stats(page->mapping->host,
357 NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
358 }
359
360 return 1;
361}
362
363/*
364 * Release the caching state associated with a page if undergoing complete page
365 * invalidation.
366 */
367void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode)
368{
369 struct nfs_inode *nfsi = NFS_I(inode);
370 struct fscache_cookie *cookie = nfsi->fscache;
371
372 BUG_ON(!cookie);
373
374 dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n",
375 cookie, page, nfsi);
376
377 fscache_wait_on_page_write(cookie, page);
378
379 BUG_ON(!PageLocked(page));
380 fscache_uncache_page(cookie, page);
381 nfs_add_fscache_stats(page->mapping->host,
382 NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
383}
384
385/*
386 * Handle completion of a page being read from the cache.
387 * - Called in process (keventd) context.
388 */
389static void nfs_readpage_from_fscache_complete(struct page *page,
390 void *context,
391 int error)
392{
393 dfprintk(FSCACHE,
394 "NFS: readpage_from_fscache_complete (0x%p/0x%p/%d)\n",
395 page, context, error);
396
397 /* if the read completes with an error, we just unlock the page and let
398 * the VM reissue the readpage */
399 if (!error) {
400 SetPageUptodate(page);
401 unlock_page(page);
402 } else {
403 error = nfs_readpage_async(context, page->mapping->host, page);
404 if (error)
405 unlock_page(page);
406 }
407}
408
409/*
410 * Retrieve a page from fscache
411 */
412int __nfs_readpage_from_fscache(struct nfs_open_context *ctx,
413 struct inode *inode, struct page *page)
414{
415 int ret;
416
417 dfprintk(FSCACHE,
418 "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n",
419 NFS_I(inode)->fscache, page, page->index, page->flags, inode);
420
421 ret = fscache_read_or_alloc_page(NFS_I(inode)->fscache,
422 page,
423 nfs_readpage_from_fscache_complete,
424 ctx,
425 GFP_KERNEL);
426
427 switch (ret) {
428 case 0: /* read BIO submitted (page in fscache) */
429 dfprintk(FSCACHE,
430 "NFS: readpage_from_fscache: BIO submitted\n");
431 nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK, 1);
432 return ret;
433
434 case -ENOBUFS: /* inode not in cache */
435 case -ENODATA: /* page not in cache */
436 nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1);
437 dfprintk(FSCACHE,
438 "NFS: readpage_from_fscache %d\n", ret);
439 return 1;
440
441 default:
442 dfprintk(FSCACHE, "NFS: readpage_from_fscache %d\n", ret);
443 nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1);
444 }
445 return ret;
446}
447
448/*
449 * Retrieve a set of pages from fscache
450 */
451int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,
452 struct inode *inode,
453 struct address_space *mapping,
454 struct list_head *pages,
455 unsigned *nr_pages)
456{
457 int ret, npages = *nr_pages;
458
459 dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n",
460 NFS_I(inode)->fscache, npages, inode);
461
462 ret = fscache_read_or_alloc_pages(NFS_I(inode)->fscache,
463 mapping, pages, nr_pages,
464 nfs_readpage_from_fscache_complete,
465 ctx,
466 mapping_gfp_mask(mapping));
467 if (*nr_pages < npages)
468 nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK,
469 npages);
470 if (*nr_pages > 0)
471 nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL,
472 *nr_pages);
473
474 switch (ret) {
475 case 0: /* read submitted to the cache for all pages */
476 BUG_ON(!list_empty(pages));
477 BUG_ON(*nr_pages != 0);
478 dfprintk(FSCACHE,
479 "NFS: nfs_getpages_from_fscache: submitted\n");
480
481 return ret;
482
483 case -ENOBUFS: /* some pages aren't cached and can't be */
484 case -ENODATA: /* some pages aren't cached */
485 dfprintk(FSCACHE,
486 "NFS: nfs_getpages_from_fscache: no page: %d\n", ret);
487 return 1;
488
489 default:
490 dfprintk(FSCACHE,
491 "NFS: nfs_getpages_from_fscache: ret %d\n", ret);
492 }
493
494 return ret;
495}
496
497/*
498 * Store a newly fetched page in fscache
499 * - PG_fscache must be set on the page
500 */
501void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync)
502{
503 int ret;
504
505 dfprintk(FSCACHE,
506 "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n",
507 NFS_I(inode)->fscache, page, page->index, page->flags, sync);
508
509 ret = fscache_write_page(NFS_I(inode)->fscache, page, GFP_KERNEL);
510 dfprintk(FSCACHE,
511 "NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n",
512 page, page->index, page->flags, ret);
513
514 if (ret != 0) {
515 fscache_uncache_page(NFS_I(inode)->fscache, page);
516 nfs_add_fscache_stats(inode,
517 NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL, 1);
518 nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
519 } else {
520 nfs_add_fscache_stats(inode,
521 NFSIOS_FSCACHE_PAGES_WRITTEN_OK, 1);
522 }
523}
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
new file mode 100644
index 000000000000..6e809bb0ff08
--- /dev/null
+++ b/fs/nfs/fscache.h
@@ -0,0 +1,220 @@
1/* NFS filesystem cache interface definitions
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#ifndef _NFS_FSCACHE_H
13#define _NFS_FSCACHE_H
14
15#include <linux/nfs_fs.h>
16#include <linux/nfs_mount.h>
17#include <linux/nfs4_mount.h>
18#include <linux/fscache.h>
19
20#ifdef CONFIG_NFS_FSCACHE
21
22/*
23 * set of NFS FS-Cache objects that form a superblock key
24 */
25struct nfs_fscache_key {
26 struct rb_node node;
27 struct nfs_client *nfs_client; /* the server */
28
29 /* the elements of the unique key - as used by nfs_compare_super() and
30 * nfs_compare_mount_options() to distinguish superblocks */
31 struct {
32 struct {
33 unsigned long s_flags; /* various flags
34 * (& NFS_MS_MASK) */
35 } super;
36
37 struct {
38 struct nfs_fsid fsid;
39 int flags;
40 unsigned int rsize; /* read size */
41 unsigned int wsize; /* write size */
42 unsigned int acregmin; /* attr cache timeouts */
43 unsigned int acregmax;
44 unsigned int acdirmin;
45 unsigned int acdirmax;
46 } nfs_server;
47
48 struct {
49 rpc_authflavor_t au_flavor;
50 } rpc_auth;
51
52 /* uniquifier - can be used if nfs_server.flags includes
53 * NFS_MOUNT_UNSHARED */
54 u8 uniq_len;
55 char uniquifier[0];
56 } key;
57};
58
59/*
60 * fscache-index.c
61 */
62extern struct fscache_netfs nfs_fscache_netfs;
63extern const struct fscache_cookie_def nfs_fscache_server_index_def;
64extern const struct fscache_cookie_def nfs_fscache_super_index_def;
65extern const struct fscache_cookie_def nfs_fscache_inode_object_def;
66
67extern int nfs_fscache_register(void);
68extern void nfs_fscache_unregister(void);
69
70/*
71 * fscache.c
72 */
73extern void nfs_fscache_get_client_cookie(struct nfs_client *);
74extern void nfs_fscache_release_client_cookie(struct nfs_client *);
75
76extern void nfs_fscache_get_super_cookie(struct super_block *,
77 struct nfs_parsed_mount_data *);
78extern void nfs_fscache_release_super_cookie(struct super_block *);
79
80extern void nfs_fscache_init_inode_cookie(struct inode *);
81extern void nfs_fscache_release_inode_cookie(struct inode *);
82extern void nfs_fscache_zap_inode_cookie(struct inode *);
83extern void nfs_fscache_set_inode_cookie(struct inode *, struct file *);
84extern void nfs_fscache_reset_inode_cookie(struct inode *);
85
86extern void __nfs_fscache_invalidate_page(struct page *, struct inode *);
87extern int nfs_fscache_release_page(struct page *, gfp_t);
88
89extern int __nfs_readpage_from_fscache(struct nfs_open_context *,
90 struct inode *, struct page *);
91extern int __nfs_readpages_from_fscache(struct nfs_open_context *,
92 struct inode *, struct address_space *,
93 struct list_head *, unsigned *);
94extern void __nfs_readpage_to_fscache(struct inode *, struct page *, int);
95
96/*
97 * wait for a page to complete writing to the cache
98 */
99static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
100 struct page *page)
101{
102 if (PageFsCache(page))
103 fscache_wait_on_page_write(nfsi->fscache, page);
104}
105
106/*
107 * release the caching state associated with a page if undergoing complete page
108 * invalidation
109 */
110static inline void nfs_fscache_invalidate_page(struct page *page,
111 struct inode *inode)
112{
113 if (PageFsCache(page))
114 __nfs_fscache_invalidate_page(page, inode);
115}
116
117/*
118 * Retrieve a page from an inode data storage object.
119 */
120static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
121 struct inode *inode,
122 struct page *page)
123{
124 if (NFS_I(inode)->fscache)
125 return __nfs_readpage_from_fscache(ctx, inode, page);
126 return -ENOBUFS;
127}
128
129/*
130 * Retrieve a set of pages from an inode data storage object.
131 */
132static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
133 struct inode *inode,
134 struct address_space *mapping,
135 struct list_head *pages,
136 unsigned *nr_pages)
137{
138 if (NFS_I(inode)->fscache)
139 return __nfs_readpages_from_fscache(ctx, inode, mapping, pages,
140 nr_pages);
141 return -ENOBUFS;
142}
143
144/*
145 * Store a page newly fetched from the server in an inode data storage object
146 * in the cache.
147 */
148static inline void nfs_readpage_to_fscache(struct inode *inode,
149 struct page *page,
150 int sync)
151{
152 if (PageFsCache(page))
153 __nfs_readpage_to_fscache(inode, page, sync);
154}
155
156/*
157 * indicate the client caching state as readable text
158 */
159static inline const char *nfs_server_fscache_state(struct nfs_server *server)
160{
161 if (server->fscache && (server->options & NFS_OPTION_FSCACHE))
162 return "yes";
163 return "no ";
164}
165
166
167#else /* CONFIG_NFS_FSCACHE */
168static inline int nfs_fscache_register(void) { return 0; }
169static inline void nfs_fscache_unregister(void) {}
170
171static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {}
172static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
173
174static inline void nfs_fscache_get_super_cookie(
175 struct super_block *sb,
176 struct nfs_parsed_mount_data *data)
177{
178}
179static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
180
181static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {}
182static inline void nfs_fscache_release_inode_cookie(struct inode *inode) {}
183static inline void nfs_fscache_zap_inode_cookie(struct inode *inode) {}
184static inline void nfs_fscache_set_inode_cookie(struct inode *inode,
185 struct file *filp) {}
186static inline void nfs_fscache_reset_inode_cookie(struct inode *inode) {}
187
188static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
189{
190 return 1; /* True: may release page */
191}
192static inline void nfs_fscache_invalidate_page(struct page *page,
193 struct inode *inode) {}
194static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
195 struct page *page) {}
196
197static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
198 struct inode *inode,
199 struct page *page)
200{
201 return -ENOBUFS;
202}
203static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
204 struct inode *inode,
205 struct address_space *mapping,
206 struct list_head *pages,
207 unsigned *nr_pages)
208{
209 return -ENOBUFS;
210}
211static inline void nfs_readpage_to_fscache(struct inode *inode,
212 struct page *page, int sync) {}
213
214static inline const char *nfs_server_fscache_state(struct nfs_server *server)
215{
216 return "no ";
217}
218
219#endif /* CONFIG_NFS_FSCACHE */
220#endif /* _NFS_FSCACHE_H */
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b7c9b2df1f29..46177cb87064 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -156,7 +156,7 @@ int nfs4_path_walk(struct nfs_server *server,
156 return ret; 156 return ret;
157 } 157 }
158 158
159 if (fattr.type != NFDIR) { 159 if (!S_ISDIR(fattr.mode)) {
160 printk(KERN_ERR "nfs4_get_root:" 160 printk(KERN_ERR "nfs4_get_root:"
161 " getroot encountered non-directory\n"); 161 " getroot encountered non-directory\n");
162 return -ENOTDIR; 162 return -ENOTDIR;
@@ -213,7 +213,7 @@ eat_dot_dir:
213 return ret; 213 return ret;
214 } 214 }
215 215
216 if (fattr.type != NFDIR) { 216 if (!S_ISDIR(fattr.mode)) {
217 printk(KERN_ERR "nfs4_get_root:" 217 printk(KERN_ERR "nfs4_get_root:"
218 " lookupfh encountered non-directory\n"); 218 " lookupfh encountered non-directory\n");
219 return -ENOTDIR; 219 return -ENOTDIR;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0c381686171e..64f87194d390 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -46,6 +46,7 @@
46#include "delegation.h" 46#include "delegation.h"
47#include "iostat.h" 47#include "iostat.h"
48#include "internal.h" 48#include "internal.h"
49#include "fscache.h"
49 50
50#define NFSDBG_FACILITY NFSDBG_VFS 51#define NFSDBG_FACILITY NFSDBG_VFS
51 52
@@ -66,6 +67,18 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
66} 67}
67 68
68/** 69/**
70 * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks
71 * @word: long word containing the bit lock
72 */
73int nfs_wait_bit_killable(void *word)
74{
75 if (fatal_signal_pending(current))
76 return -ERESTARTSYS;
77 schedule();
78 return 0;
79}
80
81/**
69 * nfs_compat_user_ino64 - returns the user-visible inode number 82 * nfs_compat_user_ino64 - returns the user-visible inode number
70 * @fileid: 64-bit fileid 83 * @fileid: 64-bit fileid
71 * 84 *
@@ -109,6 +122,7 @@ void nfs_clear_inode(struct inode *inode)
109 BUG_ON(!list_empty(&NFS_I(inode)->open_files)); 122 BUG_ON(!list_empty(&NFS_I(inode)->open_files));
110 nfs_zap_acl_cache(inode); 123 nfs_zap_acl_cache(inode);
111 nfs_access_zap_cache(inode); 124 nfs_access_zap_cache(inode);
125 nfs_fscache_release_inode_cookie(inode);
112} 126}
113 127
114/** 128/**
@@ -249,13 +263,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
249 struct inode *inode = ERR_PTR(-ENOENT); 263 struct inode *inode = ERR_PTR(-ENOENT);
250 unsigned long hash; 264 unsigned long hash;
251 265
252 if ((fattr->valid & NFS_ATTR_FATTR) == 0) 266 if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0)
253 goto out_no_inode; 267 goto out_no_inode;
254 268 if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)
255 if (!fattr->nlink) {
256 printk("NFS: Buggy server - nlink == 0!\n");
257 goto out_no_inode; 269 goto out_no_inode;
258 }
259 270
260 hash = nfs_fattr_to_ino_t(fattr); 271 hash = nfs_fattr_to_ino_t(fattr);
261 272
@@ -291,7 +302,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
291 && fattr->size <= NFS_LIMIT_READDIRPLUS) 302 && fattr->size <= NFS_LIMIT_READDIRPLUS)
292 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); 303 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
293 /* Deal with crossing mountpoints */ 304 /* Deal with crossing mountpoints */
294 if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) { 305 if ((fattr->valid & NFS_ATTR_FATTR_FSID)
306 && !nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
295 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) 307 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
296 inode->i_op = &nfs_referral_inode_operations; 308 inode->i_op = &nfs_referral_inode_operations;
297 else 309 else
@@ -304,30 +316,49 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
304 else 316 else
305 init_special_inode(inode, inode->i_mode, fattr->rdev); 317 init_special_inode(inode, inode->i_mode, fattr->rdev);
306 318
319 memset(&inode->i_atime, 0, sizeof(inode->i_atime));
320 memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
321 memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
322 nfsi->change_attr = 0;
323 inode->i_size = 0;
324 inode->i_nlink = 0;
325 inode->i_uid = -2;
326 inode->i_gid = -2;
327 inode->i_blocks = 0;
328 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
329
307 nfsi->read_cache_jiffies = fattr->time_start; 330 nfsi->read_cache_jiffies = fattr->time_start;
308 nfsi->attr_gencount = fattr->gencount; 331 nfsi->attr_gencount = fattr->gencount;
309 inode->i_atime = fattr->atime; 332 if (fattr->valid & NFS_ATTR_FATTR_ATIME)
310 inode->i_mtime = fattr->mtime; 333 inode->i_atime = fattr->atime;
311 inode->i_ctime = fattr->ctime; 334 if (fattr->valid & NFS_ATTR_FATTR_MTIME)
312 if (fattr->valid & NFS_ATTR_FATTR_V4) 335 inode->i_mtime = fattr->mtime;
336 if (fattr->valid & NFS_ATTR_FATTR_CTIME)
337 inode->i_ctime = fattr->ctime;
338 if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
313 nfsi->change_attr = fattr->change_attr; 339 nfsi->change_attr = fattr->change_attr;
314 inode->i_size = nfs_size_to_loff_t(fattr->size); 340 if (fattr->valid & NFS_ATTR_FATTR_SIZE)
315 inode->i_nlink = fattr->nlink; 341 inode->i_size = nfs_size_to_loff_t(fattr->size);
316 inode->i_uid = fattr->uid; 342 if (fattr->valid & NFS_ATTR_FATTR_NLINK)
317 inode->i_gid = fattr->gid; 343 inode->i_nlink = fattr->nlink;
318 if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) { 344 if (fattr->valid & NFS_ATTR_FATTR_OWNER)
345 inode->i_uid = fattr->uid;
346 if (fattr->valid & NFS_ATTR_FATTR_GROUP)
347 inode->i_gid = fattr->gid;
348 if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
349 inode->i_blocks = fattr->du.nfs2.blocks;
350 if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
319 /* 351 /*
320 * report the blocks in 512byte units 352 * report the blocks in 512byte units
321 */ 353 */
322 inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); 354 inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
323 } else {
324 inode->i_blocks = fattr->du.nfs2.blocks;
325 } 355 }
326 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); 356 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
327 nfsi->attrtimeo_timestamp = now; 357 nfsi->attrtimeo_timestamp = now;
328 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
329 nfsi->access_cache = RB_ROOT; 358 nfsi->access_cache = RB_ROOT;
330 359
360 nfs_fscache_init_inode_cookie(inode);
361
331 unlock_new_inode(inode); 362 unlock_new_inode(inode);
332 } else 363 } else
333 nfs_refresh_inode(inode, fattr); 364 nfs_refresh_inode(inode, fattr);
@@ -514,6 +545,32 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
514 return err; 545 return err;
515} 546}
516 547
548/**
549 * nfs_close_context - Common close_context() routine NFSv2/v3
550 * @ctx: pointer to context
551 * @is_sync: is this a synchronous close
552 *
553 * always ensure that the attributes are up to date if we're mounted
554 * with close-to-open semantics
555 */
556void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
557{
558 struct inode *inode;
559 struct nfs_server *server;
560
561 if (!(ctx->mode & FMODE_WRITE))
562 return;
563 if (!is_sync)
564 return;
565 inode = ctx->path.dentry->d_inode;
566 if (!list_empty(&NFS_I(inode)->open_files))
567 return;
568 server = NFS_SERVER(inode);
569 if (server->flags & NFS_MOUNT_NOCTO)
570 return;
571 nfs_revalidate_inode(server, inode);
572}
573
517static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred) 574static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred)
518{ 575{
519 struct nfs_open_context *ctx; 576 struct nfs_open_context *ctx;
@@ -540,24 +597,15 @@ struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
540 return ctx; 597 return ctx;
541} 598}
542 599
543static void __put_nfs_open_context(struct nfs_open_context *ctx, int wait) 600static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
544{ 601{
545 struct inode *inode; 602 struct inode *inode = ctx->path.dentry->d_inode;
546
547 if (ctx == NULL)
548 return;
549 603
550 inode = ctx->path.dentry->d_inode;
551 if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock)) 604 if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock))
552 return; 605 return;
553 list_del(&ctx->list); 606 list_del(&ctx->list);
554 spin_unlock(&inode->i_lock); 607 spin_unlock(&inode->i_lock);
555 if (ctx->state != NULL) { 608 NFS_PROTO(inode)->close_context(ctx, is_sync);
556 if (wait)
557 nfs4_close_sync(&ctx->path, ctx->state, ctx->mode);
558 else
559 nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
560 }
561 if (ctx->cred != NULL) 609 if (ctx->cred != NULL)
562 put_rpccred(ctx->cred); 610 put_rpccred(ctx->cred);
563 path_put(&ctx->path); 611 path_put(&ctx->path);
@@ -642,6 +690,7 @@ int nfs_open(struct inode *inode, struct file *filp)
642 ctx->mode = filp->f_mode; 690 ctx->mode = filp->f_mode;
643 nfs_file_set_open_context(filp, ctx); 691 nfs_file_set_open_context(filp, ctx);
644 put_nfs_open_context(ctx); 692 put_nfs_open_context(ctx);
693 nfs_fscache_set_inode_cookie(inode, filp);
645 return 0; 694 return 0;
646} 695}
647 696
@@ -670,9 +719,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
670 if (NFS_STALE(inode)) 719 if (NFS_STALE(inode))
671 goto out; 720 goto out;
672 721
673 if (NFS_STALE(inode))
674 goto out;
675
676 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); 722 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
677 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); 723 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
678 if (status != 0) { 724 if (status != 0) {
@@ -745,6 +791,7 @@ static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_spa
745 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); 791 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
746 spin_unlock(&inode->i_lock); 792 spin_unlock(&inode->i_lock);
747 nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); 793 nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
794 nfs_fscache_reset_inode_cookie(inode);
748 dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", 795 dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
749 inode->i_sb->s_id, (long long)NFS_FILEID(inode)); 796 inode->i_sb->s_id, (long long)NFS_FILEID(inode));
750 return 0; 797 return 0;
@@ -815,25 +862,31 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
815{ 862{
816 struct nfs_inode *nfsi = NFS_I(inode); 863 struct nfs_inode *nfsi = NFS_I(inode);
817 864
818 if ((fattr->valid & NFS_ATTR_WCC_V4) != 0 && 865 if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
819 nfsi->change_attr == fattr->pre_change_attr) { 866 && (fattr->valid & NFS_ATTR_FATTR_CHANGE)
867 && nfsi->change_attr == fattr->pre_change_attr) {
820 nfsi->change_attr = fattr->change_attr; 868 nfsi->change_attr = fattr->change_attr;
821 if (S_ISDIR(inode->i_mode)) 869 if (S_ISDIR(inode->i_mode))
822 nfsi->cache_validity |= NFS_INO_INVALID_DATA; 870 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
823 } 871 }
824 /* If we have atomic WCC data, we may update some attributes */ 872 /* If we have atomic WCC data, we may update some attributes */
825 if ((fattr->valid & NFS_ATTR_WCC) != 0) { 873 if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
826 if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) 874 && (fattr->valid & NFS_ATTR_FATTR_CTIME)
875 && timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
827 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); 876 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
828 if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { 877
878 if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
879 && (fattr->valid & NFS_ATTR_FATTR_MTIME)
880 && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
829 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); 881 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
830 if (S_ISDIR(inode->i_mode)) 882 if (S_ISDIR(inode->i_mode))
831 nfsi->cache_validity |= NFS_INO_INVALID_DATA; 883 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
832 }
833 if (i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) &&
834 nfsi->npages == 0)
835 i_size_write(inode, nfs_size_to_loff_t(fattr->size));
836 } 884 }
885 if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
886 && (fattr->valid & NFS_ATTR_FATTR_SIZE)
887 && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
888 && nfsi->npages == 0)
889 i_size_write(inode, nfs_size_to_loff_t(fattr->size));
837} 890}
838 891
839/** 892/**
@@ -853,35 +906,39 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
853 906
854 907
855 /* Has the inode gone and changed behind our back? */ 908 /* Has the inode gone and changed behind our back? */
856 if (nfsi->fileid != fattr->fileid 909 if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
857 || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) { 910 return -EIO;
911 if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
858 return -EIO; 912 return -EIO;
859 }
860 913
861 if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && 914 if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
862 nfsi->change_attr != fattr->change_attr) 915 nfsi->change_attr != fattr->change_attr)
863 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 916 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
864 917
865 /* Verify a few of the more important attributes */ 918 /* Verify a few of the more important attributes */
866 if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) 919 if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
867 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 920 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
868 921
869 cur_size = i_size_read(inode); 922 if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
870 new_isize = nfs_size_to_loff_t(fattr->size); 923 cur_size = i_size_read(inode);
871 if (cur_size != new_isize && nfsi->npages == 0) 924 new_isize = nfs_size_to_loff_t(fattr->size);
872 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 925 if (cur_size != new_isize && nfsi->npages == 0)
926 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
927 }
873 928
874 /* Have any file permissions changed? */ 929 /* Have any file permissions changed? */
875 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) 930 if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
876 || inode->i_uid != fattr->uid 931 invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
877 || inode->i_gid != fattr->gid) 932 if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid)
933 invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
934 if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid)
878 invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; 935 invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
879 936
880 /* Has the link count changed? */ 937 /* Has the link count changed? */
881 if (inode->i_nlink != fattr->nlink) 938 if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink)
882 invalid |= NFS_INO_INVALID_ATTR; 939 invalid |= NFS_INO_INVALID_ATTR;
883 940
884 if (!timespec_equal(&inode->i_atime, &fattr->atime)) 941 if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime))
885 invalid |= NFS_INO_INVALID_ATIME; 942 invalid |= NFS_INO_INVALID_ATIME;
886 943
887 if (invalid != 0) 944 if (invalid != 0)
@@ -893,11 +950,15 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
893 950
894static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr) 951static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
895{ 952{
953 if (!(fattr->valid & NFS_ATTR_FATTR_CTIME))
954 return 0;
896 return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0; 955 return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
897} 956}
898 957
899static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr) 958static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
900{ 959{
960 if (!(fattr->valid & NFS_ATTR_FATTR_SIZE))
961 return 0;
901 return nfs_size_to_loff_t(fattr->size) > i_size_read(inode); 962 return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
902} 963}
903 964
@@ -975,6 +1036,7 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
975 spin_lock(&inode->i_lock); 1036 spin_lock(&inode->i_lock);
976 status = nfs_refresh_inode_locked(inode, fattr); 1037 status = nfs_refresh_inode_locked(inode, fattr);
977 spin_unlock(&inode->i_lock); 1038 spin_unlock(&inode->i_lock);
1039
978 return status; 1040 return status;
979} 1041}
980 1042
@@ -1033,20 +1095,31 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
1033 /* Don't do a WCC update if these attributes are already stale */ 1095 /* Don't do a WCC update if these attributes are already stale */
1034 if ((fattr->valid & NFS_ATTR_FATTR) == 0 || 1096 if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
1035 !nfs_inode_attrs_need_update(inode, fattr)) { 1097 !nfs_inode_attrs_need_update(inode, fattr)) {
1036 fattr->valid &= ~(NFS_ATTR_WCC_V4|NFS_ATTR_WCC); 1098 fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE
1099 | NFS_ATTR_FATTR_PRESIZE
1100 | NFS_ATTR_FATTR_PREMTIME
1101 | NFS_ATTR_FATTR_PRECTIME);
1037 goto out_noforce; 1102 goto out_noforce;
1038 } 1103 }
1039 if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && 1104 if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
1040 (fattr->valid & NFS_ATTR_WCC_V4) == 0) { 1105 (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) {
1041 fattr->pre_change_attr = NFS_I(inode)->change_attr; 1106 fattr->pre_change_attr = NFS_I(inode)->change_attr;
1042 fattr->valid |= NFS_ATTR_WCC_V4; 1107 fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
1043 } 1108 }
1044 if ((fattr->valid & NFS_ATTR_FATTR) != 0 && 1109 if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
1045 (fattr->valid & NFS_ATTR_WCC) == 0) { 1110 (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) {
1046 memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime)); 1111 memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
1112 fattr->valid |= NFS_ATTR_FATTR_PRECTIME;
1113 }
1114 if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 &&
1115 (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) {
1047 memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime)); 1116 memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
1117 fattr->valid |= NFS_ATTR_FATTR_PREMTIME;
1118 }
1119 if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 &&
1120 (fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) {
1048 fattr->pre_size = i_size_read(inode); 1121 fattr->pre_size = i_size_read(inode);
1049 fattr->valid |= NFS_ATTR_WCC; 1122 fattr->valid |= NFS_ATTR_FATTR_PRESIZE;
1050 } 1123 }
1051out_noforce: 1124out_noforce:
1052 status = nfs_post_op_update_inode_locked(inode, fattr); 1125 status = nfs_post_op_update_inode_locked(inode, fattr);
@@ -1078,18 +1151,18 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1078 __func__, inode->i_sb->s_id, inode->i_ino, 1151 __func__, inode->i_sb->s_id, inode->i_ino,
1079 atomic_read(&inode->i_count), fattr->valid); 1152 atomic_read(&inode->i_count), fattr->valid);
1080 1153
1081 if (nfsi->fileid != fattr->fileid) 1154 if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
1082 goto out_fileid; 1155 goto out_fileid;
1083 1156
1084 /* 1157 /*
1085 * Make sure the inode's type hasn't changed. 1158 * Make sure the inode's type hasn't changed.
1086 */ 1159 */
1087 if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) 1160 if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
1088 goto out_changed; 1161 goto out_changed;
1089 1162
1090 server = NFS_SERVER(inode); 1163 server = NFS_SERVER(inode);
1091 /* Update the fsid? */ 1164 /* Update the fsid? */
1092 if (S_ISDIR(inode->i_mode) && 1165 if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
1093 !nfs_fsid_equal(&server->fsid, &fattr->fsid) && 1166 !nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
1094 !test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags)) 1167 !test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags))
1095 server->fsid = fattr->fsid; 1168 server->fsid = fattr->fsid;
@@ -1099,14 +1172,27 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1099 */ 1172 */
1100 nfsi->read_cache_jiffies = fattr->time_start; 1173 nfsi->read_cache_jiffies = fattr->time_start;
1101 1174
1102 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME 1175 if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME)))
1103 | NFS_INO_REVAL_PAGECACHE); 1176 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
1177 | NFS_INO_INVALID_ATIME
1178 | NFS_INO_REVAL_PAGECACHE);
1104 1179
1105 /* Do atomic weak cache consistency updates */ 1180 /* Do atomic weak cache consistency updates */
1106 nfs_wcc_update_inode(inode, fattr); 1181 nfs_wcc_update_inode(inode, fattr);
1107 1182
1108 /* More cache consistency checks */ 1183 /* More cache consistency checks */
1109 if (!(fattr->valid & NFS_ATTR_FATTR_V4)) { 1184 if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
1185 if (nfsi->change_attr != fattr->change_attr) {
1186 dprintk("NFS: change_attr change on server for file %s/%ld\n",
1187 inode->i_sb->s_id, inode->i_ino);
1188 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1189 if (S_ISDIR(inode->i_mode))
1190 nfs_force_lookup_revalidate(inode);
1191 nfsi->change_attr = fattr->change_attr;
1192 }
1193 }
1194
1195 if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
1110 /* NFSv2/v3: Check if the mtime agrees */ 1196 /* NFSv2/v3: Check if the mtime agrees */
1111 if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) { 1197 if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
1112 dprintk("NFS: mtime change on server for file %s/%ld\n", 1198 dprintk("NFS: mtime change on server for file %s/%ld\n",
@@ -1114,59 +1200,80 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1114 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; 1200 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
1115 if (S_ISDIR(inode->i_mode)) 1201 if (S_ISDIR(inode->i_mode))
1116 nfs_force_lookup_revalidate(inode); 1202 nfs_force_lookup_revalidate(inode);
1203 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
1117 } 1204 }
1205 }
1206 if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
1118 /* If ctime has changed we should definitely clear access+acl caches */ 1207 /* If ctime has changed we should definitely clear access+acl caches */
1119 if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) 1208 if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
1120 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1209 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1121 } else if (nfsi->change_attr != fattr->change_attr) { 1210 /* and probably clear data for a directory too as utimes can cause
1122 dprintk("NFS: change_attr change on server for file %s/%ld\n", 1211 * havoc with our cache.
1123 inode->i_sb->s_id, inode->i_ino); 1212 */
1124 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1213 if (S_ISDIR(inode->i_mode)) {
1125 if (S_ISDIR(inode->i_mode)) 1214 invalid |= NFS_INO_INVALID_DATA;
1126 nfs_force_lookup_revalidate(inode); 1215 nfs_force_lookup_revalidate(inode);
1216 }
1217 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
1218 }
1127 } 1219 }
1128 1220
1129 /* Check if our cached file size is stale */ 1221 /* Check if our cached file size is stale */
1130 new_isize = nfs_size_to_loff_t(fattr->size); 1222 if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
1131 cur_isize = i_size_read(inode); 1223 new_isize = nfs_size_to_loff_t(fattr->size);
1132 if (new_isize != cur_isize) { 1224 cur_isize = i_size_read(inode);
1133 /* Do we perhaps have any outstanding writes, or has 1225 if (new_isize != cur_isize) {
1134 * the file grown beyond our last write? */ 1226 /* Do we perhaps have any outstanding writes, or has
1135 if (nfsi->npages == 0 || new_isize > cur_isize) { 1227 * the file grown beyond our last write? */
1136 i_size_write(inode, new_isize); 1228 if (nfsi->npages == 0 || new_isize > cur_isize) {
1137 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; 1229 i_size_write(inode, new_isize);
1230 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
1231 }
1232 dprintk("NFS: isize change on server for file %s/%ld\n",
1233 inode->i_sb->s_id, inode->i_ino);
1138 } 1234 }
1139 dprintk("NFS: isize change on server for file %s/%ld\n",
1140 inode->i_sb->s_id, inode->i_ino);
1141 } 1235 }
1142 1236
1143 1237
1144 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); 1238 if (fattr->valid & NFS_ATTR_FATTR_ATIME)
1145 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); 1239 memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
1146 memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
1147 nfsi->change_attr = fattr->change_attr;
1148
1149 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) ||
1150 inode->i_uid != fattr->uid ||
1151 inode->i_gid != fattr->gid)
1152 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1153 1240
1154 if (inode->i_nlink != fattr->nlink) 1241 if (fattr->valid & NFS_ATTR_FATTR_MODE) {
1155 invalid |= NFS_INO_INVALID_ATTR; 1242 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
1243 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1244 inode->i_mode = fattr->mode;
1245 }
1246 }
1247 if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
1248 if (inode->i_uid != fattr->uid) {
1249 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1250 inode->i_uid = fattr->uid;
1251 }
1252 }
1253 if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
1254 if (inode->i_gid != fattr->gid) {
1255 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1256 inode->i_gid = fattr->gid;
1257 }
1258 }
1156 1259
1157 inode->i_mode = fattr->mode; 1260 if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
1158 inode->i_nlink = fattr->nlink; 1261 if (inode->i_nlink != fattr->nlink) {
1159 inode->i_uid = fattr->uid; 1262 invalid |= NFS_INO_INVALID_ATTR;
1160 inode->i_gid = fattr->gid; 1263 if (S_ISDIR(inode->i_mode))
1264 invalid |= NFS_INO_INVALID_DATA;
1265 inode->i_nlink = fattr->nlink;
1266 }
1267 }
1161 1268
1162 if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) { 1269 if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
1163 /* 1270 /*
1164 * report the blocks in 512byte units 1271 * report the blocks in 512byte units
1165 */ 1272 */
1166 inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); 1273 inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
1167 } else {
1168 inode->i_blocks = fattr->du.nfs2.blocks;
1169 } 1274 }
1275 if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
1276 inode->i_blocks = fattr->du.nfs2.blocks;
1170 1277
1171 /* Update attrtimeo value if we're out of the unstable period */ 1278 /* Update attrtimeo value if we're out of the unstable period */
1172 if (invalid & NFS_INO_INVALID_ATTR) { 1279 if (invalid & NFS_INO_INVALID_ATTR) {
@@ -1274,7 +1381,6 @@ static void init_once(void *foo)
1274 INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); 1381 INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
1275 INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); 1382 INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
1276 INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); 1383 INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
1277 nfsi->ncommit = 0;
1278 nfsi->npages = 0; 1384 nfsi->npages = 0;
1279 atomic_set(&nfsi->silly_count, 1); 1385 atomic_set(&nfsi->silly_count, 1);
1280 INIT_HLIST_HEAD(&nfsi->silly_list); 1386 INIT_HLIST_HEAD(&nfsi->silly_list);
@@ -1337,6 +1443,10 @@ static int __init init_nfs_fs(void)
1337{ 1443{
1338 int err; 1444 int err;
1339 1445
1446 err = nfs_fscache_register();
1447 if (err < 0)
1448 goto out7;
1449
1340 err = nfsiod_start(); 1450 err = nfsiod_start();
1341 if (err) 1451 if (err)
1342 goto out6; 1452 goto out6;
@@ -1389,6 +1499,8 @@ out4:
1389out5: 1499out5:
1390 nfsiod_stop(); 1500 nfsiod_stop();
1391out6: 1501out6:
1502 nfs_fscache_unregister();
1503out7:
1392 return err; 1504 return err;
1393} 1505}
1394 1506
@@ -1399,6 +1511,7 @@ static void __exit exit_nfs_fs(void)
1399 nfs_destroy_readpagecache(); 1511 nfs_destroy_readpagecache();
1400 nfs_destroy_inodecache(); 1512 nfs_destroy_inodecache();
1401 nfs_destroy_nfspagecache(); 1513 nfs_destroy_nfspagecache();
1514 nfs_fscache_unregister();
1402#ifdef CONFIG_PROC_FS 1515#ifdef CONFIG_PROC_FS
1403 rpc_proc_unregister("nfs"); 1516 rpc_proc_unregister("nfs");
1404#endif 1517#endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 340ede8f608f..e4d6a8348adf 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -5,6 +5,8 @@
5#include <linux/mount.h> 5#include <linux/mount.h>
6#include <linux/security.h> 6#include <linux/security.h>
7 7
8#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
9
8struct nfs_string; 10struct nfs_string;
9 11
10/* Maximum number of readahead requests 12/* Maximum number of readahead requests
@@ -37,10 +39,12 @@ struct nfs_parsed_mount_data {
37 int acregmin, acregmax, 39 int acregmin, acregmax,
38 acdirmin, acdirmax; 40 acdirmin, acdirmax;
39 int namlen; 41 int namlen;
42 unsigned int options;
40 unsigned int bsize; 43 unsigned int bsize;
41 unsigned int auth_flavor_len; 44 unsigned int auth_flavor_len;
42 rpc_authflavor_t auth_flavors[1]; 45 rpc_authflavor_t auth_flavors[1];
43 char *client_address; 46 char *client_address;
47 char *fscache_uniq;
44 48
45 struct { 49 struct {
46 struct sockaddr_storage address; 50 struct sockaddr_storage address;
@@ -152,6 +156,9 @@ extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
152extern struct rpc_procinfo nfs4_procedures[]; 156extern struct rpc_procinfo nfs4_procedures[];
153#endif 157#endif
154 158
159/* proc.c */
160void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
161
155/* dir.c */ 162/* dir.c */
156extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); 163extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
157 164
@@ -165,6 +172,7 @@ extern void nfs_clear_inode(struct inode *);
165extern void nfs4_clear_inode(struct inode *); 172extern void nfs4_clear_inode(struct inode *);
166#endif 173#endif
167void nfs_zap_acl_cache(struct inode *inode); 174void nfs_zap_acl_cache(struct inode *inode);
175extern int nfs_wait_bit_killable(void *word);
168 176
169/* super.c */ 177/* super.c */
170void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *); 178void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *);
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index a36952810032..a2ab2529b5ca 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -16,6 +16,9 @@
16 16
17struct nfs_iostats { 17struct nfs_iostats {
18 unsigned long long bytes[__NFSIOS_BYTESMAX]; 18 unsigned long long bytes[__NFSIOS_BYTESMAX];
19#ifdef CONFIG_NFS_FSCACHE
20 unsigned long long fscache[__NFSIOS_FSCACHEMAX];
21#endif
19 unsigned long events[__NFSIOS_COUNTSMAX]; 22 unsigned long events[__NFSIOS_COUNTSMAX];
20} ____cacheline_aligned; 23} ____cacheline_aligned;
21 24
@@ -57,6 +60,21 @@ static inline void nfs_add_stats(const struct inode *inode,
57 nfs_add_server_stats(NFS_SERVER(inode), stat, addend); 60 nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
58} 61}
59 62
63#ifdef CONFIG_NFS_FSCACHE
64static inline void nfs_add_fscache_stats(struct inode *inode,
65 enum nfs_stat_fscachecounters stat,
66 unsigned long addend)
67{
68 struct nfs_iostats *iostats;
69 int cpu;
70
71 cpu = get_cpu();
72 iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu);
73 iostats->fscache[stat] += addend;
74 put_cpu_no_resched();
75}
76#endif
77
60static inline struct nfs_iostats *nfs_alloc_iostats(void) 78static inline struct nfs_iostats *nfs_alloc_iostats(void)
61{ 79{
62 return alloc_percpu(struct nfs_iostats); 80 return alloc_percpu(struct nfs_iostats);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 28bab67d1519..c862c9340f9a 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -120,8 +120,8 @@ xdr_decode_time(__be32 *p, struct timespec *timep)
120static __be32 * 120static __be32 *
121xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) 121xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
122{ 122{
123 u32 rdev; 123 u32 rdev, type;
124 fattr->type = (enum nfs_ftype) ntohl(*p++); 124 type = ntohl(*p++);
125 fattr->mode = ntohl(*p++); 125 fattr->mode = ntohl(*p++);
126 fattr->nlink = ntohl(*p++); 126 fattr->nlink = ntohl(*p++);
127 fattr->uid = ntohl(*p++); 127 fattr->uid = ntohl(*p++);
@@ -136,10 +136,9 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
136 p = xdr_decode_time(p, &fattr->atime); 136 p = xdr_decode_time(p, &fattr->atime);
137 p = xdr_decode_time(p, &fattr->mtime); 137 p = xdr_decode_time(p, &fattr->mtime);
138 p = xdr_decode_time(p, &fattr->ctime); 138 p = xdr_decode_time(p, &fattr->ctime);
139 fattr->valid |= NFS_ATTR_FATTR; 139 fattr->valid |= NFS_ATTR_FATTR_V2;
140 fattr->rdev = new_decode_dev(rdev); 140 fattr->rdev = new_decode_dev(rdev);
141 if (fattr->type == NFCHR && rdev == NFS2_FIFO_DEV) { 141 if (type == NFCHR && rdev == NFS2_FIFO_DEV) {
142 fattr->type = NFFIFO;
143 fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO; 142 fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
144 fattr->rdev = 0; 143 fattr->rdev = 0;
145 } 144 }
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index cef62557c87d..6bbf0e6daad2 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -292,7 +292,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
292{ 292{
293 struct nfs_server *server = NFS_SERVER(inode); 293 struct nfs_server *server = NFS_SERVER(inode);
294 struct nfs_fattr fattr; 294 struct nfs_fattr fattr;
295 struct page *pages[NFSACL_MAXPAGES] = { }; 295 struct page *pages[NFSACL_MAXPAGES];
296 struct nfs3_setaclargs args = { 296 struct nfs3_setaclargs args = {
297 .inode = inode, 297 .inode = inode,
298 .mask = NFS_ACL, 298 .mask = NFS_ACL,
@@ -303,7 +303,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
303 .rpc_argp = &args, 303 .rpc_argp = &args,
304 .rpc_resp = &fattr, 304 .rpc_resp = &fattr,
305 }; 305 };
306 int status, count; 306 int status;
307 307
308 status = -EOPNOTSUPP; 308 status = -EOPNOTSUPP;
309 if (!nfs_server_capable(inode, NFS_CAP_ACLS)) 309 if (!nfs_server_capable(inode, NFS_CAP_ACLS))
@@ -319,6 +319,20 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
319 if (S_ISDIR(inode->i_mode)) { 319 if (S_ISDIR(inode->i_mode)) {
320 args.mask |= NFS_DFACL; 320 args.mask |= NFS_DFACL;
321 args.acl_default = dfacl; 321 args.acl_default = dfacl;
322 args.len = nfsacl_size(acl, dfacl);
323 } else
324 args.len = nfsacl_size(acl, NULL);
325
326 if (args.len > NFS_ACL_INLINE_BUFSIZE) {
327 unsigned int npages = 1 + ((args.len - 1) >> PAGE_SHIFT);
328
329 status = -ENOMEM;
330 do {
331 args.pages[args.npages] = alloc_page(GFP_KERNEL);
332 if (args.pages[args.npages] == NULL)
333 goto out_freepages;
334 args.npages++;
335 } while (args.npages < npages);
322 } 336 }
323 337
324 dprintk("NFS call setacl\n"); 338 dprintk("NFS call setacl\n");
@@ -329,10 +343,6 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
329 nfs_zap_acl_cache(inode); 343 nfs_zap_acl_cache(inode);
330 dprintk("NFS reply setacl: %d\n", status); 344 dprintk("NFS reply setacl: %d\n", status);
331 345
332 /* pages may have been allocated at the xdr layer. */
333 for (count = 0; count < NFSACL_MAXPAGES && args.pages[count]; count++)
334 __free_page(args.pages[count]);
335
336 switch (status) { 346 switch (status) {
337 case 0: 347 case 0:
338 status = nfs_refresh_inode(inode, &fattr); 348 status = nfs_refresh_inode(inode, &fattr);
@@ -346,6 +356,11 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
346 case -ENOTSUPP: 356 case -ENOTSUPP:
347 status = -EOPNOTSUPP; 357 status = -EOPNOTSUPP;
348 } 358 }
359out_freepages:
360 while (args.npages != 0) {
361 args.npages--;
362 __free_page(args.pages[args.npages]);
363 }
349out: 364out:
350 return status; 365 return status;
351} 366}
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c55be7a7679e..d0cc5ce0edfe 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -328,7 +328,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
328 data->arg.create.verifier[1] = current->pid; 328 data->arg.create.verifier[1] = current->pid;
329 } 329 }
330 330
331 sattr->ia_mode &= ~current->fs->umask; 331 sattr->ia_mode &= ~current_umask();
332 332
333 for (;;) { 333 for (;;) {
334 status = nfs3_do_create(dir, dentry, data); 334 status = nfs3_do_create(dir, dentry, data);
@@ -528,7 +528,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
528 528
529 dprintk("NFS call mkdir %s\n", dentry->d_name.name); 529 dprintk("NFS call mkdir %s\n", dentry->d_name.name);
530 530
531 sattr->ia_mode &= ~current->fs->umask; 531 sattr->ia_mode &= ~current_umask();
532 532
533 data = nfs3_alloc_createdata(); 533 data = nfs3_alloc_createdata();
534 if (data == NULL) 534 if (data == NULL)
@@ -639,7 +639,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
639 dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name, 639 dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name,
640 MAJOR(rdev), MINOR(rdev)); 640 MAJOR(rdev), MINOR(rdev));
641 641
642 sattr->ia_mode &= ~current->fs->umask; 642 sattr->ia_mode &= ~current_umask();
643 643
644 data = nfs3_alloc_createdata(); 644 data = nfs3_alloc_createdata();
645 if (data == NULL) 645 if (data == NULL)
@@ -834,4 +834,5 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
834 .commit_done = nfs3_commit_done, 834 .commit_done = nfs3_commit_done,
835 .lock = nfs3_proc_lock, 835 .lock = nfs3_proc_lock,
836 .clear_acl_cache = nfs3_forget_cached_acls, 836 .clear_acl_cache = nfs3_forget_cached_acls,
837 .close_context = nfs_close_context,
837}; 838};
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 11cdddec1432..e6a1932c7110 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -82,26 +82,24 @@
82#define NFS3_commitres_sz (1+NFS3_wcc_data_sz+2) 82#define NFS3_commitres_sz (1+NFS3_wcc_data_sz+2)
83 83
84#define ACL3_getaclargs_sz (NFS3_fh_sz+1) 84#define ACL3_getaclargs_sz (NFS3_fh_sz+1)
85#define ACL3_setaclargs_sz (NFS3_fh_sz+1+2*(2+5*3)) 85#define ACL3_setaclargs_sz (NFS3_fh_sz+1+ \
86#define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+2*(2+5*3)) 86 XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
87#define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+ \
88 XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
87#define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz) 89#define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz)
88 90
89/* 91/*
90 * Map file type to S_IFMT bits 92 * Map file type to S_IFMT bits
91 */ 93 */
92static struct { 94static const umode_t nfs_type2fmt[] = {
93 unsigned int mode; 95 [NF3BAD] = 0,
94 unsigned int nfs2type; 96 [NF3REG] = S_IFREG,
95} nfs_type2fmt[] = { 97 [NF3DIR] = S_IFDIR,
96 { 0, NFNON }, 98 [NF3BLK] = S_IFBLK,
97 { S_IFREG, NFREG }, 99 [NF3CHR] = S_IFCHR,
98 { S_IFDIR, NFDIR }, 100 [NF3LNK] = S_IFLNK,
99 { S_IFBLK, NFBLK }, 101 [NF3SOCK] = S_IFSOCK,
100 { S_IFCHR, NFCHR }, 102 [NF3FIFO] = S_IFIFO,
101 { S_IFLNK, NFLNK },
102 { S_IFSOCK, NFSOCK },
103 { S_IFIFO, NFFIFO },
104 { 0, NFBAD }
105}; 103};
106 104
107/* 105/*
@@ -146,13 +144,12 @@ static __be32 *
146xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) 144xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
147{ 145{
148 unsigned int type, major, minor; 146 unsigned int type, major, minor;
149 int fmode; 147 umode_t fmode;
150 148
151 type = ntohl(*p++); 149 type = ntohl(*p++);
152 if (type >= NF3BAD) 150 if (type > NF3FIFO)
153 type = NF3BAD; 151 type = NF3NON;
154 fmode = nfs_type2fmt[type].mode; 152 fmode = nfs_type2fmt[type];
155 fattr->type = nfs_type2fmt[type].nfs2type;
156 fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode; 153 fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode;
157 fattr->nlink = ntohl(*p++); 154 fattr->nlink = ntohl(*p++);
158 fattr->uid = ntohl(*p++); 155 fattr->uid = ntohl(*p++);
@@ -175,7 +172,7 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
175 p = xdr_decode_time3(p, &fattr->ctime); 172 p = xdr_decode_time3(p, &fattr->ctime);
176 173
177 /* Update the mode bits */ 174 /* Update the mode bits */
178 fattr->valid |= (NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3); 175 fattr->valid |= NFS_ATTR_FATTR_V3;
179 return p; 176 return p;
180} 177}
181 178
@@ -231,7 +228,9 @@ xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr)
231 p = xdr_decode_hyper(p, &fattr->pre_size); 228 p = xdr_decode_hyper(p, &fattr->pre_size);
232 p = xdr_decode_time3(p, &fattr->pre_mtime); 229 p = xdr_decode_time3(p, &fattr->pre_mtime);
233 p = xdr_decode_time3(p, &fattr->pre_ctime); 230 p = xdr_decode_time3(p, &fattr->pre_ctime);
234 fattr->valid |= NFS_ATTR_WCC; 231 fattr->valid |= NFS_ATTR_FATTR_PRESIZE
232 | NFS_ATTR_FATTR_PREMTIME
233 | NFS_ATTR_FATTR_PRECTIME;
235 return p; 234 return p;
236} 235}
237 236
@@ -703,28 +702,18 @@ nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p,
703 struct nfs3_setaclargs *args) 702 struct nfs3_setaclargs *args)
704{ 703{
705 struct xdr_buf *buf = &req->rq_snd_buf; 704 struct xdr_buf *buf = &req->rq_snd_buf;
706 unsigned int base, len_in_head, len = nfsacl_size( 705 unsigned int base;
707 (args->mask & NFS_ACL) ? args->acl_access : NULL, 706 int err;
708 (args->mask & NFS_DFACL) ? args->acl_default : NULL);
709 int count, err;
710 707
711 p = xdr_encode_fhandle(p, NFS_FH(args->inode)); 708 p = xdr_encode_fhandle(p, NFS_FH(args->inode));
712 *p++ = htonl(args->mask); 709 *p++ = htonl(args->mask);
713 base = (char *)p - (char *)buf->head->iov_base; 710 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
714 /* put as much of the acls into head as possible. */ 711 base = req->rq_slen;
715 len_in_head = min_t(unsigned int, buf->head->iov_len - base, len); 712
716 len -= len_in_head; 713 if (args->npages != 0)
717 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p + (len_in_head >> 2)); 714 xdr_encode_pages(buf, args->pages, 0, args->len);
718 715 else
719 for (count = 0; (count << PAGE_SHIFT) < len; count++) { 716 req->rq_slen += args->len;
720 args->pages[count] = alloc_page(GFP_KERNEL);
721 if (!args->pages[count]) {
722 while (count)
723 __free_page(args->pages[--count]);
724 return -ENOMEM;
725 }
726 }
727 xdr_encode_pages(buf, args->pages, 0, len);
728 717
729 err = nfsacl_encode(buf, base, args->inode, 718 err = nfsacl_encode(buf, base, args->inode,
730 (args->mask & NFS_ACL) ? 719 (args->mask & NFS_ACL) ?
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 4e4d33204376..84345deab26f 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -179,7 +179,7 @@ struct nfs4_state_recovery_ops {
179 int (*recover_lock)(struct nfs4_state *, struct file_lock *); 179 int (*recover_lock)(struct nfs4_state *, struct file_lock *);
180}; 180};
181 181
182extern struct dentry_operations nfs4_dentry_operations; 182extern const struct dentry_operations nfs4_dentry_operations;
183extern const struct inode_operations nfs4_dir_inode_operations; 183extern const struct inode_operations nfs4_dir_inode_operations;
184 184
185/* inode.c */ 185/* inode.c */
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 30befc39b3c6..2a2a0a7143ad 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -21,7 +21,9 @@
21#define NFSDBG_FACILITY NFSDBG_VFS 21#define NFSDBG_FACILITY NFSDBG_VFS
22 22
23/* 23/*
24 * Check if fs_root is valid 24 * Convert the NFSv4 pathname components into a standard posix path.
25 *
26 * Note that the resulting string will be placed at the end of the buffer
25 */ 27 */
26static inline char *nfs4_pathname_string(const struct nfs4_pathname *pathname, 28static inline char *nfs4_pathname_string(const struct nfs4_pathname *pathname,
27 char *buffer, ssize_t buflen) 29 char *buffer, ssize_t buflen)
@@ -99,21 +101,20 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
99{ 101{
100 struct vfsmount *mnt = ERR_PTR(-ENOENT); 102 struct vfsmount *mnt = ERR_PTR(-ENOENT);
101 char *mnt_path; 103 char *mnt_path;
102 int page2len; 104 unsigned int maxbuflen;
103 unsigned int s; 105 unsigned int s;
104 106
105 mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); 107 mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
106 if (IS_ERR(mnt_path)) 108 if (IS_ERR(mnt_path))
107 return mnt; 109 return mnt;
108 mountdata->mnt_path = mnt_path; 110 mountdata->mnt_path = mnt_path;
109 page2 += strlen(mnt_path) + 1; 111 maxbuflen = mnt_path - 1 - page2;
110 page2len = PAGE_SIZE - strlen(mnt_path) - 1;
111 112
112 for (s = 0; s < location->nservers; s++) { 113 for (s = 0; s < location->nservers; s++) {
113 const struct nfs4_string *buf = &location->servers[s]; 114 const struct nfs4_string *buf = &location->servers[s];
114 struct sockaddr_storage addr; 115 struct sockaddr_storage addr;
115 116
116 if (buf->len <= 0 || buf->len >= PAGE_SIZE) 117 if (buf->len <= 0 || buf->len >= maxbuflen)
117 continue; 118 continue;
118 119
119 mountdata->addr = (struct sockaddr *)&addr; 120 mountdata->addr = (struct sockaddr *)&addr;
@@ -126,8 +127,8 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
126 continue; 127 continue;
127 nfs_set_port(mountdata->addr, NFS_PORT); 128 nfs_set_port(mountdata->addr, NFS_PORT);
128 129
129 strncpy(page2, buf->data, page2len); 130 memcpy(page2, buf->data, buf->len);
130 page2[page2len] = '\0'; 131 page2[buf->len] = '\0';
131 mountdata->hostname = page2; 132 mountdata->hostname = page2;
132 133
133 snprintf(page, PAGE_SIZE, "%s:%s", 134 snprintf(page, PAGE_SIZE, "%s:%s",
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8dde84b988d9..a4d242680299 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -193,14 +193,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
193 kunmap_atomic(start, KM_USER0); 193 kunmap_atomic(start, KM_USER0);
194} 194}
195 195
196static int nfs4_wait_bit_killable(void *word)
197{
198 if (fatal_signal_pending(current))
199 return -ERESTARTSYS;
200 schedule();
201 return 0;
202}
203
204static int nfs4_wait_clnt_recover(struct nfs_client *clp) 196static int nfs4_wait_clnt_recover(struct nfs_client *clp)
205{ 197{
206 int res; 198 int res;
@@ -208,7 +200,7 @@ static int nfs4_wait_clnt_recover(struct nfs_client *clp)
208 might_sleep(); 200 might_sleep();
209 201
210 res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, 202 res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
211 nfs4_wait_bit_killable, TASK_KILLABLE); 203 nfs_wait_bit_killable, TASK_KILLABLE);
212 return res; 204 return res;
213} 205}
214 206
@@ -1439,7 +1431,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
1439 if (calldata->arg.seqid == NULL) 1431 if (calldata->arg.seqid == NULL)
1440 goto out_free_calldata; 1432 goto out_free_calldata;
1441 calldata->arg.fmode = 0; 1433 calldata->arg.fmode = 0;
1442 calldata->arg.bitmask = server->attr_bitmask; 1434 calldata->arg.bitmask = server->cache_consistency_bitmask;
1443 calldata->res.fattr = &calldata->fattr; 1435 calldata->res.fattr = &calldata->fattr;
1444 calldata->res.seqid = calldata->arg.seqid; 1436 calldata->res.seqid = calldata->arg.seqid;
1445 calldata->res.server = server; 1437 calldata->res.server = server;
@@ -1509,7 +1501,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
1509 attr.ia_mode = nd->intent.open.create_mode; 1501 attr.ia_mode = nd->intent.open.create_mode;
1510 attr.ia_valid = ATTR_MODE; 1502 attr.ia_valid = ATTR_MODE;
1511 if (!IS_POSIXACL(dir)) 1503 if (!IS_POSIXACL(dir))
1512 attr.ia_mode &= ~current->fs->umask; 1504 attr.ia_mode &= ~current_umask();
1513 } else { 1505 } else {
1514 attr.ia_valid = 0; 1506 attr.ia_valid = 0;
1515 BUG_ON(nd->intent.open.flags & O_CREAT); 1507 BUG_ON(nd->intent.open.flags & O_CREAT);
@@ -1580,6 +1572,15 @@ out_drop:
1580 return 0; 1572 return 0;
1581} 1573}
1582 1574
1575void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
1576{
1577 if (ctx->state == NULL)
1578 return;
1579 if (is_sync)
1580 nfs4_close_sync(&ctx->path, ctx->state, ctx->mode);
1581 else
1582 nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
1583}
1583 1584
1584static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) 1585static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
1585{ 1586{
@@ -1600,6 +1601,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
1600 server->caps |= NFS_CAP_HARDLINKS; 1601 server->caps |= NFS_CAP_HARDLINKS;
1601 if (res.has_symlinks != 0) 1602 if (res.has_symlinks != 0)
1602 server->caps |= NFS_CAP_SYMLINKS; 1603 server->caps |= NFS_CAP_SYMLINKS;
1604 memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
1605 server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
1606 server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
1603 server->acl_bitmask = res.acl_bitmask; 1607 server->acl_bitmask = res.acl_bitmask;
1604 } 1608 }
1605 return status; 1609 return status;
@@ -2079,7 +2083,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
2079 struct nfs_removeargs *args = msg->rpc_argp; 2083 struct nfs_removeargs *args = msg->rpc_argp;
2080 struct nfs_removeres *res = msg->rpc_resp; 2084 struct nfs_removeres *res = msg->rpc_resp;
2081 2085
2082 args->bitmask = server->attr_bitmask; 2086 args->bitmask = server->cache_consistency_bitmask;
2083 res->server = server; 2087 res->server = server;
2084 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; 2088 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
2085} 2089}
@@ -2323,7 +2327,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
2323 .pages = &page, 2327 .pages = &page,
2324 .pgbase = 0, 2328 .pgbase = 0,
2325 .count = count, 2329 .count = count,
2326 .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask, 2330 .bitmask = NFS_SERVER(dentry->d_inode)->cache_consistency_bitmask,
2327 }; 2331 };
2328 struct nfs4_readdir_res res; 2332 struct nfs4_readdir_res res;
2329 struct rpc_message msg = { 2333 struct rpc_message msg = {
@@ -2552,7 +2556,7 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
2552{ 2556{
2553 struct nfs_server *server = NFS_SERVER(data->inode); 2557 struct nfs_server *server = NFS_SERVER(data->inode);
2554 2558
2555 data->args.bitmask = server->attr_bitmask; 2559 data->args.bitmask = server->cache_consistency_bitmask;
2556 data->res.server = server; 2560 data->res.server = server;
2557 data->timestamp = jiffies; 2561 data->timestamp = jiffies;
2558 2562
@@ -2575,7 +2579,7 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
2575{ 2579{
2576 struct nfs_server *server = NFS_SERVER(data->inode); 2580 struct nfs_server *server = NFS_SERVER(data->inode);
2577 2581
2578 data->args.bitmask = server->attr_bitmask; 2582 data->args.bitmask = server->cache_consistency_bitmask;
2579 data->res.server = server; 2583 data->res.server = server;
2580 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; 2584 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
2581} 2585}
@@ -3678,6 +3682,19 @@ ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
3678 return len; 3682 return len;
3679} 3683}
3680 3684
3685static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr)
3686{
3687 if (!((fattr->valid & NFS_ATTR_FATTR_FILEID) &&
3688 (fattr->valid & NFS_ATTR_FATTR_FSID) &&
3689 (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)))
3690 return;
3691
3692 fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE |
3693 NFS_ATTR_FATTR_NLINK;
3694 fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO;
3695 fattr->nlink = 2;
3696}
3697
3681int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, 3698int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
3682 struct nfs4_fs_locations *fs_locations, struct page *page) 3699 struct nfs4_fs_locations *fs_locations, struct page *page)
3683{ 3700{
@@ -3704,6 +3721,7 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
3704 fs_locations->server = server; 3721 fs_locations->server = server;
3705 fs_locations->nlocations = 0; 3722 fs_locations->nlocations = 0;
3706 status = rpc_call_sync(server->client, &msg, 0); 3723 status = rpc_call_sync(server->client, &msg, 0);
3724 nfs_fixup_referral_attributes(&fs_locations->fattr);
3707 dprintk("%s: returned status = %d\n", __func__, status); 3725 dprintk("%s: returned status = %d\n", __func__, status);
3708 return status; 3726 return status;
3709} 3727}
@@ -3767,6 +3785,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
3767 .commit_done = nfs4_commit_done, 3785 .commit_done = nfs4_commit_done,
3768 .lock = nfs4_proc_lock, 3786 .lock = nfs4_proc_lock,
3769 .clear_acl_cache = nfs4_zap_acl_attr, 3787 .clear_acl_cache = nfs4_zap_acl_attr,
3788 .close_context = nfs4_close_context,
3770}; 3789};
3771 3790
3772/* 3791/*
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 2022fe47966f..0298e909559f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -62,8 +62,14 @@ static LIST_HEAD(nfs4_clientid_list);
62 62
63static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred) 63static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
64{ 64{
65 int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, 65 unsigned short port;
66 nfs_callback_tcpport, cred); 66 int status;
67
68 port = nfs_callback_tcpport;
69 if (clp->cl_addr.ss_family == AF_INET6)
70 port = nfs_callback_tcpport6;
71
72 status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred);
67 if (status == 0) 73 if (status == 0)
68 status = nfs4_proc_setclientid_confirm(clp, cred); 74 status = nfs4_proc_setclientid_confirm(clp, cred);
69 if (status == 0) 75 if (status == 0)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index d1e4c8f8a0a9..1690f0e44b91 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -522,20 +522,17 @@ static int nfs4_stat_to_errno(int);
522 decode_lookup_maxsz + \ 522 decode_lookup_maxsz + \
523 decode_fs_locations_maxsz) 523 decode_fs_locations_maxsz)
524 524
525static struct { 525static const umode_t nfs_type2fmt[] = {
526 unsigned int mode; 526 [NF4BAD] = 0,
527 unsigned int nfs2type; 527 [NF4REG] = S_IFREG,
528} nfs_type2fmt[] = { 528 [NF4DIR] = S_IFDIR,
529 { 0, NFNON }, 529 [NF4BLK] = S_IFBLK,
530 { S_IFREG, NFREG }, 530 [NF4CHR] = S_IFCHR,
531 { S_IFDIR, NFDIR }, 531 [NF4LNK] = S_IFLNK,
532 { S_IFBLK, NFBLK }, 532 [NF4SOCK] = S_IFSOCK,
533 { S_IFCHR, NFCHR }, 533 [NF4FIFO] = S_IFIFO,
534 { S_IFLNK, NFLNK }, 534 [NF4ATTRDIR] = 0,
535 { S_IFSOCK, NFSOCK }, 535 [NF4NAMEDATTR] = 0,
536 { S_IFIFO, NFFIFO },
537 { 0, NFNON },
538 { 0, NFNON },
539}; 536};
540 537
541struct compound_hdr { 538struct compound_hdr {
@@ -2160,6 +2157,7 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3
2160static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type) 2157static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type)
2161{ 2158{
2162 __be32 *p; 2159 __be32 *p;
2160 int ret = 0;
2163 2161
2164 *type = 0; 2162 *type = 0;
2165 if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U))) 2163 if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
@@ -2172,14 +2170,16 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
2172 return -EIO; 2170 return -EIO;
2173 } 2171 }
2174 bitmap[0] &= ~FATTR4_WORD0_TYPE; 2172 bitmap[0] &= ~FATTR4_WORD0_TYPE;
2173 ret = NFS_ATTR_FATTR_TYPE;
2175 } 2174 }
2176 dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type].nfs2type); 2175 dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
2177 return 0; 2176 return ret;
2178} 2177}
2179 2178
2180static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change) 2179static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
2181{ 2180{
2182 __be32 *p; 2181 __be32 *p;
2182 int ret = 0;
2183 2183
2184 *change = 0; 2184 *change = 0;
2185 if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U))) 2185 if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
@@ -2188,15 +2188,17 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
2188 READ_BUF(8); 2188 READ_BUF(8);
2189 READ64(*change); 2189 READ64(*change);
2190 bitmap[0] &= ~FATTR4_WORD0_CHANGE; 2190 bitmap[0] &= ~FATTR4_WORD0_CHANGE;
2191 ret = NFS_ATTR_FATTR_CHANGE;
2191 } 2192 }
2192 dprintk("%s: change attribute=%Lu\n", __func__, 2193 dprintk("%s: change attribute=%Lu\n", __func__,
2193 (unsigned long long)*change); 2194 (unsigned long long)*change);
2194 return 0; 2195 return ret;
2195} 2196}
2196 2197
2197static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size) 2198static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
2198{ 2199{
2199 __be32 *p; 2200 __be32 *p;
2201 int ret = 0;
2200 2202
2201 *size = 0; 2203 *size = 0;
2202 if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U))) 2204 if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
@@ -2205,9 +2207,10 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
2205 READ_BUF(8); 2207 READ_BUF(8);
2206 READ64(*size); 2208 READ64(*size);
2207 bitmap[0] &= ~FATTR4_WORD0_SIZE; 2209 bitmap[0] &= ~FATTR4_WORD0_SIZE;
2210 ret = NFS_ATTR_FATTR_SIZE;
2208 } 2211 }
2209 dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size); 2212 dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
2210 return 0; 2213 return ret;
2211} 2214}
2212 2215
2213static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 2216static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2245,6 +2248,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
2245static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid) 2248static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
2246{ 2249{
2247 __be32 *p; 2250 __be32 *p;
2251 int ret = 0;
2248 2252
2249 fsid->major = 0; 2253 fsid->major = 0;
2250 fsid->minor = 0; 2254 fsid->minor = 0;
@@ -2255,11 +2259,12 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
2255 READ64(fsid->major); 2259 READ64(fsid->major);
2256 READ64(fsid->minor); 2260 READ64(fsid->minor);
2257 bitmap[0] &= ~FATTR4_WORD0_FSID; 2261 bitmap[0] &= ~FATTR4_WORD0_FSID;
2262 ret = NFS_ATTR_FATTR_FSID;
2258 } 2263 }
2259 dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __func__, 2264 dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __func__,
2260 (unsigned long long)fsid->major, 2265 (unsigned long long)fsid->major,
2261 (unsigned long long)fsid->minor); 2266 (unsigned long long)fsid->minor);
2262 return 0; 2267 return ret;
2263} 2268}
2264 2269
2265static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 2270static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2297,6 +2302,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
2297static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) 2302static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
2298{ 2303{
2299 __be32 *p; 2304 __be32 *p;
2305 int ret = 0;
2300 2306
2301 *fileid = 0; 2307 *fileid = 0;
2302 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U))) 2308 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
@@ -2305,14 +2311,16 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
2305 READ_BUF(8); 2311 READ_BUF(8);
2306 READ64(*fileid); 2312 READ64(*fileid);
2307 bitmap[0] &= ~FATTR4_WORD0_FILEID; 2313 bitmap[0] &= ~FATTR4_WORD0_FILEID;
2314 ret = NFS_ATTR_FATTR_FILEID;
2308 } 2315 }
2309 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); 2316 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
2310 return 0; 2317 return ret;
2311} 2318}
2312 2319
2313static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) 2320static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
2314{ 2321{
2315 __be32 *p; 2322 __be32 *p;
2323 int ret = 0;
2316 2324
2317 *fileid = 0; 2325 *fileid = 0;
2318 if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U))) 2326 if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
@@ -2321,9 +2329,10 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
2321 READ_BUF(8); 2329 READ_BUF(8);
2322 READ64(*fileid); 2330 READ64(*fileid);
2323 bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; 2331 bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
2332 ret = NFS_ATTR_FATTR_FILEID;
2324 } 2333 }
2325 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); 2334 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
2326 return 0; 2335 return ret;
2327} 2336}
2328 2337
2329static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 2338static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2479,6 +2488,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
2479 if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES) 2488 if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES)
2480 res->nlocations++; 2489 res->nlocations++;
2481 } 2490 }
2491 if (res->nlocations != 0)
2492 status = NFS_ATTR_FATTR_V4_REFERRAL;
2482out: 2493out:
2483 dprintk("%s: fs_locations done, error = %d\n", __func__, status); 2494 dprintk("%s: fs_locations done, error = %d\n", __func__, status);
2484 return status; 2495 return status;
@@ -2580,26 +2591,30 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
2580 return status; 2591 return status;
2581} 2592}
2582 2593
2583static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *mode) 2594static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
2584{ 2595{
2596 uint32_t tmp;
2585 __be32 *p; 2597 __be32 *p;
2598 int ret = 0;
2586 2599
2587 *mode = 0; 2600 *mode = 0;
2588 if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U))) 2601 if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
2589 return -EIO; 2602 return -EIO;
2590 if (likely(bitmap[1] & FATTR4_WORD1_MODE)) { 2603 if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
2591 READ_BUF(4); 2604 READ_BUF(4);
2592 READ32(*mode); 2605 READ32(tmp);
2593 *mode &= ~S_IFMT; 2606 *mode = tmp & ~S_IFMT;
2594 bitmap[1] &= ~FATTR4_WORD1_MODE; 2607 bitmap[1] &= ~FATTR4_WORD1_MODE;
2608 ret = NFS_ATTR_FATTR_MODE;
2595 } 2609 }
2596 dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode); 2610 dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
2597 return 0; 2611 return ret;
2598} 2612}
2599 2613
2600static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink) 2614static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
2601{ 2615{
2602 __be32 *p; 2616 __be32 *p;
2617 int ret = 0;
2603 2618
2604 *nlink = 1; 2619 *nlink = 1;
2605 if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U))) 2620 if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
@@ -2608,15 +2623,17 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
2608 READ_BUF(4); 2623 READ_BUF(4);
2609 READ32(*nlink); 2624 READ32(*nlink);
2610 bitmap[1] &= ~FATTR4_WORD1_NUMLINKS; 2625 bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
2626 ret = NFS_ATTR_FATTR_NLINK;
2611 } 2627 }
2612 dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink); 2628 dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
2613 return 0; 2629 return ret;
2614} 2630}
2615 2631
2616static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid) 2632static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid)
2617{ 2633{
2618 uint32_t len; 2634 uint32_t len;
2619 __be32 *p; 2635 __be32 *p;
2636 int ret = 0;
2620 2637
2621 *uid = -2; 2638 *uid = -2;
2622 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U))) 2639 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
@@ -2626,7 +2643,9 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
2626 READ32(len); 2643 READ32(len);
2627 READ_BUF(len); 2644 READ_BUF(len);
2628 if (len < XDR_MAX_NETOBJ) { 2645 if (len < XDR_MAX_NETOBJ) {
2629 if (nfs_map_name_to_uid(clp, (char *)p, len, uid) != 0) 2646 if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
2647 ret = NFS_ATTR_FATTR_OWNER;
2648 else
2630 dprintk("%s: nfs_map_name_to_uid failed!\n", 2649 dprintk("%s: nfs_map_name_to_uid failed!\n",
2631 __func__); 2650 __func__);
2632 } else 2651 } else
@@ -2635,13 +2654,14 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
2635 bitmap[1] &= ~FATTR4_WORD1_OWNER; 2654 bitmap[1] &= ~FATTR4_WORD1_OWNER;
2636 } 2655 }
2637 dprintk("%s: uid=%d\n", __func__, (int)*uid); 2656 dprintk("%s: uid=%d\n", __func__, (int)*uid);
2638 return 0; 2657 return ret;
2639} 2658}
2640 2659
2641static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid) 2660static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid)
2642{ 2661{
2643 uint32_t len; 2662 uint32_t len;
2644 __be32 *p; 2663 __be32 *p;
2664 int ret = 0;
2645 2665
2646 *gid = -2; 2666 *gid = -2;
2647 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U))) 2667 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
@@ -2651,7 +2671,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
2651 READ32(len); 2671 READ32(len);
2652 READ_BUF(len); 2672 READ_BUF(len);
2653 if (len < XDR_MAX_NETOBJ) { 2673 if (len < XDR_MAX_NETOBJ) {
2654 if (nfs_map_group_to_gid(clp, (char *)p, len, gid) != 0) 2674 if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
2675 ret = NFS_ATTR_FATTR_GROUP;
2676 else
2655 dprintk("%s: nfs_map_group_to_gid failed!\n", 2677 dprintk("%s: nfs_map_group_to_gid failed!\n",
2656 __func__); 2678 __func__);
2657 } else 2679 } else
@@ -2660,13 +2682,14 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
2660 bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP; 2682 bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP;
2661 } 2683 }
2662 dprintk("%s: gid=%d\n", __func__, (int)*gid); 2684 dprintk("%s: gid=%d\n", __func__, (int)*gid);
2663 return 0; 2685 return ret;
2664} 2686}
2665 2687
2666static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev) 2688static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
2667{ 2689{
2668 uint32_t major = 0, minor = 0; 2690 uint32_t major = 0, minor = 0;
2669 __be32 *p; 2691 __be32 *p;
2692 int ret = 0;
2670 2693
2671 *rdev = MKDEV(0,0); 2694 *rdev = MKDEV(0,0);
2672 if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U))) 2695 if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U)))
@@ -2681,9 +2704,10 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
2681 if (MAJOR(tmp) == major && MINOR(tmp) == minor) 2704 if (MAJOR(tmp) == major && MINOR(tmp) == minor)
2682 *rdev = tmp; 2705 *rdev = tmp;
2683 bitmap[1] &= ~ FATTR4_WORD1_RAWDEV; 2706 bitmap[1] &= ~ FATTR4_WORD1_RAWDEV;
2707 ret = NFS_ATTR_FATTR_RDEV;
2684 } 2708 }
2685 dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor); 2709 dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
2686 return 0; 2710 return ret;
2687} 2711}
2688 2712
2689static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 2713static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2740,6 +2764,7 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
2740static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used) 2764static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
2741{ 2765{
2742 __be32 *p; 2766 __be32 *p;
2767 int ret = 0;
2743 2768
2744 *used = 0; 2769 *used = 0;
2745 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U))) 2770 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
@@ -2748,10 +2773,11 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
2748 READ_BUF(8); 2773 READ_BUF(8);
2749 READ64(*used); 2774 READ64(*used);
2750 bitmap[1] &= ~FATTR4_WORD1_SPACE_USED; 2775 bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
2776 ret = NFS_ATTR_FATTR_SPACE_USED;
2751 } 2777 }
2752 dprintk("%s: space used=%Lu\n", __func__, 2778 dprintk("%s: space used=%Lu\n", __func__,
2753 (unsigned long long)*used); 2779 (unsigned long long)*used);
2754 return 0; 2780 return ret;
2755} 2781}
2756 2782
2757static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) 2783static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
@@ -2778,6 +2804,8 @@ static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, str
2778 return -EIO; 2804 return -EIO;
2779 if (likely(bitmap[1] & FATTR4_WORD1_TIME_ACCESS)) { 2805 if (likely(bitmap[1] & FATTR4_WORD1_TIME_ACCESS)) {
2780 status = decode_attr_time(xdr, time); 2806 status = decode_attr_time(xdr, time);
2807 if (status == 0)
2808 status = NFS_ATTR_FATTR_ATIME;
2781 bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS; 2809 bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS;
2782 } 2810 }
2783 dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec); 2811 dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec);
@@ -2794,6 +2822,8 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s
2794 return -EIO; 2822 return -EIO;
2795 if (likely(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) { 2823 if (likely(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) {
2796 status = decode_attr_time(xdr, time); 2824 status = decode_attr_time(xdr, time);
2825 if (status == 0)
2826 status = NFS_ATTR_FATTR_CTIME;
2797 bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA; 2827 bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA;
2798 } 2828 }
2799 dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec); 2829 dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec);
@@ -2810,6 +2840,8 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str
2810 return -EIO; 2840 return -EIO;
2811 if (likely(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) { 2841 if (likely(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) {
2812 status = decode_attr_time(xdr, time); 2842 status = decode_attr_time(xdr, time);
2843 if (status == 0)
2844 status = NFS_ATTR_FATTR_MTIME;
2813 bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY; 2845 bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY;
2814 } 2846 }
2815 dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec); 2847 dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec);
@@ -2994,63 +3026,116 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
2994 uint32_t attrlen, 3026 uint32_t attrlen,
2995 bitmap[2] = {0}, 3027 bitmap[2] = {0},
2996 type; 3028 type;
2997 int status, fmode = 0; 3029 int status;
3030 umode_t fmode = 0;
2998 uint64_t fileid; 3031 uint64_t fileid;
2999 3032
3000 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) 3033 status = decode_op_hdr(xdr, OP_GETATTR);
3001 goto xdr_error; 3034 if (status < 0)
3002 if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
3003 goto xdr_error; 3035 goto xdr_error;
3004 3036
3005 fattr->bitmap[0] = bitmap[0]; 3037 status = decode_attr_bitmap(xdr, bitmap);
3006 fattr->bitmap[1] = bitmap[1]; 3038 if (status < 0)
3039 goto xdr_error;
3007 3040
3008 if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) 3041 status = decode_attr_length(xdr, &attrlen, &savep);
3042 if (status < 0)
3009 goto xdr_error; 3043 goto xdr_error;
3010 3044
3011 3045
3012 if ((status = decode_attr_type(xdr, bitmap, &type)) != 0) 3046 status = decode_attr_type(xdr, bitmap, &type);
3047 if (status < 0)
3013 goto xdr_error; 3048 goto xdr_error;
3014 fattr->type = nfs_type2fmt[type].nfs2type; 3049 fattr->mode = 0;
3015 fmode = nfs_type2fmt[type].mode; 3050 if (status != 0) {
3051 fattr->mode |= nfs_type2fmt[type];
3052 fattr->valid |= status;
3053 }
3016 3054
3017 if ((status = decode_attr_change(xdr, bitmap, &fattr->change_attr)) != 0) 3055 status = decode_attr_change(xdr, bitmap, &fattr->change_attr);
3056 if (status < 0)
3018 goto xdr_error; 3057 goto xdr_error;
3019 if ((status = decode_attr_size(xdr, bitmap, &fattr->size)) != 0) 3058 fattr->valid |= status;
3059
3060 status = decode_attr_size(xdr, bitmap, &fattr->size);
3061 if (status < 0)
3020 goto xdr_error; 3062 goto xdr_error;
3021 if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid)) != 0) 3063 fattr->valid |= status;
3064
3065 status = decode_attr_fsid(xdr, bitmap, &fattr->fsid);
3066 if (status < 0)
3022 goto xdr_error; 3067 goto xdr_error;
3023 if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0) 3068 fattr->valid |= status;
3069
3070 status = decode_attr_fileid(xdr, bitmap, &fattr->fileid);
3071 if (status < 0)
3024 goto xdr_error; 3072 goto xdr_error;
3025 if ((status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr, 3073 fattr->valid |= status;
3074
3075 status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
3026 struct nfs4_fs_locations, 3076 struct nfs4_fs_locations,
3027 fattr))) != 0) 3077 fattr));
3078 if (status < 0)
3028 goto xdr_error; 3079 goto xdr_error;
3029 if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0) 3080 fattr->valid |= status;
3081
3082 status = decode_attr_mode(xdr, bitmap, &fmode);
3083 if (status < 0)
3030 goto xdr_error; 3084 goto xdr_error;
3031 fattr->mode |= fmode; 3085 if (status != 0) {
3032 if ((status = decode_attr_nlink(xdr, bitmap, &fattr->nlink)) != 0) 3086 fattr->mode |= fmode;
3087 fattr->valid |= status;
3088 }
3089
3090 status = decode_attr_nlink(xdr, bitmap, &fattr->nlink);
3091 if (status < 0)
3033 goto xdr_error; 3092 goto xdr_error;
3034 if ((status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid)) != 0) 3093 fattr->valid |= status;
3094
3095 status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid);
3096 if (status < 0)
3035 goto xdr_error; 3097 goto xdr_error;
3036 if ((status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid)) != 0) 3098 fattr->valid |= status;
3099
3100 status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid);
3101 if (status < 0)
3037 goto xdr_error; 3102 goto xdr_error;
3038 if ((status = decode_attr_rdev(xdr, bitmap, &fattr->rdev)) != 0) 3103 fattr->valid |= status;
3104
3105 status = decode_attr_rdev(xdr, bitmap, &fattr->rdev);
3106 if (status < 0)
3039 goto xdr_error; 3107 goto xdr_error;
3040 if ((status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used)) != 0) 3108 fattr->valid |= status;
3109
3110 status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used);
3111 if (status < 0)
3041 goto xdr_error; 3112 goto xdr_error;
3042 if ((status = decode_attr_time_access(xdr, bitmap, &fattr->atime)) != 0) 3113 fattr->valid |= status;
3114
3115 status = decode_attr_time_access(xdr, bitmap, &fattr->atime);
3116 if (status < 0)
3043 goto xdr_error; 3117 goto xdr_error;
3044 if ((status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime)) != 0) 3118 fattr->valid |= status;
3119
3120 status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime);
3121 if (status < 0)
3045 goto xdr_error; 3122 goto xdr_error;
3046 if ((status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime)) != 0) 3123 fattr->valid |= status;
3124
3125 status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime);
3126 if (status < 0)
3047 goto xdr_error; 3127 goto xdr_error;
3048 if ((status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid)) != 0) 3128 fattr->valid |= status;
3129
3130 status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid);
3131 if (status < 0)
3049 goto xdr_error; 3132 goto xdr_error;
3050 if (fattr->fileid == 0 && fileid != 0) 3133 if (status != 0 && !(fattr->valid & status)) {
3051 fattr->fileid = fileid; 3134 fattr->fileid = fileid;
3052 if ((status = verify_attr_len(xdr, savep, attrlen)) == 0) 3135 fattr->valid |= status;
3053 fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4; 3136 }
3137
3138 status = verify_attr_len(xdr, savep, attrlen);
3054xdr_error: 3139xdr_error:
3055 dprintk("%s: xdr returned %d\n", __func__, -status); 3140 dprintk("%s: xdr returned %d\n", __func__, -status);
3056 return status; 3141 return status;
@@ -4078,9 +4163,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se
4078 status = decode_setattr(&xdr, res); 4163 status = decode_setattr(&xdr, res);
4079 if (status) 4164 if (status)
4080 goto out; 4165 goto out;
4081 status = decode_getfattr(&xdr, res->fattr, res->server); 4166 decode_getfattr(&xdr, res->fattr, res->server);
4082 if (status == NFS4ERR_DELAY)
4083 status = 0;
4084out: 4167out:
4085 return status; 4168 return status;
4086} 4169}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 7f079209d70a..e2975939126a 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -176,17 +176,6 @@ void nfs_release_request(struct nfs_page *req)
176 kref_put(&req->wb_kref, nfs_free_request); 176 kref_put(&req->wb_kref, nfs_free_request);
177} 177}
178 178
179static int nfs_wait_bit_killable(void *word)
180{
181 int ret = 0;
182
183 if (fatal_signal_pending(current))
184 ret = -ERESTARTSYS;
185 else
186 schedule();
187 return ret;
188}
189
190/** 179/**
191 * nfs_wait_on_request - Wait for a request to complete. 180 * nfs_wait_on_request - Wait for a request to complete.
192 * @req: request to wait upon. 181 * @req: request to wait upon.
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 193465210d7c..7be72d90d49d 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -663,4 +663,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
663 .commit_setup = nfs_proc_commit_setup, 663 .commit_setup = nfs_proc_commit_setup,
664 .lock = nfs_proc_lock, 664 .lock = nfs_proc_lock,
665 .lock_check_bounds = nfs_lock_check_bounds, 665 .lock_check_bounds = nfs_lock_check_bounds,
666 .close_context = nfs_close_context,
666}; 667};
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index f856004bb7fa..4ace3c50a8eb 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -24,6 +24,7 @@
24 24
25#include "internal.h" 25#include "internal.h"
26#include "iostat.h" 26#include "iostat.h"
27#include "fscache.h"
27 28
28#define NFSDBG_FACILITY NFSDBG_PAGECACHE 29#define NFSDBG_FACILITY NFSDBG_PAGECACHE
29 30
@@ -111,8 +112,8 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
111 } 112 }
112} 113}
113 114
114static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, 115int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
115 struct page *page) 116 struct page *page)
116{ 117{
117 LIST_HEAD(one_request); 118 LIST_HEAD(one_request);
118 struct nfs_page *new; 119 struct nfs_page *new;
@@ -139,6 +140,11 @@ static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
139 140
140static void nfs_readpage_release(struct nfs_page *req) 141static void nfs_readpage_release(struct nfs_page *req)
141{ 142{
143 struct inode *d_inode = req->wb_context->path.dentry->d_inode;
144
145 if (PageUptodate(req->wb_page))
146 nfs_readpage_to_fscache(d_inode, req->wb_page, 0);
147
142 unlock_page(req->wb_page); 148 unlock_page(req->wb_page);
143 149
144 dprintk("NFS: read done (%s/%Ld %d@%Ld)\n", 150 dprintk("NFS: read done (%s/%Ld %d@%Ld)\n",
@@ -510,8 +516,15 @@ int nfs_readpage(struct file *file, struct page *page)
510 } else 516 } else
511 ctx = get_nfs_open_context(nfs_file_open_context(file)); 517 ctx = get_nfs_open_context(nfs_file_open_context(file));
512 518
519 if (!IS_SYNC(inode)) {
520 error = nfs_readpage_from_fscache(ctx, inode, page);
521 if (error == 0)
522 goto out;
523 }
524
513 error = nfs_readpage_async(ctx, inode, page); 525 error = nfs_readpage_async(ctx, inode, page);
514 526
527out:
515 put_nfs_open_context(ctx); 528 put_nfs_open_context(ctx);
516 return error; 529 return error;
517out_unlock: 530out_unlock:
@@ -584,6 +597,15 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
584 return -EBADF; 597 return -EBADF;
585 } else 598 } else
586 desc.ctx = get_nfs_open_context(nfs_file_open_context(filp)); 599 desc.ctx = get_nfs_open_context(nfs_file_open_context(filp));
600
601 /* attempt to read as many of the pages as possible from the cache
602 * - this returns -ENOBUFS immediately if the cookie is negative
603 */
604 ret = nfs_readpages_from_fscache(desc.ctx, inode, mapping,
605 pages, &nr_pages);
606 if (ret == 0)
607 goto read_complete; /* all pages were read */
608
587 if (rsize < PAGE_CACHE_SIZE) 609 if (rsize < PAGE_CACHE_SIZE)
588 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); 610 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
589 else 611 else
@@ -594,6 +616,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
594 nfs_pageio_complete(&pgio); 616 nfs_pageio_complete(&pgio);
595 npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 617 npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
596 nfs_add_stats(inode, NFSIOS_READPAGES, npages); 618 nfs_add_stats(inode, NFSIOS_READPAGES, npages);
619read_complete:
597 put_nfs_open_context(desc.ctx); 620 put_nfs_open_context(desc.ctx);
598out: 621out:
599 return ret; 622 return ret;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d6686f4786dc..82eaadbff408 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -60,6 +60,7 @@
60#include "delegation.h" 60#include "delegation.h"
61#include "iostat.h" 61#include "iostat.h"
62#include "internal.h" 62#include "internal.h"
63#include "fscache.h"
63 64
64#define NFSDBG_FACILITY NFSDBG_VFS 65#define NFSDBG_FACILITY NFSDBG_VFS
65 66
@@ -76,6 +77,7 @@ enum {
76 Opt_rdirplus, Opt_nordirplus, 77 Opt_rdirplus, Opt_nordirplus,
77 Opt_sharecache, Opt_nosharecache, 78 Opt_sharecache, Opt_nosharecache,
78 Opt_resvport, Opt_noresvport, 79 Opt_resvport, Opt_noresvport,
80 Opt_fscache, Opt_nofscache,
79 81
80 /* Mount options that take integer arguments */ 82 /* Mount options that take integer arguments */
81 Opt_port, 83 Opt_port,
@@ -93,6 +95,7 @@ enum {
93 Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost, 95 Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
94 Opt_addr, Opt_mountaddr, Opt_clientaddr, 96 Opt_addr, Opt_mountaddr, Opt_clientaddr,
95 Opt_lookupcache, 97 Opt_lookupcache,
98 Opt_fscache_uniq,
96 99
97 /* Special mount options */ 100 /* Special mount options */
98 Opt_userspace, Opt_deprecated, Opt_sloppy, 101 Opt_userspace, Opt_deprecated, Opt_sloppy,
@@ -132,6 +135,9 @@ static const match_table_t nfs_mount_option_tokens = {
132 { Opt_nosharecache, "nosharecache" }, 135 { Opt_nosharecache, "nosharecache" },
133 { Opt_resvport, "resvport" }, 136 { Opt_resvport, "resvport" },
134 { Opt_noresvport, "noresvport" }, 137 { Opt_noresvport, "noresvport" },
138 { Opt_fscache, "fsc" },
139 { Opt_fscache_uniq, "fsc=%s" },
140 { Opt_nofscache, "nofsc" },
135 141
136 { Opt_port, "port=%u" }, 142 { Opt_port, "port=%u" },
137 { Opt_rsize, "rsize=%u" }, 143 { Opt_rsize, "rsize=%u" },
@@ -563,6 +569,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
563 if (clp->rpc_ops->version == 4) 569 if (clp->rpc_ops->version == 4)
564 seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr); 570 seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr);
565#endif 571#endif
572 if (nfss->options & NFS_OPTION_FSCACHE)
573 seq_printf(m, ",fsc");
566} 574}
567 575
568/* 576/*
@@ -641,6 +649,10 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
641 totals.events[i] += stats->events[i]; 649 totals.events[i] += stats->events[i];
642 for (i = 0; i < __NFSIOS_BYTESMAX; i++) 650 for (i = 0; i < __NFSIOS_BYTESMAX; i++)
643 totals.bytes[i] += stats->bytes[i]; 651 totals.bytes[i] += stats->bytes[i];
652#ifdef CONFIG_NFS_FSCACHE
653 for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
654 totals.fscache[i] += stats->fscache[i];
655#endif
644 656
645 preempt_enable(); 657 preempt_enable();
646 } 658 }
@@ -651,6 +663,13 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
651 seq_printf(m, "\n\tbytes:\t"); 663 seq_printf(m, "\n\tbytes:\t");
652 for (i = 0; i < __NFSIOS_BYTESMAX; i++) 664 for (i = 0; i < __NFSIOS_BYTESMAX; i++)
653 seq_printf(m, "%Lu ", totals.bytes[i]); 665 seq_printf(m, "%Lu ", totals.bytes[i]);
666#ifdef CONFIG_NFS_FSCACHE
667 if (nfss->options & NFS_OPTION_FSCACHE) {
668 seq_printf(m, "\n\tfsc:\t");
669 for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
670 seq_printf(m, "%Lu ", totals.bytes[i]);
671 }
672#endif
654 seq_printf(m, "\n"); 673 seq_printf(m, "\n");
655 674
656 rpc_print_iostats(m, nfss->client); 675 rpc_print_iostats(m, nfss->client);
@@ -1018,6 +1037,7 @@ static int nfs_parse_mount_options(char *raw,
1018 case Opt_rdma: 1037 case Opt_rdma:
1019 mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */ 1038 mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */
1020 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; 1039 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
1040 xprt_load_transport(p);
1021 break; 1041 break;
1022 case Opt_acl: 1042 case Opt_acl:
1023 mnt->flags &= ~NFS_MOUNT_NOACL; 1043 mnt->flags &= ~NFS_MOUNT_NOACL;
@@ -1043,6 +1063,24 @@ static int nfs_parse_mount_options(char *raw,
1043 case Opt_noresvport: 1063 case Opt_noresvport:
1044 mnt->flags |= NFS_MOUNT_NORESVPORT; 1064 mnt->flags |= NFS_MOUNT_NORESVPORT;
1045 break; 1065 break;
1066 case Opt_fscache:
1067 mnt->options |= NFS_OPTION_FSCACHE;
1068 kfree(mnt->fscache_uniq);
1069 mnt->fscache_uniq = NULL;
1070 break;
1071 case Opt_nofscache:
1072 mnt->options &= ~NFS_OPTION_FSCACHE;
1073 kfree(mnt->fscache_uniq);
1074 mnt->fscache_uniq = NULL;
1075 break;
1076 case Opt_fscache_uniq:
1077 string = match_strdup(args);
1078 if (!string)
1079 goto out_nomem;
1080 kfree(mnt->fscache_uniq);
1081 mnt->fscache_uniq = string;
1082 mnt->options |= NFS_OPTION_FSCACHE;
1083 break;
1046 1084
1047 /* 1085 /*
1048 * options that take numeric values 1086 * options that take numeric values
@@ -1205,12 +1243,14 @@ static int nfs_parse_mount_options(char *raw,
1205 /* vector side protocols to TCP */ 1243 /* vector side protocols to TCP */
1206 mnt->flags |= NFS_MOUNT_TCP; 1244 mnt->flags |= NFS_MOUNT_TCP;
1207 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; 1245 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
1246 xprt_load_transport(string);
1208 break; 1247 break;
1209 default: 1248 default:
1210 errors++; 1249 errors++;
1211 dfprintk(MOUNT, "NFS: unrecognized " 1250 dfprintk(MOUNT, "NFS: unrecognized "
1212 "transport protocol\n"); 1251 "transport protocol\n");
1213 } 1252 }
1253 kfree(string);
1214 break; 1254 break;
1215 case Opt_mountproto: 1255 case Opt_mountproto:
1216 string = match_strdup(args); 1256 string = match_strdup(args);
@@ -1218,7 +1258,6 @@ static int nfs_parse_mount_options(char *raw,
1218 goto out_nomem; 1258 goto out_nomem;
1219 token = match_token(string, 1259 token = match_token(string,
1220 nfs_xprt_protocol_tokens, args); 1260 nfs_xprt_protocol_tokens, args);
1221 kfree(string);
1222 1261
1223 switch (token) { 1262 switch (token) {
1224 case Opt_xprt_udp: 1263 case Opt_xprt_udp:
@@ -1868,8 +1907,6 @@ static void nfs_clone_super(struct super_block *sb,
1868 nfs_initialise_sb(sb); 1907 nfs_initialise_sb(sb);
1869} 1908}
1870 1909
1871#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
1872
1873static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags) 1910static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags)
1874{ 1911{
1875 const struct nfs_server *a = s->s_fs_info; 1912 const struct nfs_server *a = s->s_fs_info;
@@ -2034,6 +2071,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2034 if (!s->s_root) { 2071 if (!s->s_root) {
2035 /* initial superblock/root creation */ 2072 /* initial superblock/root creation */
2036 nfs_fill_super(s, data); 2073 nfs_fill_super(s, data);
2074 nfs_fscache_get_super_cookie(s, data);
2037 } 2075 }
2038 2076
2039 mntroot = nfs_get_root(s, mntfh); 2077 mntroot = nfs_get_root(s, mntfh);
@@ -2054,6 +2092,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2054out: 2092out:
2055 kfree(data->nfs_server.hostname); 2093 kfree(data->nfs_server.hostname);
2056 kfree(data->mount_server.hostname); 2094 kfree(data->mount_server.hostname);
2095 kfree(data->fscache_uniq);
2057 security_free_mnt_opts(&data->lsm_opts); 2096 security_free_mnt_opts(&data->lsm_opts);
2058out_free_fh: 2097out_free_fh:
2059 kfree(mntfh); 2098 kfree(mntfh);
@@ -2081,6 +2120,7 @@ static void nfs_kill_super(struct super_block *s)
2081 2120
2082 bdi_unregister(&server->backing_dev_info); 2121 bdi_unregister(&server->backing_dev_info);
2083 kill_anon_super(s); 2122 kill_anon_super(s);
2123 nfs_fscache_release_super_cookie(s);
2084 nfs_free_server(server); 2124 nfs_free_server(server);
2085} 2125}
2086 2126
@@ -2388,6 +2428,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
2388 if (!s->s_root) { 2428 if (!s->s_root) {
2389 /* initial superblock/root creation */ 2429 /* initial superblock/root creation */
2390 nfs4_fill_super(s); 2430 nfs4_fill_super(s);
2431 nfs_fscache_get_super_cookie(s, data);
2391 } 2432 }
2392 2433
2393 mntroot = nfs4_get_root(s, mntfh); 2434 mntroot = nfs4_get_root(s, mntfh);
@@ -2409,6 +2450,7 @@ out:
2409 kfree(data->client_address); 2450 kfree(data->client_address);
2410 kfree(data->nfs_server.export_path); 2451 kfree(data->nfs_server.export_path);
2411 kfree(data->nfs_server.hostname); 2452 kfree(data->nfs_server.hostname);
2453 kfree(data->fscache_uniq);
2412 security_free_mnt_opts(&data->lsm_opts); 2454 security_free_mnt_opts(&data->lsm_opts);
2413out_free_fh: 2455out_free_fh:
2414 kfree(mntfh); 2456 kfree(mntfh);
@@ -2435,6 +2477,7 @@ static void nfs4_kill_super(struct super_block *sb)
2435 kill_anon_super(sb); 2477 kill_anon_super(sb);
2436 2478
2437 nfs4_renewd_prepare_shutdown(server); 2479 nfs4_renewd_prepare_shutdown(server);
2480 nfs_fscache_release_super_cookie(sb);
2438 nfs_free_server(server); 2481 nfs_free_server(server);
2439} 2482}
2440 2483
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 9f9845859fc1..e560a78995a3 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -313,19 +313,34 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
313int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) 313int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
314{ 314{
315 struct inode *inode = mapping->host; 315 struct inode *inode = mapping->host;
316 unsigned long *bitlock = &NFS_I(inode)->flags;
316 struct nfs_pageio_descriptor pgio; 317 struct nfs_pageio_descriptor pgio;
317 int err; 318 int err;
318 319
320 /* Stop dirtying of new pages while we sync */
321 err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING,
322 nfs_wait_bit_killable, TASK_KILLABLE);
323 if (err)
324 goto out_err;
325
319 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); 326 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
320 327
321 nfs_pageio_init_write(&pgio, inode, wb_priority(wbc)); 328 nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
322 err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); 329 err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
323 nfs_pageio_complete(&pgio); 330 nfs_pageio_complete(&pgio);
331
332 clear_bit_unlock(NFS_INO_FLUSHING, bitlock);
333 smp_mb__after_clear_bit();
334 wake_up_bit(bitlock, NFS_INO_FLUSHING);
335
324 if (err < 0) 336 if (err < 0)
325 return err; 337 goto out_err;
326 if (pgio.pg_error < 0) 338 err = pgio.pg_error;
327 return pgio.pg_error; 339 if (err < 0)
340 goto out_err;
328 return 0; 341 return 0;
342out_err:
343 return err;
329} 344}
330 345
331/* 346/*
@@ -404,7 +419,6 @@ nfs_mark_request_commit(struct nfs_page *req)
404 struct nfs_inode *nfsi = NFS_I(inode); 419 struct nfs_inode *nfsi = NFS_I(inode);
405 420
406 spin_lock(&inode->i_lock); 421 spin_lock(&inode->i_lock);
407 nfsi->ncommit++;
408 set_bit(PG_CLEAN, &(req)->wb_flags); 422 set_bit(PG_CLEAN, &(req)->wb_flags);
409 radix_tree_tag_set(&nfsi->nfs_page_tree, 423 radix_tree_tag_set(&nfsi->nfs_page_tree,
410 req->wb_index, 424 req->wb_index,
@@ -524,6 +538,12 @@ static void nfs_cancel_commit_list(struct list_head *head)
524} 538}
525 539
526#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 540#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
541static int
542nfs_need_commit(struct nfs_inode *nfsi)
543{
544 return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT);
545}
546
527/* 547/*
528 * nfs_scan_commit - Scan an inode for commit requests 548 * nfs_scan_commit - Scan an inode for commit requests
529 * @inode: NFS inode to scan 549 * @inode: NFS inode to scan
@@ -538,16 +558,18 @@ static int
538nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) 558nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
539{ 559{
540 struct nfs_inode *nfsi = NFS_I(inode); 560 struct nfs_inode *nfsi = NFS_I(inode);
541 int res = 0;
542 561
543 if (nfsi->ncommit != 0) { 562 if (!nfs_need_commit(nfsi))
544 res = nfs_scan_list(nfsi, dst, idx_start, npages, 563 return 0;
545 NFS_PAGE_TAG_COMMIT); 564
546 nfsi->ncommit -= res; 565 return nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
547 }
548 return res;
549} 566}
550#else 567#else
568static inline int nfs_need_commit(struct nfs_inode *nfsi)
569{
570 return 0;
571}
572
551static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) 573static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
552{ 574{
553 return 0; 575 return 0;
@@ -820,7 +842,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
820 data->args.stable = NFS_UNSTABLE; 842 data->args.stable = NFS_UNSTABLE;
821 if (how & FLUSH_STABLE) { 843 if (how & FLUSH_STABLE) {
822 data->args.stable = NFS_DATA_SYNC; 844 data->args.stable = NFS_DATA_SYNC;
823 if (!NFS_I(inode)->ncommit) 845 if (!nfs_need_commit(NFS_I(inode)))
824 data->args.stable = NFS_FILE_SYNC; 846 data->args.stable = NFS_FILE_SYNC;
825 } 847 }
826 848
@@ -1425,18 +1447,13 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
1425{ 1447{
1426 struct writeback_control wbc = { 1448 struct writeback_control wbc = {
1427 .bdi = mapping->backing_dev_info, 1449 .bdi = mapping->backing_dev_info,
1428 .sync_mode = WB_SYNC_NONE, 1450 .sync_mode = WB_SYNC_ALL,
1429 .nr_to_write = LONG_MAX, 1451 .nr_to_write = LONG_MAX,
1430 .range_start = 0, 1452 .range_start = 0,
1431 .range_end = LLONG_MAX, 1453 .range_end = LLONG_MAX,
1432 .for_writepages = 1, 1454 .for_writepages = 1,
1433 }; 1455 };
1434 int ret;
1435 1456
1436 ret = __nfs_write_mapping(mapping, &wbc, how);
1437 if (ret < 0)
1438 return ret;
1439 wbc.sync_mode = WB_SYNC_ALL;
1440 return __nfs_write_mapping(mapping, &wbc, how); 1457 return __nfs_write_mapping(mapping, &wbc, how);
1441} 1458}
1442 1459
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index f65953be39c0..9250067943d8 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2596,6 +2596,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
2596 [OP_LOOKUPP] = (nfsd4_enc)nfsd4_encode_noop, 2596 [OP_LOOKUPP] = (nfsd4_enc)nfsd4_encode_noop,
2597 [OP_NVERIFY] = (nfsd4_enc)nfsd4_encode_noop, 2597 [OP_NVERIFY] = (nfsd4_enc)nfsd4_encode_noop,
2598 [OP_OPEN] = (nfsd4_enc)nfsd4_encode_open, 2598 [OP_OPEN] = (nfsd4_enc)nfsd4_encode_open,
2599 [OP_OPENATTR] = (nfsd4_enc)nfsd4_encode_noop,
2599 [OP_OPEN_CONFIRM] = (nfsd4_enc)nfsd4_encode_open_confirm, 2600 [OP_OPEN_CONFIRM] = (nfsd4_enc)nfsd4_encode_open_confirm,
2600 [OP_OPEN_DOWNGRADE] = (nfsd4_enc)nfsd4_encode_open_downgrade, 2601 [OP_OPEN_DOWNGRADE] = (nfsd4_enc)nfsd4_encode_open_downgrade,
2601 [OP_PUTFH] = (nfsd4_enc)nfsd4_encode_noop, 2602 [OP_PUTFH] = (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3d93b2064ce5..a4ed8644d69c 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -938,10 +938,12 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
938 char transport[16]; 938 char transport[16];
939 int port; 939 int port;
940 if (sscanf(buf, "%15s %4d", transport, &port) == 2) { 940 if (sscanf(buf, "%15s %4d", transport, &port) == 2) {
941 if (port < 1 || port > 65535)
942 return -EINVAL;
941 err = nfsd_create_serv(); 943 err = nfsd_create_serv();
942 if (!err) { 944 if (!err) {
943 err = svc_create_xprt(nfsd_serv, 945 err = svc_create_xprt(nfsd_serv,
944 transport, port, 946 transport, PF_INET, port,
945 SVC_SOCK_ANONYMOUS); 947 SVC_SOCK_ANONYMOUS);
946 if (err == -ENOENT) 948 if (err == -ENOENT)
947 /* Give a reasonable perror msg for 949 /* Give a reasonable perror msg for
@@ -960,7 +962,7 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
960 char transport[16]; 962 char transport[16];
961 int port; 963 int port;
962 if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) { 964 if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) {
963 if (port == 0) 965 if (port < 1 || port > 65535)
964 return -EINVAL; 966 return -EINVAL;
965 if (nfsd_serv) { 967 if (nfsd_serv) {
966 xprt = svc_find_xprt(nfsd_serv, transport, 968 xprt = svc_find_xprt(nfsd_serv, transport,
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 07e4f5d7baa8..7c09852be713 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -229,7 +229,6 @@ int nfsd_create_serv(void)
229 229
230 atomic_set(&nfsd_busy, 0); 230 atomic_set(&nfsd_busy, 0);
231 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, 231 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
232 AF_INET,
233 nfsd_last_thread, nfsd, THIS_MODULE); 232 nfsd_last_thread, nfsd, THIS_MODULE);
234 if (nfsd_serv == NULL) 233 if (nfsd_serv == NULL)
235 err = -ENOMEM; 234 err = -ENOMEM;
@@ -244,7 +243,7 @@ static int nfsd_init_socks(int port)
244 if (!list_empty(&nfsd_serv->sv_permsocks)) 243 if (!list_empty(&nfsd_serv->sv_permsocks))
245 return 0; 244 return 0;
246 245
247 error = svc_create_xprt(nfsd_serv, "udp", port, 246 error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port,
248 SVC_SOCK_DEFAULTS); 247 SVC_SOCK_DEFAULTS);
249 if (error < 0) 248 if (error < 0)
250 return error; 249 return error;
@@ -253,7 +252,7 @@ static int nfsd_init_socks(int port)
253 if (error < 0) 252 if (error < 0)
254 return error; 253 return error;
255 254
256 error = svc_create_xprt(nfsd_serv, "tcp", port, 255 error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port,
257 SVC_SOCK_DEFAULTS); 256 SVC_SOCK_DEFAULTS);
258 if (error < 0) 257 if (error < 0)
259 return error; 258 return error;
@@ -404,7 +403,6 @@ static int
404nfsd(void *vrqstp) 403nfsd(void *vrqstp)
405{ 404{
406 struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp; 405 struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
407 struct fs_struct *fsp;
408 int err, preverr = 0; 406 int err, preverr = 0;
409 407
410 /* Lock module and set up kernel thread */ 408 /* Lock module and set up kernel thread */
@@ -413,13 +411,11 @@ nfsd(void *vrqstp)
413 /* At this point, the thread shares current->fs 411 /* At this point, the thread shares current->fs
414 * with the init process. We need to create files with a 412 * with the init process. We need to create files with a
415 * umask of 0 instead of init's umask. */ 413 * umask of 0 instead of init's umask. */
416 fsp = copy_fs_struct(current->fs); 414 if (unshare_fs_struct() < 0) {
417 if (!fsp) {
418 printk("Unable to start nfsd thread: out of memory\n"); 415 printk("Unable to start nfsd thread: out of memory\n");
419 goto out; 416 goto out;
420 } 417 }
421 exit_fs(current); 418
422 current->fs = fsp;
423 current->fs->umask = 0; 419 current->fs->umask = 0;
424 420
425 /* 421 /*
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6e50aaa56ca2..78376b6c0236 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -356,7 +356,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
356 put_write_access(inode); 356 put_write_access(inode);
357 goto out_nfserr; 357 goto out_nfserr;
358 } 358 }
359 DQUOT_INIT(inode); 359 vfs_dq_init(inode);
360 } 360 }
361 361
362 /* sanitize the mode change */ 362 /* sanitize the mode change */
@@ -723,7 +723,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
723 else 723 else
724 flags = O_WRONLY|O_LARGEFILE; 724 flags = O_WRONLY|O_LARGEFILE;
725 725
726 DQUOT_INIT(inode); 726 vfs_dq_init(inode);
727 } 727 }
728 *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt), 728 *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt),
729 flags, cred); 729 flags, cred);
@@ -998,8 +998,11 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
998 998
999 if (!EX_ISSYNC(exp)) 999 if (!EX_ISSYNC(exp))
1000 stable = 0; 1000 stable = 0;
1001 if (stable && !EX_WGATHER(exp)) 1001 if (stable && !EX_WGATHER(exp)) {
1002 spin_lock(&file->f_lock);
1002 file->f_flags |= O_SYNC; 1003 file->f_flags |= O_SYNC;
1004 spin_unlock(&file->f_lock);
1005 }
1003 1006
1004 /* Write the data. */ 1007 /* Write the data. */
1005 oldfs = get_fs(); set_fs(KERNEL_DS); 1008 oldfs = get_fs(); set_fs(KERNEL_DS);
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index 331f2e88e284..220c13f0d73d 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -380,6 +380,14 @@ void inotify_unmount_inodes(struct list_head *list)
380 struct list_head *watches; 380 struct list_head *watches;
381 381
382 /* 382 /*
383 * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
384 * I_WILL_FREE, or I_NEW which is fine because by that point
385 * the inode cannot have any associated watches.
386 */
387 if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
388 continue;
389
390 /*
383 * If i_count is zero, the inode cannot have any watches and 391 * If i_count is zero, the inode cannot have any watches and
384 * doing an __iget/iput with MS_ACTIVE clear would actually 392 * doing an __iget/iput with MS_ACTIVE clear would actually
385 * evict all inodes with zero i_count from icache which is 393 * evict all inodes with zero i_count from icache which is
@@ -388,14 +396,6 @@ void inotify_unmount_inodes(struct list_head *list)
388 if (!atomic_read(&inode->i_count)) 396 if (!atomic_read(&inode->i_count))
389 continue; 397 continue;
390 398
391 /*
392 * We cannot __iget() an inode in state I_CLEAR, I_FREEING, or
393 * I_WILL_FREE which is fine because by that point the inode
394 * cannot have any associated watches.
395 */
396 if (inode->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))
397 continue;
398
399 need_iput_tmp = need_iput; 399 need_iput_tmp = need_iput;
400 need_iput = NULL; 400 need_iput = NULL;
401 /* In case inotify_remove_watch_locked() drops a reference. */ 401 /* In case inotify_remove_watch_locked() drops a reference. */
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 34314b33dbd4..5a9e34475e37 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -32,8 +32,8 @@
32/** 32/**
33 * The little endian Unicode string $I30 as a global constant. 33 * The little endian Unicode string $I30 as a global constant.
34 */ 34 */
35ntfschar I30[5] = { const_cpu_to_le16('$'), const_cpu_to_le16('I'), 35ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'),
36 const_cpu_to_le16('3'), const_cpu_to_le16('0'), 0 }; 36 cpu_to_le16('3'), cpu_to_le16('0'), 0 };
37 37
38/** 38/**
39 * ntfs_lookup_inode_by_name - find an inode in a directory given its name 39 * ntfs_lookup_inode_by_name - find an inode in a directory given its name
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 86bef156cf0a..82c5085559c6 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1975,8 +1975,7 @@ int ntfs_read_inode_mount(struct inode *vi)
1975 goto em_put_err_out; 1975 goto em_put_err_out;
1976 next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry + 1976 next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
1977 le16_to_cpu(al_entry->length)); 1977 le16_to_cpu(al_entry->length));
1978 if (le32_to_cpu(al_entry->type) > 1978 if (le32_to_cpu(al_entry->type) > le32_to_cpu(AT_DATA))
1979 const_le32_to_cpu(AT_DATA))
1980 goto em_put_err_out; 1979 goto em_put_err_out;
1981 if (AT_DATA != al_entry->type) 1980 if (AT_DATA != al_entry->type)
1982 continue; 1981 continue;
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index 1e383328eceb..50931b1ce4b9 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h
@@ -31,19 +31,8 @@
31 31
32#include "types.h" 32#include "types.h"
33 33
34/*
35 * Constant endianness conversion defines.
36 */
37#define const_le16_to_cpu(x) __constant_le16_to_cpu(x)
38#define const_le32_to_cpu(x) __constant_le32_to_cpu(x)
39#define const_le64_to_cpu(x) __constant_le64_to_cpu(x)
40
41#define const_cpu_to_le16(x) __constant_cpu_to_le16(x)
42#define const_cpu_to_le32(x) __constant_cpu_to_le32(x)
43#define const_cpu_to_le64(x) __constant_cpu_to_le64(x)
44
45/* The NTFS oem_id "NTFS " */ 34/* The NTFS oem_id "NTFS " */
46#define magicNTFS const_cpu_to_le64(0x202020205346544eULL) 35#define magicNTFS cpu_to_le64(0x202020205346544eULL)
47 36
48/* 37/*
49 * Location of bootsector on partition: 38 * Location of bootsector on partition:
@@ -114,25 +103,25 @@ typedef struct {
114 */ 103 */
115enum { 104enum {
116 /* Found in $MFT/$DATA. */ 105 /* Found in $MFT/$DATA. */
117 magic_FILE = const_cpu_to_le32(0x454c4946), /* Mft entry. */ 106 magic_FILE = cpu_to_le32(0x454c4946), /* Mft entry. */
118 magic_INDX = const_cpu_to_le32(0x58444e49), /* Index buffer. */ 107 magic_INDX = cpu_to_le32(0x58444e49), /* Index buffer. */
119 magic_HOLE = const_cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */ 108 magic_HOLE = cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */
120 109
121 /* Found in $LogFile/$DATA. */ 110 /* Found in $LogFile/$DATA. */
122 magic_RSTR = const_cpu_to_le32(0x52545352), /* Restart page. */ 111 magic_RSTR = cpu_to_le32(0x52545352), /* Restart page. */
123 magic_RCRD = const_cpu_to_le32(0x44524352), /* Log record page. */ 112 magic_RCRD = cpu_to_le32(0x44524352), /* Log record page. */
124 113
125 /* Found in $LogFile/$DATA. (May be found in $MFT/$DATA, also?) */ 114 /* Found in $LogFile/$DATA. (May be found in $MFT/$DATA, also?) */
126 magic_CHKD = const_cpu_to_le32(0x444b4843), /* Modified by chkdsk. */ 115 magic_CHKD = cpu_to_le32(0x444b4843), /* Modified by chkdsk. */
127 116
128 /* Found in all ntfs record containing records. */ 117 /* Found in all ntfs record containing records. */
129 magic_BAAD = const_cpu_to_le32(0x44414142), /* Failed multi sector 118 magic_BAAD = cpu_to_le32(0x44414142), /* Failed multi sector
130 transfer was detected. */ 119 transfer was detected. */
131 /* 120 /*
132 * Found in $LogFile/$DATA when a page is full of 0xff bytes and is 121 * Found in $LogFile/$DATA when a page is full of 0xff bytes and is
133 * thus not initialized. Page must be initialized before using it. 122 * thus not initialized. Page must be initialized before using it.
134 */ 123 */
135 magic_empty = const_cpu_to_le32(0xffffffff) /* Record is empty. */ 124 magic_empty = cpu_to_le32(0xffffffff) /* Record is empty. */
136}; 125};
137 126
138typedef le32 NTFS_RECORD_TYPE; 127typedef le32 NTFS_RECORD_TYPE;
@@ -258,8 +247,8 @@ typedef enum {
258 * information about the mft record in which they are present. 247 * information about the mft record in which they are present.
259 */ 248 */
260enum { 249enum {
261 MFT_RECORD_IN_USE = const_cpu_to_le16(0x0001), 250 MFT_RECORD_IN_USE = cpu_to_le16(0x0001),
262 MFT_RECORD_IS_DIRECTORY = const_cpu_to_le16(0x0002), 251 MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002),
263} __attribute__ ((__packed__)); 252} __attribute__ ((__packed__));
264 253
265typedef le16 MFT_RECORD_FLAGS; 254typedef le16 MFT_RECORD_FLAGS;
@@ -309,7 +298,7 @@ typedef le16 MFT_RECORD_FLAGS;
309 * Note: The _LE versions will return a CPU endian formatted value! 298 * Note: The _LE versions will return a CPU endian formatted value!
310 */ 299 */
311#define MFT_REF_MASK_CPU 0x0000ffffffffffffULL 300#define MFT_REF_MASK_CPU 0x0000ffffffffffffULL
312#define MFT_REF_MASK_LE const_cpu_to_le64(MFT_REF_MASK_CPU) 301#define MFT_REF_MASK_LE cpu_to_le64(MFT_REF_MASK_CPU)
313 302
314typedef u64 MFT_REF; 303typedef u64 MFT_REF;
315typedef le64 leMFT_REF; 304typedef le64 leMFT_REF;
@@ -477,25 +466,25 @@ typedef struct {
477 * a revealing choice of symbol I do not know what is... (-; 466 * a revealing choice of symbol I do not know what is... (-;
478 */ 467 */
479enum { 468enum {
480 AT_UNUSED = const_cpu_to_le32( 0), 469 AT_UNUSED = cpu_to_le32( 0),
481 AT_STANDARD_INFORMATION = const_cpu_to_le32( 0x10), 470 AT_STANDARD_INFORMATION = cpu_to_le32( 0x10),
482 AT_ATTRIBUTE_LIST = const_cpu_to_le32( 0x20), 471 AT_ATTRIBUTE_LIST = cpu_to_le32( 0x20),
483 AT_FILE_NAME = const_cpu_to_le32( 0x30), 472 AT_FILE_NAME = cpu_to_le32( 0x30),
484 AT_OBJECT_ID = const_cpu_to_le32( 0x40), 473 AT_OBJECT_ID = cpu_to_le32( 0x40),
485 AT_SECURITY_DESCRIPTOR = const_cpu_to_le32( 0x50), 474 AT_SECURITY_DESCRIPTOR = cpu_to_le32( 0x50),
486 AT_VOLUME_NAME = const_cpu_to_le32( 0x60), 475 AT_VOLUME_NAME = cpu_to_le32( 0x60),
487 AT_VOLUME_INFORMATION = const_cpu_to_le32( 0x70), 476 AT_VOLUME_INFORMATION = cpu_to_le32( 0x70),
488 AT_DATA = const_cpu_to_le32( 0x80), 477 AT_DATA = cpu_to_le32( 0x80),
489 AT_INDEX_ROOT = const_cpu_to_le32( 0x90), 478 AT_INDEX_ROOT = cpu_to_le32( 0x90),
490 AT_INDEX_ALLOCATION = const_cpu_to_le32( 0xa0), 479 AT_INDEX_ALLOCATION = cpu_to_le32( 0xa0),
491 AT_BITMAP = const_cpu_to_le32( 0xb0), 480 AT_BITMAP = cpu_to_le32( 0xb0),
492 AT_REPARSE_POINT = const_cpu_to_le32( 0xc0), 481 AT_REPARSE_POINT = cpu_to_le32( 0xc0),
493 AT_EA_INFORMATION = const_cpu_to_le32( 0xd0), 482 AT_EA_INFORMATION = cpu_to_le32( 0xd0),
494 AT_EA = const_cpu_to_le32( 0xe0), 483 AT_EA = cpu_to_le32( 0xe0),
495 AT_PROPERTY_SET = const_cpu_to_le32( 0xf0), 484 AT_PROPERTY_SET = cpu_to_le32( 0xf0),
496 AT_LOGGED_UTILITY_STREAM = const_cpu_to_le32( 0x100), 485 AT_LOGGED_UTILITY_STREAM = cpu_to_le32( 0x100),
497 AT_FIRST_USER_DEFINED_ATTRIBUTE = const_cpu_to_le32( 0x1000), 486 AT_FIRST_USER_DEFINED_ATTRIBUTE = cpu_to_le32( 0x1000),
498 AT_END = const_cpu_to_le32(0xffffffff) 487 AT_END = cpu_to_le32(0xffffffff)
499}; 488};
500 489
501typedef le32 ATTR_TYPE; 490typedef le32 ATTR_TYPE;
@@ -539,13 +528,13 @@ typedef le32 ATTR_TYPE;
539 * equal then the second le32 values would be compared, etc. 528 * equal then the second le32 values would be compared, etc.
540 */ 529 */
541enum { 530enum {
542 COLLATION_BINARY = const_cpu_to_le32(0x00), 531 COLLATION_BINARY = cpu_to_le32(0x00),
543 COLLATION_FILE_NAME = const_cpu_to_le32(0x01), 532 COLLATION_FILE_NAME = cpu_to_le32(0x01),
544 COLLATION_UNICODE_STRING = const_cpu_to_le32(0x02), 533 COLLATION_UNICODE_STRING = cpu_to_le32(0x02),
545 COLLATION_NTOFS_ULONG = const_cpu_to_le32(0x10), 534 COLLATION_NTOFS_ULONG = cpu_to_le32(0x10),
546 COLLATION_NTOFS_SID = const_cpu_to_le32(0x11), 535 COLLATION_NTOFS_SID = cpu_to_le32(0x11),
547 COLLATION_NTOFS_SECURITY_HASH = const_cpu_to_le32(0x12), 536 COLLATION_NTOFS_SECURITY_HASH = cpu_to_le32(0x12),
548 COLLATION_NTOFS_ULONGS = const_cpu_to_le32(0x13), 537 COLLATION_NTOFS_ULONGS = cpu_to_le32(0x13),
549}; 538};
550 539
551typedef le32 COLLATION_RULE; 540typedef le32 COLLATION_RULE;
@@ -559,25 +548,25 @@ typedef le32 COLLATION_RULE;
559 * NT4. 548 * NT4.
560 */ 549 */
561enum { 550enum {
562 ATTR_DEF_INDEXABLE = const_cpu_to_le32(0x02), /* Attribute can be 551 ATTR_DEF_INDEXABLE = cpu_to_le32(0x02), /* Attribute can be
563 indexed. */ 552 indexed. */
564 ATTR_DEF_MULTIPLE = const_cpu_to_le32(0x04), /* Attribute type 553 ATTR_DEF_MULTIPLE = cpu_to_le32(0x04), /* Attribute type
565 can be present multiple times in the 554 can be present multiple times in the
566 mft records of an inode. */ 555 mft records of an inode. */
567 ATTR_DEF_NOT_ZERO = const_cpu_to_le32(0x08), /* Attribute value 556 ATTR_DEF_NOT_ZERO = cpu_to_le32(0x08), /* Attribute value
568 must contain at least one non-zero 557 must contain at least one non-zero
569 byte. */ 558 byte. */
570 ATTR_DEF_INDEXED_UNIQUE = const_cpu_to_le32(0x10), /* Attribute must be 559 ATTR_DEF_INDEXED_UNIQUE = cpu_to_le32(0x10), /* Attribute must be
571 indexed and the attribute value must be 560 indexed and the attribute value must be
572 unique for the attribute type in all of 561 unique for the attribute type in all of
573 the mft records of an inode. */ 562 the mft records of an inode. */
574 ATTR_DEF_NAMED_UNIQUE = const_cpu_to_le32(0x20), /* Attribute must be 563 ATTR_DEF_NAMED_UNIQUE = cpu_to_le32(0x20), /* Attribute must be
575 named and the name must be unique for 564 named and the name must be unique for
576 the attribute type in all of the mft 565 the attribute type in all of the mft
577 records of an inode. */ 566 records of an inode. */
578 ATTR_DEF_RESIDENT = const_cpu_to_le32(0x40), /* Attribute must be 567 ATTR_DEF_RESIDENT = cpu_to_le32(0x40), /* Attribute must be
579 resident. */ 568 resident. */
580 ATTR_DEF_ALWAYS_LOG = const_cpu_to_le32(0x80), /* Always log 569 ATTR_DEF_ALWAYS_LOG = cpu_to_le32(0x80), /* Always log
581 modifications to this attribute, 570 modifications to this attribute,
582 regardless of whether it is resident or 571 regardless of whether it is resident or
583 non-resident. Without this, only log 572 non-resident. Without this, only log
@@ -614,12 +603,12 @@ typedef struct {
614 * Attribute flags (16-bit). 603 * Attribute flags (16-bit).
615 */ 604 */
616enum { 605enum {
617 ATTR_IS_COMPRESSED = const_cpu_to_le16(0x0001), 606 ATTR_IS_COMPRESSED = cpu_to_le16(0x0001),
618 ATTR_COMPRESSION_MASK = const_cpu_to_le16(0x00ff), /* Compression method 607 ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* Compression method
619 mask. Also, first 608 mask. Also, first
620 illegal value. */ 609 illegal value. */
621 ATTR_IS_ENCRYPTED = const_cpu_to_le16(0x4000), 610 ATTR_IS_ENCRYPTED = cpu_to_le16(0x4000),
622 ATTR_IS_SPARSE = const_cpu_to_le16(0x8000), 611 ATTR_IS_SPARSE = cpu_to_le16(0x8000),
623} __attribute__ ((__packed__)); 612} __attribute__ ((__packed__));
624 613
625typedef le16 ATTR_FLAGS; 614typedef le16 ATTR_FLAGS;
@@ -811,32 +800,32 @@ typedef ATTR_RECORD ATTR_REC;
811 * flags appear in all of the above. 800 * flags appear in all of the above.
812 */ 801 */
813enum { 802enum {
814 FILE_ATTR_READONLY = const_cpu_to_le32(0x00000001), 803 FILE_ATTR_READONLY = cpu_to_le32(0x00000001),
815 FILE_ATTR_HIDDEN = const_cpu_to_le32(0x00000002), 804 FILE_ATTR_HIDDEN = cpu_to_le32(0x00000002),
816 FILE_ATTR_SYSTEM = const_cpu_to_le32(0x00000004), 805 FILE_ATTR_SYSTEM = cpu_to_le32(0x00000004),
817 /* Old DOS volid. Unused in NT. = const_cpu_to_le32(0x00000008), */ 806 /* Old DOS volid. Unused in NT. = cpu_to_le32(0x00000008), */
818 807
819 FILE_ATTR_DIRECTORY = const_cpu_to_le32(0x00000010), 808 FILE_ATTR_DIRECTORY = cpu_to_le32(0x00000010),
820 /* Note, FILE_ATTR_DIRECTORY is not considered valid in NT. It is 809 /* Note, FILE_ATTR_DIRECTORY is not considered valid in NT. It is
821 reserved for the DOS SUBDIRECTORY flag. */ 810 reserved for the DOS SUBDIRECTORY flag. */
822 FILE_ATTR_ARCHIVE = const_cpu_to_le32(0x00000020), 811 FILE_ATTR_ARCHIVE = cpu_to_le32(0x00000020),
823 FILE_ATTR_DEVICE = const_cpu_to_le32(0x00000040), 812 FILE_ATTR_DEVICE = cpu_to_le32(0x00000040),
824 FILE_ATTR_NORMAL = const_cpu_to_le32(0x00000080), 813 FILE_ATTR_NORMAL = cpu_to_le32(0x00000080),
825 814
826 FILE_ATTR_TEMPORARY = const_cpu_to_le32(0x00000100), 815 FILE_ATTR_TEMPORARY = cpu_to_le32(0x00000100),
827 FILE_ATTR_SPARSE_FILE = const_cpu_to_le32(0x00000200), 816 FILE_ATTR_SPARSE_FILE = cpu_to_le32(0x00000200),
828 FILE_ATTR_REPARSE_POINT = const_cpu_to_le32(0x00000400), 817 FILE_ATTR_REPARSE_POINT = cpu_to_le32(0x00000400),
829 FILE_ATTR_COMPRESSED = const_cpu_to_le32(0x00000800), 818 FILE_ATTR_COMPRESSED = cpu_to_le32(0x00000800),
830 819
831 FILE_ATTR_OFFLINE = const_cpu_to_le32(0x00001000), 820 FILE_ATTR_OFFLINE = cpu_to_le32(0x00001000),
832 FILE_ATTR_NOT_CONTENT_INDEXED = const_cpu_to_le32(0x00002000), 821 FILE_ATTR_NOT_CONTENT_INDEXED = cpu_to_le32(0x00002000),
833 FILE_ATTR_ENCRYPTED = const_cpu_to_le32(0x00004000), 822 FILE_ATTR_ENCRYPTED = cpu_to_le32(0x00004000),
834 823
835 FILE_ATTR_VALID_FLAGS = const_cpu_to_le32(0x00007fb7), 824 FILE_ATTR_VALID_FLAGS = cpu_to_le32(0x00007fb7),
836 /* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the 825 /* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the
837 FILE_ATTR_DEVICE and preserves everything else. This mask is used 826 FILE_ATTR_DEVICE and preserves everything else. This mask is used
838 to obtain all flags that are valid for reading. */ 827 to obtain all flags that are valid for reading. */
839 FILE_ATTR_VALID_SET_FLAGS = const_cpu_to_le32(0x000031a7), 828 FILE_ATTR_VALID_SET_FLAGS = cpu_to_le32(0x000031a7),
840 /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the 829 /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the
841 F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT, 830 F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT,
842 F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask 831 F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask
@@ -846,11 +835,11 @@ enum {
846 * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION 835 * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION
847 * attribute of an mft record. 836 * attribute of an mft record.
848 */ 837 */
849 FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = const_cpu_to_le32(0x10000000), 838 FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = cpu_to_le32(0x10000000),
850 /* Note, this is a copy of the corresponding bit from the mft record, 839 /* Note, this is a copy of the corresponding bit from the mft record,
851 telling us whether this is a directory or not, i.e. whether it has 840 telling us whether this is a directory or not, i.e. whether it has
852 an index root attribute or not. */ 841 an index root attribute or not. */
853 FILE_ATTR_DUP_VIEW_INDEX_PRESENT = const_cpu_to_le32(0x20000000), 842 FILE_ATTR_DUP_VIEW_INDEX_PRESENT = cpu_to_le32(0x20000000),
854 /* Note, this is a copy of the corresponding bit from the mft record, 843 /* Note, this is a copy of the corresponding bit from the mft record,
855 telling us whether this file has a view index present (eg. object id 844 telling us whether this file has a view index present (eg. object id
856 index, quota index, one of the security indexes or the encrypting 845 index, quota index, one of the security indexes or the encrypting
@@ -1446,42 +1435,42 @@ enum {
1446 /* Specific rights for files and directories are as follows: */ 1435 /* Specific rights for files and directories are as follows: */
1447 1436
1448 /* Right to read data from the file. (FILE) */ 1437 /* Right to read data from the file. (FILE) */
1449 FILE_READ_DATA = const_cpu_to_le32(0x00000001), 1438 FILE_READ_DATA = cpu_to_le32(0x00000001),
1450 /* Right to list contents of a directory. (DIRECTORY) */ 1439 /* Right to list contents of a directory. (DIRECTORY) */
1451 FILE_LIST_DIRECTORY = const_cpu_to_le32(0x00000001), 1440 FILE_LIST_DIRECTORY = cpu_to_le32(0x00000001),
1452 1441
1453 /* Right to write data to the file. (FILE) */ 1442 /* Right to write data to the file. (FILE) */
1454 FILE_WRITE_DATA = const_cpu_to_le32(0x00000002), 1443 FILE_WRITE_DATA = cpu_to_le32(0x00000002),
1455 /* Right to create a file in the directory. (DIRECTORY) */ 1444 /* Right to create a file in the directory. (DIRECTORY) */
1456 FILE_ADD_FILE = const_cpu_to_le32(0x00000002), 1445 FILE_ADD_FILE = cpu_to_le32(0x00000002),
1457 1446
1458 /* Right to append data to the file. (FILE) */ 1447 /* Right to append data to the file. (FILE) */
1459 FILE_APPEND_DATA = const_cpu_to_le32(0x00000004), 1448 FILE_APPEND_DATA = cpu_to_le32(0x00000004),
1460 /* Right to create a subdirectory. (DIRECTORY) */ 1449 /* Right to create a subdirectory. (DIRECTORY) */
1461 FILE_ADD_SUBDIRECTORY = const_cpu_to_le32(0x00000004), 1450 FILE_ADD_SUBDIRECTORY = cpu_to_le32(0x00000004),
1462 1451
1463 /* Right to read extended attributes. (FILE/DIRECTORY) */ 1452 /* Right to read extended attributes. (FILE/DIRECTORY) */
1464 FILE_READ_EA = const_cpu_to_le32(0x00000008), 1453 FILE_READ_EA = cpu_to_le32(0x00000008),
1465 1454
1466 /* Right to write extended attributes. (FILE/DIRECTORY) */ 1455 /* Right to write extended attributes. (FILE/DIRECTORY) */
1467 FILE_WRITE_EA = const_cpu_to_le32(0x00000010), 1456 FILE_WRITE_EA = cpu_to_le32(0x00000010),
1468 1457
1469 /* Right to execute a file. (FILE) */ 1458 /* Right to execute a file. (FILE) */
1470 FILE_EXECUTE = const_cpu_to_le32(0x00000020), 1459 FILE_EXECUTE = cpu_to_le32(0x00000020),
1471 /* Right to traverse the directory. (DIRECTORY) */ 1460 /* Right to traverse the directory. (DIRECTORY) */
1472 FILE_TRAVERSE = const_cpu_to_le32(0x00000020), 1461 FILE_TRAVERSE = cpu_to_le32(0x00000020),
1473 1462
1474 /* 1463 /*
1475 * Right to delete a directory and all the files it contains (its 1464 * Right to delete a directory and all the files it contains (its
1476 * children), even if the files are read-only. (DIRECTORY) 1465 * children), even if the files are read-only. (DIRECTORY)
1477 */ 1466 */
1478 FILE_DELETE_CHILD = const_cpu_to_le32(0x00000040), 1467 FILE_DELETE_CHILD = cpu_to_le32(0x00000040),
1479 1468
1480 /* Right to read file attributes. (FILE/DIRECTORY) */ 1469 /* Right to read file attributes. (FILE/DIRECTORY) */
1481 FILE_READ_ATTRIBUTES = const_cpu_to_le32(0x00000080), 1470 FILE_READ_ATTRIBUTES = cpu_to_le32(0x00000080),
1482 1471
1483 /* Right to change file attributes. (FILE/DIRECTORY) */ 1472 /* Right to change file attributes. (FILE/DIRECTORY) */
1484 FILE_WRITE_ATTRIBUTES = const_cpu_to_le32(0x00000100), 1473 FILE_WRITE_ATTRIBUTES = cpu_to_le32(0x00000100),
1485 1474
1486 /* 1475 /*
1487 * The standard rights (bits 16 to 23). These are independent of the 1476 * The standard rights (bits 16 to 23). These are independent of the
@@ -1489,27 +1478,27 @@ enum {
1489 */ 1478 */
1490 1479
1491 /* Right to delete the object. */ 1480 /* Right to delete the object. */
1492 DELETE = const_cpu_to_le32(0x00010000), 1481 DELETE = cpu_to_le32(0x00010000),
1493 1482
1494 /* 1483 /*
1495 * Right to read the information in the object's security descriptor, 1484 * Right to read the information in the object's security descriptor,
1496 * not including the information in the SACL, i.e. right to read the 1485 * not including the information in the SACL, i.e. right to read the
1497 * security descriptor and owner. 1486 * security descriptor and owner.
1498 */ 1487 */
1499 READ_CONTROL = const_cpu_to_le32(0x00020000), 1488 READ_CONTROL = cpu_to_le32(0x00020000),
1500 1489
1501 /* Right to modify the DACL in the object's security descriptor. */ 1490 /* Right to modify the DACL in the object's security descriptor. */
1502 WRITE_DAC = const_cpu_to_le32(0x00040000), 1491 WRITE_DAC = cpu_to_le32(0x00040000),
1503 1492
1504 /* Right to change the owner in the object's security descriptor. */ 1493 /* Right to change the owner in the object's security descriptor. */
1505 WRITE_OWNER = const_cpu_to_le32(0x00080000), 1494 WRITE_OWNER = cpu_to_le32(0x00080000),
1506 1495
1507 /* 1496 /*
1508 * Right to use the object for synchronization. Enables a process to 1497 * Right to use the object for synchronization. Enables a process to
1509 * wait until the object is in the signalled state. Some object types 1498 * wait until the object is in the signalled state. Some object types
1510 * do not support this access right. 1499 * do not support this access right.
1511 */ 1500 */
1512 SYNCHRONIZE = const_cpu_to_le32(0x00100000), 1501 SYNCHRONIZE = cpu_to_le32(0x00100000),
1513 1502
1514 /* 1503 /*
1515 * The following STANDARD_RIGHTS_* are combinations of the above for 1504 * The following STANDARD_RIGHTS_* are combinations of the above for
@@ -1517,25 +1506,25 @@ enum {
1517 */ 1506 */
1518 1507
1519 /* These are currently defined to READ_CONTROL. */ 1508 /* These are currently defined to READ_CONTROL. */
1520 STANDARD_RIGHTS_READ = const_cpu_to_le32(0x00020000), 1509 STANDARD_RIGHTS_READ = cpu_to_le32(0x00020000),
1521 STANDARD_RIGHTS_WRITE = const_cpu_to_le32(0x00020000), 1510 STANDARD_RIGHTS_WRITE = cpu_to_le32(0x00020000),
1522 STANDARD_RIGHTS_EXECUTE = const_cpu_to_le32(0x00020000), 1511 STANDARD_RIGHTS_EXECUTE = cpu_to_le32(0x00020000),
1523 1512
1524 /* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */ 1513 /* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */
1525 STANDARD_RIGHTS_REQUIRED = const_cpu_to_le32(0x000f0000), 1514 STANDARD_RIGHTS_REQUIRED = cpu_to_le32(0x000f0000),
1526 1515
1527 /* 1516 /*
1528 * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and 1517 * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and
1529 * SYNCHRONIZE access. 1518 * SYNCHRONIZE access.
1530 */ 1519 */
1531 STANDARD_RIGHTS_ALL = const_cpu_to_le32(0x001f0000), 1520 STANDARD_RIGHTS_ALL = cpu_to_le32(0x001f0000),
1532 1521
1533 /* 1522 /*
1534 * The access system ACL and maximum allowed access types (bits 24 to 1523 * The access system ACL and maximum allowed access types (bits 24 to
1535 * 25, bits 26 to 27 are reserved). 1524 * 25, bits 26 to 27 are reserved).
1536 */ 1525 */
1537 ACCESS_SYSTEM_SECURITY = const_cpu_to_le32(0x01000000), 1526 ACCESS_SYSTEM_SECURITY = cpu_to_le32(0x01000000),
1538 MAXIMUM_ALLOWED = const_cpu_to_le32(0x02000000), 1527 MAXIMUM_ALLOWED = cpu_to_le32(0x02000000),
1539 1528
1540 /* 1529 /*
1541 * The generic rights (bits 28 to 31). These map onto the standard and 1530 * The generic rights (bits 28 to 31). These map onto the standard and
@@ -1543,10 +1532,10 @@ enum {
1543 */ 1532 */
1544 1533
1545 /* Read, write, and execute access. */ 1534 /* Read, write, and execute access. */
1546 GENERIC_ALL = const_cpu_to_le32(0x10000000), 1535 GENERIC_ALL = cpu_to_le32(0x10000000),
1547 1536
1548 /* Execute access. */ 1537 /* Execute access. */
1549 GENERIC_EXECUTE = const_cpu_to_le32(0x20000000), 1538 GENERIC_EXECUTE = cpu_to_le32(0x20000000),
1550 1539
1551 /* 1540 /*
1552 * Write access. For files, this maps onto: 1541 * Write access. For files, this maps onto:
@@ -1555,7 +1544,7 @@ enum {
1555 * For directories, the mapping has the same numerical value. See 1544 * For directories, the mapping has the same numerical value. See
1556 * above for the descriptions of the rights granted. 1545 * above for the descriptions of the rights granted.
1557 */ 1546 */
1558 GENERIC_WRITE = const_cpu_to_le32(0x40000000), 1547 GENERIC_WRITE = cpu_to_le32(0x40000000),
1559 1548
1560 /* 1549 /*
1561 * Read access. For files, this maps onto: 1550 * Read access. For files, this maps onto:
@@ -1564,7 +1553,7 @@ enum {
1564 * For directories, the mapping has the same numberical value. See 1553 * For directories, the mapping has the same numberical value. See
1565 * above for the descriptions of the rights granted. 1554 * above for the descriptions of the rights granted.
1566 */ 1555 */
1567 GENERIC_READ = const_cpu_to_le32(0x80000000), 1556 GENERIC_READ = cpu_to_le32(0x80000000),
1568}; 1557};
1569 1558
1570typedef le32 ACCESS_MASK; 1559typedef le32 ACCESS_MASK;
@@ -1604,8 +1593,8 @@ typedef struct {
1604 * The object ACE flags (32-bit). 1593 * The object ACE flags (32-bit).
1605 */ 1594 */
1606enum { 1595enum {
1607 ACE_OBJECT_TYPE_PRESENT = const_cpu_to_le32(1), 1596 ACE_OBJECT_TYPE_PRESENT = cpu_to_le32(1),
1608 ACE_INHERITED_OBJECT_TYPE_PRESENT = const_cpu_to_le32(2), 1597 ACE_INHERITED_OBJECT_TYPE_PRESENT = cpu_to_le32(2),
1609}; 1598};
1610 1599
1611typedef le32 OBJECT_ACE_FLAGS; 1600typedef le32 OBJECT_ACE_FLAGS;
@@ -1706,23 +1695,23 @@ typedef enum {
1706 * expressed as offsets from the beginning of the security descriptor. 1695 * expressed as offsets from the beginning of the security descriptor.
1707 */ 1696 */
1708enum { 1697enum {
1709 SE_OWNER_DEFAULTED = const_cpu_to_le16(0x0001), 1698 SE_OWNER_DEFAULTED = cpu_to_le16(0x0001),
1710 SE_GROUP_DEFAULTED = const_cpu_to_le16(0x0002), 1699 SE_GROUP_DEFAULTED = cpu_to_le16(0x0002),
1711 SE_DACL_PRESENT = const_cpu_to_le16(0x0004), 1700 SE_DACL_PRESENT = cpu_to_le16(0x0004),
1712 SE_DACL_DEFAULTED = const_cpu_to_le16(0x0008), 1701 SE_DACL_DEFAULTED = cpu_to_le16(0x0008),
1713 1702
1714 SE_SACL_PRESENT = const_cpu_to_le16(0x0010), 1703 SE_SACL_PRESENT = cpu_to_le16(0x0010),
1715 SE_SACL_DEFAULTED = const_cpu_to_le16(0x0020), 1704 SE_SACL_DEFAULTED = cpu_to_le16(0x0020),
1716 1705
1717 SE_DACL_AUTO_INHERIT_REQ = const_cpu_to_le16(0x0100), 1706 SE_DACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0100),
1718 SE_SACL_AUTO_INHERIT_REQ = const_cpu_to_le16(0x0200), 1707 SE_SACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0200),
1719 SE_DACL_AUTO_INHERITED = const_cpu_to_le16(0x0400), 1708 SE_DACL_AUTO_INHERITED = cpu_to_le16(0x0400),
1720 SE_SACL_AUTO_INHERITED = const_cpu_to_le16(0x0800), 1709 SE_SACL_AUTO_INHERITED = cpu_to_le16(0x0800),
1721 1710
1722 SE_DACL_PROTECTED = const_cpu_to_le16(0x1000), 1711 SE_DACL_PROTECTED = cpu_to_le16(0x1000),
1723 SE_SACL_PROTECTED = const_cpu_to_le16(0x2000), 1712 SE_SACL_PROTECTED = cpu_to_le16(0x2000),
1724 SE_RM_CONTROL_VALID = const_cpu_to_le16(0x4000), 1713 SE_RM_CONTROL_VALID = cpu_to_le16(0x4000),
1725 SE_SELF_RELATIVE = const_cpu_to_le16(0x8000) 1714 SE_SELF_RELATIVE = cpu_to_le16(0x8000)
1726} __attribute__ ((__packed__)); 1715} __attribute__ ((__packed__));
1727 1716
1728typedef le16 SECURITY_DESCRIPTOR_CONTROL; 1717typedef le16 SECURITY_DESCRIPTOR_CONTROL;
@@ -1910,21 +1899,21 @@ typedef struct {
1910 * Possible flags for the volume (16-bit). 1899 * Possible flags for the volume (16-bit).
1911 */ 1900 */
1912enum { 1901enum {
1913 VOLUME_IS_DIRTY = const_cpu_to_le16(0x0001), 1902 VOLUME_IS_DIRTY = cpu_to_le16(0x0001),
1914 VOLUME_RESIZE_LOG_FILE = const_cpu_to_le16(0x0002), 1903 VOLUME_RESIZE_LOG_FILE = cpu_to_le16(0x0002),
1915 VOLUME_UPGRADE_ON_MOUNT = const_cpu_to_le16(0x0004), 1904 VOLUME_UPGRADE_ON_MOUNT = cpu_to_le16(0x0004),
1916 VOLUME_MOUNTED_ON_NT4 = const_cpu_to_le16(0x0008), 1905 VOLUME_MOUNTED_ON_NT4 = cpu_to_le16(0x0008),
1917 1906
1918 VOLUME_DELETE_USN_UNDERWAY = const_cpu_to_le16(0x0010), 1907 VOLUME_DELETE_USN_UNDERWAY = cpu_to_le16(0x0010),
1919 VOLUME_REPAIR_OBJECT_ID = const_cpu_to_le16(0x0020), 1908 VOLUME_REPAIR_OBJECT_ID = cpu_to_le16(0x0020),
1920 1909
1921 VOLUME_CHKDSK_UNDERWAY = const_cpu_to_le16(0x4000), 1910 VOLUME_CHKDSK_UNDERWAY = cpu_to_le16(0x4000),
1922 VOLUME_MODIFIED_BY_CHKDSK = const_cpu_to_le16(0x8000), 1911 VOLUME_MODIFIED_BY_CHKDSK = cpu_to_le16(0x8000),
1923 1912
1924 VOLUME_FLAGS_MASK = const_cpu_to_le16(0xc03f), 1913 VOLUME_FLAGS_MASK = cpu_to_le16(0xc03f),
1925 1914
1926 /* To make our life easier when checking if we must mount read-only. */ 1915 /* To make our life easier when checking if we must mount read-only. */
1927 VOLUME_MUST_MOUNT_RO_MASK = const_cpu_to_le16(0xc027), 1916 VOLUME_MUST_MOUNT_RO_MASK = cpu_to_le16(0xc027),
1928} __attribute__ ((__packed__)); 1917} __attribute__ ((__packed__));
1929 1918
1930typedef le16 VOLUME_FLAGS; 1919typedef le16 VOLUME_FLAGS;
@@ -2109,26 +2098,26 @@ typedef struct {
2109 * The user quota flags. Names explain meaning. 2098 * The user quota flags. Names explain meaning.
2110 */ 2099 */
2111enum { 2100enum {
2112 QUOTA_FLAG_DEFAULT_LIMITS = const_cpu_to_le32(0x00000001), 2101 QUOTA_FLAG_DEFAULT_LIMITS = cpu_to_le32(0x00000001),
2113 QUOTA_FLAG_LIMIT_REACHED = const_cpu_to_le32(0x00000002), 2102 QUOTA_FLAG_LIMIT_REACHED = cpu_to_le32(0x00000002),
2114 QUOTA_FLAG_ID_DELETED = const_cpu_to_le32(0x00000004), 2103 QUOTA_FLAG_ID_DELETED = cpu_to_le32(0x00000004),
2115 2104
2116 QUOTA_FLAG_USER_MASK = const_cpu_to_le32(0x00000007), 2105 QUOTA_FLAG_USER_MASK = cpu_to_le32(0x00000007),
2117 /* This is a bit mask for the user quota flags. */ 2106 /* This is a bit mask for the user quota flags. */
2118 2107
2119 /* 2108 /*
2120 * These flags are only present in the quota defaults index entry, i.e. 2109 * These flags are only present in the quota defaults index entry, i.e.
2121 * in the entry where owner_id = QUOTA_DEFAULTS_ID. 2110 * in the entry where owner_id = QUOTA_DEFAULTS_ID.
2122 */ 2111 */
2123 QUOTA_FLAG_TRACKING_ENABLED = const_cpu_to_le32(0x00000010), 2112 QUOTA_FLAG_TRACKING_ENABLED = cpu_to_le32(0x00000010),
2124 QUOTA_FLAG_ENFORCEMENT_ENABLED = const_cpu_to_le32(0x00000020), 2113 QUOTA_FLAG_ENFORCEMENT_ENABLED = cpu_to_le32(0x00000020),
2125 QUOTA_FLAG_TRACKING_REQUESTED = const_cpu_to_le32(0x00000040), 2114 QUOTA_FLAG_TRACKING_REQUESTED = cpu_to_le32(0x00000040),
2126 QUOTA_FLAG_LOG_THRESHOLD = const_cpu_to_le32(0x00000080), 2115 QUOTA_FLAG_LOG_THRESHOLD = cpu_to_le32(0x00000080),
2127 2116
2128 QUOTA_FLAG_LOG_LIMIT = const_cpu_to_le32(0x00000100), 2117 QUOTA_FLAG_LOG_LIMIT = cpu_to_le32(0x00000100),
2129 QUOTA_FLAG_OUT_OF_DATE = const_cpu_to_le32(0x00000200), 2118 QUOTA_FLAG_OUT_OF_DATE = cpu_to_le32(0x00000200),
2130 QUOTA_FLAG_CORRUPT = const_cpu_to_le32(0x00000400), 2119 QUOTA_FLAG_CORRUPT = cpu_to_le32(0x00000400),
2131 QUOTA_FLAG_PENDING_DELETES = const_cpu_to_le32(0x00000800), 2120 QUOTA_FLAG_PENDING_DELETES = cpu_to_le32(0x00000800),
2132}; 2121};
2133 2122
2134typedef le32 QUOTA_FLAGS; 2123typedef le32 QUOTA_FLAGS;
@@ -2172,9 +2161,9 @@ typedef struct {
2172 * Predefined owner_id values (32-bit). 2161 * Predefined owner_id values (32-bit).
2173 */ 2162 */
2174enum { 2163enum {
2175 QUOTA_INVALID_ID = const_cpu_to_le32(0x00000000), 2164 QUOTA_INVALID_ID = cpu_to_le32(0x00000000),
2176 QUOTA_DEFAULTS_ID = const_cpu_to_le32(0x00000001), 2165 QUOTA_DEFAULTS_ID = cpu_to_le32(0x00000001),
2177 QUOTA_FIRST_USER_ID = const_cpu_to_le32(0x00000100), 2166 QUOTA_FIRST_USER_ID = cpu_to_le32(0x00000100),
2178}; 2167};
2179 2168
2180/* 2169/*
@@ -2189,14 +2178,14 @@ typedef enum {
2189 * Index entry flags (16-bit). 2178 * Index entry flags (16-bit).
2190 */ 2179 */
2191enum { 2180enum {
2192 INDEX_ENTRY_NODE = const_cpu_to_le16(1), /* This entry contains a 2181 INDEX_ENTRY_NODE = cpu_to_le16(1), /* This entry contains a
2193 sub-node, i.e. a reference to an index block in form of 2182 sub-node, i.e. a reference to an index block in form of
2194 a virtual cluster number (see below). */ 2183 a virtual cluster number (see below). */
2195 INDEX_ENTRY_END = const_cpu_to_le16(2), /* This signifies the last 2184 INDEX_ENTRY_END = cpu_to_le16(2), /* This signifies the last
2196 entry in an index block. The index entry does not 2185 entry in an index block. The index entry does not
2197 represent a file but it can point to a sub-node. */ 2186 represent a file but it can point to a sub-node. */
2198 2187
2199 INDEX_ENTRY_SPACE_FILLER = const_cpu_to_le16(0xffff), /* gcc: Force 2188 INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force
2200 enum bit width to 16-bit. */ 2189 enum bit width to 16-bit. */
2201} __attribute__ ((__packed__)); 2190} __attribute__ ((__packed__));
2202 2191
@@ -2334,26 +2323,26 @@ typedef struct {
2334 * These are the predefined reparse point tags: 2323 * These are the predefined reparse point tags:
2335 */ 2324 */
2336enum { 2325enum {
2337 IO_REPARSE_TAG_IS_ALIAS = const_cpu_to_le32(0x20000000), 2326 IO_REPARSE_TAG_IS_ALIAS = cpu_to_le32(0x20000000),
2338 IO_REPARSE_TAG_IS_HIGH_LATENCY = const_cpu_to_le32(0x40000000), 2327 IO_REPARSE_TAG_IS_HIGH_LATENCY = cpu_to_le32(0x40000000),
2339 IO_REPARSE_TAG_IS_MICROSOFT = const_cpu_to_le32(0x80000000), 2328 IO_REPARSE_TAG_IS_MICROSOFT = cpu_to_le32(0x80000000),
2340 2329
2341 IO_REPARSE_TAG_RESERVED_ZERO = const_cpu_to_le32(0x00000000), 2330 IO_REPARSE_TAG_RESERVED_ZERO = cpu_to_le32(0x00000000),
2342 IO_REPARSE_TAG_RESERVED_ONE = const_cpu_to_le32(0x00000001), 2331 IO_REPARSE_TAG_RESERVED_ONE = cpu_to_le32(0x00000001),
2343 IO_REPARSE_TAG_RESERVED_RANGE = const_cpu_to_le32(0x00000001), 2332 IO_REPARSE_TAG_RESERVED_RANGE = cpu_to_le32(0x00000001),
2344 2333
2345 IO_REPARSE_TAG_NSS = const_cpu_to_le32(0x68000005), 2334 IO_REPARSE_TAG_NSS = cpu_to_le32(0x68000005),
2346 IO_REPARSE_TAG_NSS_RECOVER = const_cpu_to_le32(0x68000006), 2335 IO_REPARSE_TAG_NSS_RECOVER = cpu_to_le32(0x68000006),
2347 IO_REPARSE_TAG_SIS = const_cpu_to_le32(0x68000007), 2336 IO_REPARSE_TAG_SIS = cpu_to_le32(0x68000007),
2348 IO_REPARSE_TAG_DFS = const_cpu_to_le32(0x68000008), 2337 IO_REPARSE_TAG_DFS = cpu_to_le32(0x68000008),
2349 2338
2350 IO_REPARSE_TAG_MOUNT_POINT = const_cpu_to_le32(0x88000003), 2339 IO_REPARSE_TAG_MOUNT_POINT = cpu_to_le32(0x88000003),
2351 2340
2352 IO_REPARSE_TAG_HSM = const_cpu_to_le32(0xa8000004), 2341 IO_REPARSE_TAG_HSM = cpu_to_le32(0xa8000004),
2353 2342
2354 IO_REPARSE_TAG_SYMBOLIC_LINK = const_cpu_to_le32(0xe8000000), 2343 IO_REPARSE_TAG_SYMBOLIC_LINK = cpu_to_le32(0xe8000000),
2355 2344
2356 IO_REPARSE_TAG_VALID_VALUES = const_cpu_to_le32(0xe000ffff), 2345 IO_REPARSE_TAG_VALID_VALUES = cpu_to_le32(0xe000ffff),
2357}; 2346};
2358 2347
2359/* 2348/*
diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h
index 9468e1c45ae3..b5a6f08bd35c 100644
--- a/fs/ntfs/logfile.h
+++ b/fs/ntfs/logfile.h
@@ -104,7 +104,7 @@ typedef struct {
104 * in this particular client array. Also inside the client records themselves, 104 * in this particular client array. Also inside the client records themselves,
105 * this means that there are no client records preceding or following this one. 105 * this means that there are no client records preceding or following this one.
106 */ 106 */
107#define LOGFILE_NO_CLIENT const_cpu_to_le16(0xffff) 107#define LOGFILE_NO_CLIENT cpu_to_le16(0xffff)
108#define LOGFILE_NO_CLIENT_CPU 0xffff 108#define LOGFILE_NO_CLIENT_CPU 0xffff
109 109
110/* 110/*
@@ -112,8 +112,8 @@ typedef struct {
112 * information about the log file in which they are present. 112 * information about the log file in which they are present.
113 */ 113 */
114enum { 114enum {
115 RESTART_VOLUME_IS_CLEAN = const_cpu_to_le16(0x0002), 115 RESTART_VOLUME_IS_CLEAN = cpu_to_le16(0x0002),
116 RESTART_SPACE_FILLER = const_cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */ 116 RESTART_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */
117} __attribute__ ((__packed__)); 117} __attribute__ ((__packed__));
118 118
119typedef le16 RESTART_AREA_FLAGS; 119typedef le16 RESTART_AREA_FLAGS;
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 17d32ca6bc35..23bf68453d7d 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -2839,7 +2839,7 @@ int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m)
2839 */ 2839 */
2840 2840
2841 /* Mark the mft record as not in use. */ 2841 /* Mark the mft record as not in use. */
2842 m->flags &= const_cpu_to_le16(~const_le16_to_cpu(MFT_RECORD_IN_USE)); 2842 m->flags &= ~MFT_RECORD_IN_USE;
2843 2843
2844 /* Increment the sequence number, skipping zero, if it is not zero. */ 2844 /* Increment the sequence number, skipping zero, if it is not zero. */
2845 old_seq_no = m->sequence_number; 2845 old_seq_no = m->sequence_number;
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 4a46743b5077..f76951dcd4a6 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -618,7 +618,7 @@ static bool is_boot_sector_ntfs(const struct super_block *sb,
618 * many BIOSes will refuse to boot from a bootsector if the magic is 618 * many BIOSes will refuse to boot from a bootsector if the magic is
619 * incorrect, so we emit a warning. 619 * incorrect, so we emit a warning.
620 */ 620 */
621 if (!silent && b->end_of_sector_marker != const_cpu_to_le16(0xaa55)) 621 if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55))
622 ntfs_warning(sb, "Invalid end of sector marker."); 622 ntfs_warning(sb, "Invalid end of sector marker.");
623 return true; 623 return true;
624not_ntfs: 624not_ntfs:
@@ -1242,13 +1242,13 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
1242 u32 *kaddr, *kend; 1242 u32 *kaddr, *kend;
1243 ntfs_name *name = NULL; 1243 ntfs_name *name = NULL;
1244 int ret = 1; 1244 int ret = 1;
1245 static const ntfschar hiberfil[13] = { const_cpu_to_le16('h'), 1245 static const ntfschar hiberfil[13] = { cpu_to_le16('h'),
1246 const_cpu_to_le16('i'), const_cpu_to_le16('b'), 1246 cpu_to_le16('i'), cpu_to_le16('b'),
1247 const_cpu_to_le16('e'), const_cpu_to_le16('r'), 1247 cpu_to_le16('e'), cpu_to_le16('r'),
1248 const_cpu_to_le16('f'), const_cpu_to_le16('i'), 1248 cpu_to_le16('f'), cpu_to_le16('i'),
1249 const_cpu_to_le16('l'), const_cpu_to_le16('.'), 1249 cpu_to_le16('l'), cpu_to_le16('.'),
1250 const_cpu_to_le16('s'), const_cpu_to_le16('y'), 1250 cpu_to_le16('s'), cpu_to_le16('y'),
1251 const_cpu_to_le16('s'), 0 }; 1251 cpu_to_le16('s'), 0 };
1252 1252
1253 ntfs_debug("Entering."); 1253 ntfs_debug("Entering.");
1254 /* 1254 /*
@@ -1296,7 +1296,7 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
1296 goto iput_out; 1296 goto iput_out;
1297 } 1297 }
1298 kaddr = (u32*)page_address(page); 1298 kaddr = (u32*)page_address(page);
1299 if (*(le32*)kaddr == const_cpu_to_le32(0x72626968)/*'hibr'*/) { 1299 if (*(le32*)kaddr == cpu_to_le32(0x72626968)/*'hibr'*/) {
1300 ntfs_debug("Magic \"hibr\" found in hiberfil.sys. Windows is " 1300 ntfs_debug("Magic \"hibr\" found in hiberfil.sys. Windows is "
1301 "hibernated on the volume. This is the " 1301 "hibernated on the volume. This is the "
1302 "system volume."); 1302 "system volume.");
@@ -1337,12 +1337,12 @@ static bool load_and_init_quota(ntfs_volume *vol)
1337 MFT_REF mref; 1337 MFT_REF mref;
1338 struct inode *tmp_ino; 1338 struct inode *tmp_ino;
1339 ntfs_name *name = NULL; 1339 ntfs_name *name = NULL;
1340 static const ntfschar Quota[7] = { const_cpu_to_le16('$'), 1340 static const ntfschar Quota[7] = { cpu_to_le16('$'),
1341 const_cpu_to_le16('Q'), const_cpu_to_le16('u'), 1341 cpu_to_le16('Q'), cpu_to_le16('u'),
1342 const_cpu_to_le16('o'), const_cpu_to_le16('t'), 1342 cpu_to_le16('o'), cpu_to_le16('t'),
1343 const_cpu_to_le16('a'), 0 }; 1343 cpu_to_le16('a'), 0 };
1344 static ntfschar Q[3] = { const_cpu_to_le16('$'), 1344 static ntfschar Q[3] = { cpu_to_le16('$'),
1345 const_cpu_to_le16('Q'), 0 }; 1345 cpu_to_le16('Q'), 0 };
1346 1346
1347 ntfs_debug("Entering."); 1347 ntfs_debug("Entering.");
1348 /* 1348 /*
@@ -1416,16 +1416,16 @@ static bool load_and_init_usnjrnl(ntfs_volume *vol)
1416 struct page *page; 1416 struct page *page;
1417 ntfs_name *name = NULL; 1417 ntfs_name *name = NULL;
1418 USN_HEADER *uh; 1418 USN_HEADER *uh;
1419 static const ntfschar UsnJrnl[9] = { const_cpu_to_le16('$'), 1419 static const ntfschar UsnJrnl[9] = { cpu_to_le16('$'),
1420 const_cpu_to_le16('U'), const_cpu_to_le16('s'), 1420 cpu_to_le16('U'), cpu_to_le16('s'),
1421 const_cpu_to_le16('n'), const_cpu_to_le16('J'), 1421 cpu_to_le16('n'), cpu_to_le16('J'),
1422 const_cpu_to_le16('r'), const_cpu_to_le16('n'), 1422 cpu_to_le16('r'), cpu_to_le16('n'),
1423 const_cpu_to_le16('l'), 0 }; 1423 cpu_to_le16('l'), 0 };
1424 static ntfschar Max[5] = { const_cpu_to_le16('$'), 1424 static ntfschar Max[5] = { cpu_to_le16('$'),
1425 const_cpu_to_le16('M'), const_cpu_to_le16('a'), 1425 cpu_to_le16('M'), cpu_to_le16('a'),
1426 const_cpu_to_le16('x'), 0 }; 1426 cpu_to_le16('x'), 0 };
1427 static ntfschar J[3] = { const_cpu_to_le16('$'), 1427 static ntfschar J[3] = { cpu_to_le16('$'),
1428 const_cpu_to_le16('J'), 0 }; 1428 cpu_to_le16('J'), 0 };
1429 1429
1430 ntfs_debug("Entering."); 1430 ntfs_debug("Entering.");
1431 /* 1431 /*
diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h
index 4087fbdac327..00d8e6bd7c36 100644
--- a/fs/ntfs/usnjrnl.h
+++ b/fs/ntfs/usnjrnl.h
@@ -116,27 +116,27 @@ typedef struct {
116 * documentation: http://www.linux-ntfs.org/ 116 * documentation: http://www.linux-ntfs.org/
117 */ 117 */
118enum { 118enum {
119 USN_REASON_DATA_OVERWRITE = const_cpu_to_le32(0x00000001), 119 USN_REASON_DATA_OVERWRITE = cpu_to_le32(0x00000001),
120 USN_REASON_DATA_EXTEND = const_cpu_to_le32(0x00000002), 120 USN_REASON_DATA_EXTEND = cpu_to_le32(0x00000002),
121 USN_REASON_DATA_TRUNCATION = const_cpu_to_le32(0x00000004), 121 USN_REASON_DATA_TRUNCATION = cpu_to_le32(0x00000004),
122 USN_REASON_NAMED_DATA_OVERWRITE = const_cpu_to_le32(0x00000010), 122 USN_REASON_NAMED_DATA_OVERWRITE = cpu_to_le32(0x00000010),
123 USN_REASON_NAMED_DATA_EXTEND = const_cpu_to_le32(0x00000020), 123 USN_REASON_NAMED_DATA_EXTEND = cpu_to_le32(0x00000020),
124 USN_REASON_NAMED_DATA_TRUNCATION= const_cpu_to_le32(0x00000040), 124 USN_REASON_NAMED_DATA_TRUNCATION= cpu_to_le32(0x00000040),
125 USN_REASON_FILE_CREATE = const_cpu_to_le32(0x00000100), 125 USN_REASON_FILE_CREATE = cpu_to_le32(0x00000100),
126 USN_REASON_FILE_DELETE = const_cpu_to_le32(0x00000200), 126 USN_REASON_FILE_DELETE = cpu_to_le32(0x00000200),
127 USN_REASON_EA_CHANGE = const_cpu_to_le32(0x00000400), 127 USN_REASON_EA_CHANGE = cpu_to_le32(0x00000400),
128 USN_REASON_SECURITY_CHANGE = const_cpu_to_le32(0x00000800), 128 USN_REASON_SECURITY_CHANGE = cpu_to_le32(0x00000800),
129 USN_REASON_RENAME_OLD_NAME = const_cpu_to_le32(0x00001000), 129 USN_REASON_RENAME_OLD_NAME = cpu_to_le32(0x00001000),
130 USN_REASON_RENAME_NEW_NAME = const_cpu_to_le32(0x00002000), 130 USN_REASON_RENAME_NEW_NAME = cpu_to_le32(0x00002000),
131 USN_REASON_INDEXABLE_CHANGE = const_cpu_to_le32(0x00004000), 131 USN_REASON_INDEXABLE_CHANGE = cpu_to_le32(0x00004000),
132 USN_REASON_BASIC_INFO_CHANGE = const_cpu_to_le32(0x00008000), 132 USN_REASON_BASIC_INFO_CHANGE = cpu_to_le32(0x00008000),
133 USN_REASON_HARD_LINK_CHANGE = const_cpu_to_le32(0x00010000), 133 USN_REASON_HARD_LINK_CHANGE = cpu_to_le32(0x00010000),
134 USN_REASON_COMPRESSION_CHANGE = const_cpu_to_le32(0x00020000), 134 USN_REASON_COMPRESSION_CHANGE = cpu_to_le32(0x00020000),
135 USN_REASON_ENCRYPTION_CHANGE = const_cpu_to_le32(0x00040000), 135 USN_REASON_ENCRYPTION_CHANGE = cpu_to_le32(0x00040000),
136 USN_REASON_OBJECT_ID_CHANGE = const_cpu_to_le32(0x00080000), 136 USN_REASON_OBJECT_ID_CHANGE = cpu_to_le32(0x00080000),
137 USN_REASON_REPARSE_POINT_CHANGE = const_cpu_to_le32(0x00100000), 137 USN_REASON_REPARSE_POINT_CHANGE = cpu_to_le32(0x00100000),
138 USN_REASON_STREAM_CHANGE = const_cpu_to_le32(0x00200000), 138 USN_REASON_STREAM_CHANGE = cpu_to_le32(0x00200000),
139 USN_REASON_CLOSE = const_cpu_to_le32(0x80000000), 139 USN_REASON_CLOSE = cpu_to_le32(0x80000000),
140}; 140};
141 141
142typedef le32 USN_REASON_FLAGS; 142typedef le32 USN_REASON_FLAGS;
@@ -148,9 +148,9 @@ typedef le32 USN_REASON_FLAGS;
148 * http://www.linux-ntfs.org/ 148 * http://www.linux-ntfs.org/
149 */ 149 */
150enum { 150enum {
151 USN_SOURCE_DATA_MANAGEMENT = const_cpu_to_le32(0x00000001), 151 USN_SOURCE_DATA_MANAGEMENT = cpu_to_le32(0x00000001),
152 USN_SOURCE_AUXILIARY_DATA = const_cpu_to_le32(0x00000002), 152 USN_SOURCE_AUXILIARY_DATA = cpu_to_le32(0x00000002),
153 USN_SOURCE_REPLICATION_MANAGEMENT = const_cpu_to_le32(0x00000004), 153 USN_SOURCE_REPLICATION_MANAGEMENT = cpu_to_le32(0x00000004),
154}; 154};
155 155
156typedef le32 USN_SOURCE_INFO_FLAGS; 156typedef le32 USN_SOURCE_INFO_FLAGS;
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 12dfb44c22e5..fbeaec762103 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -296,7 +296,7 @@ int ocfs2_init_acl(handle_t *handle,
296 return PTR_ERR(acl); 296 return PTR_ERR(acl);
297 } 297 }
298 if (!acl) 298 if (!acl)
299 inode->i_mode &= ~current->fs->umask; 299 inode->i_mode &= ~current_umask();
300 } 300 }
301 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { 301 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
302 struct posix_acl *clone; 302 struct posix_acl *clone;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 3a9e5deed74d..678a067d9251 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -176,7 +176,8 @@ static int ocfs2_dinode_insert_check(struct inode *inode,
176 176
177 BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL); 177 BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
178 mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && 178 mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
179 (OCFS2_I(inode)->ip_clusters != rec->e_cpos), 179 (OCFS2_I(inode)->ip_clusters !=
180 le32_to_cpu(rec->e_cpos)),
180 "Device %s, asking for sparse allocation: inode %llu, " 181 "Device %s, asking for sparse allocation: inode %llu, "
181 "cpos %u, clusters %u\n", 182 "cpos %u, clusters %u\n",
182 osb->dev_str, 183 osb->dev_str,
@@ -293,6 +294,55 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
293 .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters, 294 .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
294}; 295};
295 296
297static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et,
298 u64 blkno)
299{
300 struct ocfs2_dx_root_block *dx_root = et->et_object;
301
302 dx_root->dr_last_eb_blk = cpu_to_le64(blkno);
303}
304
305static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
306{
307 struct ocfs2_dx_root_block *dx_root = et->et_object;
308
309 return le64_to_cpu(dx_root->dr_last_eb_blk);
310}
311
312static void ocfs2_dx_root_update_clusters(struct inode *inode,
313 struct ocfs2_extent_tree *et,
314 u32 clusters)
315{
316 struct ocfs2_dx_root_block *dx_root = et->et_object;
317
318 le32_add_cpu(&dx_root->dr_clusters, clusters);
319}
320
321static int ocfs2_dx_root_sanity_check(struct inode *inode,
322 struct ocfs2_extent_tree *et)
323{
324 struct ocfs2_dx_root_block *dx_root = et->et_object;
325
326 BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root));
327
328 return 0;
329}
330
331static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
332{
333 struct ocfs2_dx_root_block *dx_root = et->et_object;
334
335 et->et_root_el = &dx_root->dr_list;
336}
337
338static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
339 .eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk,
340 .eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk,
341 .eo_update_clusters = ocfs2_dx_root_update_clusters,
342 .eo_sanity_check = ocfs2_dx_root_sanity_check,
343 .eo_fill_root_el = ocfs2_dx_root_fill_root_el,
344};
345
296static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, 346static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
297 struct inode *inode, 347 struct inode *inode,
298 struct buffer_head *bh, 348 struct buffer_head *bh,
@@ -338,6 +388,14 @@ void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
338 &ocfs2_xattr_value_et_ops); 388 &ocfs2_xattr_value_et_ops);
339} 389}
340 390
391void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
392 struct inode *inode,
393 struct buffer_head *bh)
394{
395 __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_dr,
396 NULL, &ocfs2_dx_root_et_ops);
397}
398
341static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et, 399static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
342 u64 new_last_eb_blk) 400 u64 new_last_eb_blk)
343{ 401{
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index cceff5c37f47..353254ba29e1 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -75,6 +75,9 @@ struct ocfs2_xattr_value_buf;
75void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, 75void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
76 struct inode *inode, 76 struct inode *inode,
77 struct ocfs2_xattr_value_buf *vb); 77 struct ocfs2_xattr_value_buf *vb);
78void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
79 struct inode *inode,
80 struct buffer_head *bh);
78 81
79/* 82/*
80 * Read an extent block into *bh. If *bh is NULL, a bh will be 83 * Read an extent block into *bh. If *bh is NULL, a bh will be
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index a067a6cffb01..b2c52b3a1484 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -227,7 +227,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
227 size = i_size_read(inode); 227 size = i_size_read(inode);
228 228
229 if (size > PAGE_CACHE_SIZE || 229 if (size > PAGE_CACHE_SIZE ||
230 size > ocfs2_max_inline_data(inode->i_sb)) { 230 size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) {
231 ocfs2_error(inode->i_sb, 231 ocfs2_error(inode->i_sb,
232 "Inode %llu has with inline data has bad size: %Lu", 232 "Inode %llu has with inline data has bad size: %Lu",
233 (unsigned long long)OCFS2_I(inode)->ip_blkno, 233 (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -1555,6 +1555,7 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
1555 int ret, written = 0; 1555 int ret, written = 0;
1556 loff_t end = pos + len; 1556 loff_t end = pos + len;
1557 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1557 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1558 struct ocfs2_dinode *di = NULL;
1558 1559
1559 mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n", 1560 mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n",
1560 (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos, 1561 (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos,
@@ -1587,7 +1588,9 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
1587 /* 1588 /*
1588 * Check whether the write can fit. 1589 * Check whether the write can fit.
1589 */ 1590 */
1590 if (mmap_page || end > ocfs2_max_inline_data(inode->i_sb)) 1591 di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1592 if (mmap_page ||
1593 end > ocfs2_max_inline_data_with_xattr(inode->i_sb, di))
1591 return 0; 1594 return 0;
1592 1595
1593do_inline_write: 1596do_inline_write:
@@ -1953,15 +1956,16 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
1953} 1956}
1954 1957
1955const struct address_space_operations ocfs2_aops = { 1958const struct address_space_operations ocfs2_aops = {
1956 .readpage = ocfs2_readpage, 1959 .readpage = ocfs2_readpage,
1957 .readpages = ocfs2_readpages, 1960 .readpages = ocfs2_readpages,
1958 .writepage = ocfs2_writepage, 1961 .writepage = ocfs2_writepage,
1959 .write_begin = ocfs2_write_begin, 1962 .write_begin = ocfs2_write_begin,
1960 .write_end = ocfs2_write_end, 1963 .write_end = ocfs2_write_end,
1961 .bmap = ocfs2_bmap, 1964 .bmap = ocfs2_bmap,
1962 .sync_page = block_sync_page, 1965 .sync_page = block_sync_page,
1963 .direct_IO = ocfs2_direct_IO, 1966 .direct_IO = ocfs2_direct_IO,
1964 .invalidatepage = ocfs2_invalidatepage, 1967 .invalidatepage = ocfs2_invalidatepage,
1965 .releasepage = ocfs2_releasepage, 1968 .releasepage = ocfs2_releasepage,
1966 .migratepage = buffer_migrate_page, 1969 .migratepage = buffer_migrate_page,
1970 .is_partially_uptodate = block_is_partially_uptodate,
1967}; 1971};
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 04697ba7f73e..4f85eceab376 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -33,6 +33,7 @@
33#include <linux/random.h> 33#include <linux/random.h>
34#include <linux/crc32.h> 34#include <linux/crc32.h>
35#include <linux/time.h> 35#include <linux/time.h>
36#include <linux/debugfs.h>
36 37
37#include "heartbeat.h" 38#include "heartbeat.h"
38#include "tcp.h" 39#include "tcp.h"
@@ -60,6 +61,11 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
60static LIST_HEAD(o2hb_node_events); 61static LIST_HEAD(o2hb_node_events);
61static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); 62static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
62 63
64#define O2HB_DEBUG_DIR "o2hb"
65#define O2HB_DEBUG_LIVENODES "livenodes"
66static struct dentry *o2hb_debug_dir;
67static struct dentry *o2hb_debug_livenodes;
68
63static LIST_HEAD(o2hb_all_regions); 69static LIST_HEAD(o2hb_all_regions);
64 70
65static struct o2hb_callback { 71static struct o2hb_callback {
@@ -905,7 +911,77 @@ static int o2hb_thread(void *data)
905 return 0; 911 return 0;
906} 912}
907 913
908void o2hb_init(void) 914#ifdef CONFIG_DEBUG_FS
915static int o2hb_debug_open(struct inode *inode, struct file *file)
916{
917 unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
918 char *buf = NULL;
919 int i = -1;
920 int out = 0;
921
922 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
923 if (!buf)
924 goto bail;
925
926 o2hb_fill_node_map(map, sizeof(map));
927
928 while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
929 out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
930 out += snprintf(buf + out, PAGE_SIZE - out, "\n");
931
932 i_size_write(inode, out);
933
934 file->private_data = buf;
935
936 return 0;
937bail:
938 return -ENOMEM;
939}
940
941static int o2hb_debug_release(struct inode *inode, struct file *file)
942{
943 kfree(file->private_data);
944 return 0;
945}
946
947static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
948 size_t nbytes, loff_t *ppos)
949{
950 return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
951 i_size_read(file->f_mapping->host));
952}
953#else
954static int o2hb_debug_open(struct inode *inode, struct file *file)
955{
956 return 0;
957}
958static int o2hb_debug_release(struct inode *inode, struct file *file)
959{
960 return 0;
961}
962static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
963 size_t nbytes, loff_t *ppos)
964{
965 return 0;
966}
967#endif /* CONFIG_DEBUG_FS */
968
969static struct file_operations o2hb_debug_fops = {
970 .open = o2hb_debug_open,
971 .release = o2hb_debug_release,
972 .read = o2hb_debug_read,
973 .llseek = generic_file_llseek,
974};
975
976void o2hb_exit(void)
977{
978 if (o2hb_debug_livenodes)
979 debugfs_remove(o2hb_debug_livenodes);
980 if (o2hb_debug_dir)
981 debugfs_remove(o2hb_debug_dir);
982}
983
984int o2hb_init(void)
909{ 985{
910 int i; 986 int i;
911 987
@@ -918,6 +994,24 @@ void o2hb_init(void)
918 INIT_LIST_HEAD(&o2hb_node_events); 994 INIT_LIST_HEAD(&o2hb_node_events);
919 995
920 memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); 996 memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
997
998 o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
999 if (!o2hb_debug_dir) {
1000 mlog_errno(-ENOMEM);
1001 return -ENOMEM;
1002 }
1003
1004 o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES,
1005 S_IFREG|S_IRUSR,
1006 o2hb_debug_dir, NULL,
1007 &o2hb_debug_fops);
1008 if (!o2hb_debug_livenodes) {
1009 mlog_errno(-ENOMEM);
1010 debugfs_remove(o2hb_debug_dir);
1011 return -ENOMEM;
1012 }
1013
1014 return 0;
921} 1015}
922 1016
923/* if we're already in a callback then we're already serialized by the sem */ 1017/* if we're already in a callback then we're already serialized by the sem */
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index e511339886b3..2f1649253b49 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -75,7 +75,8 @@ void o2hb_unregister_callback(const char *region_uuid,
75 struct o2hb_callback_func *hc); 75 struct o2hb_callback_func *hc);
76void o2hb_fill_node_map(unsigned long *map, 76void o2hb_fill_node_map(unsigned long *map,
77 unsigned bytes); 77 unsigned bytes);
78void o2hb_init(void); 78void o2hb_exit(void);
79int o2hb_init(void);
79int o2hb_check_node_heartbeating(u8 node_num); 80int o2hb_check_node_heartbeating(u8 node_num);
80int o2hb_check_node_heartbeating_from_callback(u8 node_num); 81int o2hb_check_node_heartbeating_from_callback(u8 node_num);
81int o2hb_check_local_node_heartbeating(void); 82int o2hb_check_local_node_heartbeating(void);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 70e8fa9e2539..7ee6188bc79a 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -881,6 +881,7 @@ static void __exit exit_o2nm(void)
881 o2cb_sys_shutdown(); 881 o2cb_sys_shutdown();
882 882
883 o2net_exit(); 883 o2net_exit();
884 o2hb_exit();
884} 885}
885 886
886static int __init init_o2nm(void) 887static int __init init_o2nm(void)
@@ -889,11 +890,13 @@ static int __init init_o2nm(void)
889 890
890 cluster_print_version(); 891 cluster_print_version();
891 892
892 o2hb_init(); 893 ret = o2hb_init();
894 if (ret)
895 goto out;
893 896
894 ret = o2net_init(); 897 ret = o2net_init();
895 if (ret) 898 if (ret)
896 goto out; 899 goto out_o2hb;
897 900
898 ret = o2net_register_hb_callbacks(); 901 ret = o2net_register_hb_callbacks();
899 if (ret) 902 if (ret)
@@ -916,6 +919,8 @@ out_callbacks:
916 o2net_unregister_hb_callbacks(); 919 o2net_unregister_hb_callbacks();
917out_o2net: 920out_o2net:
918 o2net_exit(); 921 o2net_exit();
922out_o2hb:
923 o2hb_exit();
919out: 924out:
920 return ret; 925 return ret;
921} 926}
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index e9d7c2038c0f..7d604480557a 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -455,7 +455,7 @@ out_move:
455 d_move(dentry, target); 455 d_move(dentry, target);
456} 456}
457 457
458struct dentry_operations ocfs2_dentry_ops = { 458const struct dentry_operations ocfs2_dentry_ops = {
459 .d_revalidate = ocfs2_dentry_revalidate, 459 .d_revalidate = ocfs2_dentry_revalidate,
460 .d_iput = ocfs2_dentry_iput, 460 .d_iput = ocfs2_dentry_iput,
461}; 461};
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index d06e16c06640..faa12e75f98d 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -26,7 +26,7 @@
26#ifndef OCFS2_DCACHE_H 26#ifndef OCFS2_DCACHE_H
27#define OCFS2_DCACHE_H 27#define OCFS2_DCACHE_H
28 28
29extern struct dentry_operations ocfs2_dentry_ops; 29extern const struct dentry_operations ocfs2_dentry_ops;
30 30
31struct ocfs2_dentry_lock { 31struct ocfs2_dentry_lock {
32 /* Use count of dentry lock */ 32 /* Use count of dentry lock */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f2c4098cf337..e71160cda110 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -41,6 +41,7 @@
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/quotaops.h> 43#include <linux/quotaops.h>
44#include <linux/sort.h>
44 45
45#define MLOG_MASK_PREFIX ML_NAMEI 46#define MLOG_MASK_PREFIX ML_NAMEI
46#include <cluster/masklog.h> 47#include <cluster/masklog.h>
@@ -58,6 +59,7 @@
58#include "namei.h" 59#include "namei.h"
59#include "suballoc.h" 60#include "suballoc.h"
60#include "super.h" 61#include "super.h"
62#include "sysfile.h"
61#include "uptodate.h" 63#include "uptodate.h"
62 64
63#include "buffer_head_io.h" 65#include "buffer_head_io.h"
@@ -71,11 +73,6 @@ static unsigned char ocfs2_filetype_table[] = {
71 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 73 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
72}; 74};
73 75
74static int ocfs2_extend_dir(struct ocfs2_super *osb,
75 struct inode *dir,
76 struct buffer_head *parent_fe_bh,
77 unsigned int blocks_wanted,
78 struct buffer_head **new_de_bh);
79static int ocfs2_do_extend_dir(struct super_block *sb, 76static int ocfs2_do_extend_dir(struct super_block *sb,
80 handle_t *handle, 77 handle_t *handle,
81 struct inode *dir, 78 struct inode *dir,
@@ -83,22 +80,36 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
83 struct ocfs2_alloc_context *data_ac, 80 struct ocfs2_alloc_context *data_ac,
84 struct ocfs2_alloc_context *meta_ac, 81 struct ocfs2_alloc_context *meta_ac,
85 struct buffer_head **new_bh); 82 struct buffer_head **new_bh);
83static int ocfs2_dir_indexed(struct inode *inode);
86 84
87/* 85/*
88 * These are distinct checks because future versions of the file system will 86 * These are distinct checks because future versions of the file system will
89 * want to have a trailing dirent structure independent of indexing. 87 * want to have a trailing dirent structure independent of indexing.
90 */ 88 */
91static int ocfs2_dir_has_trailer(struct inode *dir) 89static int ocfs2_supports_dir_trailer(struct inode *dir)
92{ 90{
91 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
92
93 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 93 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
94 return 0; 94 return 0;
95 95
96 return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb)); 96 return ocfs2_meta_ecc(osb) || ocfs2_dir_indexed(dir);
97} 97}
98 98
99static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb) 99/*
100 * "new' here refers to the point at which we're creating a new
101 * directory via "mkdir()", but also when we're expanding an inline
102 * directory. In either case, we don't yet have the indexing bit set
103 * on the directory, so the standard checks will fail in when metaecc
104 * is turned off. Only directory-initialization type functions should
105 * use this then. Everything else wants ocfs2_supports_dir_trailer()
106 */
107static int ocfs2_new_dir_wants_trailer(struct inode *dir)
100{ 108{
101 return ocfs2_meta_ecc(osb); 109 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
110
111 return ocfs2_meta_ecc(osb) ||
112 ocfs2_supports_indexed_dirs(osb);
102} 113}
103 114
104static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb) 115static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
@@ -130,7 +141,7 @@ static int ocfs2_skip_dir_trailer(struct inode *dir,
130{ 141{
131 unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer); 142 unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
132 143
133 if (!ocfs2_dir_has_trailer(dir)) 144 if (!ocfs2_supports_dir_trailer(dir))
134 return 0; 145 return 0;
135 146
136 if (offset != toff) 147 if (offset != toff)
@@ -140,7 +151,7 @@ static int ocfs2_skip_dir_trailer(struct inode *dir,
140} 151}
141 152
142static void ocfs2_init_dir_trailer(struct inode *inode, 153static void ocfs2_init_dir_trailer(struct inode *inode,
143 struct buffer_head *bh) 154 struct buffer_head *bh, u16 rec_len)
144{ 155{
145 struct ocfs2_dir_block_trailer *trailer; 156 struct ocfs2_dir_block_trailer *trailer;
146 157
@@ -150,6 +161,153 @@ static void ocfs2_init_dir_trailer(struct inode *inode,
150 cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer)); 161 cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
151 trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno); 162 trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
152 trailer->db_blkno = cpu_to_le64(bh->b_blocknr); 163 trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
164 trailer->db_free_rec_len = cpu_to_le16(rec_len);
165}
166/*
167 * Link an unindexed block with a dir trailer structure into the index free
168 * list. This function will modify dirdata_bh, but assumes you've already
169 * passed it to the journal.
170 */
171static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle,
172 struct buffer_head *dx_root_bh,
173 struct buffer_head *dirdata_bh)
174{
175 int ret;
176 struct ocfs2_dx_root_block *dx_root;
177 struct ocfs2_dir_block_trailer *trailer;
178
179 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
180 OCFS2_JOURNAL_ACCESS_WRITE);
181 if (ret) {
182 mlog_errno(ret);
183 goto out;
184 }
185 trailer = ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
186 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
187
188 trailer->db_free_next = dx_root->dr_free_blk;
189 dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
190
191 ocfs2_journal_dirty(handle, dx_root_bh);
192
193out:
194 return ret;
195}
196
197static int ocfs2_free_list_at_root(struct ocfs2_dir_lookup_result *res)
198{
199 return res->dl_prev_leaf_bh == NULL;
200}
201
202void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res)
203{
204 brelse(res->dl_dx_root_bh);
205 brelse(res->dl_leaf_bh);
206 brelse(res->dl_dx_leaf_bh);
207 brelse(res->dl_prev_leaf_bh);
208}
209
210static int ocfs2_dir_indexed(struct inode *inode)
211{
212 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INDEXED_DIR_FL)
213 return 1;
214 return 0;
215}
216
217static inline int ocfs2_dx_root_inline(struct ocfs2_dx_root_block *dx_root)
218{
219 return dx_root->dr_flags & OCFS2_DX_FLAG_INLINE;
220}
221
222/*
223 * Hashing code adapted from ext3
224 */
225#define DELTA 0x9E3779B9
226
227static void TEA_transform(__u32 buf[4], __u32 const in[])
228{
229 __u32 sum = 0;
230 __u32 b0 = buf[0], b1 = buf[1];
231 __u32 a = in[0], b = in[1], c = in[2], d = in[3];
232 int n = 16;
233
234 do {
235 sum += DELTA;
236 b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
237 b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
238 } while (--n);
239
240 buf[0] += b0;
241 buf[1] += b1;
242}
243
244static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
245{
246 __u32 pad, val;
247 int i;
248
249 pad = (__u32)len | ((__u32)len << 8);
250 pad |= pad << 16;
251
252 val = pad;
253 if (len > num*4)
254 len = num * 4;
255 for (i = 0; i < len; i++) {
256 if ((i % 4) == 0)
257 val = pad;
258 val = msg[i] + (val << 8);
259 if ((i % 4) == 3) {
260 *buf++ = val;
261 val = pad;
262 num--;
263 }
264 }
265 if (--num >= 0)
266 *buf++ = val;
267 while (--num >= 0)
268 *buf++ = pad;
269}
270
271static void ocfs2_dx_dir_name_hash(struct inode *dir, const char *name, int len,
272 struct ocfs2_dx_hinfo *hinfo)
273{
274 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
275 const char *p;
276 __u32 in[8], buf[4];
277
278 /*
279 * XXX: Is this really necessary, if the index is never looked
280 * at by readdir? Is a hash value of '0' a bad idea?
281 */
282 if ((len == 1 && !strncmp(".", name, 1)) ||
283 (len == 2 && !strncmp("..", name, 2))) {
284 buf[0] = buf[1] = 0;
285 goto out;
286 }
287
288#ifdef OCFS2_DEBUG_DX_DIRS
289 /*
290 * This makes it very easy to debug indexing problems. We
291 * should never allow this to be selected without hand editing
292 * this file though.
293 */
294 buf[0] = buf[1] = len;
295 goto out;
296#endif
297
298 memcpy(buf, osb->osb_dx_seed, sizeof(buf));
299
300 p = name;
301 while (len > 0) {
302 str2hashbuf(p, len, in, 4);
303 TEA_transform(buf, in);
304 len -= 16;
305 p += 16;
306 }
307
308out:
309 hinfo->major_hash = buf[0];
310 hinfo->minor_hash = buf[1];
153} 311}
154 312
155/* 313/*
@@ -312,6 +470,52 @@ static int ocfs2_validate_dir_block(struct super_block *sb,
312} 470}
313 471
314/* 472/*
473 * Validate a directory trailer.
474 *
475 * We check the trailer here rather than in ocfs2_validate_dir_block()
476 * because that function doesn't have the inode to test.
477 */
478static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
479{
480 int rc = 0;
481 struct ocfs2_dir_block_trailer *trailer;
482
483 trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
484 if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
485 rc = -EINVAL;
486 ocfs2_error(dir->i_sb,
487 "Invalid dirblock #%llu: "
488 "signature = %.*s\n",
489 (unsigned long long)bh->b_blocknr, 7,
490 trailer->db_signature);
491 goto out;
492 }
493 if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
494 rc = -EINVAL;
495 ocfs2_error(dir->i_sb,
496 "Directory block #%llu has an invalid "
497 "db_blkno of %llu",
498 (unsigned long long)bh->b_blocknr,
499 (unsigned long long)le64_to_cpu(trailer->db_blkno));
500 goto out;
501 }
502 if (le64_to_cpu(trailer->db_parent_dinode) !=
503 OCFS2_I(dir)->ip_blkno) {
504 rc = -EINVAL;
505 ocfs2_error(dir->i_sb,
506 "Directory block #%llu on dinode "
507 "#%llu has an invalid parent_dinode "
508 "of %llu",
509 (unsigned long long)bh->b_blocknr,
510 (unsigned long long)OCFS2_I(dir)->ip_blkno,
511 (unsigned long long)le64_to_cpu(trailer->db_blkno));
512 goto out;
513 }
514out:
515 return rc;
516}
517
518/*
315 * This function forces all errors to -EIO for consistency with its 519 * This function forces all errors to -EIO for consistency with its
316 * predecessor, ocfs2_bread(). We haven't audited what returning the 520 * predecessor, ocfs2_bread(). We haven't audited what returning the
317 * real error codes would do to callers. We log the real codes with 521 * real error codes would do to callers. We log the real codes with
@@ -322,7 +526,6 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
322{ 526{
323 int rc = 0; 527 int rc = 0;
324 struct buffer_head *tmp = *bh; 528 struct buffer_head *tmp = *bh;
325 struct ocfs2_dir_block_trailer *trailer;
326 529
327 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags, 530 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
328 ocfs2_validate_dir_block); 531 ocfs2_validate_dir_block);
@@ -331,42 +534,13 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
331 goto out; 534 goto out;
332 } 535 }
333 536
334 /*
335 * We check the trailer here rather than in
336 * ocfs2_validate_dir_block() because that function doesn't have
337 * the inode to test.
338 */
339 if (!(flags & OCFS2_BH_READAHEAD) && 537 if (!(flags & OCFS2_BH_READAHEAD) &&
340 ocfs2_dir_has_trailer(inode)) { 538 ocfs2_supports_dir_trailer(inode)) {
341 trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb); 539 rc = ocfs2_check_dir_trailer(inode, tmp);
342 if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) { 540 if (rc) {
343 rc = -EINVAL; 541 if (!*bh)
344 ocfs2_error(inode->i_sb, 542 brelse(tmp);
345 "Invalid dirblock #%llu: " 543 mlog_errno(rc);
346 "signature = %.*s\n",
347 (unsigned long long)tmp->b_blocknr, 7,
348 trailer->db_signature);
349 goto out;
350 }
351 if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) {
352 rc = -EINVAL;
353 ocfs2_error(inode->i_sb,
354 "Directory block #%llu has an invalid "
355 "db_blkno of %llu",
356 (unsigned long long)tmp->b_blocknr,
357 (unsigned long long)le64_to_cpu(trailer->db_blkno));
358 goto out;
359 }
360 if (le64_to_cpu(trailer->db_parent_dinode) !=
361 OCFS2_I(inode)->ip_blkno) {
362 rc = -EINVAL;
363 ocfs2_error(inode->i_sb,
364 "Directory block #%llu on dinode "
365 "#%llu has an invalid parent_dinode "
366 "of %llu",
367 (unsigned long long)tmp->b_blocknr,
368 (unsigned long long)OCFS2_I(inode)->ip_blkno,
369 (unsigned long long)le64_to_cpu(trailer->db_blkno));
370 goto out; 544 goto out;
371 } 545 }
372 } 546 }
@@ -379,6 +553,141 @@ out:
379 return rc ? -EIO : 0; 553 return rc ? -EIO : 0;
380} 554}
381 555
556/*
557 * Read the block at 'phys' which belongs to this directory
558 * inode. This function does no virtual->physical block translation -
559 * what's passed in is assumed to be a valid directory block.
560 */
561static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys,
562 struct buffer_head **bh)
563{
564 int ret;
565 struct buffer_head *tmp = *bh;
566
567 ret = ocfs2_read_block(dir, phys, &tmp, ocfs2_validate_dir_block);
568 if (ret) {
569 mlog_errno(ret);
570 goto out;
571 }
572
573 if (ocfs2_supports_dir_trailer(dir)) {
574 ret = ocfs2_check_dir_trailer(dir, tmp);
575 if (ret) {
576 if (!*bh)
577 brelse(tmp);
578 mlog_errno(ret);
579 goto out;
580 }
581 }
582
583 if (!ret && !*bh)
584 *bh = tmp;
585out:
586 return ret;
587}
588
589static int ocfs2_validate_dx_root(struct super_block *sb,
590 struct buffer_head *bh)
591{
592 int ret;
593 struct ocfs2_dx_root_block *dx_root;
594
595 BUG_ON(!buffer_uptodate(bh));
596
597 dx_root = (struct ocfs2_dx_root_block *) bh->b_data;
598
599 ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_root->dr_check);
600 if (ret) {
601 mlog(ML_ERROR,
602 "Checksum failed for dir index root block %llu\n",
603 (unsigned long long)bh->b_blocknr);
604 return ret;
605 }
606
607 if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
608 ocfs2_error(sb,
609 "Dir Index Root # %llu has bad signature %.*s",
610 (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
611 7, dx_root->dr_signature);
612 return -EINVAL;
613 }
614
615 return 0;
616}
617
618static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
619 struct buffer_head **dx_root_bh)
620{
621 int ret;
622 u64 blkno = le64_to_cpu(di->i_dx_root);
623 struct buffer_head *tmp = *dx_root_bh;
624
625 ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_root);
626
627 /* If ocfs2_read_block() got us a new bh, pass it up. */
628 if (!ret && !*dx_root_bh)
629 *dx_root_bh = tmp;
630
631 return ret;
632}
633
634static int ocfs2_validate_dx_leaf(struct super_block *sb,
635 struct buffer_head *bh)
636{
637 int ret;
638 struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)bh->b_data;
639
640 BUG_ON(!buffer_uptodate(bh));
641
642 ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_leaf->dl_check);
643 if (ret) {
644 mlog(ML_ERROR,
645 "Checksum failed for dir index leaf block %llu\n",
646 (unsigned long long)bh->b_blocknr);
647 return ret;
648 }
649
650 if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
651 ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
652 7, dx_leaf->dl_signature);
653 return -EROFS;
654 }
655
656 return 0;
657}
658
659static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
660 struct buffer_head **dx_leaf_bh)
661{
662 int ret;
663 struct buffer_head *tmp = *dx_leaf_bh;
664
665 ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_leaf);
666
667 /* If ocfs2_read_block() got us a new bh, pass it up. */
668 if (!ret && !*dx_leaf_bh)
669 *dx_leaf_bh = tmp;
670
671 return ret;
672}
673
674/*
675 * Read a series of dx_leaf blocks. This expects all buffer_head
676 * pointers to be NULL on function entry.
677 */
678static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num,
679 struct buffer_head **dx_leaf_bhs)
680{
681 int ret;
682
683 ret = ocfs2_read_blocks(dir, start, num, dx_leaf_bhs, 0,
684 ocfs2_validate_dx_leaf);
685 if (ret)
686 mlog_errno(ret);
687
688 return ret;
689}
690
382static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen, 691static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
383 struct inode *dir, 692 struct inode *dir,
384 struct ocfs2_dir_entry **res_dir) 693 struct ocfs2_dir_entry **res_dir)
@@ -480,39 +789,340 @@ cleanup_and_exit:
480 return ret; 789 return ret;
481} 790}
482 791
792static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
793 struct ocfs2_extent_list *el,
794 u32 major_hash,
795 u32 *ret_cpos,
796 u64 *ret_phys_blkno,
797 unsigned int *ret_clen)
798{
799 int ret = 0, i, found;
800 struct buffer_head *eb_bh = NULL;
801 struct ocfs2_extent_block *eb;
802 struct ocfs2_extent_rec *rec = NULL;
803
804 if (el->l_tree_depth) {
805 ret = ocfs2_find_leaf(inode, el, major_hash, &eb_bh);
806 if (ret) {
807 mlog_errno(ret);
808 goto out;
809 }
810
811 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
812 el = &eb->h_list;
813
814 if (el->l_tree_depth) {
815 ocfs2_error(inode->i_sb,
816 "Inode %lu has non zero tree depth in "
817 "btree tree block %llu\n", inode->i_ino,
818 (unsigned long long)eb_bh->b_blocknr);
819 ret = -EROFS;
820 goto out;
821 }
822 }
823
824 found = 0;
825 for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
826 rec = &el->l_recs[i];
827
828 if (le32_to_cpu(rec->e_cpos) <= major_hash) {
829 found = 1;
830 break;
831 }
832 }
833
834 if (!found) {
835 ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
836 "record (%u, %u, 0) in btree", inode->i_ino,
837 le32_to_cpu(rec->e_cpos),
838 ocfs2_rec_clusters(el, rec));
839 ret = -EROFS;
840 goto out;
841 }
842
843 if (ret_phys_blkno)
844 *ret_phys_blkno = le64_to_cpu(rec->e_blkno);
845 if (ret_cpos)
846 *ret_cpos = le32_to_cpu(rec->e_cpos);
847 if (ret_clen)
848 *ret_clen = le16_to_cpu(rec->e_leaf_clusters);
849
850out:
851 brelse(eb_bh);
852 return ret;
853}
854
855/*
856 * Returns the block index, from the start of the cluster which this
857 * hash belongs too.
858 */
859static inline unsigned int __ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
860 u32 minor_hash)
861{
862 return minor_hash & osb->osb_dx_mask;
863}
864
865static inline unsigned int ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
866 struct ocfs2_dx_hinfo *hinfo)
867{
868 return __ocfs2_dx_dir_hash_idx(osb, hinfo->minor_hash);
869}
870
871static int ocfs2_dx_dir_lookup(struct inode *inode,
872 struct ocfs2_extent_list *el,
873 struct ocfs2_dx_hinfo *hinfo,
874 u32 *ret_cpos,
875 u64 *ret_phys_blkno)
876{
877 int ret = 0;
878 unsigned int cend, uninitialized_var(clen);
879 u32 uninitialized_var(cpos);
880 u64 uninitialized_var(blkno);
881 u32 name_hash = hinfo->major_hash;
882
883 ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno,
884 &clen);
885 if (ret) {
886 mlog_errno(ret);
887 goto out;
888 }
889
890 cend = cpos + clen;
891 if (name_hash >= cend) {
892 /* We want the last cluster */
893 blkno += ocfs2_clusters_to_blocks(inode->i_sb, clen - 1);
894 cpos += clen - 1;
895 } else {
896 blkno += ocfs2_clusters_to_blocks(inode->i_sb,
897 name_hash - cpos);
898 cpos = name_hash;
899 }
900
901 /*
902 * We now have the cluster which should hold our entry. To
903 * find the exact block from the start of the cluster to
904 * search, we take the lower bits of the hash.
905 */
906 blkno += ocfs2_dx_dir_hash_idx(OCFS2_SB(inode->i_sb), hinfo);
907
908 if (ret_phys_blkno)
909 *ret_phys_blkno = blkno;
910 if (ret_cpos)
911 *ret_cpos = cpos;
912
913out:
914
915 return ret;
916}
917
918static int ocfs2_dx_dir_search(const char *name, int namelen,
919 struct inode *dir,
920 struct ocfs2_dx_root_block *dx_root,
921 struct ocfs2_dir_lookup_result *res)
922{
923 int ret, i, found;
924 u64 uninitialized_var(phys);
925 struct buffer_head *dx_leaf_bh = NULL;
926 struct ocfs2_dx_leaf *dx_leaf;
927 struct ocfs2_dx_entry *dx_entry = NULL;
928 struct buffer_head *dir_ent_bh = NULL;
929 struct ocfs2_dir_entry *dir_ent = NULL;
930 struct ocfs2_dx_hinfo *hinfo = &res->dl_hinfo;
931 struct ocfs2_extent_list *dr_el;
932 struct ocfs2_dx_entry_list *entry_list;
933
934 ocfs2_dx_dir_name_hash(dir, name, namelen, &res->dl_hinfo);
935
936 if (ocfs2_dx_root_inline(dx_root)) {
937 entry_list = &dx_root->dr_entries;
938 goto search;
939 }
940
941 dr_el = &dx_root->dr_list;
942
943 ret = ocfs2_dx_dir_lookup(dir, dr_el, hinfo, NULL, &phys);
944 if (ret) {
945 mlog_errno(ret);
946 goto out;
947 }
948
949 mlog(0, "Dir %llu: name: \"%.*s\", lookup of hash: %u.0x%x "
950 "returns: %llu\n",
951 (unsigned long long)OCFS2_I(dir)->ip_blkno,
952 namelen, name, hinfo->major_hash, hinfo->minor_hash,
953 (unsigned long long)phys);
954
955 ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh);
956 if (ret) {
957 mlog_errno(ret);
958 goto out;
959 }
960
961 dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data;
962
963 mlog(0, "leaf info: num_used: %d, count: %d\n",
964 le16_to_cpu(dx_leaf->dl_list.de_num_used),
965 le16_to_cpu(dx_leaf->dl_list.de_count));
966
967 entry_list = &dx_leaf->dl_list;
968
969search:
970 /*
971 * Empty leaf is legal, so no need to check for that.
972 */
973 found = 0;
974 for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
975 dx_entry = &entry_list->de_entries[i];
976
977 if (hinfo->major_hash != le32_to_cpu(dx_entry->dx_major_hash)
978 || hinfo->minor_hash != le32_to_cpu(dx_entry->dx_minor_hash))
979 continue;
980
981 /*
982 * Search unindexed leaf block now. We're not
983 * guaranteed to find anything.
984 */
985 ret = ocfs2_read_dir_block_direct(dir,
986 le64_to_cpu(dx_entry->dx_dirent_blk),
987 &dir_ent_bh);
988 if (ret) {
989 mlog_errno(ret);
990 goto out;
991 }
992
993 /*
994 * XXX: We should check the unindexed block here,
995 * before using it.
996 */
997
998 found = ocfs2_search_dirblock(dir_ent_bh, dir, name, namelen,
999 0, dir_ent_bh->b_data,
1000 dir->i_sb->s_blocksize, &dir_ent);
1001 if (found == 1)
1002 break;
1003
1004 if (found == -1) {
1005 /* This means we found a bad directory entry. */
1006 ret = -EIO;
1007 mlog_errno(ret);
1008 goto out;
1009 }
1010
1011 brelse(dir_ent_bh);
1012 dir_ent_bh = NULL;
1013 }
1014
1015 if (found <= 0) {
1016 ret = -ENOENT;
1017 goto out;
1018 }
1019
1020 res->dl_leaf_bh = dir_ent_bh;
1021 res->dl_entry = dir_ent;
1022 res->dl_dx_leaf_bh = dx_leaf_bh;
1023 res->dl_dx_entry = dx_entry;
1024
1025 ret = 0;
1026out:
1027 if (ret) {
1028 brelse(dx_leaf_bh);
1029 brelse(dir_ent_bh);
1030 }
1031 return ret;
1032}
1033
1034static int ocfs2_find_entry_dx(const char *name, int namelen,
1035 struct inode *dir,
1036 struct ocfs2_dir_lookup_result *lookup)
1037{
1038 int ret;
1039 struct buffer_head *di_bh = NULL;
1040 struct ocfs2_dinode *di;
1041 struct buffer_head *dx_root_bh = NULL;
1042 struct ocfs2_dx_root_block *dx_root;
1043
1044 ret = ocfs2_read_inode_block(dir, &di_bh);
1045 if (ret) {
1046 mlog_errno(ret);
1047 goto out;
1048 }
1049
1050 di = (struct ocfs2_dinode *)di_bh->b_data;
1051
1052 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
1053 if (ret) {
1054 mlog_errno(ret);
1055 goto out;
1056 }
1057 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
1058
1059 ret = ocfs2_dx_dir_search(name, namelen, dir, dx_root, lookup);
1060 if (ret) {
1061 if (ret != -ENOENT)
1062 mlog_errno(ret);
1063 goto out;
1064 }
1065
1066 lookup->dl_dx_root_bh = dx_root_bh;
1067 dx_root_bh = NULL;
1068out:
1069 brelse(di_bh);
1070 brelse(dx_root_bh);
1071 return ret;
1072}
1073
483/* 1074/*
484 * Try to find an entry of the provided name within 'dir'. 1075 * Try to find an entry of the provided name within 'dir'.
485 * 1076 *
486 * If nothing was found, NULL is returned. Otherwise, a buffer_head 1077 * If nothing was found, -ENOENT is returned. Otherwise, zero is
487 * and pointer to the dir entry are passed back. 1078 * returned and the struct 'res' will contain information useful to
1079 * other directory manipulation functions.
488 * 1080 *
489 * Caller can NOT assume anything about the contents of the 1081 * Caller can NOT assume anything about the contents of the
490 * buffer_head - it is passed back only so that it can be passed into 1082 * buffer_heads - they are passed back only so that it can be passed
491 * any one of the manipulation functions (add entry, delete entry, 1083 * into any one of the manipulation functions (add entry, delete
492 * etc). As an example, bh in the extent directory case is a data 1084 * entry, etc). As an example, bh in the extent directory case is a
493 * block, in the inline-data case it actually points to an inode. 1085 * data block, in the inline-data case it actually points to an inode,
1086 * in the indexed directory case, multiple buffers are involved.
494 */ 1087 */
495struct buffer_head *ocfs2_find_entry(const char *name, int namelen, 1088int ocfs2_find_entry(const char *name, int namelen,
496 struct inode *dir, 1089 struct inode *dir, struct ocfs2_dir_lookup_result *lookup)
497 struct ocfs2_dir_entry **res_dir)
498{ 1090{
499 *res_dir = NULL; 1091 struct buffer_head *bh;
1092 struct ocfs2_dir_entry *res_dir = NULL;
500 1093
1094 if (ocfs2_dir_indexed(dir))
1095 return ocfs2_find_entry_dx(name, namelen, dir, lookup);
1096
1097 /*
1098 * The unindexed dir code only uses part of the lookup
1099 * structure, so there's no reason to push it down further
1100 * than this.
1101 */
501 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1102 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
502 return ocfs2_find_entry_id(name, namelen, dir, res_dir); 1103 bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir);
1104 else
1105 bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir);
1106
1107 if (bh == NULL)
1108 return -ENOENT;
503 1109
504 return ocfs2_find_entry_el(name, namelen, dir, res_dir); 1110 lookup->dl_leaf_bh = bh;
1111 lookup->dl_entry = res_dir;
1112 return 0;
505} 1113}
506 1114
507/* 1115/*
508 * Update inode number and type of a previously found directory entry. 1116 * Update inode number and type of a previously found directory entry.
509 */ 1117 */
510int ocfs2_update_entry(struct inode *dir, handle_t *handle, 1118int ocfs2_update_entry(struct inode *dir, handle_t *handle,
511 struct buffer_head *de_bh, struct ocfs2_dir_entry *de, 1119 struct ocfs2_dir_lookup_result *res,
512 struct inode *new_entry_inode) 1120 struct inode *new_entry_inode)
513{ 1121{
514 int ret; 1122 int ret;
515 ocfs2_journal_access_func access = ocfs2_journal_access_db; 1123 ocfs2_journal_access_func access = ocfs2_journal_access_db;
1124 struct ocfs2_dir_entry *de = res->dl_entry;
1125 struct buffer_head *de_bh = res->dl_leaf_bh;
516 1126
517 /* 1127 /*
518 * The same code works fine for both inline-data and extent 1128 * The same code works fine for both inline-data and extent
@@ -538,6 +1148,10 @@ out:
538 return ret; 1148 return ret;
539} 1149}
540 1150
1151/*
1152 * __ocfs2_delete_entry deletes a directory entry by merging it with the
1153 * previous entry
1154 */
541static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, 1155static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
542 struct ocfs2_dir_entry *de_del, 1156 struct ocfs2_dir_entry *de_del,
543 struct buffer_head *bh, char *first_de, 1157 struct buffer_head *bh, char *first_de,
@@ -587,6 +1201,181 @@ bail:
587 return status; 1201 return status;
588} 1202}
589 1203
1204static unsigned int ocfs2_figure_dirent_hole(struct ocfs2_dir_entry *de)
1205{
1206 unsigned int hole;
1207
1208 if (le64_to_cpu(de->inode) == 0)
1209 hole = le16_to_cpu(de->rec_len);
1210 else
1211 hole = le16_to_cpu(de->rec_len) -
1212 OCFS2_DIR_REC_LEN(de->name_len);
1213
1214 return hole;
1215}
1216
1217static int ocfs2_find_max_rec_len(struct super_block *sb,
1218 struct buffer_head *dirblock_bh)
1219{
1220 int size, this_hole, largest_hole = 0;
1221 char *trailer, *de_buf, *limit, *start = dirblock_bh->b_data;
1222 struct ocfs2_dir_entry *de;
1223
1224 trailer = (char *)ocfs2_trailer_from_bh(dirblock_bh, sb);
1225 size = ocfs2_dir_trailer_blk_off(sb);
1226 limit = start + size;
1227 de_buf = start;
1228 de = (struct ocfs2_dir_entry *)de_buf;
1229 do {
1230 if (de_buf != trailer) {
1231 this_hole = ocfs2_figure_dirent_hole(de);
1232 if (this_hole > largest_hole)
1233 largest_hole = this_hole;
1234 }
1235
1236 de_buf += le16_to_cpu(de->rec_len);
1237 de = (struct ocfs2_dir_entry *)de_buf;
1238 } while (de_buf < limit);
1239
1240 if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
1241 return largest_hole;
1242 return 0;
1243}
1244
1245static void ocfs2_dx_list_remove_entry(struct ocfs2_dx_entry_list *entry_list,
1246 int index)
1247{
1248 int num_used = le16_to_cpu(entry_list->de_num_used);
1249
1250 if (num_used == 1 || index == (num_used - 1))
1251 goto clear;
1252
1253 memmove(&entry_list->de_entries[index],
1254 &entry_list->de_entries[index + 1],
1255 (num_used - index - 1)*sizeof(struct ocfs2_dx_entry));
1256clear:
1257 num_used--;
1258 memset(&entry_list->de_entries[num_used], 0,
1259 sizeof(struct ocfs2_dx_entry));
1260 entry_list->de_num_used = cpu_to_le16(num_used);
1261}
1262
1263static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
1264 struct ocfs2_dir_lookup_result *lookup)
1265{
1266 int ret, index, max_rec_len, add_to_free_list = 0;
1267 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
1268 struct buffer_head *leaf_bh = lookup->dl_leaf_bh;
1269 struct ocfs2_dx_leaf *dx_leaf;
1270 struct ocfs2_dx_entry *dx_entry = lookup->dl_dx_entry;
1271 struct ocfs2_dir_block_trailer *trailer;
1272 struct ocfs2_dx_root_block *dx_root;
1273 struct ocfs2_dx_entry_list *entry_list;
1274
1275 /*
1276 * This function gets a bit messy because we might have to
1277 * modify the root block, regardless of whether the indexed
1278 * entries are stored inline.
1279 */
1280
1281 /*
1282 * *Only* set 'entry_list' here, based on where we're looking
1283 * for the indexed entries. Later, we might still want to
1284 * journal both blocks, based on free list state.
1285 */
1286 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
1287 if (ocfs2_dx_root_inline(dx_root)) {
1288 entry_list = &dx_root->dr_entries;
1289 } else {
1290 dx_leaf = (struct ocfs2_dx_leaf *) lookup->dl_dx_leaf_bh->b_data;
1291 entry_list = &dx_leaf->dl_list;
1292 }
1293
1294 /* Neither of these are a disk corruption - that should have
1295 * been caught by lookup, before we got here. */
1296 BUG_ON(le16_to_cpu(entry_list->de_count) <= 0);
1297 BUG_ON(le16_to_cpu(entry_list->de_num_used) <= 0);
1298
1299 index = (char *)dx_entry - (char *)entry_list->de_entries;
1300 index /= sizeof(*dx_entry);
1301
1302 if (index >= le16_to_cpu(entry_list->de_num_used)) {
1303 mlog(ML_ERROR, "Dir %llu: Bad dx_entry ptr idx %d, (%p, %p)\n",
1304 (unsigned long long)OCFS2_I(dir)->ip_blkno, index,
1305 entry_list, dx_entry);
1306 return -EIO;
1307 }
1308
1309 /*
1310 * We know that removal of this dirent will leave enough room
1311 * for a new one, so add this block to the free list if it
1312 * isn't already there.
1313 */
1314 trailer = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
1315 if (trailer->db_free_rec_len == 0)
1316 add_to_free_list = 1;
1317
1318 /*
1319 * Add the block holding our index into the journal before
1320 * removing the unindexed entry. If we get an error return
1321 * from __ocfs2_delete_entry(), then it hasn't removed the
1322 * entry yet. Likewise, successful return means we *must*
1323 * remove the indexed entry.
1324 *
1325 * We're also careful to journal the root tree block here as
1326 * the entry count needs to be updated. Also, we might be
1327 * adding to the start of the free list.
1328 */
1329 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
1330 OCFS2_JOURNAL_ACCESS_WRITE);
1331 if (ret) {
1332 mlog_errno(ret);
1333 goto out;
1334 }
1335
1336 if (!ocfs2_dx_root_inline(dx_root)) {
1337 ret = ocfs2_journal_access_dl(handle, dir,
1338 lookup->dl_dx_leaf_bh,
1339 OCFS2_JOURNAL_ACCESS_WRITE);
1340 if (ret) {
1341 mlog_errno(ret);
1342 goto out;
1343 }
1344 }
1345
1346 mlog(0, "Dir %llu: delete entry at index: %d\n",
1347 (unsigned long long)OCFS2_I(dir)->ip_blkno, index);
1348
1349 ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry,
1350 leaf_bh, leaf_bh->b_data, leaf_bh->b_size);
1351 if (ret) {
1352 mlog_errno(ret);
1353 goto out;
1354 }
1355
1356 max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, leaf_bh);
1357 trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
1358 if (add_to_free_list) {
1359 trailer->db_free_next = dx_root->dr_free_blk;
1360 dx_root->dr_free_blk = cpu_to_le64(leaf_bh->b_blocknr);
1361 ocfs2_journal_dirty(handle, dx_root_bh);
1362 }
1363
1364 /* leaf_bh was journal_accessed for us in __ocfs2_delete_entry */
1365 ocfs2_journal_dirty(handle, leaf_bh);
1366
1367 le32_add_cpu(&dx_root->dr_num_entries, -1);
1368 ocfs2_journal_dirty(handle, dx_root_bh);
1369
1370 ocfs2_dx_list_remove_entry(entry_list, index);
1371
1372 if (!ocfs2_dx_root_inline(dx_root))
1373 ocfs2_journal_dirty(handle, lookup->dl_dx_leaf_bh);
1374
1375out:
1376 return ret;
1377}
1378
590static inline int ocfs2_delete_entry_id(handle_t *handle, 1379static inline int ocfs2_delete_entry_id(handle_t *handle,
591 struct inode *dir, 1380 struct inode *dir,
592 struct ocfs2_dir_entry *de_del, 1381 struct ocfs2_dir_entry *de_del,
@@ -624,18 +1413,22 @@ static inline int ocfs2_delete_entry_el(handle_t *handle,
624} 1413}
625 1414
626/* 1415/*
627 * ocfs2_delete_entry deletes a directory entry by merging it with the 1416 * Delete a directory entry. Hide the details of directory
628 * previous entry 1417 * implementation from the caller.
629 */ 1418 */
630int ocfs2_delete_entry(handle_t *handle, 1419int ocfs2_delete_entry(handle_t *handle,
631 struct inode *dir, 1420 struct inode *dir,
632 struct ocfs2_dir_entry *de_del, 1421 struct ocfs2_dir_lookup_result *res)
633 struct buffer_head *bh)
634{ 1422{
1423 if (ocfs2_dir_indexed(dir))
1424 return ocfs2_delete_entry_dx(handle, dir, res);
1425
635 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1426 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
636 return ocfs2_delete_entry_id(handle, dir, de_del, bh); 1427 return ocfs2_delete_entry_id(handle, dir, res->dl_entry,
1428 res->dl_leaf_bh);
637 1429
638 return ocfs2_delete_entry_el(handle, dir, de_del, bh); 1430 return ocfs2_delete_entry_el(handle, dir, res->dl_entry,
1431 res->dl_leaf_bh);
639} 1432}
640 1433
641/* 1434/*
@@ -663,18 +1456,166 @@ static inline int ocfs2_dirent_would_fit(struct ocfs2_dir_entry *de,
663 return 0; 1456 return 0;
664} 1457}
665 1458
1459static void ocfs2_dx_dir_leaf_insert_tail(struct ocfs2_dx_leaf *dx_leaf,
1460 struct ocfs2_dx_entry *dx_new_entry)
1461{
1462 int i;
1463
1464 i = le16_to_cpu(dx_leaf->dl_list.de_num_used);
1465 dx_leaf->dl_list.de_entries[i] = *dx_new_entry;
1466
1467 le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1);
1468}
1469
1470static void ocfs2_dx_entry_list_insert(struct ocfs2_dx_entry_list *entry_list,
1471 struct ocfs2_dx_hinfo *hinfo,
1472 u64 dirent_blk)
1473{
1474 int i;
1475 struct ocfs2_dx_entry *dx_entry;
1476
1477 i = le16_to_cpu(entry_list->de_num_used);
1478 dx_entry = &entry_list->de_entries[i];
1479
1480 memset(dx_entry, 0, sizeof(*dx_entry));
1481 dx_entry->dx_major_hash = cpu_to_le32(hinfo->major_hash);
1482 dx_entry->dx_minor_hash = cpu_to_le32(hinfo->minor_hash);
1483 dx_entry->dx_dirent_blk = cpu_to_le64(dirent_blk);
1484
1485 le16_add_cpu(&entry_list->de_num_used, 1);
1486}
1487
1488static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle,
1489 struct ocfs2_dx_hinfo *hinfo,
1490 u64 dirent_blk,
1491 struct buffer_head *dx_leaf_bh)
1492{
1493 int ret;
1494 struct ocfs2_dx_leaf *dx_leaf;
1495
1496 ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
1497 OCFS2_JOURNAL_ACCESS_WRITE);
1498 if (ret) {
1499 mlog_errno(ret);
1500 goto out;
1501 }
1502
1503 dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
1504 ocfs2_dx_entry_list_insert(&dx_leaf->dl_list, hinfo, dirent_blk);
1505 ocfs2_journal_dirty(handle, dx_leaf_bh);
1506
1507out:
1508 return ret;
1509}
1510
1511static void ocfs2_dx_inline_root_insert(struct inode *dir, handle_t *handle,
1512 struct ocfs2_dx_hinfo *hinfo,
1513 u64 dirent_blk,
1514 struct ocfs2_dx_root_block *dx_root)
1515{
1516 ocfs2_dx_entry_list_insert(&dx_root->dr_entries, hinfo, dirent_blk);
1517}
1518
1519static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle,
1520 struct ocfs2_dir_lookup_result *lookup)
1521{
1522 int ret = 0;
1523 struct ocfs2_dx_root_block *dx_root;
1524 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
1525
1526 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
1527 OCFS2_JOURNAL_ACCESS_WRITE);
1528 if (ret) {
1529 mlog_errno(ret);
1530 goto out;
1531 }
1532
1533 dx_root = (struct ocfs2_dx_root_block *)lookup->dl_dx_root_bh->b_data;
1534 if (ocfs2_dx_root_inline(dx_root)) {
1535 ocfs2_dx_inline_root_insert(dir, handle,
1536 &lookup->dl_hinfo,
1537 lookup->dl_leaf_bh->b_blocknr,
1538 dx_root);
1539 } else {
1540 ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &lookup->dl_hinfo,
1541 lookup->dl_leaf_bh->b_blocknr,
1542 lookup->dl_dx_leaf_bh);
1543 if (ret)
1544 goto out;
1545 }
1546
1547 le32_add_cpu(&dx_root->dr_num_entries, 1);
1548 ocfs2_journal_dirty(handle, dx_root_bh);
1549
1550out:
1551 return ret;
1552}
1553
1554static void ocfs2_remove_block_from_free_list(struct inode *dir,
1555 handle_t *handle,
1556 struct ocfs2_dir_lookup_result *lookup)
1557{
1558 struct ocfs2_dir_block_trailer *trailer, *prev;
1559 struct ocfs2_dx_root_block *dx_root;
1560 struct buffer_head *bh;
1561
1562 trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
1563
1564 if (ocfs2_free_list_at_root(lookup)) {
1565 bh = lookup->dl_dx_root_bh;
1566 dx_root = (struct ocfs2_dx_root_block *)bh->b_data;
1567 dx_root->dr_free_blk = trailer->db_free_next;
1568 } else {
1569 bh = lookup->dl_prev_leaf_bh;
1570 prev = ocfs2_trailer_from_bh(bh, dir->i_sb);
1571 prev->db_free_next = trailer->db_free_next;
1572 }
1573
1574 trailer->db_free_rec_len = cpu_to_le16(0);
1575 trailer->db_free_next = cpu_to_le64(0);
1576
1577 ocfs2_journal_dirty(handle, bh);
1578 ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
1579}
1580
1581/*
1582 * This expects that a journal write has been reserved on
1583 * lookup->dl_prev_leaf_bh or lookup->dl_dx_root_bh
1584 */
1585static void ocfs2_recalc_free_list(struct inode *dir, handle_t *handle,
1586 struct ocfs2_dir_lookup_result *lookup)
1587{
1588 int max_rec_len;
1589 struct ocfs2_dir_block_trailer *trailer;
1590
1591 /* Walk dl_leaf_bh to figure out what the new free rec_len is. */
1592 max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, lookup->dl_leaf_bh);
1593 if (max_rec_len) {
1594 /*
1595 * There's still room in this block, so no need to remove it
1596 * from the free list. In this case, we just want to update
1597 * the rec len accounting.
1598 */
1599 trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
1600 trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
1601 ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
1602 } else {
1603 ocfs2_remove_block_from_free_list(dir, handle, lookup);
1604 }
1605}
1606
666/* we don't always have a dentry for what we want to add, so people 1607/* we don't always have a dentry for what we want to add, so people
667 * like orphan dir can call this instead. 1608 * like orphan dir can call this instead.
668 * 1609 *
669 * If you pass me insert_bh, I'll skip the search of the other dir 1610 * The lookup context must have been filled from
670 * blocks and put the record in there. 1611 * ocfs2_prepare_dir_for_insert.
671 */ 1612 */
672int __ocfs2_add_entry(handle_t *handle, 1613int __ocfs2_add_entry(handle_t *handle,
673 struct inode *dir, 1614 struct inode *dir,
674 const char *name, int namelen, 1615 const char *name, int namelen,
675 struct inode *inode, u64 blkno, 1616 struct inode *inode, u64 blkno,
676 struct buffer_head *parent_fe_bh, 1617 struct buffer_head *parent_fe_bh,
677 struct buffer_head *insert_bh) 1618 struct ocfs2_dir_lookup_result *lookup)
678{ 1619{
679 unsigned long offset; 1620 unsigned long offset;
680 unsigned short rec_len; 1621 unsigned short rec_len;
@@ -683,6 +1624,7 @@ int __ocfs2_add_entry(handle_t *handle,
683 struct super_block *sb = dir->i_sb; 1624 struct super_block *sb = dir->i_sb;
684 int retval, status; 1625 int retval, status;
685 unsigned int size = sb->s_blocksize; 1626 unsigned int size = sb->s_blocksize;
1627 struct buffer_head *insert_bh = lookup->dl_leaf_bh;
686 char *data_start = insert_bh->b_data; 1628 char *data_start = insert_bh->b_data;
687 1629
688 mlog_entry_void(); 1630 mlog_entry_void();
@@ -690,7 +1632,31 @@ int __ocfs2_add_entry(handle_t *handle,
690 if (!namelen) 1632 if (!namelen)
691 return -EINVAL; 1633 return -EINVAL;
692 1634
693 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1635 if (ocfs2_dir_indexed(dir)) {
1636 struct buffer_head *bh;
1637
1638 /*
1639 * An indexed dir may require that we update the free space
1640 * list. Reserve a write to the previous node in the list so
1641 * that we don't fail later.
1642 *
1643 * XXX: This can be either a dx_root_block, or an unindexed
1644 * directory tree leaf block.
1645 */
1646 if (ocfs2_free_list_at_root(lookup)) {
1647 bh = lookup->dl_dx_root_bh;
1648 retval = ocfs2_journal_access_dr(handle, dir, bh,
1649 OCFS2_JOURNAL_ACCESS_WRITE);
1650 } else {
1651 bh = lookup->dl_prev_leaf_bh;
1652 retval = ocfs2_journal_access_db(handle, dir, bh,
1653 OCFS2_JOURNAL_ACCESS_WRITE);
1654 }
1655 if (retval) {
1656 mlog_errno(retval);
1657 return retval;
1658 }
1659 } else if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
694 data_start = di->id2.i_data.id_data; 1660 data_start = di->id2.i_data.id_data;
695 size = i_size_read(dir); 1661 size = i_size_read(dir);
696 1662
@@ -737,10 +1703,22 @@ int __ocfs2_add_entry(handle_t *handle,
737 status = ocfs2_journal_access_di(handle, dir, 1703 status = ocfs2_journal_access_di(handle, dir,
738 insert_bh, 1704 insert_bh,
739 OCFS2_JOURNAL_ACCESS_WRITE); 1705 OCFS2_JOURNAL_ACCESS_WRITE);
740 else 1706 else {
741 status = ocfs2_journal_access_db(handle, dir, 1707 status = ocfs2_journal_access_db(handle, dir,
742 insert_bh, 1708 insert_bh,
743 OCFS2_JOURNAL_ACCESS_WRITE); 1709 OCFS2_JOURNAL_ACCESS_WRITE);
1710
1711 if (ocfs2_dir_indexed(dir)) {
1712 status = ocfs2_dx_dir_insert(dir,
1713 handle,
1714 lookup);
1715 if (status) {
1716 mlog_errno(status);
1717 goto bail;
1718 }
1719 }
1720 }
1721
744 /* By now the buffer is marked for journaling */ 1722 /* By now the buffer is marked for journaling */
745 offset += le16_to_cpu(de->rec_len); 1723 offset += le16_to_cpu(de->rec_len);
746 if (le64_to_cpu(de->inode)) { 1724 if (le64_to_cpu(de->inode)) {
@@ -761,6 +1739,9 @@ int __ocfs2_add_entry(handle_t *handle,
761 de->name_len = namelen; 1739 de->name_len = namelen;
762 memcpy(de->name, name, namelen); 1740 memcpy(de->name, name, namelen);
763 1741
1742 if (ocfs2_dir_indexed(dir))
1743 ocfs2_recalc_free_list(dir, handle, lookup);
1744
764 dir->i_version++; 1745 dir->i_version++;
765 status = ocfs2_journal_dirty(handle, insert_bh); 1746 status = ocfs2_journal_dirty(handle, insert_bh);
766 retval = 0; 1747 retval = 0;
@@ -870,6 +1851,10 @@ out:
870 return 0; 1851 return 0;
871} 1852}
872 1853
1854/*
1855 * NOTE: This function can be called against unindexed directories,
1856 * and indexed ones.
1857 */
873static int ocfs2_dir_foreach_blk_el(struct inode *inode, 1858static int ocfs2_dir_foreach_blk_el(struct inode *inode,
874 u64 *f_version, 1859 u64 *f_version,
875 loff_t *f_pos, void *priv, 1860 loff_t *f_pos, void *priv,
@@ -1071,31 +2056,22 @@ int ocfs2_find_files_on_disk(const char *name,
1071 int namelen, 2056 int namelen,
1072 u64 *blkno, 2057 u64 *blkno,
1073 struct inode *inode, 2058 struct inode *inode,
1074 struct buffer_head **dirent_bh, 2059 struct ocfs2_dir_lookup_result *lookup)
1075 struct ocfs2_dir_entry **dirent)
1076{ 2060{
1077 int status = -ENOENT; 2061 int status = -ENOENT;
1078 2062
1079 mlog_entry("(name=%.*s, blkno=%p, inode=%p, dirent_bh=%p, dirent=%p)\n", 2063 mlog(0, "name=%.*s, blkno=%p, inode=%llu\n", namelen, name, blkno,
1080 namelen, name, blkno, inode, dirent_bh, dirent); 2064 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1081 2065
1082 *dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent); 2066 status = ocfs2_find_entry(name, namelen, inode, lookup);
1083 if (!*dirent_bh || !*dirent) { 2067 if (status)
1084 status = -ENOENT;
1085 goto leave; 2068 goto leave;
1086 }
1087 2069
1088 *blkno = le64_to_cpu((*dirent)->inode); 2070 *blkno = le64_to_cpu(lookup->dl_entry->inode);
1089 2071
1090 status = 0; 2072 status = 0;
1091leave: 2073leave:
1092 if (status < 0) {
1093 *dirent = NULL;
1094 brelse(*dirent_bh);
1095 *dirent_bh = NULL;
1096 }
1097 2074
1098 mlog_exit(status);
1099 return status; 2075 return status;
1100} 2076}
1101 2077
@@ -1107,11 +2083,10 @@ int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
1107 int namelen, u64 *blkno) 2083 int namelen, u64 *blkno)
1108{ 2084{
1109 int ret; 2085 int ret;
1110 struct buffer_head *bh = NULL; 2086 struct ocfs2_dir_lookup_result lookup = { NULL, };
1111 struct ocfs2_dir_entry *dirent = NULL;
1112 2087
1113 ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &bh, &dirent); 2088 ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &lookup);
1114 brelse(bh); 2089 ocfs2_free_dir_lookup_result(&lookup);
1115 2090
1116 return ret; 2091 return ret;
1117} 2092}
@@ -1128,20 +2103,18 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
1128 int namelen) 2103 int namelen)
1129{ 2104{
1130 int ret; 2105 int ret;
1131 struct buffer_head *dirent_bh = NULL; 2106 struct ocfs2_dir_lookup_result lookup = { NULL, };
1132 struct ocfs2_dir_entry *dirent = NULL;
1133 2107
1134 mlog_entry("dir %llu, name '%.*s'\n", 2108 mlog_entry("dir %llu, name '%.*s'\n",
1135 (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name); 2109 (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
1136 2110
1137 ret = -EEXIST; 2111 ret = -EEXIST;
1138 dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent); 2112 if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0)
1139 if (dirent_bh)
1140 goto bail; 2113 goto bail;
1141 2114
1142 ret = 0; 2115 ret = 0;
1143bail: 2116bail:
1144 brelse(dirent_bh); 2117 ocfs2_free_dir_lookup_result(&lookup);
1145 2118
1146 mlog_exit(ret); 2119 mlog_exit(ret);
1147 return ret; 2120 return ret;
@@ -1151,6 +2124,7 @@ struct ocfs2_empty_dir_priv {
1151 unsigned seen_dot; 2124 unsigned seen_dot;
1152 unsigned seen_dot_dot; 2125 unsigned seen_dot_dot;
1153 unsigned seen_other; 2126 unsigned seen_other;
2127 unsigned dx_dir;
1154}; 2128};
1155static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len, 2129static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
1156 loff_t pos, u64 ino, unsigned type) 2130 loff_t pos, u64 ino, unsigned type)
@@ -1160,6 +2134,13 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
1160 /* 2134 /*
1161 * Check the positions of "." and ".." records to be sure 2135 * Check the positions of "." and ".." records to be sure
1162 * they're in the correct place. 2136 * they're in the correct place.
2137 *
2138 * Indexed directories don't need to proceed past the first
2139 * two entries, so we end the scan after seeing '..'. Despite
2140 * that, we allow the scan to proceed In the event that we
2141 * have a corrupted indexed directory (no dot or dot dot
2142 * entries). This allows us to double check for existing
2143 * entries which might not have been found in the index.
1163 */ 2144 */
1164 if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) { 2145 if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) {
1165 p->seen_dot = 1; 2146 p->seen_dot = 1;
@@ -1169,16 +2150,57 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
1169 if (name_len == 2 && !strncmp("..", name, 2) && 2150 if (name_len == 2 && !strncmp("..", name, 2) &&
1170 pos == OCFS2_DIR_REC_LEN(1)) { 2151 pos == OCFS2_DIR_REC_LEN(1)) {
1171 p->seen_dot_dot = 1; 2152 p->seen_dot_dot = 1;
2153
2154 if (p->dx_dir && p->seen_dot)
2155 return 1;
2156
1172 return 0; 2157 return 0;
1173 } 2158 }
1174 2159
1175 p->seen_other = 1; 2160 p->seen_other = 1;
1176 return 1; 2161 return 1;
1177} 2162}
2163
2164static int ocfs2_empty_dir_dx(struct inode *inode,
2165 struct ocfs2_empty_dir_priv *priv)
2166{
2167 int ret;
2168 struct buffer_head *di_bh = NULL;
2169 struct buffer_head *dx_root_bh = NULL;
2170 struct ocfs2_dinode *di;
2171 struct ocfs2_dx_root_block *dx_root;
2172
2173 priv->dx_dir = 1;
2174
2175 ret = ocfs2_read_inode_block(inode, &di_bh);
2176 if (ret) {
2177 mlog_errno(ret);
2178 goto out;
2179 }
2180 di = (struct ocfs2_dinode *)di_bh->b_data;
2181
2182 ret = ocfs2_read_dx_root(inode, di, &dx_root_bh);
2183 if (ret) {
2184 mlog_errno(ret);
2185 goto out;
2186 }
2187 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2188
2189 if (le32_to_cpu(dx_root->dr_num_entries) != 2)
2190 priv->seen_other = 1;
2191
2192out:
2193 brelse(di_bh);
2194 brelse(dx_root_bh);
2195 return ret;
2196}
2197
1178/* 2198/*
1179 * routine to check that the specified directory is empty (for rmdir) 2199 * routine to check that the specified directory is empty (for rmdir)
1180 * 2200 *
1181 * Returns 1 if dir is empty, zero otherwise. 2201 * Returns 1 if dir is empty, zero otherwise.
2202 *
2203 * XXX: This is a performance problem for unindexed directories.
1182 */ 2204 */
1183int ocfs2_empty_dir(struct inode *inode) 2205int ocfs2_empty_dir(struct inode *inode)
1184{ 2206{
@@ -1188,6 +2210,16 @@ int ocfs2_empty_dir(struct inode *inode)
1188 2210
1189 memset(&priv, 0, sizeof(priv)); 2211 memset(&priv, 0, sizeof(priv));
1190 2212
2213 if (ocfs2_dir_indexed(inode)) {
2214 ret = ocfs2_empty_dir_dx(inode, &priv);
2215 if (ret)
2216 mlog_errno(ret);
2217 /*
2218 * We still run ocfs2_dir_foreach to get the checks
2219 * for "." and "..".
2220 */
2221 }
2222
1191 ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir); 2223 ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir);
1192 if (ret) 2224 if (ret)
1193 mlog_errno(ret); 2225 mlog_errno(ret);
@@ -1280,7 +2312,8 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1280 struct inode *parent, 2312 struct inode *parent,
1281 struct inode *inode, 2313 struct inode *inode,
1282 struct buffer_head *fe_bh, 2314 struct buffer_head *fe_bh,
1283 struct ocfs2_alloc_context *data_ac) 2315 struct ocfs2_alloc_context *data_ac,
2316 struct buffer_head **ret_new_bh)
1284{ 2317{
1285 int status; 2318 int status;
1286 unsigned int size = osb->sb->s_blocksize; 2319 unsigned int size = osb->sb->s_blocksize;
@@ -1289,7 +2322,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1289 2322
1290 mlog_entry_void(); 2323 mlog_entry_void();
1291 2324
1292 if (ocfs2_supports_dir_trailer(osb)) 2325 if (ocfs2_new_dir_wants_trailer(inode))
1293 size = ocfs2_dir_trailer_blk_off(parent->i_sb); 2326 size = ocfs2_dir_trailer_blk_off(parent->i_sb);
1294 2327
1295 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh, 2328 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
@@ -1310,8 +2343,19 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1310 memset(new_bh->b_data, 0, osb->sb->s_blocksize); 2343 memset(new_bh->b_data, 0, osb->sb->s_blocksize);
1311 2344
1312 de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size); 2345 de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
1313 if (ocfs2_supports_dir_trailer(osb)) 2346 if (ocfs2_new_dir_wants_trailer(inode)) {
1314 ocfs2_init_dir_trailer(inode, new_bh); 2347 int size = le16_to_cpu(de->rec_len);
2348
2349 /*
2350 * Figure out the size of the hole left over after
2351 * insertion of '.' and '..'. The trailer wants this
2352 * information.
2353 */
2354 size -= OCFS2_DIR_REC_LEN(2);
2355 size -= sizeof(struct ocfs2_dir_block_trailer);
2356
2357 ocfs2_init_dir_trailer(inode, new_bh, size);
2358 }
1315 2359
1316 status = ocfs2_journal_dirty(handle, new_bh); 2360 status = ocfs2_journal_dirty(handle, new_bh);
1317 if (status < 0) { 2361 if (status < 0) {
@@ -1329,6 +2373,10 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1329 } 2373 }
1330 2374
1331 status = 0; 2375 status = 0;
2376 if (ret_new_bh) {
2377 *ret_new_bh = new_bh;
2378 new_bh = NULL;
2379 }
1332bail: 2380bail:
1333 brelse(new_bh); 2381 brelse(new_bh);
1334 2382
@@ -1336,20 +2384,427 @@ bail:
1336 return status; 2384 return status;
1337} 2385}
1338 2386
2387static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2388 handle_t *handle, struct inode *dir,
2389 struct buffer_head *di_bh,
2390 struct buffer_head *dirdata_bh,
2391 struct ocfs2_alloc_context *meta_ac,
2392 int dx_inline, u32 num_entries,
2393 struct buffer_head **ret_dx_root_bh)
2394{
2395 int ret;
2396 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
2397 u16 dr_suballoc_bit;
2398 u64 dr_blkno;
2399 unsigned int num_bits;
2400 struct buffer_head *dx_root_bh = NULL;
2401 struct ocfs2_dx_root_block *dx_root;
2402 struct ocfs2_dir_block_trailer *trailer =
2403 ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
2404
2405 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit,
2406 &num_bits, &dr_blkno);
2407 if (ret) {
2408 mlog_errno(ret);
2409 goto out;
2410 }
2411
2412 mlog(0, "Dir %llu, attach new index block: %llu\n",
2413 (unsigned long long)OCFS2_I(dir)->ip_blkno,
2414 (unsigned long long)dr_blkno);
2415
2416 dx_root_bh = sb_getblk(osb->sb, dr_blkno);
2417 if (dx_root_bh == NULL) {
2418 ret = -EIO;
2419 goto out;
2420 }
2421 ocfs2_set_new_buffer_uptodate(dir, dx_root_bh);
2422
2423 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
2424 OCFS2_JOURNAL_ACCESS_CREATE);
2425 if (ret < 0) {
2426 mlog_errno(ret);
2427 goto out;
2428 }
2429
2430 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2431 memset(dx_root, 0, osb->sb->s_blocksize);
2432 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
2433 dx_root->dr_suballoc_slot = cpu_to_le16(osb->slot_num);
2434 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
2435 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
2436 dx_root->dr_blkno = cpu_to_le64(dr_blkno);
2437 dx_root->dr_dir_blkno = cpu_to_le64(OCFS2_I(dir)->ip_blkno);
2438 dx_root->dr_num_entries = cpu_to_le32(num_entries);
2439 if (le16_to_cpu(trailer->db_free_rec_len))
2440 dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
2441 else
2442 dx_root->dr_free_blk = cpu_to_le64(0);
2443
2444 if (dx_inline) {
2445 dx_root->dr_flags |= OCFS2_DX_FLAG_INLINE;
2446 dx_root->dr_entries.de_count =
2447 cpu_to_le16(ocfs2_dx_entries_per_root(osb->sb));
2448 } else {
2449 dx_root->dr_list.l_count =
2450 cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
2451 }
2452
2453 ret = ocfs2_journal_dirty(handle, dx_root_bh);
2454 if (ret)
2455 mlog_errno(ret);
2456
2457 ret = ocfs2_journal_access_di(handle, dir, di_bh,
2458 OCFS2_JOURNAL_ACCESS_CREATE);
2459 if (ret) {
2460 mlog_errno(ret);
2461 goto out;
2462 }
2463
2464 di->i_dx_root = cpu_to_le64(dr_blkno);
2465
2466 OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
2467 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
2468
2469 ret = ocfs2_journal_dirty(handle, di_bh);
2470 if (ret)
2471 mlog_errno(ret);
2472
2473 *ret_dx_root_bh = dx_root_bh;
2474 dx_root_bh = NULL;
2475
2476out:
2477 brelse(dx_root_bh);
2478 return ret;
2479}
2480
2481static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
2482 handle_t *handle, struct inode *dir,
2483 struct buffer_head **dx_leaves,
2484 int num_dx_leaves, u64 start_blk)
2485{
2486 int ret, i;
2487 struct ocfs2_dx_leaf *dx_leaf;
2488 struct buffer_head *bh;
2489
2490 for (i = 0; i < num_dx_leaves; i++) {
2491 bh = sb_getblk(osb->sb, start_blk + i);
2492 if (bh == NULL) {
2493 ret = -EIO;
2494 goto out;
2495 }
2496 dx_leaves[i] = bh;
2497
2498 ocfs2_set_new_buffer_uptodate(dir, bh);
2499
2500 ret = ocfs2_journal_access_dl(handle, dir, bh,
2501 OCFS2_JOURNAL_ACCESS_CREATE);
2502 if (ret < 0) {
2503 mlog_errno(ret);
2504 goto out;
2505 }
2506
2507 dx_leaf = (struct ocfs2_dx_leaf *) bh->b_data;
2508
2509 memset(dx_leaf, 0, osb->sb->s_blocksize);
2510 strcpy(dx_leaf->dl_signature, OCFS2_DX_LEAF_SIGNATURE);
2511 dx_leaf->dl_fs_generation = cpu_to_le32(osb->fs_generation);
2512 dx_leaf->dl_blkno = cpu_to_le64(bh->b_blocknr);
2513 dx_leaf->dl_list.de_count =
2514 cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb));
2515
2516 mlog(0,
2517 "Dir %llu, format dx_leaf: %llu, entry count: %u\n",
2518 (unsigned long long)OCFS2_I(dir)->ip_blkno,
2519 (unsigned long long)bh->b_blocknr,
2520 le16_to_cpu(dx_leaf->dl_list.de_count));
2521
2522 ocfs2_journal_dirty(handle, bh);
2523 }
2524
2525 ret = 0;
2526out:
2527 return ret;
2528}
2529
2530/*
2531 * Allocates and formats a new cluster for use in an indexed dir
2532 * leaf. This version will not do the extent insert, so that it can be
2533 * used by operations which need careful ordering.
2534 */
2535static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
2536 u32 cpos, handle_t *handle,
2537 struct ocfs2_alloc_context *data_ac,
2538 struct buffer_head **dx_leaves,
2539 int num_dx_leaves, u64 *ret_phys_blkno)
2540{
2541 int ret;
2542 u32 phys, num;
2543 u64 phys_blkno;
2544 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2545
2546 /*
2547 * XXX: For create, this should claim cluster for the index
2548 * *before* the unindexed insert so that we have a better
2549 * chance of contiguousness as the directory grows in number
2550 * of entries.
2551 */
2552 ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num);
2553 if (ret) {
2554 mlog_errno(ret);
2555 goto out;
2556 }
2557
2558 /*
2559 * Format the new cluster first. That way, we're inserting
2560 * valid data.
2561 */
2562 phys_blkno = ocfs2_clusters_to_blocks(osb->sb, phys);
2563 ret = ocfs2_dx_dir_format_cluster(osb, handle, dir, dx_leaves,
2564 num_dx_leaves, phys_blkno);
2565 if (ret) {
2566 mlog_errno(ret);
2567 goto out;
2568 }
2569
2570 *ret_phys_blkno = phys_blkno;
2571out:
2572 return ret;
2573}
2574
2575static int ocfs2_dx_dir_new_cluster(struct inode *dir,
2576 struct ocfs2_extent_tree *et,
2577 u32 cpos, handle_t *handle,
2578 struct ocfs2_alloc_context *data_ac,
2579 struct ocfs2_alloc_context *meta_ac,
2580 struct buffer_head **dx_leaves,
2581 int num_dx_leaves)
2582{
2583 int ret;
2584 u64 phys_blkno;
2585 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2586
2587 ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves,
2588 num_dx_leaves, &phys_blkno);
2589 if (ret) {
2590 mlog_errno(ret);
2591 goto out;
2592 }
2593
2594 ret = ocfs2_insert_extent(osb, handle, dir, et, cpos, phys_blkno, 1, 0,
2595 meta_ac);
2596 if (ret)
2597 mlog_errno(ret);
2598out:
2599 return ret;
2600}
2601
2602static struct buffer_head **ocfs2_dx_dir_kmalloc_leaves(struct super_block *sb,
2603 int *ret_num_leaves)
2604{
2605 int num_dx_leaves = ocfs2_clusters_to_blocks(sb, 1);
2606 struct buffer_head **dx_leaves;
2607
2608 dx_leaves = kcalloc(num_dx_leaves, sizeof(struct buffer_head *),
2609 GFP_NOFS);
2610 if (dx_leaves && ret_num_leaves)
2611 *ret_num_leaves = num_dx_leaves;
2612
2613 return dx_leaves;
2614}
2615
2616static int ocfs2_fill_new_dir_dx(struct ocfs2_super *osb,
2617 handle_t *handle,
2618 struct inode *parent,
2619 struct inode *inode,
2620 struct buffer_head *di_bh,
2621 struct ocfs2_alloc_context *data_ac,
2622 struct ocfs2_alloc_context *meta_ac)
2623{
2624 int ret;
2625 struct buffer_head *leaf_bh = NULL;
2626 struct buffer_head *dx_root_bh = NULL;
2627 struct ocfs2_dx_hinfo hinfo;
2628 struct ocfs2_dx_root_block *dx_root;
2629 struct ocfs2_dx_entry_list *entry_list;
2630
2631 /*
2632 * Our strategy is to create the directory as though it were
2633 * unindexed, then add the index block. This works with very
2634 * little complication since the state of a new directory is a
2635 * very well known quantity.
2636 *
2637 * Essentially, we have two dirents ("." and ".."), in the 1st
2638 * block which need indexing. These are easily inserted into
2639 * the index block.
2640 */
2641
2642 ret = ocfs2_fill_new_dir_el(osb, handle, parent, inode, di_bh,
2643 data_ac, &leaf_bh);
2644 if (ret) {
2645 mlog_errno(ret);
2646 goto out;
2647 }
2648
2649 ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, leaf_bh,
2650 meta_ac, 1, 2, &dx_root_bh);
2651 if (ret) {
2652 mlog_errno(ret);
2653 goto out;
2654 }
2655 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2656 entry_list = &dx_root->dr_entries;
2657
2658 /* Buffer has been journaled for us by ocfs2_dx_dir_attach_index */
2659 ocfs2_dx_dir_name_hash(inode, ".", 1, &hinfo);
2660 ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
2661
2662 ocfs2_dx_dir_name_hash(inode, "..", 2, &hinfo);
2663 ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
2664
2665out:
2666 brelse(dx_root_bh);
2667 brelse(leaf_bh);
2668 return ret;
2669}
2670
1339int ocfs2_fill_new_dir(struct ocfs2_super *osb, 2671int ocfs2_fill_new_dir(struct ocfs2_super *osb,
1340 handle_t *handle, 2672 handle_t *handle,
1341 struct inode *parent, 2673 struct inode *parent,
1342 struct inode *inode, 2674 struct inode *inode,
1343 struct buffer_head *fe_bh, 2675 struct buffer_head *fe_bh,
1344 struct ocfs2_alloc_context *data_ac) 2676 struct ocfs2_alloc_context *data_ac,
2677 struct ocfs2_alloc_context *meta_ac)
2678
1345{ 2679{
1346 BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL); 2680 BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL);
1347 2681
1348 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 2682 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1349 return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh); 2683 return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh);
1350 2684
2685 if (ocfs2_supports_indexed_dirs(osb))
2686 return ocfs2_fill_new_dir_dx(osb, handle, parent, inode, fe_bh,
2687 data_ac, meta_ac);
2688
1351 return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh, 2689 return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh,
1352 data_ac); 2690 data_ac, NULL);
2691}
2692
2693static int ocfs2_dx_dir_index_block(struct inode *dir,
2694 handle_t *handle,
2695 struct buffer_head **dx_leaves,
2696 int num_dx_leaves,
2697 u32 *num_dx_entries,
2698 struct buffer_head *dirent_bh)
2699{
2700 int ret, namelen, i;
2701 char *de_buf, *limit;
2702 struct ocfs2_dir_entry *de;
2703 struct buffer_head *dx_leaf_bh;
2704 struct ocfs2_dx_hinfo hinfo;
2705 u64 dirent_blk = dirent_bh->b_blocknr;
2706
2707 de_buf = dirent_bh->b_data;
2708 limit = de_buf + dir->i_sb->s_blocksize;
2709
2710 while (de_buf < limit) {
2711 de = (struct ocfs2_dir_entry *)de_buf;
2712
2713 namelen = de->name_len;
2714 if (!namelen || !de->inode)
2715 goto inc;
2716
2717 ocfs2_dx_dir_name_hash(dir, de->name, namelen, &hinfo);
2718
2719 i = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), &hinfo);
2720 dx_leaf_bh = dx_leaves[i];
2721
2722 ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &hinfo,
2723 dirent_blk, dx_leaf_bh);
2724 if (ret) {
2725 mlog_errno(ret);
2726 goto out;
2727 }
2728
2729 *num_dx_entries = *num_dx_entries + 1;
2730
2731inc:
2732 de_buf += le16_to_cpu(de->rec_len);
2733 }
2734
2735out:
2736 return ret;
2737}
2738
2739/*
2740 * XXX: This expects dx_root_bh to already be part of the transaction.
2741 */
2742static void ocfs2_dx_dir_index_root_block(struct inode *dir,
2743 struct buffer_head *dx_root_bh,
2744 struct buffer_head *dirent_bh)
2745{
2746 char *de_buf, *limit;
2747 struct ocfs2_dx_root_block *dx_root;
2748 struct ocfs2_dir_entry *de;
2749 struct ocfs2_dx_hinfo hinfo;
2750 u64 dirent_blk = dirent_bh->b_blocknr;
2751
2752 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2753
2754 de_buf = dirent_bh->b_data;
2755 limit = de_buf + dir->i_sb->s_blocksize;
2756
2757 while (de_buf < limit) {
2758 de = (struct ocfs2_dir_entry *)de_buf;
2759
2760 if (!de->name_len || !de->inode)
2761 goto inc;
2762
2763 ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo);
2764
2765 mlog(0,
2766 "dir: %llu, major: 0x%x minor: 0x%x, index: %u, name: %.*s\n",
2767 (unsigned long long)dir->i_ino, hinfo.major_hash,
2768 hinfo.minor_hash,
2769 le16_to_cpu(dx_root->dr_entries.de_num_used),
2770 de->name_len, de->name);
2771
2772 ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo,
2773 dirent_blk);
2774
2775 le32_add_cpu(&dx_root->dr_num_entries, 1);
2776inc:
2777 de_buf += le16_to_cpu(de->rec_len);
2778 }
2779}
2780
2781/*
2782 * Count the number of inline directory entries in di_bh and compare
2783 * them against the number of entries we can hold in an inline dx root
2784 * block.
2785 */
2786static int ocfs2_new_dx_should_be_inline(struct inode *dir,
2787 struct buffer_head *di_bh)
2788{
2789 int dirent_count = 0;
2790 char *de_buf, *limit;
2791 struct ocfs2_dir_entry *de;
2792 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2793
2794 de_buf = di->id2.i_data.id_data;
2795 limit = de_buf + i_size_read(dir);
2796
2797 while (de_buf < limit) {
2798 de = (struct ocfs2_dir_entry *)de_buf;
2799
2800 if (de->name_len && de->inode)
2801 dirent_count++;
2802
2803 de_buf += le16_to_cpu(de->rec_len);
2804 }
2805
2806 /* We are careful to leave room for one extra record. */
2807 return dirent_count < ocfs2_dx_entries_per_root(dir->i_sb);
1353} 2808}
1354 2809
1355/* 2810/*
@@ -1358,18 +2813,26 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
1358 * expansion from an inline directory to one with extents. The first dir block 2813 * expansion from an inline directory to one with extents. The first dir block
1359 * in that case is taken from the inline data portion of the inode block. 2814 * in that case is taken from the inline data portion of the inode block.
1360 * 2815 *
2816 * This will also return the largest amount of contiguous space for a dirent
2817 * in the block. That value is *not* necessarily the last dirent, even after
2818 * expansion. The directory indexing code wants this value for free space
2819 * accounting. We do this here since we're already walking the entire dir
2820 * block.
2821 *
1361 * We add the dir trailer if this filesystem wants it. 2822 * We add the dir trailer if this filesystem wants it.
1362 */ 2823 */
1363static void ocfs2_expand_last_dirent(char *start, unsigned int old_size, 2824static unsigned int ocfs2_expand_last_dirent(char *start, unsigned int old_size,
1364 struct super_block *sb) 2825 struct inode *dir)
1365{ 2826{
2827 struct super_block *sb = dir->i_sb;
1366 struct ocfs2_dir_entry *de; 2828 struct ocfs2_dir_entry *de;
1367 struct ocfs2_dir_entry *prev_de; 2829 struct ocfs2_dir_entry *prev_de;
1368 char *de_buf, *limit; 2830 char *de_buf, *limit;
1369 unsigned int new_size = sb->s_blocksize; 2831 unsigned int new_size = sb->s_blocksize;
1370 unsigned int bytes; 2832 unsigned int bytes, this_hole;
2833 unsigned int largest_hole = 0;
1371 2834
1372 if (ocfs2_supports_dir_trailer(OCFS2_SB(sb))) 2835 if (ocfs2_new_dir_wants_trailer(dir))
1373 new_size = ocfs2_dir_trailer_blk_off(sb); 2836 new_size = ocfs2_dir_trailer_blk_off(sb);
1374 2837
1375 bytes = new_size - old_size; 2838 bytes = new_size - old_size;
@@ -1378,12 +2841,26 @@ static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
1378 de_buf = start; 2841 de_buf = start;
1379 de = (struct ocfs2_dir_entry *)de_buf; 2842 de = (struct ocfs2_dir_entry *)de_buf;
1380 do { 2843 do {
2844 this_hole = ocfs2_figure_dirent_hole(de);
2845 if (this_hole > largest_hole)
2846 largest_hole = this_hole;
2847
1381 prev_de = de; 2848 prev_de = de;
1382 de_buf += le16_to_cpu(de->rec_len); 2849 de_buf += le16_to_cpu(de->rec_len);
1383 de = (struct ocfs2_dir_entry *)de_buf; 2850 de = (struct ocfs2_dir_entry *)de_buf;
1384 } while (de_buf < limit); 2851 } while (de_buf < limit);
1385 2852
1386 le16_add_cpu(&prev_de->rec_len, bytes); 2853 le16_add_cpu(&prev_de->rec_len, bytes);
2854
2855 /* We need to double check this after modification of the final
2856 * dirent. */
2857 this_hole = ocfs2_figure_dirent_hole(prev_de);
2858 if (this_hole > largest_hole)
2859 largest_hole = this_hole;
2860
2861 if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
2862 return largest_hole;
2863 return 0;
1387} 2864}
1388 2865
1389/* 2866/*
@@ -1396,29 +2873,61 @@ static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
1396 */ 2873 */
1397static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, 2874static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1398 unsigned int blocks_wanted, 2875 unsigned int blocks_wanted,
2876 struct ocfs2_dir_lookup_result *lookup,
1399 struct buffer_head **first_block_bh) 2877 struct buffer_head **first_block_bh)
1400{ 2878{
1401 u32 alloc, bit_off, len; 2879 u32 alloc, dx_alloc, bit_off, len, num_dx_entries = 0;
1402 struct super_block *sb = dir->i_sb; 2880 struct super_block *sb = dir->i_sb;
1403 int ret, credits = ocfs2_inline_to_extents_credits(sb); 2881 int ret, i, num_dx_leaves = 0, dx_inline = 0,
1404 u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits; 2882 credits = ocfs2_inline_to_extents_credits(sb);
2883 u64 dx_insert_blkno, blkno,
2884 bytes = blocks_wanted << sb->s_blocksize_bits;
1405 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 2885 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
1406 struct ocfs2_inode_info *oi = OCFS2_I(dir); 2886 struct ocfs2_inode_info *oi = OCFS2_I(dir);
1407 struct ocfs2_alloc_context *data_ac; 2887 struct ocfs2_alloc_context *data_ac;
2888 struct ocfs2_alloc_context *meta_ac = NULL;
1408 struct buffer_head *dirdata_bh = NULL; 2889 struct buffer_head *dirdata_bh = NULL;
2890 struct buffer_head *dx_root_bh = NULL;
2891 struct buffer_head **dx_leaves = NULL;
1409 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 2892 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1410 handle_t *handle; 2893 handle_t *handle;
1411 struct ocfs2_extent_tree et; 2894 struct ocfs2_extent_tree et;
1412 int did_quota = 0; 2895 struct ocfs2_extent_tree dx_et;
2896 int did_quota = 0, bytes_allocated = 0;
1413 2897
1414 ocfs2_init_dinode_extent_tree(&et, dir, di_bh); 2898 ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
1415 2899
1416 alloc = ocfs2_clusters_for_bytes(sb, bytes); 2900 alloc = ocfs2_clusters_for_bytes(sb, bytes);
2901 dx_alloc = 0;
2902
2903 if (ocfs2_supports_indexed_dirs(osb)) {
2904 credits += ocfs2_add_dir_index_credits(sb);
2905
2906 dx_inline = ocfs2_new_dx_should_be_inline(dir, di_bh);
2907 if (!dx_inline) {
2908 /* Add one more cluster for an index leaf */
2909 dx_alloc++;
2910 dx_leaves = ocfs2_dx_dir_kmalloc_leaves(sb,
2911 &num_dx_leaves);
2912 if (!dx_leaves) {
2913 ret = -ENOMEM;
2914 mlog_errno(ret);
2915 goto out;
2916 }
2917 }
2918
2919 /* This gets us the dx_root */
2920 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
2921 if (ret) {
2922 mlog_errno(ret);
2923 goto out;
2924 }
2925 }
1417 2926
1418 /* 2927 /*
1419 * We should never need more than 2 clusters for this - 2928 * We should never need more than 2 clusters for the unindexed
1420 * maximum dirent size is far less than one block. In fact, 2929 * tree - maximum dirent size is far less than one block. In
1421 * the only time we'd need more than one cluster is if 2930 * fact, the only time we'd need more than one cluster is if
1422 * blocksize == clustersize and the dirent won't fit in the 2931 * blocksize == clustersize and the dirent won't fit in the
1423 * extra space that the expansion to a single block gives. As 2932 * extra space that the expansion to a single block gives. As
1424 * of today, that only happens on 4k/4k file systems. 2933 * of today, that only happens on 4k/4k file systems.
@@ -1435,7 +2944,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1435 2944
1436 /* 2945 /*
1437 * Prepare for worst case allocation scenario of two separate 2946 * Prepare for worst case allocation scenario of two separate
1438 * extents. 2947 * extents in the unindexed tree.
1439 */ 2948 */
1440 if (alloc == 2) 2949 if (alloc == 2)
1441 credits += OCFS2_SUBALLOC_ALLOC; 2950 credits += OCFS2_SUBALLOC_ALLOC;
@@ -1448,11 +2957,29 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1448 } 2957 }
1449 2958
1450 if (vfs_dq_alloc_space_nodirty(dir, 2959 if (vfs_dq_alloc_space_nodirty(dir,
1451 ocfs2_clusters_to_bytes(osb->sb, alloc))) { 2960 ocfs2_clusters_to_bytes(osb->sb,
2961 alloc + dx_alloc))) {
1452 ret = -EDQUOT; 2962 ret = -EDQUOT;
1453 goto out_commit; 2963 goto out_commit;
1454 } 2964 }
1455 did_quota = 1; 2965 did_quota = 1;
2966
2967 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
2968 /*
2969 * Allocate our index cluster first, to maximize the
2970 * possibility that unindexed leaves grow
2971 * contiguously.
2972 */
2973 ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac,
2974 dx_leaves, num_dx_leaves,
2975 &dx_insert_blkno);
2976 if (ret) {
2977 mlog_errno(ret);
2978 goto out_commit;
2979 }
2980 bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
2981 }
2982
1456 /* 2983 /*
1457 * Try to claim as many clusters as the bitmap can give though 2984 * Try to claim as many clusters as the bitmap can give though
1458 * if we only get one now, that's enough to continue. The rest 2985 * if we only get one now, that's enough to continue. The rest
@@ -1463,6 +2990,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1463 mlog_errno(ret); 2990 mlog_errno(ret);
1464 goto out_commit; 2991 goto out_commit;
1465 } 2992 }
2993 bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
1466 2994
1467 /* 2995 /*
1468 * Operations are carefully ordered so that we set up the new 2996 * Operations are carefully ordered so that we set up the new
@@ -1489,9 +3017,16 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1489 memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir)); 3017 memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
1490 memset(dirdata_bh->b_data + i_size_read(dir), 0, 3018 memset(dirdata_bh->b_data + i_size_read(dir), 0,
1491 sb->s_blocksize - i_size_read(dir)); 3019 sb->s_blocksize - i_size_read(dir));
1492 ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), sb); 3020 i = ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), dir);
1493 if (ocfs2_supports_dir_trailer(osb)) 3021 if (ocfs2_new_dir_wants_trailer(dir)) {
1494 ocfs2_init_dir_trailer(dir, dirdata_bh); 3022 /*
3023 * Prepare the dir trailer up front. It will otherwise look
3024 * like a valid dirent. Even if inserting the index fails
3025 * (unlikely), then all we'll have done is given first dir
3026 * block a small amount of fragmentation.
3027 */
3028 ocfs2_init_dir_trailer(dir, dirdata_bh, i);
3029 }
1495 3030
1496 ret = ocfs2_journal_dirty(handle, dirdata_bh); 3031 ret = ocfs2_journal_dirty(handle, dirdata_bh);
1497 if (ret) { 3032 if (ret) {
@@ -1499,6 +3034,24 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1499 goto out_commit; 3034 goto out_commit;
1500 } 3035 }
1501 3036
3037 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
3038 /*
3039 * Dx dirs with an external cluster need to do this up
3040 * front. Inline dx root's get handled later, after
3041 * we've allocated our root block. We get passed back
3042 * a total number of items so that dr_num_entries can
3043 * be correctly set once the dx_root has been
3044 * allocated.
3045 */
3046 ret = ocfs2_dx_dir_index_block(dir, handle, dx_leaves,
3047 num_dx_leaves, &num_dx_entries,
3048 dirdata_bh);
3049 if (ret) {
3050 mlog_errno(ret);
3051 goto out_commit;
3052 }
3053 }
3054
1502 /* 3055 /*
1503 * Set extent, i_size, etc on the directory. After this, the 3056 * Set extent, i_size, etc on the directory. After this, the
1504 * inode should contain the same exact dirents as before and 3057 * inode should contain the same exact dirents as before and
@@ -1551,6 +3104,27 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1551 goto out_commit; 3104 goto out_commit;
1552 } 3105 }
1553 3106
3107 if (ocfs2_supports_indexed_dirs(osb)) {
3108 ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
3109 dirdata_bh, meta_ac, dx_inline,
3110 num_dx_entries, &dx_root_bh);
3111 if (ret) {
3112 mlog_errno(ret);
3113 goto out_commit;
3114 }
3115
3116 if (dx_inline) {
3117 ocfs2_dx_dir_index_root_block(dir, dx_root_bh,
3118 dirdata_bh);
3119 } else {
3120 ocfs2_init_dx_root_extent_tree(&dx_et, dir, dx_root_bh);
3121 ret = ocfs2_insert_extent(osb, handle, dir, &dx_et, 0,
3122 dx_insert_blkno, 1, 0, NULL);
3123 if (ret)
3124 mlog_errno(ret);
3125 }
3126 }
3127
1554 /* 3128 /*
1555 * We asked for two clusters, but only got one in the 1st 3129 * We asked for two clusters, but only got one in the 1st
1556 * pass. Claim the 2nd cluster as a separate extent. 3130 * pass. Claim the 2nd cluster as a separate extent.
@@ -1570,15 +3144,32 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1570 mlog_errno(ret); 3144 mlog_errno(ret);
1571 goto out_commit; 3145 goto out_commit;
1572 } 3146 }
3147 bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
1573 } 3148 }
1574 3149
1575 *first_block_bh = dirdata_bh; 3150 *first_block_bh = dirdata_bh;
1576 dirdata_bh = NULL; 3151 dirdata_bh = NULL;
3152 if (ocfs2_supports_indexed_dirs(osb)) {
3153 unsigned int off;
3154
3155 if (!dx_inline) {
3156 /*
3157 * We need to return the correct block within the
3158 * cluster which should hold our entry.
3159 */
3160 off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb),
3161 &lookup->dl_hinfo);
3162 get_bh(dx_leaves[off]);
3163 lookup->dl_dx_leaf_bh = dx_leaves[off];
3164 }
3165 lookup->dl_dx_root_bh = dx_root_bh;
3166 dx_root_bh = NULL;
3167 }
1577 3168
1578out_commit: 3169out_commit:
1579 if (ret < 0 && did_quota) 3170 if (ret < 0 && did_quota)
1580 vfs_dq_free_space_nodirty(dir, 3171 vfs_dq_free_space_nodirty(dir, bytes_allocated);
1581 ocfs2_clusters_to_bytes(osb->sb, 2)); 3172
1582 ocfs2_commit_trans(osb, handle); 3173 ocfs2_commit_trans(osb, handle);
1583 3174
1584out_sem: 3175out_sem:
@@ -1587,8 +3178,17 @@ out_sem:
1587out: 3178out:
1588 if (data_ac) 3179 if (data_ac)
1589 ocfs2_free_alloc_context(data_ac); 3180 ocfs2_free_alloc_context(data_ac);
3181 if (meta_ac)
3182 ocfs2_free_alloc_context(meta_ac);
3183
3184 if (dx_leaves) {
3185 for (i = 0; i < num_dx_leaves; i++)
3186 brelse(dx_leaves[i]);
3187 kfree(dx_leaves);
3188 }
1590 3189
1591 brelse(dirdata_bh); 3190 brelse(dirdata_bh);
3191 brelse(dx_root_bh);
1592 3192
1593 return ret; 3193 return ret;
1594} 3194}
@@ -1658,11 +3258,14 @@ bail:
1658 * is to be turned into an extent based one. The size of the dirent to 3258 * is to be turned into an extent based one. The size of the dirent to
1659 * insert might be larger than the space gained by growing to just one 3259 * insert might be larger than the space gained by growing to just one
1660 * block, so we may have to grow the inode by two blocks in that case. 3260 * block, so we may have to grow the inode by two blocks in that case.
3261 *
3262 * If the directory is already indexed, dx_root_bh must be provided.
1661 */ 3263 */
1662static int ocfs2_extend_dir(struct ocfs2_super *osb, 3264static int ocfs2_extend_dir(struct ocfs2_super *osb,
1663 struct inode *dir, 3265 struct inode *dir,
1664 struct buffer_head *parent_fe_bh, 3266 struct buffer_head *parent_fe_bh,
1665 unsigned int blocks_wanted, 3267 unsigned int blocks_wanted,
3268 struct ocfs2_dir_lookup_result *lookup,
1666 struct buffer_head **new_de_bh) 3269 struct buffer_head **new_de_bh)
1667{ 3270{
1668 int status = 0; 3271 int status = 0;
@@ -1677,17 +3280,29 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
1677 struct ocfs2_dir_entry * de; 3280 struct ocfs2_dir_entry * de;
1678 struct super_block *sb = osb->sb; 3281 struct super_block *sb = osb->sb;
1679 struct ocfs2_extent_tree et; 3282 struct ocfs2_extent_tree et;
3283 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
1680 3284
1681 mlog_entry_void(); 3285 mlog_entry_void();
1682 3286
1683 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 3287 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
3288 /*
3289 * This would be a code error as an inline directory should
3290 * never have an index root.
3291 */
3292 BUG_ON(dx_root_bh);
3293
1684 status = ocfs2_expand_inline_dir(dir, parent_fe_bh, 3294 status = ocfs2_expand_inline_dir(dir, parent_fe_bh,
1685 blocks_wanted, &new_bh); 3295 blocks_wanted, lookup,
3296 &new_bh);
1686 if (status) { 3297 if (status) {
1687 mlog_errno(status); 3298 mlog_errno(status);
1688 goto bail; 3299 goto bail;
1689 } 3300 }
1690 3301
3302 /* Expansion from inline to an indexed directory will
3303 * have given us this. */
3304 dx_root_bh = lookup->dl_dx_root_bh;
3305
1691 if (blocks_wanted == 1) { 3306 if (blocks_wanted == 1) {
1692 /* 3307 /*
1693 * If the new dirent will fit inside the space 3308 * If the new dirent will fit inside the space
@@ -1751,6 +3366,10 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
1751 } 3366 }
1752 3367
1753do_extend: 3368do_extend:
3369 if (ocfs2_dir_indexed(dir))
3370 credits++; /* For attaching the new dirent block to the
3371 * dx_root */
3372
1754 down_write(&OCFS2_I(dir)->ip_alloc_sem); 3373 down_write(&OCFS2_I(dir)->ip_alloc_sem);
1755 drop_alloc_sem = 1; 3374 drop_alloc_sem = 1;
1756 3375
@@ -1781,9 +3400,19 @@ do_extend:
1781 3400
1782 de = (struct ocfs2_dir_entry *) new_bh->b_data; 3401 de = (struct ocfs2_dir_entry *) new_bh->b_data;
1783 de->inode = 0; 3402 de->inode = 0;
1784 if (ocfs2_dir_has_trailer(dir)) { 3403 if (ocfs2_supports_dir_trailer(dir)) {
1785 de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb)); 3404 de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
1786 ocfs2_init_dir_trailer(dir, new_bh); 3405
3406 ocfs2_init_dir_trailer(dir, new_bh, le16_to_cpu(de->rec_len));
3407
3408 if (ocfs2_dir_indexed(dir)) {
3409 status = ocfs2_dx_dir_link_trailer(dir, handle,
3410 dx_root_bh, new_bh);
3411 if (status) {
3412 mlog_errno(status);
3413 goto bail;
3414 }
3415 }
1787 } else { 3416 } else {
1788 de->rec_len = cpu_to_le16(sb->s_blocksize); 3417 de->rec_len = cpu_to_le16(sb->s_blocksize);
1789 } 3418 }
@@ -1839,7 +3468,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
1839 * This calculates how many free bytes we'd have in block zero, should 3468 * This calculates how many free bytes we'd have in block zero, should
1840 * this function force expansion to an extent tree. 3469 * this function force expansion to an extent tree.
1841 */ 3470 */
1842 if (ocfs2_supports_dir_trailer(OCFS2_SB(sb))) 3471 if (ocfs2_new_dir_wants_trailer(dir))
1843 free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir); 3472 free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
1844 else 3473 else
1845 free_space = dir->i_sb->s_blocksize - i_size_read(dir); 3474 free_space = dir->i_sb->s_blocksize - i_size_read(dir);
@@ -1970,12 +3599,766 @@ bail:
1970 return status; 3599 return status;
1971} 3600}
1972 3601
3602static int dx_leaf_sort_cmp(const void *a, const void *b)
3603{
3604 const struct ocfs2_dx_entry *entry1 = a;
3605 const struct ocfs2_dx_entry *entry2 = b;
3606 u32 major_hash1 = le32_to_cpu(entry1->dx_major_hash);
3607 u32 major_hash2 = le32_to_cpu(entry2->dx_major_hash);
3608 u32 minor_hash1 = le32_to_cpu(entry1->dx_minor_hash);
3609 u32 minor_hash2 = le32_to_cpu(entry2->dx_minor_hash);
3610
3611 if (major_hash1 > major_hash2)
3612 return 1;
3613 if (major_hash1 < major_hash2)
3614 return -1;
3615
3616 /*
3617 * It is not strictly necessary to sort by minor
3618 */
3619 if (minor_hash1 > minor_hash2)
3620 return 1;
3621 if (minor_hash1 < minor_hash2)
3622 return -1;
3623 return 0;
3624}
3625
3626static void dx_leaf_sort_swap(void *a, void *b, int size)
3627{
3628 struct ocfs2_dx_entry *entry1 = a;
3629 struct ocfs2_dx_entry *entry2 = b;
3630 struct ocfs2_dx_entry tmp;
3631
3632 BUG_ON(size != sizeof(*entry1));
3633
3634 tmp = *entry1;
3635 *entry1 = *entry2;
3636 *entry2 = tmp;
3637}
3638
3639static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf)
3640{
3641 struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
3642 int i, num = le16_to_cpu(dl_list->de_num_used);
3643
3644 for (i = 0; i < (num - 1); i++) {
3645 if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) !=
3646 le32_to_cpu(dl_list->de_entries[i + 1].dx_major_hash))
3647 return 0;
3648 }
3649
3650 return 1;
3651}
3652
3653/*
3654 * Find the optimal value to split this leaf on. This expects the leaf
3655 * entries to be in sorted order.
3656 *
3657 * leaf_cpos is the cpos of the leaf we're splitting. insert_hash is
3658 * the hash we want to insert.
3659 *
3660 * This function is only concerned with the major hash - that which
3661 * determines which cluster an item belongs to.
3662 */
3663static int ocfs2_dx_dir_find_leaf_split(struct ocfs2_dx_leaf *dx_leaf,
3664 u32 leaf_cpos, u32 insert_hash,
3665 u32 *split_hash)
3666{
3667 struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
3668 int i, num_used = le16_to_cpu(dl_list->de_num_used);
3669 int allsame;
3670
3671 /*
3672 * There's a couple rare, but nasty corner cases we have to
3673 * check for here. All of them involve a leaf where all value
3674 * have the same hash, which is what we look for first.
3675 *
3676 * Most of the time, all of the above is false, and we simply
3677 * pick the median value for a split.
3678 */
3679 allsame = ocfs2_dx_leaf_same_major(dx_leaf);
3680 if (allsame) {
3681 u32 val = le32_to_cpu(dl_list->de_entries[0].dx_major_hash);
3682
3683 if (val == insert_hash) {
3684 /*
3685 * No matter where we would choose to split,
3686 * the new entry would want to occupy the same
3687 * block as these. Since there's no space left
3688 * in their existing block, we know there
3689 * won't be space after the split.
3690 */
3691 return -ENOSPC;
3692 }
3693
3694 if (val == leaf_cpos) {
3695 /*
3696 * Because val is the same as leaf_cpos (which
3697 * is the smallest value this leaf can have),
3698 * yet is not equal to insert_hash, then we
3699 * know that insert_hash *must* be larger than
3700 * val (and leaf_cpos). At least cpos+1 in value.
3701 *
3702 * We also know then, that there cannot be an
3703 * adjacent extent (otherwise we'd be looking
3704 * at it). Choosing this value gives us a
3705 * chance to get some contiguousness.
3706 */
3707 *split_hash = leaf_cpos + 1;
3708 return 0;
3709 }
3710
3711 if (val > insert_hash) {
3712 /*
3713 * val can not be the same as insert hash, and
3714 * also must be larger than leaf_cpos. Also,
3715 * we know that there can't be a leaf between
3716 * cpos and val, otherwise the entries with
3717 * hash 'val' would be there.
3718 */
3719 *split_hash = val;
3720 return 0;
3721 }
3722
3723 *split_hash = insert_hash;
3724 return 0;
3725 }
3726
3727 /*
3728 * Since the records are sorted and the checks above
3729 * guaranteed that not all records in this block are the same,
3730 * we simple travel forward, from the median, and pick the 1st
3731 * record whose value is larger than leaf_cpos.
3732 */
3733 for (i = (num_used / 2); i < num_used; i++)
3734 if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) >
3735 leaf_cpos)
3736 break;
3737
3738 BUG_ON(i == num_used); /* Should be impossible */
3739 *split_hash = le32_to_cpu(dl_list->de_entries[i].dx_major_hash);
3740 return 0;
3741}
3742
3743/*
3744 * Transfer all entries in orig_dx_leaves whose major hash is equal to or
3745 * larger than split_hash into new_dx_leaves. We use a temporary
3746 * buffer (tmp_dx_leaf) to make the changes to the original leaf blocks.
3747 *
3748 * Since the block offset inside a leaf (cluster) is a constant mask
3749 * of minor_hash, we can optimize - an item at block offset X within
3750 * the original cluster, will be at offset X within the new cluster.
3751 */
3752static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash,
3753 handle_t *handle,
3754 struct ocfs2_dx_leaf *tmp_dx_leaf,
3755 struct buffer_head **orig_dx_leaves,
3756 struct buffer_head **new_dx_leaves,
3757 int num_dx_leaves)
3758{
3759 int i, j, num_used;
3760 u32 major_hash;
3761 struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf;
3762 struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list;
3763 struct ocfs2_dx_entry *dx_entry;
3764
3765 tmp_list = &tmp_dx_leaf->dl_list;
3766
3767 for (i = 0; i < num_dx_leaves; i++) {
3768 orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data;
3769 orig_list = &orig_dx_leaf->dl_list;
3770 new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data;
3771 new_list = &new_dx_leaf->dl_list;
3772
3773 num_used = le16_to_cpu(orig_list->de_num_used);
3774
3775 memcpy(tmp_dx_leaf, orig_dx_leaf, dir->i_sb->s_blocksize);
3776 tmp_list->de_num_used = cpu_to_le16(0);
3777 memset(&tmp_list->de_entries, 0, sizeof(*dx_entry)*num_used);
3778
3779 for (j = 0; j < num_used; j++) {
3780 dx_entry = &orig_list->de_entries[j];
3781 major_hash = le32_to_cpu(dx_entry->dx_major_hash);
3782 if (major_hash >= split_hash)
3783 ocfs2_dx_dir_leaf_insert_tail(new_dx_leaf,
3784 dx_entry);
3785 else
3786 ocfs2_dx_dir_leaf_insert_tail(tmp_dx_leaf,
3787 dx_entry);
3788 }
3789 memcpy(orig_dx_leaf, tmp_dx_leaf, dir->i_sb->s_blocksize);
3790
3791 ocfs2_journal_dirty(handle, orig_dx_leaves[i]);
3792 ocfs2_journal_dirty(handle, new_dx_leaves[i]);
3793 }
3794}
3795
3796static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb,
3797 struct ocfs2_dx_root_block *dx_root)
3798{
3799 int credits = ocfs2_clusters_to_blocks(osb->sb, 2);
3800
3801 credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list, 1);
3802 credits += ocfs2_quota_trans_credits(osb->sb);
3803 return credits;
3804}
3805
3806/*
3807 * Find the median value in dx_leaf_bh and allocate a new leaf to move
3808 * half our entries into.
3809 */
3810static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3811 struct buffer_head *dx_root_bh,
3812 struct buffer_head *dx_leaf_bh,
3813 struct ocfs2_dx_hinfo *hinfo, u32 leaf_cpos,
3814 u64 leaf_blkno)
3815{
3816 struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
3817 int credits, ret, i, num_used, did_quota = 0;
3818 u32 cpos, split_hash, insert_hash = hinfo->major_hash;
3819 u64 orig_leaves_start;
3820 int num_dx_leaves;
3821 struct buffer_head **orig_dx_leaves = NULL;
3822 struct buffer_head **new_dx_leaves = NULL;
3823 struct ocfs2_alloc_context *data_ac = NULL, *meta_ac = NULL;
3824 struct ocfs2_extent_tree et;
3825 handle_t *handle = NULL;
3826 struct ocfs2_dx_root_block *dx_root;
3827 struct ocfs2_dx_leaf *tmp_dx_leaf = NULL;
3828
3829 mlog(0, "DX Dir: %llu, rebalance leaf leaf_blkno: %llu insert: %u\n",
3830 (unsigned long long)OCFS2_I(dir)->ip_blkno,
3831 (unsigned long long)leaf_blkno, insert_hash);
3832
3833 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
3834
3835 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
3836 /*
3837 * XXX: This is a rather large limit. We should use a more
3838 * realistic value.
3839 */
3840 if (le32_to_cpu(dx_root->dr_clusters) == UINT_MAX)
3841 return -ENOSPC;
3842
3843 num_used = le16_to_cpu(dx_leaf->dl_list.de_num_used);
3844 if (num_used < le16_to_cpu(dx_leaf->dl_list.de_count)) {
3845 mlog(ML_ERROR, "DX Dir: %llu, Asked to rebalance empty leaf: "
3846 "%llu, %d\n", (unsigned long long)OCFS2_I(dir)->ip_blkno,
3847 (unsigned long long)leaf_blkno, num_used);
3848 ret = -EIO;
3849 goto out;
3850 }
3851
3852 orig_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
3853 if (!orig_dx_leaves) {
3854 ret = -ENOMEM;
3855 mlog_errno(ret);
3856 goto out;
3857 }
3858
3859 new_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, NULL);
3860 if (!new_dx_leaves) {
3861 ret = -ENOMEM;
3862 mlog_errno(ret);
3863 goto out;
3864 }
3865
3866 ret = ocfs2_lock_allocators(dir, &et, 1, 0, &data_ac, &meta_ac);
3867 if (ret) {
3868 if (ret != -ENOSPC)
3869 mlog_errno(ret);
3870 goto out;
3871 }
3872
3873 credits = ocfs2_dx_dir_rebalance_credits(osb, dx_root);
3874 handle = ocfs2_start_trans(osb, credits);
3875 if (IS_ERR(handle)) {
3876 ret = PTR_ERR(handle);
3877 handle = NULL;
3878 mlog_errno(ret);
3879 goto out;
3880 }
3881
3882 if (vfs_dq_alloc_space_nodirty(dir,
3883 ocfs2_clusters_to_bytes(dir->i_sb, 1))) {
3884 ret = -EDQUOT;
3885 goto out_commit;
3886 }
3887 did_quota = 1;
3888
3889 ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
3890 OCFS2_JOURNAL_ACCESS_WRITE);
3891 if (ret) {
3892 mlog_errno(ret);
3893 goto out_commit;
3894 }
3895
3896 /*
3897 * This block is changing anyway, so we can sort it in place.
3898 */
3899 sort(dx_leaf->dl_list.de_entries, num_used,
3900 sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
3901 dx_leaf_sort_swap);
3902
3903 ret = ocfs2_journal_dirty(handle, dx_leaf_bh);
3904 if (ret) {
3905 mlog_errno(ret);
3906 goto out_commit;
3907 }
3908
3909 ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
3910 &split_hash);
3911 if (ret) {
3912 mlog_errno(ret);
3913 goto out_commit;
3914 }
3915
3916 mlog(0, "Split leaf (%u) at %u, insert major hash is %u\n",
3917 leaf_cpos, split_hash, insert_hash);
3918
3919 /*
3920 * We have to carefully order operations here. There are items
3921 * which want to be in the new cluster before insert, but in
3922 * order to put those items in the new cluster, we alter the
3923 * old cluster. A failure to insert gets nasty.
3924 *
3925 * So, start by reserving writes to the old
3926 * cluster. ocfs2_dx_dir_new_cluster will reserve writes on
3927 * the new cluster for us, before inserting it. The insert
3928 * won't happen if there's an error before that. Once the
3929 * insert is done then, we can transfer from one leaf into the
3930 * other without fear of hitting any error.
3931 */
3932
3933 /*
3934 * The leaf transfer wants some scratch space so that we don't
3935 * wind up doing a bunch of expensive memmove().
3936 */
3937 tmp_dx_leaf = kmalloc(osb->sb->s_blocksize, GFP_NOFS);
3938 if (!tmp_dx_leaf) {
3939 ret = -ENOMEM;
3940 mlog_errno(ret);
3941 goto out_commit;
3942 }
3943
3944 orig_leaves_start = ocfs2_block_to_cluster_start(dir->i_sb, leaf_blkno);
3945 ret = ocfs2_read_dx_leaves(dir, orig_leaves_start, num_dx_leaves,
3946 orig_dx_leaves);
3947 if (ret) {
3948 mlog_errno(ret);
3949 goto out_commit;
3950 }
3951
3952 for (i = 0; i < num_dx_leaves; i++) {
3953 ret = ocfs2_journal_access_dl(handle, dir, orig_dx_leaves[i],
3954 OCFS2_JOURNAL_ACCESS_WRITE);
3955 if (ret) {
3956 mlog_errno(ret);
3957 goto out_commit;
3958 }
3959 }
3960
3961 cpos = split_hash;
3962 ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
3963 data_ac, meta_ac, new_dx_leaves,
3964 num_dx_leaves);
3965 if (ret) {
3966 mlog_errno(ret);
3967 goto out_commit;
3968 }
3969
3970 ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
3971 orig_dx_leaves, new_dx_leaves, num_dx_leaves);
3972
3973out_commit:
3974 if (ret < 0 && did_quota)
3975 vfs_dq_free_space_nodirty(dir,
3976 ocfs2_clusters_to_bytes(dir->i_sb, 1));
3977
3978 ocfs2_commit_trans(osb, handle);
3979
3980out:
3981 if (orig_dx_leaves || new_dx_leaves) {
3982 for (i = 0; i < num_dx_leaves; i++) {
3983 if (orig_dx_leaves)
3984 brelse(orig_dx_leaves[i]);
3985 if (new_dx_leaves)
3986 brelse(new_dx_leaves[i]);
3987 }
3988 kfree(orig_dx_leaves);
3989 kfree(new_dx_leaves);
3990 }
3991
3992 if (meta_ac)
3993 ocfs2_free_alloc_context(meta_ac);
3994 if (data_ac)
3995 ocfs2_free_alloc_context(data_ac);
3996
3997 kfree(tmp_dx_leaf);
3998 return ret;
3999}
4000
4001static int ocfs2_find_dir_space_dx(struct ocfs2_super *osb, struct inode *dir,
4002 struct buffer_head *di_bh,
4003 struct buffer_head *dx_root_bh,
4004 const char *name, int namelen,
4005 struct ocfs2_dir_lookup_result *lookup)
4006{
4007 int ret, rebalanced = 0;
4008 struct ocfs2_dx_root_block *dx_root;
4009 struct buffer_head *dx_leaf_bh = NULL;
4010 struct ocfs2_dx_leaf *dx_leaf;
4011 u64 blkno;
4012 u32 leaf_cpos;
4013
4014 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
4015
4016restart_search:
4017 ret = ocfs2_dx_dir_lookup(dir, &dx_root->dr_list, &lookup->dl_hinfo,
4018 &leaf_cpos, &blkno);
4019 if (ret) {
4020 mlog_errno(ret);
4021 goto out;
4022 }
4023
4024 ret = ocfs2_read_dx_leaf(dir, blkno, &dx_leaf_bh);
4025 if (ret) {
4026 mlog_errno(ret);
4027 goto out;
4028 }
4029
4030 dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
4031
4032 if (le16_to_cpu(dx_leaf->dl_list.de_num_used) >=
4033 le16_to_cpu(dx_leaf->dl_list.de_count)) {
4034 if (rebalanced) {
4035 /*
4036 * Rebalancing should have provided us with
4037 * space in an appropriate leaf.
4038 *
4039 * XXX: Is this an abnormal condition then?
4040 * Should we print a message here?
4041 */
4042 ret = -ENOSPC;
4043 goto out;
4044 }
4045
4046 ret = ocfs2_dx_dir_rebalance(osb, dir, dx_root_bh, dx_leaf_bh,
4047 &lookup->dl_hinfo, leaf_cpos,
4048 blkno);
4049 if (ret) {
4050 if (ret != -ENOSPC)
4051 mlog_errno(ret);
4052 goto out;
4053 }
4054
4055 /*
4056 * Restart the lookup. The rebalance might have
4057 * changed which block our item fits into. Mark our
4058 * progress, so we only execute this once.
4059 */
4060 brelse(dx_leaf_bh);
4061 dx_leaf_bh = NULL;
4062 rebalanced = 1;
4063 goto restart_search;
4064 }
4065
4066 lookup->dl_dx_leaf_bh = dx_leaf_bh;
4067 dx_leaf_bh = NULL;
4068
4069out:
4070 brelse(dx_leaf_bh);
4071 return ret;
4072}
4073
4074static int ocfs2_search_dx_free_list(struct inode *dir,
4075 struct buffer_head *dx_root_bh,
4076 int namelen,
4077 struct ocfs2_dir_lookup_result *lookup)
4078{
4079 int ret = -ENOSPC;
4080 struct buffer_head *leaf_bh = NULL, *prev_leaf_bh = NULL;
4081 struct ocfs2_dir_block_trailer *db;
4082 u64 next_block;
4083 int rec_len = OCFS2_DIR_REC_LEN(namelen);
4084 struct ocfs2_dx_root_block *dx_root;
4085
4086 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
4087 next_block = le64_to_cpu(dx_root->dr_free_blk);
4088
4089 while (next_block) {
4090 brelse(prev_leaf_bh);
4091 prev_leaf_bh = leaf_bh;
4092 leaf_bh = NULL;
4093
4094 ret = ocfs2_read_dir_block_direct(dir, next_block, &leaf_bh);
4095 if (ret) {
4096 mlog_errno(ret);
4097 goto out;
4098 }
4099
4100 db = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
4101 if (rec_len <= le16_to_cpu(db->db_free_rec_len)) {
4102 lookup->dl_leaf_bh = leaf_bh;
4103 lookup->dl_prev_leaf_bh = prev_leaf_bh;
4104 leaf_bh = NULL;
4105 prev_leaf_bh = NULL;
4106 break;
4107 }
4108
4109 next_block = le64_to_cpu(db->db_free_next);
4110 }
4111
4112 if (!next_block)
4113 ret = -ENOSPC;
4114
4115out:
4116
4117 brelse(leaf_bh);
4118 brelse(prev_leaf_bh);
4119 return ret;
4120}
4121
4122static int ocfs2_expand_inline_dx_root(struct inode *dir,
4123 struct buffer_head *dx_root_bh)
4124{
4125 int ret, num_dx_leaves, i, j, did_quota = 0;
4126 struct buffer_head **dx_leaves = NULL;
4127 struct ocfs2_extent_tree et;
4128 u64 insert_blkno;
4129 struct ocfs2_alloc_context *data_ac = NULL;
4130 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4131 handle_t *handle = NULL;
4132 struct ocfs2_dx_root_block *dx_root;
4133 struct ocfs2_dx_entry_list *entry_list;
4134 struct ocfs2_dx_entry *dx_entry;
4135 struct ocfs2_dx_leaf *target_leaf;
4136
4137 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
4138 if (ret) {
4139 mlog_errno(ret);
4140 goto out;
4141 }
4142
4143 dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
4144 if (!dx_leaves) {
4145 ret = -ENOMEM;
4146 mlog_errno(ret);
4147 goto out;
4148 }
4149
4150 handle = ocfs2_start_trans(osb, ocfs2_calc_dxi_expand_credits(osb->sb));
4151 if (IS_ERR(handle)) {
4152 ret = PTR_ERR(handle);
4153 mlog_errno(ret);
4154 goto out;
4155 }
4156
4157 if (vfs_dq_alloc_space_nodirty(dir,
4158 ocfs2_clusters_to_bytes(osb->sb, 1))) {
4159 ret = -EDQUOT;
4160 goto out_commit;
4161 }
4162 did_quota = 1;
4163
4164 /*
4165 * We do this up front, before the allocation, so that a
4166 * failure to add the dx_root_bh to the journal won't result
4167 * us losing clusters.
4168 */
4169 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
4170 OCFS2_JOURNAL_ACCESS_WRITE);
4171 if (ret) {
4172 mlog_errno(ret);
4173 goto out_commit;
4174 }
4175
4176 ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, dx_leaves,
4177 num_dx_leaves, &insert_blkno);
4178 if (ret) {
4179 mlog_errno(ret);
4180 goto out_commit;
4181 }
4182
4183 /*
4184 * Transfer the entries from our dx_root into the appropriate
4185 * block
4186 */
4187 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
4188 entry_list = &dx_root->dr_entries;
4189
4190 for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
4191 dx_entry = &entry_list->de_entries[i];
4192
4193 j = __ocfs2_dx_dir_hash_idx(osb,
4194 le32_to_cpu(dx_entry->dx_minor_hash));
4195 target_leaf = (struct ocfs2_dx_leaf *)dx_leaves[j]->b_data;
4196
4197 ocfs2_dx_dir_leaf_insert_tail(target_leaf, dx_entry);
4198
4199 /* Each leaf has been passed to the journal already
4200 * via __ocfs2_dx_dir_new_cluster() */
4201 }
4202
4203 dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE;
4204 memset(&dx_root->dr_list, 0, osb->sb->s_blocksize -
4205 offsetof(struct ocfs2_dx_root_block, dr_list));
4206 dx_root->dr_list.l_count =
4207 cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
4208
4209 /* This should never fail considering we start with an empty
4210 * dx_root. */
4211 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
4212 ret = ocfs2_insert_extent(osb, handle, dir, &et, 0,
4213 insert_blkno, 1, 0, NULL);
4214 if (ret)
4215 mlog_errno(ret);
4216 did_quota = 0;
4217
4218 ocfs2_journal_dirty(handle, dx_root_bh);
4219
4220out_commit:
4221 if (ret < 0 && did_quota)
4222 vfs_dq_free_space_nodirty(dir,
4223 ocfs2_clusters_to_bytes(dir->i_sb, 1));
4224
4225 ocfs2_commit_trans(osb, handle);
4226
4227out:
4228 if (data_ac)
4229 ocfs2_free_alloc_context(data_ac);
4230
4231 if (dx_leaves) {
4232 for (i = 0; i < num_dx_leaves; i++)
4233 brelse(dx_leaves[i]);
4234 kfree(dx_leaves);
4235 }
4236 return ret;
4237}
4238
4239static int ocfs2_inline_dx_has_space(struct buffer_head *dx_root_bh)
4240{
4241 struct ocfs2_dx_root_block *dx_root;
4242 struct ocfs2_dx_entry_list *entry_list;
4243
4244 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
4245 entry_list = &dx_root->dr_entries;
4246
4247 if (le16_to_cpu(entry_list->de_num_used) >=
4248 le16_to_cpu(entry_list->de_count))
4249 return -ENOSPC;
4250
4251 return 0;
4252}
4253
4254static int ocfs2_prepare_dx_dir_for_insert(struct inode *dir,
4255 struct buffer_head *di_bh,
4256 const char *name,
4257 int namelen,
4258 struct ocfs2_dir_lookup_result *lookup)
4259{
4260 int ret, free_dx_root = 1;
4261 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4262 struct buffer_head *dx_root_bh = NULL;
4263 struct buffer_head *leaf_bh = NULL;
4264 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4265 struct ocfs2_dx_root_block *dx_root;
4266
4267 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
4268 if (ret) {
4269 mlog_errno(ret);
4270 goto out;
4271 }
4272
4273 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
4274 if (le32_to_cpu(dx_root->dr_num_entries) == OCFS2_DX_ENTRIES_MAX) {
4275 ret = -ENOSPC;
4276 mlog_errno(ret);
4277 goto out;
4278 }
4279
4280 if (ocfs2_dx_root_inline(dx_root)) {
4281 ret = ocfs2_inline_dx_has_space(dx_root_bh);
4282
4283 if (ret == 0)
4284 goto search_el;
4285
4286 /*
4287 * We ran out of room in the root block. Expand it to
4288 * an extent, then allow ocfs2_find_dir_space_dx to do
4289 * the rest.
4290 */
4291 ret = ocfs2_expand_inline_dx_root(dir, dx_root_bh);
4292 if (ret) {
4293 mlog_errno(ret);
4294 goto out;
4295 }
4296 }
4297
4298 /*
4299 * Insert preparation for an indexed directory is split into two
4300 * steps. The call to find_dir_space_dx reserves room in the index for
4301 * an additional item. If we run out of space there, it's a real error
4302 * we can't continue on.
4303 */
4304 ret = ocfs2_find_dir_space_dx(osb, dir, di_bh, dx_root_bh, name,
4305 namelen, lookup);
4306 if (ret) {
4307 mlog_errno(ret);
4308 goto out;
4309 }
4310
4311search_el:
4312 /*
4313 * Next, we need to find space in the unindexed tree. This call
4314 * searches using the free space linked list. If the unindexed tree
4315 * lacks sufficient space, we'll expand it below. The expansion code
4316 * is smart enough to add any new blocks to the free space list.
4317 */
4318 ret = ocfs2_search_dx_free_list(dir, dx_root_bh, namelen, lookup);
4319 if (ret && ret != -ENOSPC) {
4320 mlog_errno(ret);
4321 goto out;
4322 }
4323
4324 /* Do this up here - ocfs2_extend_dir might need the dx_root */
4325 lookup->dl_dx_root_bh = dx_root_bh;
4326 free_dx_root = 0;
4327
4328 if (ret == -ENOSPC) {
4329 ret = ocfs2_extend_dir(osb, dir, di_bh, 1, lookup, &leaf_bh);
4330
4331 if (ret) {
4332 mlog_errno(ret);
4333 goto out;
4334 }
4335
4336 /*
4337 * We make the assumption here that new leaf blocks are added
4338 * to the front of our free list.
4339 */
4340 lookup->dl_prev_leaf_bh = NULL;
4341 lookup->dl_leaf_bh = leaf_bh;
4342 }
4343
4344out:
4345 if (free_dx_root)
4346 brelse(dx_root_bh);
4347 return ret;
4348}
4349
4350/*
4351 * Get a directory ready for insert. Any directory allocation required
4352 * happens here. Success returns zero, and enough context in the dir
4353 * lookup result that ocfs2_add_entry() will be able complete the task
4354 * with minimal performance impact.
4355 */
1973int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, 4356int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
1974 struct inode *dir, 4357 struct inode *dir,
1975 struct buffer_head *parent_fe_bh, 4358 struct buffer_head *parent_fe_bh,
1976 const char *name, 4359 const char *name,
1977 int namelen, 4360 int namelen,
1978 struct buffer_head **ret_de_bh) 4361 struct ocfs2_dir_lookup_result *lookup)
1979{ 4362{
1980 int ret; 4363 int ret;
1981 unsigned int blocks_wanted = 1; 4364 unsigned int blocks_wanted = 1;
@@ -1984,14 +4367,34 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
1984 mlog(0, "getting ready to insert namelen %d into dir %llu\n", 4367 mlog(0, "getting ready to insert namelen %d into dir %llu\n",
1985 namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno); 4368 namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno);
1986 4369
1987 *ret_de_bh = NULL;
1988
1989 if (!namelen) { 4370 if (!namelen) {
1990 ret = -EINVAL; 4371 ret = -EINVAL;
1991 mlog_errno(ret); 4372 mlog_errno(ret);
1992 goto out; 4373 goto out;
1993 } 4374 }
1994 4375
4376 /*
4377 * Do this up front to reduce confusion.
4378 *
4379 * The directory might start inline, then be turned into an
4380 * indexed one, in which case we'd need to hash deep inside
4381 * ocfs2_find_dir_space_id(). Since
4382 * ocfs2_prepare_dx_dir_for_insert() also needs this hash
4383 * done, there seems no point in spreading out the calls. We
4384 * can optimize away the case where the file system doesn't
4385 * support indexing.
4386 */
4387 if (ocfs2_supports_indexed_dirs(osb))
4388 ocfs2_dx_dir_name_hash(dir, name, namelen, &lookup->dl_hinfo);
4389
4390 if (ocfs2_dir_indexed(dir)) {
4391 ret = ocfs2_prepare_dx_dir_for_insert(dir, parent_fe_bh,
4392 name, namelen, lookup);
4393 if (ret)
4394 mlog_errno(ret);
4395 goto out;
4396 }
4397
1995 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 4398 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1996 ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name, 4399 ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name,
1997 namelen, &bh, &blocks_wanted); 4400 namelen, &bh, &blocks_wanted);
@@ -2010,7 +4413,7 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
2010 BUG_ON(bh); 4413 BUG_ON(bh);
2011 4414
2012 ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted, 4415 ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted,
2013 &bh); 4416 lookup, &bh);
2014 if (ret) { 4417 if (ret) {
2015 if (ret != -ENOSPC) 4418 if (ret != -ENOSPC)
2016 mlog_errno(ret); 4419 mlog_errno(ret);
@@ -2020,9 +4423,154 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
2020 BUG_ON(!bh); 4423 BUG_ON(!bh);
2021 } 4424 }
2022 4425
2023 *ret_de_bh = bh; 4426 lookup->dl_leaf_bh = bh;
2024 bh = NULL; 4427 bh = NULL;
2025out: 4428out:
2026 brelse(bh); 4429 brelse(bh);
2027 return ret; 4430 return ret;
2028} 4431}
4432
4433static int ocfs2_dx_dir_remove_index(struct inode *dir,
4434 struct buffer_head *di_bh,
4435 struct buffer_head *dx_root_bh)
4436{
4437 int ret;
4438 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4439 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4440 struct ocfs2_dx_root_block *dx_root;
4441 struct inode *dx_alloc_inode = NULL;
4442 struct buffer_head *dx_alloc_bh = NULL;
4443 handle_t *handle;
4444 u64 blk;
4445 u16 bit;
4446 u64 bg_blkno;
4447
4448 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
4449
4450 dx_alloc_inode = ocfs2_get_system_file_inode(osb,
4451 EXTENT_ALLOC_SYSTEM_INODE,
4452 le16_to_cpu(dx_root->dr_suballoc_slot));
4453 if (!dx_alloc_inode) {
4454 ret = -ENOMEM;
4455 mlog_errno(ret);
4456 goto out;
4457 }
4458 mutex_lock(&dx_alloc_inode->i_mutex);
4459
4460 ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1);
4461 if (ret) {
4462 mlog_errno(ret);
4463 goto out_mutex;
4464 }
4465
4466 handle = ocfs2_start_trans(osb, OCFS2_DX_ROOT_REMOVE_CREDITS);
4467 if (IS_ERR(handle)) {
4468 ret = PTR_ERR(handle);
4469 mlog_errno(ret);
4470 goto out_unlock;
4471 }
4472
4473 ret = ocfs2_journal_access_di(handle, dir, di_bh,
4474 OCFS2_JOURNAL_ACCESS_WRITE);
4475 if (ret) {
4476 mlog_errno(ret);
4477 goto out_commit;
4478 }
4479
4480 OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
4481 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
4482 di->i_dx_root = cpu_to_le64(0ULL);
4483
4484 ocfs2_journal_dirty(handle, di_bh);
4485
4486 blk = le64_to_cpu(dx_root->dr_blkno);
4487 bit = le16_to_cpu(dx_root->dr_suballoc_bit);
4488 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
4489 ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
4490 bit, bg_blkno, 1);
4491 if (ret)
4492 mlog_errno(ret);
4493
4494out_commit:
4495 ocfs2_commit_trans(osb, handle);
4496
4497out_unlock:
4498 ocfs2_inode_unlock(dx_alloc_inode, 1);
4499
4500out_mutex:
4501 mutex_unlock(&dx_alloc_inode->i_mutex);
4502 brelse(dx_alloc_bh);
4503out:
4504 iput(dx_alloc_inode);
4505 return ret;
4506}
4507
4508int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
4509{
4510 int ret;
4511 unsigned int uninitialized_var(clen);
4512 u32 major_hash = UINT_MAX, p_cpos, uninitialized_var(cpos);
4513 u64 uninitialized_var(blkno);
4514 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4515 struct buffer_head *dx_root_bh = NULL;
4516 struct ocfs2_dx_root_block *dx_root;
4517 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4518 struct ocfs2_cached_dealloc_ctxt dealloc;
4519 struct ocfs2_extent_tree et;
4520
4521 ocfs2_init_dealloc_ctxt(&dealloc);
4522
4523 if (!ocfs2_dir_indexed(dir))
4524 return 0;
4525
4526 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
4527 if (ret) {
4528 mlog_errno(ret);
4529 goto out;
4530 }
4531 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
4532
4533 if (ocfs2_dx_root_inline(dx_root))
4534 goto remove_index;
4535
4536 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
4537
4538 /* XXX: What if dr_clusters is too large? */
4539 while (le32_to_cpu(dx_root->dr_clusters)) {
4540 ret = ocfs2_dx_dir_lookup_rec(dir, &dx_root->dr_list,
4541 major_hash, &cpos, &blkno, &clen);
4542 if (ret) {
4543 mlog_errno(ret);
4544 goto out;
4545 }
4546
4547 p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
4548
4549 ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen,
4550 &dealloc);
4551 if (ret) {
4552 mlog_errno(ret);
4553 goto out;
4554 }
4555
4556 if (cpos == 0)
4557 break;
4558
4559 major_hash = cpos - 1;
4560 }
4561
4562remove_index:
4563 ret = ocfs2_dx_dir_remove_index(dir, di_bh, dx_root_bh);
4564 if (ret) {
4565 mlog_errno(ret);
4566 goto out;
4567 }
4568
4569 ocfs2_remove_from_cache(dir, dx_root_bh);
4570out:
4571 ocfs2_schedule_truncate_log_flush(osb, 1);
4572 ocfs2_run_deallocs(osb, &dealloc);
4573
4574 brelse(dx_root_bh);
4575 return ret;
4576}
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index c511e2e18e9f..e683f3deb645 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -26,44 +26,70 @@
26#ifndef OCFS2_DIR_H 26#ifndef OCFS2_DIR_H
27#define OCFS2_DIR_H 27#define OCFS2_DIR_H
28 28
29struct buffer_head *ocfs2_find_entry(const char *name, 29struct ocfs2_dx_hinfo {
30 int namelen, 30 u32 major_hash;
31 struct inode *dir, 31 u32 minor_hash;
32 struct ocfs2_dir_entry **res_dir); 32};
33
34struct ocfs2_dir_lookup_result {
35 struct buffer_head *dl_leaf_bh; /* Unindexed leaf
36 * block */
37 struct ocfs2_dir_entry *dl_entry; /* Target dirent in
38 * unindexed leaf */
39
40 struct buffer_head *dl_dx_root_bh; /* Root of indexed
41 * tree */
42
43 struct buffer_head *dl_dx_leaf_bh; /* Indexed leaf block */
44 struct ocfs2_dx_entry *dl_dx_entry; /* Target dx_entry in
45 * indexed leaf */
46 struct ocfs2_dx_hinfo dl_hinfo; /* Name hash results */
47
48 struct buffer_head *dl_prev_leaf_bh;/* Previous entry in
49 * dir free space
50 * list. NULL if
51 * previous entry is
52 * dx root block. */
53};
54
55void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res);
56
57int ocfs2_find_entry(const char *name, int namelen,
58 struct inode *dir,
59 struct ocfs2_dir_lookup_result *lookup);
33int ocfs2_delete_entry(handle_t *handle, 60int ocfs2_delete_entry(handle_t *handle,
34 struct inode *dir, 61 struct inode *dir,
35 struct ocfs2_dir_entry *de_del, 62 struct ocfs2_dir_lookup_result *res);
36 struct buffer_head *bh);
37int __ocfs2_add_entry(handle_t *handle, 63int __ocfs2_add_entry(handle_t *handle,
38 struct inode *dir, 64 struct inode *dir,
39 const char *name, int namelen, 65 const char *name, int namelen,
40 struct inode *inode, u64 blkno, 66 struct inode *inode, u64 blkno,
41 struct buffer_head *parent_fe_bh, 67 struct buffer_head *parent_fe_bh,
42 struct buffer_head *insert_bh); 68 struct ocfs2_dir_lookup_result *lookup);
43static inline int ocfs2_add_entry(handle_t *handle, 69static inline int ocfs2_add_entry(handle_t *handle,
44 struct dentry *dentry, 70 struct dentry *dentry,
45 struct inode *inode, u64 blkno, 71 struct inode *inode, u64 blkno,
46 struct buffer_head *parent_fe_bh, 72 struct buffer_head *parent_fe_bh,
47 struct buffer_head *insert_bh) 73 struct ocfs2_dir_lookup_result *lookup)
48{ 74{
49 return __ocfs2_add_entry(handle, dentry->d_parent->d_inode, 75 return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
50 dentry->d_name.name, dentry->d_name.len, 76 dentry->d_name.name, dentry->d_name.len,
51 inode, blkno, parent_fe_bh, insert_bh); 77 inode, blkno, parent_fe_bh, lookup);
52} 78}
53int ocfs2_update_entry(struct inode *dir, handle_t *handle, 79int ocfs2_update_entry(struct inode *dir, handle_t *handle,
54 struct buffer_head *de_bh, struct ocfs2_dir_entry *de, 80 struct ocfs2_dir_lookup_result *res,
55 struct inode *new_entry_inode); 81 struct inode *new_entry_inode);
56 82
57int ocfs2_check_dir_for_entry(struct inode *dir, 83int ocfs2_check_dir_for_entry(struct inode *dir,
58 const char *name, 84 const char *name,
59 int namelen); 85 int namelen);
60int ocfs2_empty_dir(struct inode *inode); 86int ocfs2_empty_dir(struct inode *inode);
87
61int ocfs2_find_files_on_disk(const char *name, 88int ocfs2_find_files_on_disk(const char *name,
62 int namelen, 89 int namelen,
63 u64 *blkno, 90 u64 *blkno,
64 struct inode *inode, 91 struct inode *inode,
65 struct buffer_head **dirent_bh, 92 struct ocfs2_dir_lookup_result *res);
66 struct ocfs2_dir_entry **dirent);
67int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name, 93int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
68 int namelen, u64 *blkno); 94 int namelen, u64 *blkno);
69int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir); 95int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
@@ -74,14 +100,17 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
74 struct buffer_head *parent_fe_bh, 100 struct buffer_head *parent_fe_bh,
75 const char *name, 101 const char *name,
76 int namelen, 102 int namelen,
77 struct buffer_head **ret_de_bh); 103 struct ocfs2_dir_lookup_result *lookup);
78struct ocfs2_alloc_context; 104struct ocfs2_alloc_context;
79int ocfs2_fill_new_dir(struct ocfs2_super *osb, 105int ocfs2_fill_new_dir(struct ocfs2_super *osb,
80 handle_t *handle, 106 handle_t *handle,
81 struct inode *parent, 107 struct inode *parent,
82 struct inode *inode, 108 struct inode *inode,
83 struct buffer_head *fe_bh, 109 struct buffer_head *fe_bh,
84 struct ocfs2_alloc_context *data_ac); 110 struct ocfs2_alloc_context *data_ac,
111 struct ocfs2_alloc_context *meta_ac);
112
113int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh);
85 114
86struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize, 115struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
87 void *data); 116 void *data);
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index bb53714813ab..0102be35980c 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -52,16 +52,12 @@
52enum dlm_mle_type { 52enum dlm_mle_type {
53 DLM_MLE_BLOCK, 53 DLM_MLE_BLOCK,
54 DLM_MLE_MASTER, 54 DLM_MLE_MASTER,
55 DLM_MLE_MIGRATION 55 DLM_MLE_MIGRATION,
56}; 56 DLM_MLE_NUM_TYPES
57
58struct dlm_lock_name {
59 u8 len;
60 u8 name[DLM_LOCKID_NAME_MAX];
61}; 57};
62 58
63struct dlm_master_list_entry { 59struct dlm_master_list_entry {
64 struct list_head list; 60 struct hlist_node master_hash_node;
65 struct list_head hb_events; 61 struct list_head hb_events;
66 struct dlm_ctxt *dlm; 62 struct dlm_ctxt *dlm;
67 spinlock_t spinlock; 63 spinlock_t spinlock;
@@ -78,10 +74,10 @@ struct dlm_master_list_entry {
78 enum dlm_mle_type type; 74 enum dlm_mle_type type;
79 struct o2hb_callback_func mle_hb_up; 75 struct o2hb_callback_func mle_hb_up;
80 struct o2hb_callback_func mle_hb_down; 76 struct o2hb_callback_func mle_hb_down;
81 union { 77 struct dlm_lock_resource *mleres;
82 struct dlm_lock_resource *res; 78 unsigned char mname[DLM_LOCKID_NAME_MAX];
83 struct dlm_lock_name name; 79 unsigned int mnamelen;
84 } u; 80 unsigned int mnamehash;
85}; 81};
86 82
87enum dlm_ast_type { 83enum dlm_ast_type {
@@ -151,13 +147,14 @@ struct dlm_ctxt
151 unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 147 unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
152 struct dlm_recovery_ctxt reco; 148 struct dlm_recovery_ctxt reco;
153 spinlock_t master_lock; 149 spinlock_t master_lock;
154 struct list_head master_list; 150 struct hlist_head **master_hash;
155 struct list_head mle_hb_events; 151 struct list_head mle_hb_events;
156 152
157 /* these give a really vague idea of the system load */ 153 /* these give a really vague idea of the system load */
158 atomic_t local_resources; 154 atomic_t mle_tot_count[DLM_MLE_NUM_TYPES];
159 atomic_t remote_resources; 155 atomic_t mle_cur_count[DLM_MLE_NUM_TYPES];
160 atomic_t unknown_resources; 156 atomic_t res_tot_count;
157 atomic_t res_cur_count;
161 158
162 struct dlm_debug_ctxt *dlm_debug_ctxt; 159 struct dlm_debug_ctxt *dlm_debug_ctxt;
163 struct dentry *dlm_debugfs_subroot; 160 struct dentry *dlm_debugfs_subroot;
@@ -195,6 +192,13 @@ static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned
195 return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE); 192 return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE);
196} 193}
197 194
195static inline struct hlist_head *dlm_master_hash(struct dlm_ctxt *dlm,
196 unsigned i)
197{
198 return dlm->master_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] +
199 (i % DLM_BUCKETS_PER_PAGE);
200}
201
198/* these keventd work queue items are for less-frequently 202/* these keventd work queue items are for less-frequently
199 * called functions that cannot be directly called from the 203 * called functions that cannot be directly called from the
200 * net message handlers for some reason, usually because 204 * net message handlers for some reason, usually because
@@ -848,9 +852,7 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
848 unsigned int len); 852 unsigned int len);
849 853
850int dlm_is_host_down(int errno); 854int dlm_is_host_down(int errno);
851void dlm_change_lockres_owner(struct dlm_ctxt *dlm, 855
852 struct dlm_lock_resource *res,
853 u8 owner);
854struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, 856struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
855 const char *lockid, 857 const char *lockid,
856 int namelen, 858 int namelen,
@@ -1008,6 +1010,9 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
1008 DLM_LOCK_RES_MIGRATING)); 1010 DLM_LOCK_RES_MIGRATING));
1009} 1011}
1010 1012
1013void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle);
1014void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle);
1015
1011/* create/destroy slab caches */ 1016/* create/destroy slab caches */
1012int dlm_init_master_caches(void); 1017int dlm_init_master_caches(void);
1013void dlm_destroy_master_caches(void); 1018void dlm_destroy_master_caches(void);
@@ -1110,6 +1115,23 @@ static inline int dlm_node_iter_next(struct dlm_node_iter *iter)
1110 return bit; 1115 return bit;
1111} 1116}
1112 1117
1118static inline void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
1119 struct dlm_lock_resource *res,
1120 u8 owner)
1121{
1122 assert_spin_locked(&res->spinlock);
1123
1124 res->owner = owner;
1125}
1113 1126
1127static inline void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
1128 struct dlm_lock_resource *res,
1129 u8 owner)
1130{
1131 assert_spin_locked(&res->spinlock);
1132
1133 if (owner != res->owner)
1134 dlm_set_lockres_owner(dlm, res, owner);
1135}
1114 1136
1115#endif /* DLMCOMMON_H */ 1137#endif /* DLMCOMMON_H */
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index b32f60a5acfb..df52f706f669 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -287,18 +287,8 @@ static int stringify_nodemap(unsigned long *nodemap, int maxnodes,
287static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len) 287static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
288{ 288{
289 int out = 0; 289 int out = 0;
290 unsigned int namelen;
291 const char *name;
292 char *mle_type; 290 char *mle_type;
293 291
294 if (mle->type != DLM_MLE_MASTER) {
295 namelen = mle->u.name.len;
296 name = mle->u.name.name;
297 } else {
298 namelen = mle->u.res->lockname.len;
299 name = mle->u.res->lockname.name;
300 }
301
302 if (mle->type == DLM_MLE_BLOCK) 292 if (mle->type == DLM_MLE_BLOCK)
303 mle_type = "BLK"; 293 mle_type = "BLK";
304 else if (mle->type == DLM_MLE_MASTER) 294 else if (mle->type == DLM_MLE_MASTER)
@@ -306,7 +296,7 @@ static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
306 else 296 else
307 mle_type = "MIG"; 297 mle_type = "MIG";
308 298
309 out += stringify_lockname(name, namelen, buf + out, len - out); 299 out += stringify_lockname(mle->mname, mle->mnamelen, buf + out, len - out);
310 out += snprintf(buf + out, len - out, 300 out += snprintf(buf + out, len - out,
311 "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n", 301 "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n",
312 mle_type, mle->master, mle->new_master, 302 mle_type, mle->master, mle->new_master,
@@ -501,23 +491,33 @@ static struct file_operations debug_purgelist_fops = {
501static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db) 491static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
502{ 492{
503 struct dlm_master_list_entry *mle; 493 struct dlm_master_list_entry *mle;
504 int out = 0; 494 struct hlist_head *bucket;
505 unsigned long total = 0; 495 struct hlist_node *list;
496 int i, out = 0;
497 unsigned long total = 0, longest = 0, bktcnt;
506 498
507 out += snprintf(db->buf + out, db->len - out, 499 out += snprintf(db->buf + out, db->len - out,
508 "Dumping MLEs for Domain: %s\n", dlm->name); 500 "Dumping MLEs for Domain: %s\n", dlm->name);
509 501
510 spin_lock(&dlm->master_lock); 502 spin_lock(&dlm->master_lock);
511 list_for_each_entry(mle, &dlm->master_list, list) { 503 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
512 ++total; 504 bucket = dlm_master_hash(dlm, i);
513 if (db->len - out < 200) 505 hlist_for_each(list, bucket) {
514 continue; 506 mle = hlist_entry(list, struct dlm_master_list_entry,
515 out += dump_mle(mle, db->buf + out, db->len - out); 507 master_hash_node);
508 ++total;
509 ++bktcnt;
510 if (db->len - out < 200)
511 continue;
512 out += dump_mle(mle, db->buf + out, db->len - out);
513 }
514 longest = max(longest, bktcnt);
515 bktcnt = 0;
516 } 516 }
517 spin_unlock(&dlm->master_lock); 517 spin_unlock(&dlm->master_lock);
518 518
519 out += snprintf(db->buf + out, db->len - out, 519 out += snprintf(db->buf + out, db->len - out,
520 "Total on list: %ld\n", total); 520 "Total: %ld, Longest: %ld\n", total, longest);
521 return out; 521 return out;
522} 522}
523 523
@@ -756,12 +756,8 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
756 int out = 0; 756 int out = 0;
757 struct dlm_reco_node_data *node; 757 struct dlm_reco_node_data *node;
758 char *state; 758 char *state;
759 int lres, rres, ures, tres; 759 int cur_mles = 0, tot_mles = 0;
760 760 int i;
761 lres = atomic_read(&dlm->local_resources);
762 rres = atomic_read(&dlm->remote_resources);
763 ures = atomic_read(&dlm->unknown_resources);
764 tres = lres + rres + ures;
765 761
766 spin_lock(&dlm->spinlock); 762 spin_lock(&dlm->spinlock);
767 763
@@ -804,21 +800,48 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
804 db->buf + out, db->len - out); 800 db->buf + out, db->len - out);
805 out += snprintf(db->buf + out, db->len - out, "\n"); 801 out += snprintf(db->buf + out, db->len - out, "\n");
806 802
807 /* Mastered Resources Total: xxx Locally: xxx Remotely: ... */ 803 /* Lock Resources: xxx (xxx) */
804 out += snprintf(db->buf + out, db->len - out,
805 "Lock Resources: %d (%d)\n",
806 atomic_read(&dlm->res_cur_count),
807 atomic_read(&dlm->res_tot_count));
808
809 for (i = 0; i < DLM_MLE_NUM_TYPES; ++i)
810 tot_mles += atomic_read(&dlm->mle_tot_count[i]);
811
812 for (i = 0; i < DLM_MLE_NUM_TYPES; ++i)
813 cur_mles += atomic_read(&dlm->mle_cur_count[i]);
814
815 /* MLEs: xxx (xxx) */
816 out += snprintf(db->buf + out, db->len - out,
817 "MLEs: %d (%d)\n", cur_mles, tot_mles);
818
819 /* Blocking: xxx (xxx) */
820 out += snprintf(db->buf + out, db->len - out,
821 " Blocking: %d (%d)\n",
822 atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
823 atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
824
825 /* Mastery: xxx (xxx) */
826 out += snprintf(db->buf + out, db->len - out,
827 " Mastery: %d (%d)\n",
828 atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
829 atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
830
831 /* Migration: xxx (xxx) */
808 out += snprintf(db->buf + out, db->len - out, 832 out += snprintf(db->buf + out, db->len - out,
809 "Mastered Resources Total: %d Locally: %d " 833 " Migration: %d (%d)\n",
810 "Remotely: %d Unknown: %d\n", 834 atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
811 tres, lres, rres, ures); 835 atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
812 836
813 /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */ 837 /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */
814 out += snprintf(db->buf + out, db->len - out, 838 out += snprintf(db->buf + out, db->len - out,
815 "Lists: Dirty=%s Purge=%s PendingASTs=%s " 839 "Lists: Dirty=%s Purge=%s PendingASTs=%s "
816 "PendingBASTs=%s Master=%s\n", 840 "PendingBASTs=%s\n",
817 (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"), 841 (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
818 (list_empty(&dlm->purge_list) ? "Empty" : "InUse"), 842 (list_empty(&dlm->purge_list) ? "Empty" : "InUse"),
819 (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"), 843 (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"),
820 (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"), 844 (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
821 (list_empty(&dlm->master_list) ? "Empty" : "InUse"));
822 845
823 /* Purge Count: xxx Refs: xxx */ 846 /* Purge Count: xxx Refs: xxx */
824 out += snprintf(db->buf + out, db->len - out, 847 out += snprintf(db->buf + out, db->len - out,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index d8d578f45613..4d9e6b288dd8 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -304,6 +304,9 @@ static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
304 if (dlm->lockres_hash) 304 if (dlm->lockres_hash)
305 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); 305 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
306 306
307 if (dlm->master_hash)
308 dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
309
307 if (dlm->name) 310 if (dlm->name)
308 kfree(dlm->name); 311 kfree(dlm->name);
309 312
@@ -1534,12 +1537,27 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1534 for (i = 0; i < DLM_HASH_BUCKETS; i++) 1537 for (i = 0; i < DLM_HASH_BUCKETS; i++)
1535 INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i)); 1538 INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i));
1536 1539
1540 dlm->master_hash = (struct hlist_head **)
1541 dlm_alloc_pagevec(DLM_HASH_PAGES);
1542 if (!dlm->master_hash) {
1543 mlog_errno(-ENOMEM);
1544 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
1545 kfree(dlm->name);
1546 kfree(dlm);
1547 dlm = NULL;
1548 goto leave;
1549 }
1550
1551 for (i = 0; i < DLM_HASH_BUCKETS; i++)
1552 INIT_HLIST_HEAD(dlm_master_hash(dlm, i));
1553
1537 strcpy(dlm->name, domain); 1554 strcpy(dlm->name, domain);
1538 dlm->key = key; 1555 dlm->key = key;
1539 dlm->node_num = o2nm_this_node(); 1556 dlm->node_num = o2nm_this_node();
1540 1557
1541 ret = dlm_create_debugfs_subroot(dlm); 1558 ret = dlm_create_debugfs_subroot(dlm);
1542 if (ret < 0) { 1559 if (ret < 0) {
1560 dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
1543 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); 1561 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
1544 kfree(dlm->name); 1562 kfree(dlm->name);
1545 kfree(dlm); 1563 kfree(dlm);
@@ -1579,7 +1597,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1579 init_waitqueue_head(&dlm->reco.event); 1597 init_waitqueue_head(&dlm->reco.event);
1580 init_waitqueue_head(&dlm->ast_wq); 1598 init_waitqueue_head(&dlm->ast_wq);
1581 init_waitqueue_head(&dlm->migration_wq); 1599 init_waitqueue_head(&dlm->migration_wq);
1582 INIT_LIST_HEAD(&dlm->master_list);
1583 INIT_LIST_HEAD(&dlm->mle_hb_events); 1600 INIT_LIST_HEAD(&dlm->mle_hb_events);
1584 1601
1585 dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN; 1602 dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
@@ -1587,9 +1604,13 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1587 1604
1588 dlm->reco.new_master = O2NM_INVALID_NODE_NUM; 1605 dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
1589 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; 1606 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
1590 atomic_set(&dlm->local_resources, 0); 1607
1591 atomic_set(&dlm->remote_resources, 0); 1608 atomic_set(&dlm->res_tot_count, 0);
1592 atomic_set(&dlm->unknown_resources, 0); 1609 atomic_set(&dlm->res_cur_count, 0);
1610 for (i = 0; i < DLM_MLE_NUM_TYPES; ++i) {
1611 atomic_set(&dlm->mle_tot_count[i], 0);
1612 atomic_set(&dlm->mle_cur_count[i], 0);
1613 }
1593 1614
1594 spin_lock_init(&dlm->work_lock); 1615 spin_lock_init(&dlm->work_lock);
1595 INIT_LIST_HEAD(&dlm->work_list); 1616 INIT_LIST_HEAD(&dlm->work_list);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 0a2813947853..f8b653fcd4dd 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -73,22 +73,13 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
73 const char *name, 73 const char *name,
74 unsigned int namelen) 74 unsigned int namelen)
75{ 75{
76 struct dlm_lock_resource *res;
77
78 if (dlm != mle->dlm) 76 if (dlm != mle->dlm)
79 return 0; 77 return 0;
80 78
81 if (mle->type == DLM_MLE_BLOCK || 79 if (namelen != mle->mnamelen ||
82 mle->type == DLM_MLE_MIGRATION) { 80 memcmp(name, mle->mname, namelen) != 0)
83 if (namelen != mle->u.name.len || 81 return 0;
84 memcmp(name, mle->u.name.name, namelen)!=0) 82
85 return 0;
86 } else {
87 res = mle->u.res;
88 if (namelen != res->lockname.len ||
89 memcmp(res->lockname.name, name, namelen) != 0)
90 return 0;
91 }
92 return 1; 83 return 1;
93} 84}
94 85
@@ -283,7 +274,7 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
283 274
284 mle->dlm = dlm; 275 mle->dlm = dlm;
285 mle->type = type; 276 mle->type = type;
286 INIT_LIST_HEAD(&mle->list); 277 INIT_HLIST_NODE(&mle->master_hash_node);
287 INIT_LIST_HEAD(&mle->hb_events); 278 INIT_LIST_HEAD(&mle->hb_events);
288 memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); 279 memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
289 spin_lock_init(&mle->spinlock); 280 spin_lock_init(&mle->spinlock);
@@ -295,19 +286,27 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
295 mle->new_master = O2NM_MAX_NODES; 286 mle->new_master = O2NM_MAX_NODES;
296 mle->inuse = 0; 287 mle->inuse = 0;
297 288
289 BUG_ON(mle->type != DLM_MLE_BLOCK &&
290 mle->type != DLM_MLE_MASTER &&
291 mle->type != DLM_MLE_MIGRATION);
292
298 if (mle->type == DLM_MLE_MASTER) { 293 if (mle->type == DLM_MLE_MASTER) {
299 BUG_ON(!res); 294 BUG_ON(!res);
300 mle->u.res = res; 295 mle->mleres = res;
301 } else if (mle->type == DLM_MLE_BLOCK) { 296 memcpy(mle->mname, res->lockname.name, res->lockname.len);
302 BUG_ON(!name); 297 mle->mnamelen = res->lockname.len;
303 memcpy(mle->u.name.name, name, namelen); 298 mle->mnamehash = res->lockname.hash;
304 mle->u.name.len = namelen; 299 } else {
305 } else /* DLM_MLE_MIGRATION */ {
306 BUG_ON(!name); 300 BUG_ON(!name);
307 memcpy(mle->u.name.name, name, namelen); 301 mle->mleres = NULL;
308 mle->u.name.len = namelen; 302 memcpy(mle->mname, name, namelen);
303 mle->mnamelen = namelen;
304 mle->mnamehash = dlm_lockid_hash(name, namelen);
309 } 305 }
310 306
307 atomic_inc(&dlm->mle_tot_count[mle->type]);
308 atomic_inc(&dlm->mle_cur_count[mle->type]);
309
311 /* copy off the node_map and register hb callbacks on our copy */ 310 /* copy off the node_map and register hb callbacks on our copy */
312 memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map)); 311 memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
313 memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map)); 312 memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
@@ -318,6 +317,24 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
318 __dlm_mle_attach_hb_events(dlm, mle); 317 __dlm_mle_attach_hb_events(dlm, mle);
319} 318}
320 319
320void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
321{
322 assert_spin_locked(&dlm->spinlock);
323 assert_spin_locked(&dlm->master_lock);
324
325 if (!hlist_unhashed(&mle->master_hash_node))
326 hlist_del_init(&mle->master_hash_node);
327}
328
329void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
330{
331 struct hlist_head *bucket;
332
333 assert_spin_locked(&dlm->master_lock);
334
335 bucket = dlm_master_hash(dlm, mle->mnamehash);
336 hlist_add_head(&mle->master_hash_node, bucket);
337}
321 338
322/* returns 1 if found, 0 if not */ 339/* returns 1 if found, 0 if not */
323static int dlm_find_mle(struct dlm_ctxt *dlm, 340static int dlm_find_mle(struct dlm_ctxt *dlm,
@@ -325,10 +342,17 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
325 char *name, unsigned int namelen) 342 char *name, unsigned int namelen)
326{ 343{
327 struct dlm_master_list_entry *tmpmle; 344 struct dlm_master_list_entry *tmpmle;
345 struct hlist_head *bucket;
346 struct hlist_node *list;
347 unsigned int hash;
328 348
329 assert_spin_locked(&dlm->master_lock); 349 assert_spin_locked(&dlm->master_lock);
330 350
331 list_for_each_entry(tmpmle, &dlm->master_list, list) { 351 hash = dlm_lockid_hash(name, namelen);
352 bucket = dlm_master_hash(dlm, hash);
353 hlist_for_each(list, bucket) {
354 tmpmle = hlist_entry(list, struct dlm_master_list_entry,
355 master_hash_node);
332 if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) 356 if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
333 continue; 357 continue;
334 dlm_get_mle(tmpmle); 358 dlm_get_mle(tmpmle);
@@ -408,24 +432,20 @@ static void dlm_mle_release(struct kref *kref)
408 mle = container_of(kref, struct dlm_master_list_entry, mle_refs); 432 mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
409 dlm = mle->dlm; 433 dlm = mle->dlm;
410 434
411 if (mle->type != DLM_MLE_MASTER) {
412 mlog(0, "calling mle_release for %.*s, type %d\n",
413 mle->u.name.len, mle->u.name.name, mle->type);
414 } else {
415 mlog(0, "calling mle_release for %.*s, type %d\n",
416 mle->u.res->lockname.len,
417 mle->u.res->lockname.name, mle->type);
418 }
419 assert_spin_locked(&dlm->spinlock); 435 assert_spin_locked(&dlm->spinlock);
420 assert_spin_locked(&dlm->master_lock); 436 assert_spin_locked(&dlm->master_lock);
421 437
438 mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname,
439 mle->type);
440
422 /* remove from list if not already */ 441 /* remove from list if not already */
423 if (!list_empty(&mle->list)) 442 __dlm_unlink_mle(dlm, mle);
424 list_del_init(&mle->list);
425 443
426 /* detach the mle from the domain node up/down events */ 444 /* detach the mle from the domain node up/down events */
427 __dlm_mle_detach_hb_events(dlm, mle); 445 __dlm_mle_detach_hb_events(dlm, mle);
428 446
447 atomic_dec(&dlm->mle_cur_count[mle->type]);
448
429 /* NOTE: kfree under spinlock here. 449 /* NOTE: kfree under spinlock here.
430 * if this is bad, we can move this to a freelist. */ 450 * if this is bad, we can move this to a freelist. */
431 kmem_cache_free(dlm_mle_cache, mle); 451 kmem_cache_free(dlm_mle_cache, mle);
@@ -465,43 +485,6 @@ void dlm_destroy_master_caches(void)
465 kmem_cache_destroy(dlm_lockres_cache); 485 kmem_cache_destroy(dlm_lockres_cache);
466} 486}
467 487
468static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
469 struct dlm_lock_resource *res,
470 u8 owner)
471{
472 assert_spin_locked(&res->spinlock);
473
474 mlog_entry("%.*s, %u\n", res->lockname.len, res->lockname.name, owner);
475
476 if (owner == dlm->node_num)
477 atomic_inc(&dlm->local_resources);
478 else if (owner == DLM_LOCK_RES_OWNER_UNKNOWN)
479 atomic_inc(&dlm->unknown_resources);
480 else
481 atomic_inc(&dlm->remote_resources);
482
483 res->owner = owner;
484}
485
486void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
487 struct dlm_lock_resource *res, u8 owner)
488{
489 assert_spin_locked(&res->spinlock);
490
491 if (owner == res->owner)
492 return;
493
494 if (res->owner == dlm->node_num)
495 atomic_dec(&dlm->local_resources);
496 else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN)
497 atomic_dec(&dlm->unknown_resources);
498 else
499 atomic_dec(&dlm->remote_resources);
500
501 dlm_set_lockres_owner(dlm, res, owner);
502}
503
504
505static void dlm_lockres_release(struct kref *kref) 488static void dlm_lockres_release(struct kref *kref)
506{ 489{
507 struct dlm_lock_resource *res; 490 struct dlm_lock_resource *res;
@@ -527,6 +510,8 @@ static void dlm_lockres_release(struct kref *kref)
527 } 510 }
528 spin_unlock(&dlm->track_lock); 511 spin_unlock(&dlm->track_lock);
529 512
513 atomic_dec(&dlm->res_cur_count);
514
530 dlm_put(dlm); 515 dlm_put(dlm);
531 516
532 if (!hlist_unhashed(&res->hash_node) || 517 if (!hlist_unhashed(&res->hash_node) ||
@@ -607,6 +592,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
607 592
608 kref_init(&res->refs); 593 kref_init(&res->refs);
609 594
595 atomic_inc(&dlm->res_tot_count);
596 atomic_inc(&dlm->res_cur_count);
597
610 /* just for consistency */ 598 /* just for consistency */
611 spin_lock(&res->spinlock); 599 spin_lock(&res->spinlock);
612 dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN); 600 dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -843,7 +831,7 @@ lookup:
843 alloc_mle = NULL; 831 alloc_mle = NULL;
844 dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0); 832 dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
845 set_bit(dlm->node_num, mle->maybe_map); 833 set_bit(dlm->node_num, mle->maybe_map);
846 list_add(&mle->list, &dlm->master_list); 834 __dlm_insert_mle(dlm, mle);
847 835
848 /* still holding the dlm spinlock, check the recovery map 836 /* still holding the dlm spinlock, check the recovery map
849 * to see if there are any nodes that still need to be 837 * to see if there are any nodes that still need to be
@@ -1270,7 +1258,7 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
1270 res->lockname.len, 1258 res->lockname.len,
1271 res->lockname.name); 1259 res->lockname.name);
1272 mle->type = DLM_MLE_MASTER; 1260 mle->type = DLM_MLE_MASTER;
1273 mle->u.res = res; 1261 mle->mleres = res;
1274 } 1262 }
1275 } 1263 }
1276 } 1264 }
@@ -1315,14 +1303,8 @@ static int dlm_do_master_request(struct dlm_lock_resource *res,
1315 1303
1316 BUG_ON(mle->type == DLM_MLE_MIGRATION); 1304 BUG_ON(mle->type == DLM_MLE_MIGRATION);
1317 1305
1318 if (mle->type != DLM_MLE_MASTER) { 1306 request.namelen = (u8)mle->mnamelen;
1319 request.namelen = mle->u.name.len; 1307 memcpy(request.name, mle->mname, request.namelen);
1320 memcpy(request.name, mle->u.name.name, request.namelen);
1321 } else {
1322 request.namelen = mle->u.res->lockname.len;
1323 memcpy(request.name, mle->u.res->lockname.name,
1324 request.namelen);
1325 }
1326 1308
1327again: 1309again:
1328 ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request, 1310 ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
@@ -1575,7 +1557,7 @@ way_up_top:
1575 // "add the block.\n"); 1557 // "add the block.\n");
1576 dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen); 1558 dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
1577 set_bit(request->node_idx, mle->maybe_map); 1559 set_bit(request->node_idx, mle->maybe_map);
1578 list_add(&mle->list, &dlm->master_list); 1560 __dlm_insert_mle(dlm, mle);
1579 response = DLM_MASTER_RESP_NO; 1561 response = DLM_MASTER_RESP_NO;
1580 } else { 1562 } else {
1581 // mlog(0, "mle was found\n"); 1563 // mlog(0, "mle was found\n");
@@ -1967,7 +1949,7 @@ ok:
1967 assert->node_idx, rr, extra_ref, mle->inuse); 1949 assert->node_idx, rr, extra_ref, mle->inuse);
1968 dlm_print_one_mle(mle); 1950 dlm_print_one_mle(mle);
1969 } 1951 }
1970 list_del_init(&mle->list); 1952 __dlm_unlink_mle(dlm, mle);
1971 __dlm_mle_detach_hb_events(dlm, mle); 1953 __dlm_mle_detach_hb_events(dlm, mle);
1972 __dlm_put_mle(mle); 1954 __dlm_put_mle(mle);
1973 if (extra_ref) { 1955 if (extra_ref) {
@@ -3159,10 +3141,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
3159 tmp->master = master; 3141 tmp->master = master;
3160 atomic_set(&tmp->woken, 1); 3142 atomic_set(&tmp->woken, 1);
3161 wake_up(&tmp->wq); 3143 wake_up(&tmp->wq);
3162 /* remove it from the list so that only one 3144 /* remove it so that only one mle will be found */
3163 * mle will be found */ 3145 __dlm_unlink_mle(dlm, tmp);
3164 list_del_init(&tmp->list);
3165 /* this was obviously WRONG. mle is uninited here. should be tmp. */
3166 __dlm_mle_detach_hb_events(dlm, tmp); 3146 __dlm_mle_detach_hb_events(dlm, tmp);
3167 ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; 3147 ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
3168 mlog(0, "%s:%.*s: master=%u, newmaster=%u, " 3148 mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
@@ -3181,137 +3161,164 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
3181 mle->master = master; 3161 mle->master = master;
3182 /* do this for consistency with other mle types */ 3162 /* do this for consistency with other mle types */
3183 set_bit(new_master, mle->maybe_map); 3163 set_bit(new_master, mle->maybe_map);
3184 list_add(&mle->list, &dlm->master_list); 3164 __dlm_insert_mle(dlm, mle);
3185 3165
3186 return ret; 3166 return ret;
3187} 3167}
3188 3168
3189 3169/*
3190void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) 3170 * Sets the owner of the lockres, associated to the mle, to UNKNOWN
3171 */
3172static struct dlm_lock_resource *dlm_reset_mleres_owner(struct dlm_ctxt *dlm,
3173 struct dlm_master_list_entry *mle)
3191{ 3174{
3192 struct dlm_master_list_entry *mle, *next;
3193 struct dlm_lock_resource *res; 3175 struct dlm_lock_resource *res;
3194 unsigned int hash;
3195 3176
3196 mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node); 3177 /* Find the lockres associated to the mle and set its owner to UNK */
3197top: 3178 res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen,
3198 assert_spin_locked(&dlm->spinlock); 3179 mle->mnamehash);
3180 if (res) {
3181 spin_unlock(&dlm->master_lock);
3199 3182
3200 /* clean the master list */ 3183 /* move lockres onto recovery list */
3201 spin_lock(&dlm->master_lock); 3184 spin_lock(&res->spinlock);
3202 list_for_each_entry_safe(mle, next, &dlm->master_list, list) { 3185 dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
3203 BUG_ON(mle->type != DLM_MLE_BLOCK && 3186 dlm_move_lockres_to_recovery_list(dlm, res);
3204 mle->type != DLM_MLE_MASTER && 3187 spin_unlock(&res->spinlock);
3205 mle->type != DLM_MLE_MIGRATION); 3188 dlm_lockres_put(res);
3206
3207 /* MASTER mles are initiated locally. the waiting
3208 * process will notice the node map change
3209 * shortly. let that happen as normal. */
3210 if (mle->type == DLM_MLE_MASTER)
3211 continue;
3212 3189
3190 /* about to get rid of mle, detach from heartbeat */
3191 __dlm_mle_detach_hb_events(dlm, mle);
3213 3192
3214 /* BLOCK mles are initiated by other nodes. 3193 /* dump the mle */
3215 * need to clean up if the dead node would have 3194 spin_lock(&dlm->master_lock);
3216 * been the master. */ 3195 __dlm_put_mle(mle);
3217 if (mle->type == DLM_MLE_BLOCK) { 3196 spin_unlock(&dlm->master_lock);
3218 int bit; 3197 }
3219 3198
3220 spin_lock(&mle->spinlock); 3199 return res;
3221 bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); 3200}
3222 if (bit != dead_node) {
3223 mlog(0, "mle found, but dead node %u would "
3224 "not have been master\n", dead_node);
3225 spin_unlock(&mle->spinlock);
3226 } else {
3227 /* must drop the refcount by one since the
3228 * assert_master will never arrive. this
3229 * may result in the mle being unlinked and
3230 * freed, but there may still be a process
3231 * waiting in the dlmlock path which is fine. */
3232 mlog(0, "node %u was expected master\n",
3233 dead_node);
3234 atomic_set(&mle->woken, 1);
3235 spin_unlock(&mle->spinlock);
3236 wake_up(&mle->wq);
3237 /* do not need events any longer, so detach
3238 * from heartbeat */
3239 __dlm_mle_detach_hb_events(dlm, mle);
3240 __dlm_put_mle(mle);
3241 }
3242 continue;
3243 }
3244 3201
3245 /* everything else is a MIGRATION mle */ 3202static void dlm_clean_migration_mle(struct dlm_ctxt *dlm,
3246 3203 struct dlm_master_list_entry *mle)
3247 /* the rule for MIGRATION mles is that the master 3204{
3248 * becomes UNKNOWN if *either* the original or 3205 __dlm_mle_detach_hb_events(dlm, mle);
3249 * the new master dies. all UNKNOWN lockreses
3250 * are sent to whichever node becomes the recovery
3251 * master. the new master is responsible for
3252 * determining if there is still a master for
3253 * this lockres, or if he needs to take over
3254 * mastery. either way, this node should expect
3255 * another message to resolve this. */
3256 if (mle->master != dead_node &&
3257 mle->new_master != dead_node)
3258 continue;
3259 3206
3260 /* if we have reached this point, this mle needs to 3207 spin_lock(&mle->spinlock);
3261 * be removed from the list and freed. */ 3208 __dlm_unlink_mle(dlm, mle);
3209 atomic_set(&mle->woken, 1);
3210 spin_unlock(&mle->spinlock);
3262 3211
3263 /* remove from the list early. NOTE: unlinking 3212 wake_up(&mle->wq);
3264 * list_head while in list_for_each_safe */ 3213}
3265 __dlm_mle_detach_hb_events(dlm, mle); 3214
3266 spin_lock(&mle->spinlock); 3215static void dlm_clean_block_mle(struct dlm_ctxt *dlm,
3267 list_del_init(&mle->list); 3216 struct dlm_master_list_entry *mle, u8 dead_node)
3217{
3218 int bit;
3219
3220 BUG_ON(mle->type != DLM_MLE_BLOCK);
3221
3222 spin_lock(&mle->spinlock);
3223 bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
3224 if (bit != dead_node) {
3225 mlog(0, "mle found, but dead node %u would not have been "
3226 "master\n", dead_node);
3227 spin_unlock(&mle->spinlock);
3228 } else {
3229 /* Must drop the refcount by one since the assert_master will
3230 * never arrive. This may result in the mle being unlinked and
3231 * freed, but there may still be a process waiting in the
3232 * dlmlock path which is fine. */
3233 mlog(0, "node %u was expected master\n", dead_node);
3268 atomic_set(&mle->woken, 1); 3234 atomic_set(&mle->woken, 1);
3269 spin_unlock(&mle->spinlock); 3235 spin_unlock(&mle->spinlock);
3270 wake_up(&mle->wq); 3236 wake_up(&mle->wq);
3271 3237
3272 mlog(0, "%s: node %u died during migration from " 3238 /* Do not need events any longer, so detach from heartbeat */
3273 "%u to %u!\n", dlm->name, dead_node, 3239 __dlm_mle_detach_hb_events(dlm, mle);
3274 mle->master, mle->new_master); 3240 __dlm_put_mle(mle);
3275 /* if there is a lockres associated with this 3241 }
3276 * mle, find it and set its owner to UNKNOWN */ 3242}
3277 hash = dlm_lockid_hash(mle->u.name.name, mle->u.name.len);
3278 res = __dlm_lookup_lockres(dlm, mle->u.name.name,
3279 mle->u.name.len, hash);
3280 if (res) {
3281 /* unfortunately if we hit this rare case, our
3282 * lock ordering is messed. we need to drop
3283 * the master lock so that we can take the
3284 * lockres lock, meaning that we will have to
3285 * restart from the head of list. */
3286 spin_unlock(&dlm->master_lock);
3287 3243
3288 /* move lockres onto recovery list */ 3244void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
3289 spin_lock(&res->spinlock); 3245{
3290 dlm_set_lockres_owner(dlm, res, 3246 struct dlm_master_list_entry *mle;
3291 DLM_LOCK_RES_OWNER_UNKNOWN); 3247 struct dlm_lock_resource *res;
3292 dlm_move_lockres_to_recovery_list(dlm, res); 3248 struct hlist_head *bucket;
3293 spin_unlock(&res->spinlock); 3249 struct hlist_node *list;
3294 dlm_lockres_put(res); 3250 unsigned int i;
3295 3251
3296 /* about to get rid of mle, detach from heartbeat */ 3252 mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
3297 __dlm_mle_detach_hb_events(dlm, mle); 3253top:
3254 assert_spin_locked(&dlm->spinlock);
3298 3255
3299 /* dump the mle */ 3256 /* clean the master list */
3300 spin_lock(&dlm->master_lock); 3257 spin_lock(&dlm->master_lock);
3301 __dlm_put_mle(mle); 3258 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
3302 spin_unlock(&dlm->master_lock); 3259 bucket = dlm_master_hash(dlm, i);
3260 hlist_for_each(list, bucket) {
3261 mle = hlist_entry(list, struct dlm_master_list_entry,
3262 master_hash_node);
3263
3264 BUG_ON(mle->type != DLM_MLE_BLOCK &&
3265 mle->type != DLM_MLE_MASTER &&
3266 mle->type != DLM_MLE_MIGRATION);
3267
3268 /* MASTER mles are initiated locally. The waiting
3269 * process will notice the node map change shortly.
3270 * Let that happen as normal. */
3271 if (mle->type == DLM_MLE_MASTER)
3272 continue;
3273
3274 /* BLOCK mles are initiated by other nodes. Need to
3275 * clean up if the dead node would have been the
3276 * master. */
3277 if (mle->type == DLM_MLE_BLOCK) {
3278 dlm_clean_block_mle(dlm, mle, dead_node);
3279 continue;
3280 }
3303 3281
3304 /* restart */ 3282 /* Everything else is a MIGRATION mle */
3305 goto top; 3283
3306 } 3284 /* The rule for MIGRATION mles is that the master
3285 * becomes UNKNOWN if *either* the original or the new
3286 * master dies. All UNKNOWN lockres' are sent to
3287 * whichever node becomes the recovery master. The new
3288 * master is responsible for determining if there is
3289 * still a master for this lockres, or if he needs to
3290 * take over mastery. Either way, this node should
3291 * expect another message to resolve this. */
3292
3293 if (mle->master != dead_node &&
3294 mle->new_master != dead_node)
3295 continue;
3296
3297 /* If we have reached this point, this mle needs to be
3298 * removed from the list and freed. */
3299 dlm_clean_migration_mle(dlm, mle);
3300
3301 mlog(0, "%s: node %u died during migration from "
3302 "%u to %u!\n", dlm->name, dead_node, mle->master,
3303 mle->new_master);
3304
3305 /* If we find a lockres associated with the mle, we've
3306 * hit this rare case that messes up our lock ordering.
3307 * If so, we need to drop the master lock so that we can
3308 * take the lockres lock, meaning that we will have to
3309 * restart from the head of list. */
3310 res = dlm_reset_mleres_owner(dlm, mle);
3311 if (res)
3312 /* restart */
3313 goto top;
3307 3314
3308 /* this may be the last reference */ 3315 /* This may be the last reference */
3309 __dlm_put_mle(mle); 3316 __dlm_put_mle(mle);
3317 }
3310 } 3318 }
3311 spin_unlock(&dlm->master_lock); 3319 spin_unlock(&dlm->master_lock);
3312} 3320}
3313 3321
3314
3315int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, 3322int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
3316 u8 old_master) 3323 u8 old_master)
3317{ 3324{
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 4060bb328bc8..d490b66ad9d7 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -162,12 +162,28 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
162 162
163 spin_lock(&res->spinlock); 163 spin_lock(&res->spinlock);
164 if (!__dlm_lockres_unused(res)) { 164 if (!__dlm_lockres_unused(res)) {
165 spin_unlock(&res->spinlock);
166 mlog(0, "%s:%.*s: tried to purge but not unused\n", 165 mlog(0, "%s:%.*s: tried to purge but not unused\n",
167 dlm->name, res->lockname.len, res->lockname.name); 166 dlm->name, res->lockname.len, res->lockname.name);
168 return -ENOTEMPTY; 167 __dlm_print_one_lock_resource(res);
168 spin_unlock(&res->spinlock);
169 BUG();
169 } 170 }
171
172 if (res->state & DLM_LOCK_RES_MIGRATING) {
173 mlog(0, "%s:%.*s: Delay dropref as this lockres is "
174 "being remastered\n", dlm->name, res->lockname.len,
175 res->lockname.name);
176 /* Re-add the lockres to the end of the purge list */
177 if (!list_empty(&res->purge)) {
178 list_del_init(&res->purge);
179 list_add_tail(&res->purge, &dlm->purge_list);
180 }
181 spin_unlock(&res->spinlock);
182 return 0;
183 }
184
170 master = (res->owner == dlm->node_num); 185 master = (res->owner == dlm->node_num);
186
171 if (!master) 187 if (!master)
172 res->state |= DLM_LOCK_RES_DROPPING_REF; 188 res->state |= DLM_LOCK_RES_DROPPING_REF;
173 spin_unlock(&res->spinlock); 189 spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 7219a86d34cc..e15fc7d50827 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -244,6 +244,10 @@ static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
244 .flags = 0, 244 .flags = 0,
245}; 245};
246 246
247static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
248 .flags = 0,
249};
250
247static struct ocfs2_lock_res_ops ocfs2_dentry_lops = { 251static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
248 .get_osb = ocfs2_get_dentry_osb, 252 .get_osb = ocfs2_get_dentry_osb,
249 .post_unlock = ocfs2_dentry_post_unlock, 253 .post_unlock = ocfs2_dentry_post_unlock,
@@ -622,6 +626,17 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
622 &ocfs2_rename_lops, osb); 626 &ocfs2_rename_lops, osb);
623} 627}
624 628
629static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
630 struct ocfs2_super *osb)
631{
632 /* nfs_sync lockres doesn't come from a slab so we call init
633 * once on it manually. */
634 ocfs2_lock_res_init_once(res);
635 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_NFS_SYNC, 0, 0, res->l_name);
636 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_NFS_SYNC,
637 &ocfs2_nfs_sync_lops, osb);
638}
639
625void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, 640void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
626 struct ocfs2_file_private *fp) 641 struct ocfs2_file_private *fp)
627{ 642{
@@ -2417,6 +2432,34 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb)
2417 ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX); 2432 ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
2418} 2433}
2419 2434
2435int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex)
2436{
2437 int status;
2438 struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
2439
2440 if (ocfs2_is_hard_readonly(osb))
2441 return -EROFS;
2442
2443 if (ocfs2_mount_local(osb))
2444 return 0;
2445
2446 status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE,
2447 0, 0);
2448 if (status < 0)
2449 mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status);
2450
2451 return status;
2452}
2453
2454void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex)
2455{
2456 struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
2457
2458 if (!ocfs2_mount_local(osb))
2459 ocfs2_cluster_unlock(osb, lockres,
2460 ex ? LKM_EXMODE : LKM_PRMODE);
2461}
2462
2420int ocfs2_dentry_lock(struct dentry *dentry, int ex) 2463int ocfs2_dentry_lock(struct dentry *dentry, int ex)
2421{ 2464{
2422 int ret; 2465 int ret;
@@ -2798,6 +2841,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
2798local: 2841local:
2799 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); 2842 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
2800 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); 2843 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
2844 ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
2801 2845
2802 osb->cconn = conn; 2846 osb->cconn = conn;
2803 2847
@@ -2833,6 +2877,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
2833 2877
2834 ocfs2_lock_res_free(&osb->osb_super_lockres); 2878 ocfs2_lock_res_free(&osb->osb_super_lockres);
2835 ocfs2_lock_res_free(&osb->osb_rename_lockres); 2879 ocfs2_lock_res_free(&osb->osb_rename_lockres);
2880 ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
2836 2881
2837 ocfs2_cluster_disconnect(osb->cconn, hangup_pending); 2882 ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
2838 osb->cconn = NULL; 2883 osb->cconn = NULL;
@@ -3015,6 +3060,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
3015{ 3060{
3016 ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres); 3061 ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
3017 ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres); 3062 ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
3063 ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
3018} 3064}
3019 3065
3020int ocfs2_drop_inode_locks(struct inode *inode) 3066int ocfs2_drop_inode_locks(struct inode *inode)
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 3f8d9986b8e0..e1fd5721cd7f 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -115,6 +115,8 @@ void ocfs2_super_unlock(struct ocfs2_super *osb,
115 int ex); 115 int ex);
116int ocfs2_rename_lock(struct ocfs2_super *osb); 116int ocfs2_rename_lock(struct ocfs2_super *osb);
117void ocfs2_rename_unlock(struct ocfs2_super *osb); 117void ocfs2_rename_unlock(struct ocfs2_super *osb);
118int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
119void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex);
118int ocfs2_dentry_lock(struct dentry *dentry, int ex); 120int ocfs2_dentry_lock(struct dentry *dentry, int ex);
119void ocfs2_dentry_unlock(struct dentry *dentry, int ex); 121void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
120int ocfs2_file_lock(struct file *file, int ex, int trylock); 122int ocfs2_file_lock(struct file *file, int ex, int trylock);
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 2f27b332d8b3..de3da8eb558c 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -31,6 +31,7 @@
31 31
32#include "ocfs2.h" 32#include "ocfs2.h"
33 33
34#include "alloc.h"
34#include "dir.h" 35#include "dir.h"
35#include "dlmglue.h" 36#include "dlmglue.h"
36#include "dcache.h" 37#include "dcache.h"
@@ -38,6 +39,7 @@
38#include "inode.h" 39#include "inode.h"
39 40
40#include "buffer_head_io.h" 41#include "buffer_head_io.h"
42#include "suballoc.h"
41 43
42struct ocfs2_inode_handle 44struct ocfs2_inode_handle
43{ 45{
@@ -49,29 +51,97 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
49 struct ocfs2_inode_handle *handle) 51 struct ocfs2_inode_handle *handle)
50{ 52{
51 struct inode *inode; 53 struct inode *inode;
54 struct ocfs2_super *osb = OCFS2_SB(sb);
55 u64 blkno = handle->ih_blkno;
56 int status, set;
52 struct dentry *result; 57 struct dentry *result;
53 58
54 mlog_entry("(0x%p, 0x%p)\n", sb, handle); 59 mlog_entry("(0x%p, 0x%p)\n", sb, handle);
55 60
56 if (handle->ih_blkno == 0) { 61 if (blkno == 0) {
57 mlog_errno(-ESTALE); 62 mlog(0, "nfs wants inode with blkno: 0\n");
58 return ERR_PTR(-ESTALE); 63 result = ERR_PTR(-ESTALE);
64 goto bail;
65 }
66
67 inode = ocfs2_ilookup(sb, blkno);
68 /*
69 * If the inode exists in memory, we only need to check it's
70 * generation number
71 */
72 if (inode)
73 goto check_gen;
74
75 /*
76 * This will synchronize us against ocfs2_delete_inode() on
77 * all nodes
78 */
79 status = ocfs2_nfs_sync_lock(osb, 1);
80 if (status < 0) {
81 mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status);
82 goto check_err;
83 }
84
85 status = ocfs2_test_inode_bit(osb, blkno, &set);
86 if (status < 0) {
87 if (status == -EINVAL) {
88 /*
89 * The blkno NFS gave us doesn't even show up
90 * as an inode, we return -ESTALE to be
91 * nice
92 */
93 mlog(0, "test inode bit failed %d\n", status);
94 status = -ESTALE;
95 } else {
96 mlog(ML_ERROR, "test inode bit failed %d\n", status);
97 }
98 goto unlock_nfs_sync;
99 }
100
101 /* If the inode allocator bit is clear, this inode must be stale */
102 if (!set) {
103 mlog(0, "inode %llu suballoc bit is clear\n", blkno);
104 status = -ESTALE;
105 goto unlock_nfs_sync;
59 } 106 }
60 107
61 inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0); 108 inode = ocfs2_iget(osb, blkno, 0, 0);
62 109
63 if (IS_ERR(inode)) 110unlock_nfs_sync:
64 return (void *)inode; 111 ocfs2_nfs_sync_unlock(osb, 1);
65 112
113check_err:
114 if (status < 0) {
115 if (status == -ESTALE) {
116 mlog(0, "stale inode ino: %llu generation: %u\n",
117 blkno, handle->ih_generation);
118 }
119 result = ERR_PTR(status);
120 goto bail;
121 }
122
123 if (IS_ERR(inode)) {
124 mlog_errno(PTR_ERR(inode));
125 result = (void *)inode;
126 goto bail;
127 }
128
129check_gen:
66 if (handle->ih_generation != inode->i_generation) { 130 if (handle->ih_generation != inode->i_generation) {
67 iput(inode); 131 iput(inode);
68 return ERR_PTR(-ESTALE); 132 mlog(0, "stale inode ino: %llu generation: %u\n", blkno,
133 handle->ih_generation);
134 result = ERR_PTR(-ESTALE);
135 goto bail;
69 } 136 }
70 137
71 result = d_obtain_alias(inode); 138 result = d_obtain_alias(inode);
72 if (!IS_ERR(result)) 139 if (!IS_ERR(result))
73 result->d_op = &ocfs2_dentry_ops; 140 result->d_op = &ocfs2_dentry_ops;
141 else
142 mlog_errno(PTR_ERR(result));
74 143
144bail:
75 mlog_exit_ptr(result); 145 mlog_exit_ptr(result);
76 return result; 146 return result;
77} 147}
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 229e707bc050..10e1fa87396a 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -38,6 +38,7 @@
38#include "ocfs2.h" 38#include "ocfs2.h"
39 39
40#include "alloc.h" 40#include "alloc.h"
41#include "dir.h"
41#include "blockcheck.h" 42#include "blockcheck.h"
42#include "dlmglue.h" 43#include "dlmglue.h"
43#include "extent_map.h" 44#include "extent_map.h"
@@ -112,6 +113,17 @@ void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
112 oi->ip_attr |= OCFS2_DIRSYNC_FL; 113 oi->ip_attr |= OCFS2_DIRSYNC_FL;
113} 114}
114 115
116struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
117{
118 struct ocfs2_find_inode_args args;
119
120 args.fi_blkno = blkno;
121 args.fi_flags = 0;
122 args.fi_ino = ino_from_blkno(sb, blkno);
123 args.fi_sysfile_type = 0;
124
125 return ilookup5(sb, blkno, ocfs2_find_actor, &args);
126}
115struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, 127struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
116 int sysfile_type) 128 int sysfile_type)
117{ 129{
@@ -275,7 +287,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
275 (unsigned long long)OCFS2_I(inode)->ip_blkno, 287 (unsigned long long)OCFS2_I(inode)->ip_blkno,
276 (unsigned long long)le64_to_cpu(fe->i_blkno)); 288 (unsigned long long)le64_to_cpu(fe->i_blkno));
277 289
278 inode->i_nlink = le16_to_cpu(fe->i_links_count); 290 inode->i_nlink = ocfs2_read_links_count(fe);
279 291
280 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) { 292 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
281 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; 293 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
@@ -351,6 +363,8 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
351 363
352 ocfs2_set_inode_flags(inode); 364 ocfs2_set_inode_flags(inode);
353 365
366 OCFS2_I(inode)->ip_last_used_slot = 0;
367 OCFS2_I(inode)->ip_last_used_group = 0;
354 mlog_exit_void(); 368 mlog_exit_void();
355} 369}
356 370
@@ -606,7 +620,7 @@ static int ocfs2_remove_inode(struct inode *inode,
606 } 620 }
607 621
608 handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS + 622 handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS +
609 ocfs2_quota_trans_credits(inode->i_sb)); 623 ocfs2_quota_trans_credits(inode->i_sb));
610 if (IS_ERR(handle)) { 624 if (IS_ERR(handle)) {
611 status = PTR_ERR(handle); 625 status = PTR_ERR(handle);
612 mlog_errno(status); 626 mlog_errno(status);
@@ -740,6 +754,15 @@ static int ocfs2_wipe_inode(struct inode *inode,
740 goto bail_unlock_dir; 754 goto bail_unlock_dir;
741 } 755 }
742 756
757 /* Remove any dir index tree */
758 if (S_ISDIR(inode->i_mode)) {
759 status = ocfs2_dx_dir_truncate(inode, di_bh);
760 if (status) {
761 mlog_errno(status);
762 goto bail_unlock_dir;
763 }
764 }
765
743 /*Free extended attribute resources associated with this inode.*/ 766 /*Free extended attribute resources associated with this inode.*/
744 status = ocfs2_xattr_remove(inode, di_bh); 767 status = ocfs2_xattr_remove(inode, di_bh);
745 if (status < 0) { 768 if (status < 0) {
@@ -949,6 +972,17 @@ void ocfs2_delete_inode(struct inode *inode)
949 goto bail; 972 goto bail;
950 } 973 }
951 974
975 /*
976 * Synchronize us against ocfs2_get_dentry. We take this in
977 * shared mode so that all nodes can still concurrently
978 * process deletes.
979 */
980 status = ocfs2_nfs_sync_lock(OCFS2_SB(inode->i_sb), 0);
981 if (status < 0) {
982 mlog(ML_ERROR, "getting nfs sync lock(PR) failed %d\n", status);
983 ocfs2_cleanup_delete_inode(inode, 0);
984 goto bail_unblock;
985 }
952 /* Lock down the inode. This gives us an up to date view of 986 /* Lock down the inode. This gives us an up to date view of
953 * it's metadata (for verification), and allows us to 987 * it's metadata (for verification), and allows us to
954 * serialize delete_inode on multiple nodes. 988 * serialize delete_inode on multiple nodes.
@@ -962,7 +996,7 @@ void ocfs2_delete_inode(struct inode *inode)
962 if (status != -ENOENT) 996 if (status != -ENOENT)
963 mlog_errno(status); 997 mlog_errno(status);
964 ocfs2_cleanup_delete_inode(inode, 0); 998 ocfs2_cleanup_delete_inode(inode, 0);
965 goto bail_unblock; 999 goto bail_unlock_nfs_sync;
966 } 1000 }
967 1001
968 /* Query the cluster. This will be the final decision made 1002 /* Query the cluster. This will be the final decision made
@@ -1005,6 +1039,10 @@ void ocfs2_delete_inode(struct inode *inode)
1005bail_unlock_inode: 1039bail_unlock_inode:
1006 ocfs2_inode_unlock(inode, 1); 1040 ocfs2_inode_unlock(inode, 1);
1007 brelse(di_bh); 1041 brelse(di_bh);
1042
1043bail_unlock_nfs_sync:
1044 ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);
1045
1008bail_unblock: 1046bail_unblock:
1009 status = sigprocmask(SIG_SETMASK, &oldset, NULL); 1047 status = sigprocmask(SIG_SETMASK, &oldset, NULL);
1010 if (status < 0) 1048 if (status < 0)
@@ -1205,7 +1243,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
1205 spin_unlock(&OCFS2_I(inode)->ip_lock); 1243 spin_unlock(&OCFS2_I(inode)->ip_lock);
1206 1244
1207 fe->i_size = cpu_to_le64(i_size_read(inode)); 1245 fe->i_size = cpu_to_le64(i_size_read(inode));
1208 fe->i_links_count = cpu_to_le16(inode->i_nlink); 1246 ocfs2_set_links_count(fe, inode->i_nlink);
1209 fe->i_uid = cpu_to_le32(inode->i_uid); 1247 fe->i_uid = cpu_to_le32(inode->i_uid);
1210 fe->i_gid = cpu_to_le32(inode->i_gid); 1248 fe->i_gid = cpu_to_le32(inode->i_gid);
1211 fe->i_mode = cpu_to_le16(inode->i_mode); 1249 fe->i_mode = cpu_to_le16(inode->i_mode);
@@ -1242,7 +1280,7 @@ void ocfs2_refresh_inode(struct inode *inode,
1242 OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features); 1280 OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
1243 ocfs2_set_inode_flags(inode); 1281 ocfs2_set_inode_flags(inode);
1244 i_size_write(inode, le64_to_cpu(fe->i_size)); 1282 i_size_write(inode, le64_to_cpu(fe->i_size));
1245 inode->i_nlink = le16_to_cpu(fe->i_links_count); 1283 inode->i_nlink = ocfs2_read_links_count(fe);
1246 inode->i_uid = le32_to_cpu(fe->i_uid); 1284 inode->i_uid = le32_to_cpu(fe->i_uid);
1247 inode->i_gid = le32_to_cpu(fe->i_gid); 1285 inode->i_gid = le32_to_cpu(fe->i_gid);
1248 inode->i_mode = le16_to_cpu(fe->i_mode); 1286 inode->i_mode = le16_to_cpu(fe->i_mode);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index eb3c302b38d3..ea71525aad41 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -72,6 +72,10 @@ struct ocfs2_inode_info
72 72
73 struct inode vfs_inode; 73 struct inode vfs_inode;
74 struct jbd2_inode ip_jinode; 74 struct jbd2_inode ip_jinode;
75
76 /* Only valid if the inode is the dir. */
77 u32 ip_last_used_slot;
78 u64 ip_last_used_group;
75}; 79};
76 80
77/* 81/*
@@ -124,6 +128,7 @@ void ocfs2_drop_inode(struct inode *inode);
124/* Flags for ocfs2_iget() */ 128/* Flags for ocfs2_iget() */
125#define OCFS2_FI_FLAG_SYSFILE 0x1 129#define OCFS2_FI_FLAG_SYSFILE 0x1
126#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2 130#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2
131struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
127struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags, 132struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
128 int sysfile_type); 133 int sysfile_type);
129int ocfs2_inode_init_private(struct inode *inode); 134int ocfs2_inode_init_private(struct inode *inode);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 57d7d25a2b9a..a20a0f1e37fd 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -65,6 +65,11 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
65static int ocfs2_recover_orphans(struct ocfs2_super *osb, 65static int ocfs2_recover_orphans(struct ocfs2_super *osb,
66 int slot); 66 int slot);
67static int ocfs2_commit_thread(void *arg); 67static int ocfs2_commit_thread(void *arg);
68static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
69 int slot_num,
70 struct ocfs2_dinode *la_dinode,
71 struct ocfs2_dinode *tl_dinode,
72 struct ocfs2_quota_recovery *qrec);
68 73
69static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb) 74static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
70{ 75{
@@ -76,18 +81,97 @@ static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb)
76 return __ocfs2_wait_on_mount(osb, 1); 81 return __ocfs2_wait_on_mount(osb, 1);
77} 82}
78 83
79
80
81/* 84/*
82 * The recovery_list is a simple linked list of node numbers to recover. 85 * This replay_map is to track online/offline slots, so we could recover
83 * It is protected by the recovery_lock. 86 * offline slots during recovery and mount
84 */ 87 */
85 88
86struct ocfs2_recovery_map { 89enum ocfs2_replay_state {
87 unsigned int rm_used; 90 REPLAY_UNNEEDED = 0, /* Replay is not needed, so ignore this map */
88 unsigned int *rm_entries; 91 REPLAY_NEEDED, /* Replay slots marked in rm_replay_slots */
92 REPLAY_DONE /* Replay was already queued */
89}; 93};
90 94
95struct ocfs2_replay_map {
96 unsigned int rm_slots;
97 enum ocfs2_replay_state rm_state;
98 unsigned char rm_replay_slots[0];
99};
100
101void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
102{
103 if (!osb->replay_map)
104 return;
105
106 /* If we've already queued the replay, we don't have any more to do */
107 if (osb->replay_map->rm_state == REPLAY_DONE)
108 return;
109
110 osb->replay_map->rm_state = state;
111}
112
113int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
114{
115 struct ocfs2_replay_map *replay_map;
116 int i, node_num;
117
118 /* If replay map is already set, we don't do it again */
119 if (osb->replay_map)
120 return 0;
121
122 replay_map = kzalloc(sizeof(struct ocfs2_replay_map) +
123 (osb->max_slots * sizeof(char)), GFP_KERNEL);
124
125 if (!replay_map) {
126 mlog_errno(-ENOMEM);
127 return -ENOMEM;
128 }
129
130 spin_lock(&osb->osb_lock);
131
132 replay_map->rm_slots = osb->max_slots;
133 replay_map->rm_state = REPLAY_UNNEEDED;
134
135 /* set rm_replay_slots for offline slot(s) */
136 for (i = 0; i < replay_map->rm_slots; i++) {
137 if (ocfs2_slot_to_node_num_locked(osb, i, &node_num) == -ENOENT)
138 replay_map->rm_replay_slots[i] = 1;
139 }
140
141 osb->replay_map = replay_map;
142 spin_unlock(&osb->osb_lock);
143 return 0;
144}
145
146void ocfs2_queue_replay_slots(struct ocfs2_super *osb)
147{
148 struct ocfs2_replay_map *replay_map = osb->replay_map;
149 int i;
150
151 if (!replay_map)
152 return;
153
154 if (replay_map->rm_state != REPLAY_NEEDED)
155 return;
156
157 for (i = 0; i < replay_map->rm_slots; i++)
158 if (replay_map->rm_replay_slots[i])
159 ocfs2_queue_recovery_completion(osb->journal, i, NULL,
160 NULL, NULL);
161 replay_map->rm_state = REPLAY_DONE;
162}
163
164void ocfs2_free_replay_slots(struct ocfs2_super *osb)
165{
166 struct ocfs2_replay_map *replay_map = osb->replay_map;
167
168 if (!osb->replay_map)
169 return;
170
171 kfree(replay_map);
172 osb->replay_map = NULL;
173}
174
91int ocfs2_recovery_init(struct ocfs2_super *osb) 175int ocfs2_recovery_init(struct ocfs2_super *osb)
92{ 176{
93 struct ocfs2_recovery_map *rm; 177 struct ocfs2_recovery_map *rm;
@@ -496,6 +580,22 @@ static struct ocfs2_triggers dq_triggers = {
496 }, 580 },
497}; 581};
498 582
583static struct ocfs2_triggers dr_triggers = {
584 .ot_triggers = {
585 .t_commit = ocfs2_commit_trigger,
586 .t_abort = ocfs2_abort_trigger,
587 },
588 .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check),
589};
590
591static struct ocfs2_triggers dl_triggers = {
592 .ot_triggers = {
593 .t_commit = ocfs2_commit_trigger,
594 .t_abort = ocfs2_abort_trigger,
595 },
596 .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check),
597};
598
499static int __ocfs2_journal_access(handle_t *handle, 599static int __ocfs2_journal_access(handle_t *handle,
500 struct inode *inode, 600 struct inode *inode,
501 struct buffer_head *bh, 601 struct buffer_head *bh,
@@ -600,6 +700,20 @@ int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
600 type); 700 type);
601} 701}
602 702
703int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode,
704 struct buffer_head *bh, int type)
705{
706 return __ocfs2_journal_access(handle, inode, bh, &dr_triggers,
707 type);
708}
709
710int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode,
711 struct buffer_head *bh, int type)
712{
713 return __ocfs2_journal_access(handle, inode, bh, &dl_triggers,
714 type);
715}
716
603int ocfs2_journal_access(handle_t *handle, struct inode *inode, 717int ocfs2_journal_access(handle_t *handle, struct inode *inode,
604 struct buffer_head *bh, int type) 718 struct buffer_head *bh, int type)
605{ 719{
@@ -1176,24 +1290,24 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
1176} 1290}
1177 1291
1178/* Called by the mount code to queue recovery the last part of 1292/* Called by the mount code to queue recovery the last part of
1179 * recovery for it's own slot. */ 1293 * recovery for it's own and offline slot(s). */
1180void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) 1294void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
1181{ 1295{
1182 struct ocfs2_journal *journal = osb->journal; 1296 struct ocfs2_journal *journal = osb->journal;
1183 1297
1184 if (osb->dirty) { 1298 /* No need to queue up our truncate_log as regular cleanup will catch
1185 /* No need to queue up our truncate_log as regular 1299 * that */
1186 * cleanup will catch that. */ 1300 ocfs2_queue_recovery_completion(journal, osb->slot_num,
1187 ocfs2_queue_recovery_completion(journal, 1301 osb->local_alloc_copy, NULL, NULL);
1188 osb->slot_num, 1302 ocfs2_schedule_truncate_log_flush(osb, 0);
1189 osb->local_alloc_copy,
1190 NULL,
1191 NULL);
1192 ocfs2_schedule_truncate_log_flush(osb, 0);
1193 1303
1194 osb->local_alloc_copy = NULL; 1304 osb->local_alloc_copy = NULL;
1195 osb->dirty = 0; 1305 osb->dirty = 0;
1196 } 1306
1307 /* queue to recover orphan slots for all offline slots */
1308 ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
1309 ocfs2_queue_replay_slots(osb);
1310 ocfs2_free_replay_slots(osb);
1197} 1311}
1198 1312
1199void ocfs2_complete_quota_recovery(struct ocfs2_super *osb) 1313void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
@@ -1236,6 +1350,14 @@ restart:
1236 goto bail; 1350 goto bail;
1237 } 1351 }
1238 1352
1353 status = ocfs2_compute_replay_slots(osb);
1354 if (status < 0)
1355 mlog_errno(status);
1356
1357 /* queue recovery for our own slot */
1358 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
1359 NULL, NULL);
1360
1239 spin_lock(&osb->osb_lock); 1361 spin_lock(&osb->osb_lock);
1240 while (rm->rm_used) { 1362 while (rm->rm_used) {
1241 /* It's always safe to remove entry zero, as we won't 1363 /* It's always safe to remove entry zero, as we won't
@@ -1301,11 +1423,8 @@ skip_recovery:
1301 1423
1302 ocfs2_super_unlock(osb, 1); 1424 ocfs2_super_unlock(osb, 1);
1303 1425
1304 /* We always run recovery on our own orphan dir - the dead 1426 /* queue recovery for offline slots */
1305 * node(s) may have disallowd a previos inode delete. Re-processing 1427 ocfs2_queue_replay_slots(osb);
1306 * is therefore required. */
1307 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
1308 NULL, NULL);
1309 1428
1310bail: 1429bail:
1311 mutex_lock(&osb->recovery_lock); 1430 mutex_lock(&osb->recovery_lock);
@@ -1314,6 +1433,7 @@ bail:
1314 goto restart; 1433 goto restart;
1315 } 1434 }
1316 1435
1436 ocfs2_free_replay_slots(osb);
1317 osb->recovery_thread_task = NULL; 1437 osb->recovery_thread_task = NULL;
1318 mb(); /* sync with ocfs2_recovery_thread_running */ 1438 mb(); /* sync with ocfs2_recovery_thread_running */
1319 wake_up(&osb->recovery_event); 1439 wake_up(&osb->recovery_event);
@@ -1465,6 +1585,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1465 goto done; 1585 goto done;
1466 } 1586 }
1467 1587
1588 /* we need to run complete recovery for offline orphan slots */
1589 ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
1590
1468 mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n", 1591 mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
1469 node_num, slot_num, 1592 node_num, slot_num,
1470 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 1593 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 172850a9a12a..619dd7f6c053 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -38,6 +38,17 @@ enum ocfs2_journal_state {
38struct ocfs2_super; 38struct ocfs2_super;
39struct ocfs2_dinode; 39struct ocfs2_dinode;
40 40
41/*
42 * The recovery_list is a simple linked list of node numbers to recover.
43 * It is protected by the recovery_lock.
44 */
45
46struct ocfs2_recovery_map {
47 unsigned int rm_used;
48 unsigned int *rm_entries;
49};
50
51
41struct ocfs2_journal { 52struct ocfs2_journal {
42 enum ocfs2_journal_state j_state; /* Journals current state */ 53 enum ocfs2_journal_state j_state; /* Journals current state */
43 54
@@ -139,6 +150,7 @@ void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
139int ocfs2_recovery_init(struct ocfs2_super *osb); 150int ocfs2_recovery_init(struct ocfs2_super *osb);
140void ocfs2_recovery_exit(struct ocfs2_super *osb); 151void ocfs2_recovery_exit(struct ocfs2_super *osb);
141 152
153int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
142/* 154/*
143 * Journal Control: 155 * Journal Control:
144 * Initialize, Load, Shutdown, Wipe a journal. 156 * Initialize, Load, Shutdown, Wipe a journal.
@@ -266,6 +278,12 @@ int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
266/* dirblock */ 278/* dirblock */
267int ocfs2_journal_access_db(handle_t *handle, struct inode *inode, 279int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
268 struct buffer_head *bh, int type); 280 struct buffer_head *bh, int type);
281/* ocfs2_dx_root_block */
282int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode,
283 struct buffer_head *bh, int type);
284/* ocfs2_dx_leaf */
285int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode,
286 struct buffer_head *bh, int type);
269/* Anything that has no ecc */ 287/* Anything that has no ecc */
270int ocfs2_journal_access(handle_t *handle, struct inode *inode, 288int ocfs2_journal_access(handle_t *handle, struct inode *inode,
271 struct buffer_head *bh, int type); 289 struct buffer_head *bh, int type);
@@ -368,14 +386,29 @@ static inline int ocfs2_remove_extent_credits(struct super_block *sb)
368} 386}
369 387
370/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + 388/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
371 * bitmap block for the new bit) */ 389 * bitmap block for the new bit) dx_root update for free list */
372#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2) 390#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2 + 1)
391
392static inline int ocfs2_add_dir_index_credits(struct super_block *sb)
393{
394 /* 1 block for index, 2 allocs (data, metadata), 1 clusters
395 * worth of blocks for initial extent. */
396 return 1 + 2 * OCFS2_SUBALLOC_ALLOC +
397 ocfs2_clusters_to_blocks(sb, 1);
398}
373 399
374/* parent fe, parent block, new file entry, inode alloc fe, inode alloc 400/* parent fe, parent block, new file entry, index leaf, inode alloc fe, inode
375 * group descriptor + mkdir/symlink blocks + quota update */ 401 * alloc group descriptor + mkdir/symlink blocks + dir blocks + xattr
376static inline int ocfs2_mknod_credits(struct super_block *sb) 402 * blocks + quota update */
403static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir,
404 int xattr_credits)
377{ 405{
378 return 3 + OCFS2_SUBALLOC_ALLOC + OCFS2_DIR_LINK_ADDITIONAL_CREDITS + 406 int dir_credits = OCFS2_DIR_LINK_ADDITIONAL_CREDITS;
407
408 if (is_dir)
409 dir_credits += ocfs2_add_dir_index_credits(sb);
410
411 return 4 + OCFS2_SUBALLOC_ALLOC + dir_credits + xattr_credits +
379 ocfs2_quota_trans_credits(sb); 412 ocfs2_quota_trans_credits(sb);
380} 413}
381 414
@@ -388,31 +421,31 @@ static inline int ocfs2_mknod_credits(struct super_block *sb)
388#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) 421#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
389 422
390/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota 423/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
391 * update on dir */ 424 * update on dir + index leaf + dx root update for free list */
392static inline int ocfs2_link_credits(struct super_block *sb) 425static inline int ocfs2_link_credits(struct super_block *sb)
393{ 426{
394 return 2*OCFS2_INODE_UPDATE_CREDITS + 1 + 427 return 2*OCFS2_INODE_UPDATE_CREDITS + 3 +
395 ocfs2_quota_trans_credits(sb); 428 ocfs2_quota_trans_credits(sb);
396} 429}
397 430
398/* inode + dir inode (if we unlink a dir), + dir entry block + orphan 431/* inode + dir inode (if we unlink a dir), + dir entry block + orphan
399 * dir inode link */ 432 * dir inode link + dir inode index leaf + dir index root */
400static inline int ocfs2_unlink_credits(struct super_block *sb) 433static inline int ocfs2_unlink_credits(struct super_block *sb)
401{ 434{
402 /* The quota update from ocfs2_link_credits is unused here... */ 435 /* The quota update from ocfs2_link_credits is unused here... */
403 return 2 * OCFS2_INODE_UPDATE_CREDITS + 1 + ocfs2_link_credits(sb); 436 return 2 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_link_credits(sb);
404} 437}
405 438
406/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry + 439/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
407 * inode alloc group descriptor */ 440 * inode alloc group descriptor + orphan dir index leaf */
408#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1) 441#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3)
409 442
410/* dinode update, old dir dinode update, new dir dinode update, old 443/* dinode update, old dir dinode update, new dir dinode update, old
411 * dir dir entry, new dir dir entry, dir entry update for renaming 444 * dir dir entry, new dir dir entry, dir entry update for renaming
412 * directory + target unlink */ 445 * directory + target unlink + 3 x dir index leaves */
413static inline int ocfs2_rename_credits(struct super_block *sb) 446static inline int ocfs2_rename_credits(struct super_block *sb)
414{ 447{
415 return 3 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_unlink_credits(sb); 448 return 3 * OCFS2_INODE_UPDATE_CREDITS + 6 + ocfs2_unlink_credits(sb);
416} 449}
417 450
418/* global bitmap dinode, group desc., relinked group, 451/* global bitmap dinode, group desc., relinked group,
@@ -422,6 +455,20 @@ static inline int ocfs2_rename_credits(struct super_block *sb)
422 + OCFS2_INODE_UPDATE_CREDITS \ 455 + OCFS2_INODE_UPDATE_CREDITS \
423 + OCFS2_XATTR_BLOCK_UPDATE_CREDITS) 456 + OCFS2_XATTR_BLOCK_UPDATE_CREDITS)
424 457
458/* inode update, removal of dx root block from allocator */
459#define OCFS2_DX_ROOT_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \
460 OCFS2_SUBALLOC_FREE)
461
462static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb)
463{
464 int credits = 1 + OCFS2_SUBALLOC_ALLOC;
465
466 credits += ocfs2_clusters_to_blocks(sb, 1);
467 credits += ocfs2_quota_trans_credits(sb);
468
469 return credits;
470}
471
425/* 472/*
426 * Please note that the caller must make sure that root_el is the root 473 * Please note that the caller must make sure that root_el is the root
427 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise 474 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
@@ -457,7 +504,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
457 504
458static inline int ocfs2_calc_symlink_credits(struct super_block *sb) 505static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
459{ 506{
460 int blocks = ocfs2_mknod_credits(sb); 507 int blocks = ocfs2_mknod_credits(sb, 0, 0);
461 508
462 /* links can be longer than one block so we may update many 509 /* links can be longer than one block so we may update many
463 * within our single allocated extent. */ 510 * within our single allocated extent. */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ec70cdbe77fc..bac7e6abaf47 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -28,7 +28,6 @@
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/bitops.h> 30#include <linux/bitops.h>
31#include <linux/debugfs.h>
32 31
33#define MLOG_MASK_PREFIX ML_DISK_ALLOC 32#define MLOG_MASK_PREFIX ML_DISK_ALLOC
34#include <cluster/masklog.h> 33#include <cluster/masklog.h>
@@ -75,84 +74,6 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
75static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, 74static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
76 struct inode *local_alloc_inode); 75 struct inode *local_alloc_inode);
77 76
78#ifdef CONFIG_OCFS2_FS_STATS
79
80static int ocfs2_la_debug_open(struct inode *inode, struct file *file)
81{
82 file->private_data = inode->i_private;
83 return 0;
84}
85
86#define LA_DEBUG_BUF_SZ PAGE_CACHE_SIZE
87#define LA_DEBUG_VER 1
88static ssize_t ocfs2_la_debug_read(struct file *file, char __user *userbuf,
89 size_t count, loff_t *ppos)
90{
91 static DEFINE_MUTEX(la_debug_mutex);
92 struct ocfs2_super *osb = file->private_data;
93 int written, ret;
94 char *buf = osb->local_alloc_debug_buf;
95
96 mutex_lock(&la_debug_mutex);
97 memset(buf, 0, LA_DEBUG_BUF_SZ);
98
99 written = snprintf(buf, LA_DEBUG_BUF_SZ,
100 "0x%x\t0x%llx\t%u\t%u\t0x%x\n",
101 LA_DEBUG_VER,
102 (unsigned long long)osb->la_last_gd,
103 osb->local_alloc_default_bits,
104 osb->local_alloc_bits, osb->local_alloc_state);
105
106 ret = simple_read_from_buffer(userbuf, count, ppos, buf, written);
107
108 mutex_unlock(&la_debug_mutex);
109 return ret;
110}
111
112static const struct file_operations ocfs2_la_debug_fops = {
113 .open = ocfs2_la_debug_open,
114 .read = ocfs2_la_debug_read,
115};
116
117static void ocfs2_init_la_debug(struct ocfs2_super *osb)
118{
119 osb->local_alloc_debug_buf = kmalloc(LA_DEBUG_BUF_SZ, GFP_NOFS);
120 if (!osb->local_alloc_debug_buf)
121 return;
122
123 osb->local_alloc_debug = debugfs_create_file("local_alloc_stats",
124 S_IFREG|S_IRUSR,
125 osb->osb_debug_root,
126 osb,
127 &ocfs2_la_debug_fops);
128 if (!osb->local_alloc_debug) {
129 kfree(osb->local_alloc_debug_buf);
130 osb->local_alloc_debug_buf = NULL;
131 }
132}
133
134static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
135{
136 if (osb->local_alloc_debug)
137 debugfs_remove(osb->local_alloc_debug);
138
139 if (osb->local_alloc_debug_buf)
140 kfree(osb->local_alloc_debug_buf);
141
142 osb->local_alloc_debug_buf = NULL;
143 osb->local_alloc_debug = NULL;
144}
145#else /* CONFIG_OCFS2_FS_STATS */
146static void ocfs2_init_la_debug(struct ocfs2_super *osb)
147{
148 return;
149}
150static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
151{
152 return;
153}
154#endif
155
156static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb) 77static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
157{ 78{
158 return (osb->local_alloc_state == OCFS2_LA_THROTTLED || 79 return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
@@ -226,8 +147,6 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
226 147
227 mlog_entry_void(); 148 mlog_entry_void();
228 149
229 ocfs2_init_la_debug(osb);
230
231 if (osb->local_alloc_bits == 0) 150 if (osb->local_alloc_bits == 0)
232 goto bail; 151 goto bail;
233 152
@@ -299,9 +218,6 @@ bail:
299 if (inode) 218 if (inode)
300 iput(inode); 219 iput(inode);
301 220
302 if (status < 0)
303 ocfs2_shutdown_la_debug(osb);
304
305 mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits); 221 mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits);
306 222
307 mlog_exit(status); 223 mlog_exit(status);
@@ -331,8 +247,6 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
331 cancel_delayed_work(&osb->la_enable_wq); 247 cancel_delayed_work(&osb->la_enable_wq);
332 flush_workqueue(ocfs2_wq); 248 flush_workqueue(ocfs2_wq);
333 249
334 ocfs2_shutdown_la_debug(osb);
335
336 if (osb->local_alloc_state == OCFS2_LA_UNUSED) 250 if (osb->local_alloc_state == OCFS2_LA_UNUSED)
337 goto out; 251 goto out;
338 252
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index eea1d24713ea..b606496b72ec 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -154,8 +154,9 @@ out:
154 return ret; 154 return ret;
155} 155}
156 156
157static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) 157static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
158{ 158{
159 struct page *page = vmf->page;
159 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 160 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
160 struct buffer_head *di_bh = NULL; 161 struct buffer_head *di_bh = NULL;
161 sigset_t blocked, oldset; 162 sigset_t blocked, oldset;
@@ -196,7 +197,8 @@ out:
196 ret2 = ocfs2_vm_op_unblock_sigs(&oldset); 197 ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
197 if (ret2 < 0) 198 if (ret2 < 0)
198 mlog_errno(ret2); 199 mlog_errno(ret2);
199 200 if (ret)
201 ret = VM_FAULT_SIGBUS;
200 return ret; 202 return ret;
201} 203}
202 204
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 084aba86c3b2..2220f93f668b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -80,14 +80,14 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
80 struct inode **ret_orphan_dir, 80 struct inode **ret_orphan_dir,
81 struct inode *inode, 81 struct inode *inode,
82 char *name, 82 char *name,
83 struct buffer_head **de_bh); 83 struct ocfs2_dir_lookup_result *lookup);
84 84
85static int ocfs2_orphan_add(struct ocfs2_super *osb, 85static int ocfs2_orphan_add(struct ocfs2_super *osb,
86 handle_t *handle, 86 handle_t *handle,
87 struct inode *inode, 87 struct inode *inode,
88 struct ocfs2_dinode *fe, 88 struct ocfs2_dinode *fe,
89 char *name, 89 char *name,
90 struct buffer_head *de_bh, 90 struct ocfs2_dir_lookup_result *lookup,
91 struct inode *orphan_dir_inode); 91 struct inode *orphan_dir_inode);
92 92
93static int ocfs2_create_symlink_data(struct ocfs2_super *osb, 93static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
@@ -228,17 +228,18 @@ static int ocfs2_mknod(struct inode *dir,
228 struct ocfs2_super *osb; 228 struct ocfs2_super *osb;
229 struct ocfs2_dinode *dirfe; 229 struct ocfs2_dinode *dirfe;
230 struct buffer_head *new_fe_bh = NULL; 230 struct buffer_head *new_fe_bh = NULL;
231 struct buffer_head *de_bh = NULL;
232 struct inode *inode = NULL; 231 struct inode *inode = NULL;
233 struct ocfs2_alloc_context *inode_ac = NULL; 232 struct ocfs2_alloc_context *inode_ac = NULL;
234 struct ocfs2_alloc_context *data_ac = NULL; 233 struct ocfs2_alloc_context *data_ac = NULL;
235 struct ocfs2_alloc_context *xattr_ac = NULL; 234 struct ocfs2_alloc_context *meta_ac = NULL;
236 int want_clusters = 0; 235 int want_clusters = 0;
236 int want_meta = 0;
237 int xattr_credits = 0; 237 int xattr_credits = 0;
238 struct ocfs2_security_xattr_info si = { 238 struct ocfs2_security_xattr_info si = {
239 .enable = 1, 239 .enable = 1,
240 }; 240 };
241 int did_quota_inode = 0; 241 int did_quota_inode = 0;
242 struct ocfs2_dir_lookup_result lookup = { NULL, };
242 243
243 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, 244 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
244 (unsigned long)dev, dentry->d_name.len, 245 (unsigned long)dev, dentry->d_name.len,
@@ -254,13 +255,13 @@ static int ocfs2_mknod(struct inode *dir,
254 return status; 255 return status;
255 } 256 }
256 257
257 if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) { 258 if (S_ISDIR(mode) && (dir->i_nlink >= ocfs2_link_max(osb))) {
258 status = -EMLINK; 259 status = -EMLINK;
259 goto leave; 260 goto leave;
260 } 261 }
261 262
262 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; 263 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
263 if (!dirfe->i_links_count) { 264 if (!ocfs2_read_links_count(dirfe)) {
264 /* can't make a file in a deleted directory. */ 265 /* can't make a file in a deleted directory. */
265 status = -ENOENT; 266 status = -ENOENT;
266 goto leave; 267 goto leave;
@@ -274,7 +275,7 @@ static int ocfs2_mknod(struct inode *dir,
274 /* get a spot inside the dir. */ 275 /* get a spot inside the dir. */
275 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, 276 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
276 dentry->d_name.name, 277 dentry->d_name.name,
277 dentry->d_name.len, &de_bh); 278 dentry->d_name.len, &lookup);
278 if (status < 0) { 279 if (status < 0) {
279 mlog_errno(status); 280 mlog_errno(status);
280 goto leave; 281 goto leave;
@@ -308,17 +309,29 @@ static int ocfs2_mknod(struct inode *dir,
308 309
309 /* calculate meta data/clusters for setting security and acl xattr */ 310 /* calculate meta data/clusters for setting security and acl xattr */
310 status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode, 311 status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode,
311 &si, &want_clusters, 312 &si, &want_clusters,
312 &xattr_credits, &xattr_ac); 313 &xattr_credits, &want_meta);
313 if (status < 0) { 314 if (status < 0) {
314 mlog_errno(status); 315 mlog_errno(status);
315 goto leave; 316 goto leave;
316 } 317 }
317 318
318 /* Reserve a cluster if creating an extent based directory. */ 319 /* Reserve a cluster if creating an extent based directory. */
319 if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) 320 if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) {
320 want_clusters += 1; 321 want_clusters += 1;
321 322
323 /* Dir indexing requires extra space as well */
324 if (ocfs2_supports_indexed_dirs(osb))
325 want_meta++;
326 }
327
328 status = ocfs2_reserve_new_metadata_blocks(osb, want_meta, &meta_ac);
329 if (status < 0) {
330 if (status != -ENOSPC)
331 mlog_errno(status);
332 goto leave;
333 }
334
322 status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac); 335 status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
323 if (status < 0) { 336 if (status < 0) {
324 if (status != -ENOSPC) 337 if (status != -ENOSPC)
@@ -326,8 +339,9 @@ static int ocfs2_mknod(struct inode *dir,
326 goto leave; 339 goto leave;
327 } 340 }
328 341
329 handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb) + 342 handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
330 xattr_credits); 343 S_ISDIR(mode),
344 xattr_credits));
331 if (IS_ERR(handle)) { 345 if (IS_ERR(handle)) {
332 status = PTR_ERR(handle); 346 status = PTR_ERR(handle);
333 handle = NULL; 347 handle = NULL;
@@ -355,7 +369,7 @@ static int ocfs2_mknod(struct inode *dir,
355 369
356 if (S_ISDIR(mode)) { 370 if (S_ISDIR(mode)) {
357 status = ocfs2_fill_new_dir(osb, handle, dir, inode, 371 status = ocfs2_fill_new_dir(osb, handle, dir, inode,
358 new_fe_bh, data_ac); 372 new_fe_bh, data_ac, meta_ac);
359 if (status < 0) { 373 if (status < 0) {
360 mlog_errno(status); 374 mlog_errno(status);
361 goto leave; 375 goto leave;
@@ -367,7 +381,7 @@ static int ocfs2_mknod(struct inode *dir,
367 mlog_errno(status); 381 mlog_errno(status);
368 goto leave; 382 goto leave;
369 } 383 }
370 le16_add_cpu(&dirfe->i_links_count, 1); 384 ocfs2_add_links_count(dirfe, 1);
371 status = ocfs2_journal_dirty(handle, parent_fe_bh); 385 status = ocfs2_journal_dirty(handle, parent_fe_bh);
372 if (status < 0) { 386 if (status < 0) {
373 mlog_errno(status); 387 mlog_errno(status);
@@ -377,7 +391,7 @@ static int ocfs2_mknod(struct inode *dir,
377 } 391 }
378 392
379 status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh, 393 status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
380 xattr_ac, data_ac); 394 meta_ac, data_ac);
381 if (status < 0) { 395 if (status < 0) {
382 mlog_errno(status); 396 mlog_errno(status);
383 goto leave; 397 goto leave;
@@ -385,7 +399,7 @@ static int ocfs2_mknod(struct inode *dir,
385 399
386 if (si.enable) { 400 if (si.enable) {
387 status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si, 401 status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
388 xattr_ac, data_ac); 402 meta_ac, data_ac);
389 if (status < 0) { 403 if (status < 0) {
390 mlog_errno(status); 404 mlog_errno(status);
391 goto leave; 405 goto leave;
@@ -394,7 +408,7 @@ static int ocfs2_mknod(struct inode *dir,
394 408
395 status = ocfs2_add_entry(handle, dentry, inode, 409 status = ocfs2_add_entry(handle, dentry, inode,
396 OCFS2_I(inode)->ip_blkno, parent_fe_bh, 410 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
397 de_bh); 411 &lookup);
398 if (status < 0) { 412 if (status < 0) {
399 mlog_errno(status); 413 mlog_errno(status);
400 goto leave; 414 goto leave;
@@ -423,11 +437,12 @@ leave:
423 mlog(0, "Disk is full\n"); 437 mlog(0, "Disk is full\n");
424 438
425 brelse(new_fe_bh); 439 brelse(new_fe_bh);
426 brelse(de_bh);
427 brelse(parent_fe_bh); 440 brelse(parent_fe_bh);
428 kfree(si.name); 441 kfree(si.name);
429 kfree(si.value); 442 kfree(si.value);
430 443
444 ocfs2_free_dir_lookup_result(&lookup);
445
431 if ((status < 0) && inode) { 446 if ((status < 0) && inode) {
432 clear_nlink(inode); 447 clear_nlink(inode);
433 iput(inode); 448 iput(inode);
@@ -439,8 +454,8 @@ leave:
439 if (data_ac) 454 if (data_ac)
440 ocfs2_free_alloc_context(data_ac); 455 ocfs2_free_alloc_context(data_ac);
441 456
442 if (xattr_ac) 457 if (meta_ac)
443 ocfs2_free_alloc_context(xattr_ac); 458 ocfs2_free_alloc_context(meta_ac);
444 459
445 mlog_exit(status); 460 mlog_exit(status);
446 461
@@ -462,6 +477,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
462 struct ocfs2_extent_list *fel; 477 struct ocfs2_extent_list *fel;
463 u64 fe_blkno = 0; 478 u64 fe_blkno = 0;
464 u16 suballoc_bit; 479 u16 suballoc_bit;
480 u16 feat;
465 481
466 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, 482 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
467 inode->i_mode, (unsigned long)dev, dentry->d_name.len, 483 inode->i_mode, (unsigned long)dev, dentry->d_name.len,
@@ -469,8 +485,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
469 485
470 *new_fe_bh = NULL; 486 *new_fe_bh = NULL;
471 487
472 status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit, 488 status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh,
473 &fe_blkno); 489 inode_ac, &suballoc_bit, &fe_blkno);
474 if (status < 0) { 490 if (status < 0) {
475 mlog_errno(status); 491 mlog_errno(status);
476 goto leave; 492 goto leave;
@@ -513,7 +529,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
513 fe->i_mode = cpu_to_le16(inode->i_mode); 529 fe->i_mode = cpu_to_le16(inode->i_mode);
514 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) 530 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
515 fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev)); 531 fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
516 fe->i_links_count = cpu_to_le16(inode->i_nlink); 532
533 ocfs2_set_links_count(fe, inode->i_nlink);
517 534
518 fe->i_last_eb_blk = 0; 535 fe->i_last_eb_blk = 0;
519 strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); 536 strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
@@ -525,14 +542,15 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
525 fe->i_dtime = 0; 542 fe->i_dtime = 0;
526 543
527 /* 544 /*
528 * If supported, directories start with inline data. 545 * If supported, directories start with inline data. If inline
546 * isn't supported, but indexing is, we start them as indexed.
529 */ 547 */
548 feat = le16_to_cpu(fe->i_dyn_features);
530 if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) { 549 if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) {
531 u16 feat = le16_to_cpu(fe->i_dyn_features);
532
533 fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL); 550 fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
534 551
535 fe->id2.i_data.id_count = cpu_to_le16(ocfs2_max_inline_data(osb->sb)); 552 fe->id2.i_data.id_count = cpu_to_le16(
553 ocfs2_max_inline_data_with_xattr(osb->sb, fe));
536 } else { 554 } else {
537 fel = &fe->id2.i_list; 555 fel = &fe->id2.i_list;
538 fel->l_tree_depth = 0; 556 fel->l_tree_depth = 0;
@@ -607,9 +625,9 @@ static int ocfs2_link(struct dentry *old_dentry,
607 int err; 625 int err;
608 struct buffer_head *fe_bh = NULL; 626 struct buffer_head *fe_bh = NULL;
609 struct buffer_head *parent_fe_bh = NULL; 627 struct buffer_head *parent_fe_bh = NULL;
610 struct buffer_head *de_bh = NULL;
611 struct ocfs2_dinode *fe = NULL; 628 struct ocfs2_dinode *fe = NULL;
612 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 629 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
630 struct ocfs2_dir_lookup_result lookup = { NULL, };
613 631
614 mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino, 632 mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
615 old_dentry->d_name.len, old_dentry->d_name.name, 633 old_dentry->d_name.len, old_dentry->d_name.name,
@@ -637,7 +655,7 @@ static int ocfs2_link(struct dentry *old_dentry,
637 655
638 err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, 656 err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
639 dentry->d_name.name, 657 dentry->d_name.name,
640 dentry->d_name.len, &de_bh); 658 dentry->d_name.len, &lookup);
641 if (err < 0) { 659 if (err < 0) {
642 mlog_errno(err); 660 mlog_errno(err);
643 goto out; 661 goto out;
@@ -651,7 +669,7 @@ static int ocfs2_link(struct dentry *old_dentry,
651 } 669 }
652 670
653 fe = (struct ocfs2_dinode *) fe_bh->b_data; 671 fe = (struct ocfs2_dinode *) fe_bh->b_data;
654 if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) { 672 if (ocfs2_read_links_count(fe) >= ocfs2_link_max(osb)) {
655 err = -EMLINK; 673 err = -EMLINK;
656 goto out_unlock_inode; 674 goto out_unlock_inode;
657 } 675 }
@@ -673,13 +691,13 @@ static int ocfs2_link(struct dentry *old_dentry,
673 691
674 inc_nlink(inode); 692 inc_nlink(inode);
675 inode->i_ctime = CURRENT_TIME; 693 inode->i_ctime = CURRENT_TIME;
676 fe->i_links_count = cpu_to_le16(inode->i_nlink); 694 ocfs2_set_links_count(fe, inode->i_nlink);
677 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 695 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
678 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 696 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
679 697
680 err = ocfs2_journal_dirty(handle, fe_bh); 698 err = ocfs2_journal_dirty(handle, fe_bh);
681 if (err < 0) { 699 if (err < 0) {
682 le16_add_cpu(&fe->i_links_count, -1); 700 ocfs2_add_links_count(fe, -1);
683 drop_nlink(inode); 701 drop_nlink(inode);
684 mlog_errno(err); 702 mlog_errno(err);
685 goto out_commit; 703 goto out_commit;
@@ -687,9 +705,9 @@ static int ocfs2_link(struct dentry *old_dentry,
687 705
688 err = ocfs2_add_entry(handle, dentry, inode, 706 err = ocfs2_add_entry(handle, dentry, inode,
689 OCFS2_I(inode)->ip_blkno, 707 OCFS2_I(inode)->ip_blkno,
690 parent_fe_bh, de_bh); 708 parent_fe_bh, &lookup);
691 if (err) { 709 if (err) {
692 le16_add_cpu(&fe->i_links_count, -1); 710 ocfs2_add_links_count(fe, -1);
693 drop_nlink(inode); 711 drop_nlink(inode);
694 mlog_errno(err); 712 mlog_errno(err);
695 goto out_commit; 713 goto out_commit;
@@ -713,10 +731,11 @@ out_unlock_inode:
713out: 731out:
714 ocfs2_inode_unlock(dir, 1); 732 ocfs2_inode_unlock(dir, 1);
715 733
716 brelse(de_bh);
717 brelse(fe_bh); 734 brelse(fe_bh);
718 brelse(parent_fe_bh); 735 brelse(parent_fe_bh);
719 736
737 ocfs2_free_dir_lookup_result(&lookup);
738
720 mlog_exit(err); 739 mlog_exit(err);
721 740
722 return err; 741 return err;
@@ -765,10 +784,9 @@ static int ocfs2_unlink(struct inode *dir,
765 struct buffer_head *fe_bh = NULL; 784 struct buffer_head *fe_bh = NULL;
766 struct buffer_head *parent_node_bh = NULL; 785 struct buffer_head *parent_node_bh = NULL;
767 handle_t *handle = NULL; 786 handle_t *handle = NULL;
768 struct ocfs2_dir_entry *dirent = NULL;
769 struct buffer_head *dirent_bh = NULL;
770 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; 787 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
771 struct buffer_head *orphan_entry_bh = NULL; 788 struct ocfs2_dir_lookup_result lookup = { NULL, };
789 struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
772 790
773 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, 791 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
774 dentry->d_name.len, dentry->d_name.name); 792 dentry->d_name.len, dentry->d_name.name);
@@ -790,8 +808,8 @@ static int ocfs2_unlink(struct inode *dir,
790 } 808 }
791 809
792 status = ocfs2_find_files_on_disk(dentry->d_name.name, 810 status = ocfs2_find_files_on_disk(dentry->d_name.name,
793 dentry->d_name.len, &blkno, 811 dentry->d_name.len, &blkno, dir,
794 dir, &dirent_bh, &dirent); 812 &lookup);
795 if (status < 0) { 813 if (status < 0) {
796 if (status != -ENOENT) 814 if (status != -ENOENT)
797 mlog_errno(status); 815 mlog_errno(status);
@@ -816,10 +834,7 @@ static int ocfs2_unlink(struct inode *dir,
816 child_locked = 1; 834 child_locked = 1;
817 835
818 if (S_ISDIR(inode->i_mode)) { 836 if (S_ISDIR(inode->i_mode)) {
819 if (!ocfs2_empty_dir(inode)) { 837 if (inode->i_nlink != 2 || !ocfs2_empty_dir(inode)) {
820 status = -ENOTEMPTY;
821 goto leave;
822 } else if (inode->i_nlink != 2) {
823 status = -ENOTEMPTY; 838 status = -ENOTEMPTY;
824 goto leave; 839 goto leave;
825 } 840 }
@@ -835,8 +850,7 @@ static int ocfs2_unlink(struct inode *dir,
835 850
836 if (inode_is_unlinkable(inode)) { 851 if (inode_is_unlinkable(inode)) {
837 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode, 852 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode,
838 orphan_name, 853 orphan_name, &orphan_insert);
839 &orphan_entry_bh);
840 if (status < 0) { 854 if (status < 0) {
841 mlog_errno(status); 855 mlog_errno(status);
842 goto leave; 856 goto leave;
@@ -862,7 +876,7 @@ static int ocfs2_unlink(struct inode *dir,
862 876
863 if (inode_is_unlinkable(inode)) { 877 if (inode_is_unlinkable(inode)) {
864 status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name, 878 status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
865 orphan_entry_bh, orphan_dir); 879 &orphan_insert, orphan_dir);
866 if (status < 0) { 880 if (status < 0) {
867 mlog_errno(status); 881 mlog_errno(status);
868 goto leave; 882 goto leave;
@@ -870,7 +884,7 @@ static int ocfs2_unlink(struct inode *dir,
870 } 884 }
871 885
872 /* delete the name from the parent dir */ 886 /* delete the name from the parent dir */
873 status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh); 887 status = ocfs2_delete_entry(handle, dir, &lookup);
874 if (status < 0) { 888 if (status < 0) {
875 mlog_errno(status); 889 mlog_errno(status);
876 goto leave; 890 goto leave;
@@ -879,7 +893,7 @@ static int ocfs2_unlink(struct inode *dir,
879 if (S_ISDIR(inode->i_mode)) 893 if (S_ISDIR(inode->i_mode))
880 drop_nlink(inode); 894 drop_nlink(inode);
881 drop_nlink(inode); 895 drop_nlink(inode);
882 fe->i_links_count = cpu_to_le16(inode->i_nlink); 896 ocfs2_set_links_count(fe, inode->i_nlink);
883 897
884 status = ocfs2_journal_dirty(handle, fe_bh); 898 status = ocfs2_journal_dirty(handle, fe_bh);
885 if (status < 0) { 899 if (status < 0) {
@@ -915,9 +929,10 @@ leave:
915 } 929 }
916 930
917 brelse(fe_bh); 931 brelse(fe_bh);
918 brelse(dirent_bh);
919 brelse(parent_node_bh); 932 brelse(parent_node_bh);
920 brelse(orphan_entry_bh); 933
934 ocfs2_free_dir_lookup_result(&orphan_insert);
935 ocfs2_free_dir_lookup_result(&lookup);
921 936
922 mlog_exit(status); 937 mlog_exit(status);
923 938
@@ -1003,8 +1018,8 @@ static int ocfs2_rename(struct inode *old_dir,
1003 struct inode *new_dir, 1018 struct inode *new_dir,
1004 struct dentry *new_dentry) 1019 struct dentry *new_dentry)
1005{ 1020{
1006 int status = 0, rename_lock = 0, parents_locked = 0; 1021 int status = 0, rename_lock = 0, parents_locked = 0, target_exists = 0;
1007 int old_child_locked = 0, new_child_locked = 0; 1022 int old_child_locked = 0, new_child_locked = 0, update_dot_dot = 0;
1008 struct inode *old_inode = old_dentry->d_inode; 1023 struct inode *old_inode = old_dentry->d_inode;
1009 struct inode *new_inode = new_dentry->d_inode; 1024 struct inode *new_inode = new_dentry->d_inode;
1010 struct inode *orphan_dir = NULL; 1025 struct inode *orphan_dir = NULL;
@@ -1019,13 +1034,13 @@ static int ocfs2_rename(struct inode *old_dir,
1019 handle_t *handle = NULL; 1034 handle_t *handle = NULL;
1020 struct buffer_head *old_dir_bh = NULL; 1035 struct buffer_head *old_dir_bh = NULL;
1021 struct buffer_head *new_dir_bh = NULL; 1036 struct buffer_head *new_dir_bh = NULL;
1022 struct ocfs2_dir_entry *old_inode_dot_dot_de = NULL, *old_de = NULL,
1023 *new_de = NULL;
1024 struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above
1025 struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
1026 // this is the 1st dirent bh
1027 nlink_t old_dir_nlink = old_dir->i_nlink; 1037 nlink_t old_dir_nlink = old_dir->i_nlink;
1028 struct ocfs2_dinode *old_di; 1038 struct ocfs2_dinode *old_di;
1039 struct ocfs2_dir_lookup_result old_inode_dot_dot_res = { NULL, };
1040 struct ocfs2_dir_lookup_result target_lookup_res = { NULL, };
1041 struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, };
1042 struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
1043 struct ocfs2_dir_lookup_result target_insert = { NULL, };
1029 1044
1030 /* At some point it might be nice to break this function up a 1045 /* At some point it might be nice to break this function up a
1031 * bit. */ 1046 * bit. */
@@ -1107,9 +1122,10 @@ static int ocfs2_rename(struct inode *old_dir,
1107 if (S_ISDIR(old_inode->i_mode)) { 1122 if (S_ISDIR(old_inode->i_mode)) {
1108 u64 old_inode_parent; 1123 u64 old_inode_parent;
1109 1124
1125 update_dot_dot = 1;
1110 status = ocfs2_find_files_on_disk("..", 2, &old_inode_parent, 1126 status = ocfs2_find_files_on_disk("..", 2, &old_inode_parent,
1111 old_inode, &old_inode_de_bh, 1127 old_inode,
1112 &old_inode_dot_dot_de); 1128 &old_inode_dot_dot_res);
1113 if (status) { 1129 if (status) {
1114 status = -EIO; 1130 status = -EIO;
1115 goto bail; 1131 goto bail;
@@ -1121,7 +1137,7 @@ static int ocfs2_rename(struct inode *old_dir,
1121 } 1137 }
1122 1138
1123 if (!new_inode && new_dir != old_dir && 1139 if (!new_inode && new_dir != old_dir &&
1124 new_dir->i_nlink >= OCFS2_LINK_MAX) { 1140 new_dir->i_nlink >= ocfs2_link_max(osb)) {
1125 status = -EMLINK; 1141 status = -EMLINK;
1126 goto bail; 1142 goto bail;
1127 } 1143 }
@@ -1150,8 +1166,8 @@ static int ocfs2_rename(struct inode *old_dir,
1150 * to delete it */ 1166 * to delete it */
1151 status = ocfs2_find_files_on_disk(new_dentry->d_name.name, 1167 status = ocfs2_find_files_on_disk(new_dentry->d_name.name,
1152 new_dentry->d_name.len, 1168 new_dentry->d_name.len,
1153 &newfe_blkno, new_dir, &new_de_bh, 1169 &newfe_blkno, new_dir,
1154 &new_de); 1170 &target_lookup_res);
1155 /* The only error we allow here is -ENOENT because the new 1171 /* The only error we allow here is -ENOENT because the new
1156 * file not existing is perfectly valid. */ 1172 * file not existing is perfectly valid. */
1157 if ((status < 0) && (status != -ENOENT)) { 1173 if ((status < 0) && (status != -ENOENT)) {
@@ -1160,8 +1176,10 @@ static int ocfs2_rename(struct inode *old_dir,
1160 mlog_errno(status); 1176 mlog_errno(status);
1161 goto bail; 1177 goto bail;
1162 } 1178 }
1179 if (status == 0)
1180 target_exists = 1;
1163 1181
1164 if (!new_de && new_inode) { 1182 if (!target_exists && new_inode) {
1165 /* 1183 /*
1166 * Target was unlinked by another node while we were 1184 * Target was unlinked by another node while we were
1167 * waiting to get to ocfs2_rename(). There isn't 1185 * waiting to get to ocfs2_rename(). There isn't
@@ -1174,7 +1192,7 @@ static int ocfs2_rename(struct inode *old_dir,
1174 1192
1175 /* In case we need to overwrite an existing file, we blow it 1193 /* In case we need to overwrite an existing file, we blow it
1176 * away first */ 1194 * away first */
1177 if (new_de) { 1195 if (target_exists) {
1178 /* VFS didn't think there existed an inode here, but 1196 /* VFS didn't think there existed an inode here, but
1179 * someone else in the cluster must have raced our 1197 * someone else in the cluster must have raced our
1180 * rename to create one. Today we error cleanly, in 1198 * rename to create one. Today we error cleanly, in
@@ -1215,8 +1233,8 @@ static int ocfs2_rename(struct inode *old_dir,
1215 1233
1216 newfe = (struct ocfs2_dinode *) newfe_bh->b_data; 1234 newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
1217 1235
1218 mlog(0, "aha rename over existing... new_de=%p new_blkno=%llu " 1236 mlog(0, "aha rename over existing... new_blkno=%llu "
1219 "newfebh=%p bhblocknr=%llu\n", new_de, 1237 "newfebh=%p bhblocknr=%llu\n",
1220 (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ? 1238 (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ?
1221 (unsigned long long)newfe_bh->b_blocknr : 0ULL); 1239 (unsigned long long)newfe_bh->b_blocknr : 0ULL);
1222 1240
@@ -1224,7 +1242,7 @@ static int ocfs2_rename(struct inode *old_dir,
1224 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, 1242 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
1225 new_inode, 1243 new_inode,
1226 orphan_name, 1244 orphan_name,
1227 &orphan_entry_bh); 1245 &orphan_insert);
1228 if (status < 0) { 1246 if (status < 0) {
1229 mlog_errno(status); 1247 mlog_errno(status);
1230 goto bail; 1248 goto bail;
@@ -1242,7 +1260,7 @@ static int ocfs2_rename(struct inode *old_dir,
1242 status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh, 1260 status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh,
1243 new_dentry->d_name.name, 1261 new_dentry->d_name.name,
1244 new_dentry->d_name.len, 1262 new_dentry->d_name.len,
1245 &insert_entry_bh); 1263 &target_insert);
1246 if (status < 0) { 1264 if (status < 0) {
1247 mlog_errno(status); 1265 mlog_errno(status);
1248 goto bail; 1266 goto bail;
@@ -1257,10 +1275,10 @@ static int ocfs2_rename(struct inode *old_dir,
1257 goto bail; 1275 goto bail;
1258 } 1276 }
1259 1277
1260 if (new_de) { 1278 if (target_exists) {
1261 if (S_ISDIR(new_inode->i_mode)) { 1279 if (S_ISDIR(new_inode->i_mode)) {
1262 if (!ocfs2_empty_dir(new_inode) || 1280 if (new_inode->i_nlink != 2 ||
1263 new_inode->i_nlink != 2) { 1281 !ocfs2_empty_dir(new_inode)) {
1264 status = -ENOTEMPTY; 1282 status = -ENOTEMPTY;
1265 goto bail; 1283 goto bail;
1266 } 1284 }
@@ -1273,10 +1291,10 @@ static int ocfs2_rename(struct inode *old_dir,
1273 } 1291 }
1274 1292
1275 if (S_ISDIR(new_inode->i_mode) || 1293 if (S_ISDIR(new_inode->i_mode) ||
1276 (newfe->i_links_count == cpu_to_le16(1))){ 1294 (ocfs2_read_links_count(newfe) == 1)) {
1277 status = ocfs2_orphan_add(osb, handle, new_inode, 1295 status = ocfs2_orphan_add(osb, handle, new_inode,
1278 newfe, orphan_name, 1296 newfe, orphan_name,
1279 orphan_entry_bh, orphan_dir); 1297 &orphan_insert, orphan_dir);
1280 if (status < 0) { 1298 if (status < 0) {
1281 mlog_errno(status); 1299 mlog_errno(status);
1282 goto bail; 1300 goto bail;
@@ -1284,8 +1302,8 @@ static int ocfs2_rename(struct inode *old_dir,
1284 } 1302 }
1285 1303
1286 /* change the dirent to point to the correct inode */ 1304 /* change the dirent to point to the correct inode */
1287 status = ocfs2_update_entry(new_dir, handle, new_de_bh, 1305 status = ocfs2_update_entry(new_dir, handle, &target_lookup_res,
1288 new_de, old_inode); 1306 old_inode);
1289 if (status < 0) { 1307 if (status < 0) {
1290 mlog_errno(status); 1308 mlog_errno(status);
1291 goto bail; 1309 goto bail;
@@ -1293,9 +1311,9 @@ static int ocfs2_rename(struct inode *old_dir,
1293 new_dir->i_version++; 1311 new_dir->i_version++;
1294 1312
1295 if (S_ISDIR(new_inode->i_mode)) 1313 if (S_ISDIR(new_inode->i_mode))
1296 newfe->i_links_count = 0; 1314 ocfs2_set_links_count(newfe, 0);
1297 else 1315 else
1298 le16_add_cpu(&newfe->i_links_count, -1); 1316 ocfs2_add_links_count(newfe, -1);
1299 1317
1300 status = ocfs2_journal_dirty(handle, newfe_bh); 1318 status = ocfs2_journal_dirty(handle, newfe_bh);
1301 if (status < 0) { 1319 if (status < 0) {
@@ -1306,7 +1324,7 @@ static int ocfs2_rename(struct inode *old_dir,
1306 /* if the name was not found in new_dir, add it now */ 1324 /* if the name was not found in new_dir, add it now */
1307 status = ocfs2_add_entry(handle, new_dentry, old_inode, 1325 status = ocfs2_add_entry(handle, new_dentry, old_inode,
1308 OCFS2_I(old_inode)->ip_blkno, 1326 OCFS2_I(old_inode)->ip_blkno,
1309 new_dir_bh, insert_entry_bh); 1327 new_dir_bh, &target_insert);
1310 } 1328 }
1311 1329
1312 old_inode->i_ctime = CURRENT_TIME; 1330 old_inode->i_ctime = CURRENT_TIME;
@@ -1333,15 +1351,13 @@ static int ocfs2_rename(struct inode *old_dir,
1333 * because the insert might have changed the type of directory 1351 * because the insert might have changed the type of directory
1334 * we're dealing with. 1352 * we're dealing with.
1335 */ 1353 */
1336 old_de_bh = ocfs2_find_entry(old_dentry->d_name.name, 1354 status = ocfs2_find_entry(old_dentry->d_name.name,
1337 old_dentry->d_name.len, 1355 old_dentry->d_name.len, old_dir,
1338 old_dir, &old_de); 1356 &old_entry_lookup);
1339 if (!old_de_bh) { 1357 if (status)
1340 status = -EIO;
1341 goto bail; 1358 goto bail;
1342 }
1343 1359
1344 status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh); 1360 status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup);
1345 if (status < 0) { 1361 if (status < 0) {
1346 mlog_errno(status); 1362 mlog_errno(status);
1347 goto bail; 1363 goto bail;
@@ -1352,9 +1368,10 @@ static int ocfs2_rename(struct inode *old_dir,
1352 new_inode->i_ctime = CURRENT_TIME; 1368 new_inode->i_ctime = CURRENT_TIME;
1353 } 1369 }
1354 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; 1370 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
1355 if (old_inode_de_bh) { 1371
1356 status = ocfs2_update_entry(old_inode, handle, old_inode_de_bh, 1372 if (update_dot_dot) {
1357 old_inode_dot_dot_de, new_dir); 1373 status = ocfs2_update_entry(old_inode, handle,
1374 &old_inode_dot_dot_res, new_dir);
1358 old_dir->i_nlink--; 1375 old_dir->i_nlink--;
1359 if (new_inode) { 1376 if (new_inode) {
1360 new_inode->i_nlink--; 1377 new_inode->i_nlink--;
@@ -1390,14 +1407,13 @@ static int ocfs2_rename(struct inode *old_dir,
1390 } else { 1407 } else {
1391 struct ocfs2_dinode *fe; 1408 struct ocfs2_dinode *fe;
1392 status = ocfs2_journal_access_di(handle, old_dir, 1409 status = ocfs2_journal_access_di(handle, old_dir,
1393 old_dir_bh, 1410 old_dir_bh,
1394 OCFS2_JOURNAL_ACCESS_WRITE); 1411 OCFS2_JOURNAL_ACCESS_WRITE);
1395 fe = (struct ocfs2_dinode *) old_dir_bh->b_data; 1412 fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
1396 fe->i_links_count = cpu_to_le16(old_dir->i_nlink); 1413 ocfs2_set_links_count(fe, old_dir->i_nlink);
1397 status = ocfs2_journal_dirty(handle, old_dir_bh); 1414 status = ocfs2_journal_dirty(handle, old_dir_bh);
1398 } 1415 }
1399 } 1416 }
1400
1401 ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir); 1417 ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
1402 status = 0; 1418 status = 0;
1403bail: 1419bail:
@@ -1428,13 +1444,17 @@ bail:
1428 1444
1429 if (new_inode) 1445 if (new_inode)
1430 iput(new_inode); 1446 iput(new_inode);
1447
1448 ocfs2_free_dir_lookup_result(&target_lookup_res);
1449 ocfs2_free_dir_lookup_result(&old_entry_lookup);
1450 ocfs2_free_dir_lookup_result(&old_inode_dot_dot_res);
1451 ocfs2_free_dir_lookup_result(&orphan_insert);
1452 ocfs2_free_dir_lookup_result(&target_insert);
1453
1431 brelse(newfe_bh); 1454 brelse(newfe_bh);
1432 brelse(old_inode_bh); 1455 brelse(old_inode_bh);
1433 brelse(old_dir_bh); 1456 brelse(old_dir_bh);
1434 brelse(new_dir_bh); 1457 brelse(new_dir_bh);
1435 brelse(new_de_bh);
1436 brelse(old_de_bh);
1437 brelse(old_inode_de_bh);
1438 brelse(orphan_entry_bh); 1458 brelse(orphan_entry_bh);
1439 brelse(insert_entry_bh); 1459 brelse(insert_entry_bh);
1440 1460
@@ -1557,7 +1577,6 @@ static int ocfs2_symlink(struct inode *dir,
1557 struct inode *inode = NULL; 1577 struct inode *inode = NULL;
1558 struct super_block *sb; 1578 struct super_block *sb;
1559 struct buffer_head *new_fe_bh = NULL; 1579 struct buffer_head *new_fe_bh = NULL;
1560 struct buffer_head *de_bh = NULL;
1561 struct buffer_head *parent_fe_bh = NULL; 1580 struct buffer_head *parent_fe_bh = NULL;
1562 struct ocfs2_dinode *fe = NULL; 1581 struct ocfs2_dinode *fe = NULL;
1563 struct ocfs2_dinode *dirfe; 1582 struct ocfs2_dinode *dirfe;
@@ -1571,6 +1590,7 @@ static int ocfs2_symlink(struct inode *dir,
1571 .enable = 1, 1590 .enable = 1,
1572 }; 1591 };
1573 int did_quota = 0, did_quota_inode = 0; 1592 int did_quota = 0, did_quota_inode = 0;
1593 struct ocfs2_dir_lookup_result lookup = { NULL, };
1574 1594
1575 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, 1595 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
1576 dentry, symname, dentry->d_name.len, dentry->d_name.name); 1596 dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1591,7 +1611,7 @@ static int ocfs2_symlink(struct inode *dir,
1591 } 1611 }
1592 1612
1593 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; 1613 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
1594 if (!dirfe->i_links_count) { 1614 if (!ocfs2_read_links_count(dirfe)) {
1595 /* can't make a file in a deleted directory. */ 1615 /* can't make a file in a deleted directory. */
1596 status = -ENOENT; 1616 status = -ENOENT;
1597 goto bail; 1617 goto bail;
@@ -1604,7 +1624,7 @@ static int ocfs2_symlink(struct inode *dir,
1604 1624
1605 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, 1625 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
1606 dentry->d_name.name, 1626 dentry->d_name.name,
1607 dentry->d_name.len, &de_bh); 1627 dentry->d_name.len, &lookup);
1608 if (status < 0) { 1628 if (status < 0) {
1609 mlog_errno(status); 1629 mlog_errno(status);
1610 goto bail; 1630 goto bail;
@@ -1743,7 +1763,7 @@ static int ocfs2_symlink(struct inode *dir,
1743 1763
1744 status = ocfs2_add_entry(handle, dentry, inode, 1764 status = ocfs2_add_entry(handle, dentry, inode,
1745 le64_to_cpu(fe->i_blkno), parent_fe_bh, 1765 le64_to_cpu(fe->i_blkno), parent_fe_bh,
1746 de_bh); 1766 &lookup);
1747 if (status < 0) { 1767 if (status < 0) {
1748 mlog_errno(status); 1768 mlog_errno(status);
1749 goto bail; 1769 goto bail;
@@ -1771,9 +1791,9 @@ bail:
1771 1791
1772 brelse(new_fe_bh); 1792 brelse(new_fe_bh);
1773 brelse(parent_fe_bh); 1793 brelse(parent_fe_bh);
1774 brelse(de_bh);
1775 kfree(si.name); 1794 kfree(si.name);
1776 kfree(si.value); 1795 kfree(si.value);
1796 ocfs2_free_dir_lookup_result(&lookup);
1777 if (inode_ac) 1797 if (inode_ac)
1778 ocfs2_free_alloc_context(inode_ac); 1798 ocfs2_free_alloc_context(inode_ac);
1779 if (data_ac) 1799 if (data_ac)
@@ -1825,7 +1845,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1825 struct inode **ret_orphan_dir, 1845 struct inode **ret_orphan_dir,
1826 struct inode *inode, 1846 struct inode *inode,
1827 char *name, 1847 char *name,
1828 struct buffer_head **de_bh) 1848 struct ocfs2_dir_lookup_result *lookup)
1829{ 1849{
1830 struct inode *orphan_dir_inode; 1850 struct inode *orphan_dir_inode;
1831 struct buffer_head *orphan_dir_bh = NULL; 1851 struct buffer_head *orphan_dir_bh = NULL;
@@ -1856,7 +1876,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1856 1876
1857 status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, 1877 status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
1858 orphan_dir_bh, name, 1878 orphan_dir_bh, name,
1859 OCFS2_ORPHAN_NAMELEN, de_bh); 1879 OCFS2_ORPHAN_NAMELEN, lookup);
1860 if (status < 0) { 1880 if (status < 0) {
1861 ocfs2_inode_unlock(orphan_dir_inode, 1); 1881 ocfs2_inode_unlock(orphan_dir_inode, 1);
1862 1882
@@ -1883,7 +1903,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1883 struct inode *inode, 1903 struct inode *inode,
1884 struct ocfs2_dinode *fe, 1904 struct ocfs2_dinode *fe,
1885 char *name, 1905 char *name,
1886 struct buffer_head *de_bh, 1906 struct ocfs2_dir_lookup_result *lookup,
1887 struct inode *orphan_dir_inode) 1907 struct inode *orphan_dir_inode)
1888{ 1908{
1889 struct buffer_head *orphan_dir_bh = NULL; 1909 struct buffer_head *orphan_dir_bh = NULL;
@@ -1909,8 +1929,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1909 * underneath us... */ 1929 * underneath us... */
1910 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; 1930 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
1911 if (S_ISDIR(inode->i_mode)) 1931 if (S_ISDIR(inode->i_mode))
1912 le16_add_cpu(&orphan_fe->i_links_count, 1); 1932 ocfs2_add_links_count(orphan_fe, 1);
1913 orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count); 1933 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
1914 1934
1915 status = ocfs2_journal_dirty(handle, orphan_dir_bh); 1935 status = ocfs2_journal_dirty(handle, orphan_dir_bh);
1916 if (status < 0) { 1936 if (status < 0) {
@@ -1921,7 +1941,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1921 status = __ocfs2_add_entry(handle, orphan_dir_inode, name, 1941 status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
1922 OCFS2_ORPHAN_NAMELEN, inode, 1942 OCFS2_ORPHAN_NAMELEN, inode,
1923 OCFS2_I(inode)->ip_blkno, 1943 OCFS2_I(inode)->ip_blkno,
1924 orphan_dir_bh, de_bh); 1944 orphan_dir_bh, lookup);
1925 if (status < 0) { 1945 if (status < 0) {
1926 mlog_errno(status); 1946 mlog_errno(status);
1927 goto leave; 1947 goto leave;
@@ -1954,8 +1974,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
1954 char name[OCFS2_ORPHAN_NAMELEN + 1]; 1974 char name[OCFS2_ORPHAN_NAMELEN + 1];
1955 struct ocfs2_dinode *orphan_fe; 1975 struct ocfs2_dinode *orphan_fe;
1956 int status = 0; 1976 int status = 0;
1957 struct buffer_head *target_de_bh = NULL; 1977 struct ocfs2_dir_lookup_result lookup = { NULL, };
1958 struct ocfs2_dir_entry *target_de = NULL;
1959 1978
1960 mlog_entry_void(); 1979 mlog_entry_void();
1961 1980
@@ -1970,17 +1989,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
1970 OCFS2_ORPHAN_NAMELEN); 1989 OCFS2_ORPHAN_NAMELEN);
1971 1990
1972 /* find it's spot in the orphan directory */ 1991 /* find it's spot in the orphan directory */
1973 target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, 1992 status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode,
1974 orphan_dir_inode, &target_de); 1993 &lookup);
1975 if (!target_de_bh) { 1994 if (status) {
1976 status = -ENOENT;
1977 mlog_errno(status); 1995 mlog_errno(status);
1978 goto leave; 1996 goto leave;
1979 } 1997 }
1980 1998
1981 /* remove it from the orphan directory */ 1999 /* remove it from the orphan directory */
1982 status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de, 2000 status = ocfs2_delete_entry(handle, orphan_dir_inode, &lookup);
1983 target_de_bh);
1984 if (status < 0) { 2001 if (status < 0) {
1985 mlog_errno(status); 2002 mlog_errno(status);
1986 goto leave; 2003 goto leave;
@@ -1996,8 +2013,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
1996 /* do the i_nlink dance! :) */ 2013 /* do the i_nlink dance! :) */
1997 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; 2014 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
1998 if (S_ISDIR(inode->i_mode)) 2015 if (S_ISDIR(inode->i_mode))
1999 le16_add_cpu(&orphan_fe->i_links_count, -1); 2016 ocfs2_add_links_count(orphan_fe, -1);
2000 orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count); 2017 orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
2001 2018
2002 status = ocfs2_journal_dirty(handle, orphan_dir_bh); 2019 status = ocfs2_journal_dirty(handle, orphan_dir_bh);
2003 if (status < 0) { 2020 if (status < 0) {
@@ -2006,7 +2023,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
2006 } 2023 }
2007 2024
2008leave: 2025leave:
2009 brelse(target_de_bh); 2026 ocfs2_free_dir_lookup_result(&lookup);
2010 2027
2011 mlog_exit(status); 2028 mlog_exit(status);
2012 return status; 2029 return status;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 946d3c34b90b..1386281950db 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -209,6 +209,7 @@ enum ocfs2_mount_options
209struct ocfs2_journal; 209struct ocfs2_journal;
210struct ocfs2_slot_info; 210struct ocfs2_slot_info;
211struct ocfs2_recovery_map; 211struct ocfs2_recovery_map;
212struct ocfs2_replay_map;
212struct ocfs2_quota_recovery; 213struct ocfs2_quota_recovery;
213struct ocfs2_dentry_lock; 214struct ocfs2_dentry_lock;
214struct ocfs2_super 215struct ocfs2_super
@@ -264,6 +265,7 @@ struct ocfs2_super
264 atomic_t vol_state; 265 atomic_t vol_state;
265 struct mutex recovery_lock; 266 struct mutex recovery_lock;
266 struct ocfs2_recovery_map *recovery_map; 267 struct ocfs2_recovery_map *recovery_map;
268 struct ocfs2_replay_map *replay_map;
267 struct task_struct *recovery_thread_task; 269 struct task_struct *recovery_thread_task;
268 int disable_recovery; 270 int disable_recovery;
269 wait_queue_head_t checkpoint_event; 271 wait_queue_head_t checkpoint_event;
@@ -287,11 +289,6 @@ struct ocfs2_super
287 289
288 u64 la_last_gd; 290 u64 la_last_gd;
289 291
290#ifdef CONFIG_OCFS2_FS_STATS
291 struct dentry *local_alloc_debug;
292 char *local_alloc_debug_buf;
293#endif
294
295 /* Next three fields are for local node slot recovery during 292 /* Next three fields are for local node slot recovery during
296 * mount. */ 293 * mount. */
297 int dirty; 294 int dirty;
@@ -305,9 +302,11 @@ struct ocfs2_super
305 struct ocfs2_cluster_connection *cconn; 302 struct ocfs2_cluster_connection *cconn;
306 struct ocfs2_lock_res osb_super_lockres; 303 struct ocfs2_lock_res osb_super_lockres;
307 struct ocfs2_lock_res osb_rename_lockres; 304 struct ocfs2_lock_res osb_rename_lockres;
305 struct ocfs2_lock_res osb_nfs_sync_lockres;
308 struct ocfs2_dlm_debug *osb_dlm_debug; 306 struct ocfs2_dlm_debug *osb_dlm_debug;
309 307
310 struct dentry *osb_debug_root; 308 struct dentry *osb_debug_root;
309 struct dentry *osb_ctxt;
311 310
312 wait_queue_head_t recovery_event; 311 wait_queue_head_t recovery_event;
313 312
@@ -344,6 +343,12 @@ struct ocfs2_super
344 343
345 /* used to protect metaecc calculation check of xattr. */ 344 /* used to protect metaecc calculation check of xattr. */
346 spinlock_t osb_xattr_lock; 345 spinlock_t osb_xattr_lock;
346
347 unsigned int osb_dx_mask;
348 u32 osb_dx_seed[4];
349
350 /* the group we used to allocate inodes. */
351 u64 osb_inode_alloc_group;
347}; 352};
348 353
349#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) 354#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
@@ -402,6 +407,51 @@ static inline int ocfs2_meta_ecc(struct ocfs2_super *osb)
402 return 0; 407 return 0;
403} 408}
404 409
410static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
411{
412 if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
413 return 1;
414 return 0;
415}
416
417static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
418{
419 if (ocfs2_supports_indexed_dirs(osb))
420 return OCFS2_DX_LINK_MAX;
421 return OCFS2_LINK_MAX;
422}
423
424static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di)
425{
426 u32 nlink = le16_to_cpu(di->i_links_count);
427 u32 hi = le16_to_cpu(di->i_links_count_hi);
428
429 if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL))
430 nlink |= (hi << OCFS2_LINKS_HI_SHIFT);
431
432 return nlink;
433}
434
435static inline void ocfs2_set_links_count(struct ocfs2_dinode *di, u32 nlink)
436{
437 u16 lo, hi;
438
439 lo = nlink;
440 hi = nlink >> OCFS2_LINKS_HI_SHIFT;
441
442 di->i_links_count = cpu_to_le16(lo);
443 di->i_links_count_hi = cpu_to_le16(hi);
444}
445
446static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n)
447{
448 u32 links = ocfs2_read_links_count(di);
449
450 links += n;
451
452 ocfs2_set_links_count(di, links);
453}
454
405/* set / clear functions because cluster events can make these happen 455/* set / clear functions because cluster events can make these happen
406 * in parallel so we want the transitions to be atomic. this also 456 * in parallel so we want the transitions to be atomic. this also
407 * means that any future flags osb_flags must be protected by spinlock 457 * means that any future flags osb_flags must be protected by spinlock
@@ -482,6 +532,12 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
482#define OCFS2_IS_VALID_DIR_TRAILER(ptr) \ 532#define OCFS2_IS_VALID_DIR_TRAILER(ptr) \
483 (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE)) 533 (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE))
484 534
535#define OCFS2_IS_VALID_DX_ROOT(ptr) \
536 (!strcmp((ptr)->dr_signature, OCFS2_DX_ROOT_SIGNATURE))
537
538#define OCFS2_IS_VALID_DX_LEAF(ptr) \
539 (!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE))
540
485static inline unsigned long ino_from_blkno(struct super_block *sb, 541static inline unsigned long ino_from_blkno(struct super_block *sb,
486 u64 blkno) 542 u64 blkno)
487{ 543{
@@ -532,6 +588,16 @@ static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
532 return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits; 588 return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits;
533} 589}
534 590
591static inline u64 ocfs2_block_to_cluster_start(struct super_block *sb,
592 u64 blocks)
593{
594 int bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits;
595 unsigned int clusters;
596
597 clusters = ocfs2_blocks_to_clusters(sb, blocks);
598 return (u64)clusters << bits;
599}
600
535static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb, 601static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb,
536 u64 bytes) 602 u64 bytes)
537{ 603{
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index c7ae45aaa36c..7ab6e9e5e77c 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -66,6 +66,8 @@
66#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" 66#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01"
67#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01" 67#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01"
68#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1" 68#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1"
69#define OCFS2_DX_ROOT_SIGNATURE "DXDIR01"
70#define OCFS2_DX_LEAF_SIGNATURE "DXLEAF1"
69 71
70/* Compatibility flags */ 72/* Compatibility flags */
71#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ 73#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
@@ -95,7 +97,8 @@
95 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \ 97 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
96 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \ 98 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
97 | OCFS2_FEATURE_INCOMPAT_XATTR \ 99 | OCFS2_FEATURE_INCOMPAT_XATTR \
98 | OCFS2_FEATURE_INCOMPAT_META_ECC) 100 | OCFS2_FEATURE_INCOMPAT_META_ECC \
101 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
99#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ 102#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
100 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ 103 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
101 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) 104 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -151,6 +154,9 @@
151/* Support for extended attributes */ 154/* Support for extended attributes */
152#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200 155#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200
153 156
157/* Support for indexed directores */
158#define OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS 0x0400
159
154/* Metadata checksum and error correction */ 160/* Metadata checksum and error correction */
155#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800 161#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800
156 162
@@ -411,8 +417,12 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
411#define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \ 417#define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \
412 OCFS2_DIR_ROUND) & \ 418 OCFS2_DIR_ROUND) & \
413 ~OCFS2_DIR_ROUND) 419 ~OCFS2_DIR_ROUND)
420#define OCFS2_DIR_MIN_REC_LEN OCFS2_DIR_REC_LEN(1)
414 421
415#define OCFS2_LINK_MAX 32000 422#define OCFS2_LINK_MAX 32000
423#define OCFS2_DX_LINK_MAX ((1U << 31) - 1U)
424#define OCFS2_LINKS_HI_SHIFT 16
425#define OCFS2_DX_ENTRIES_MAX (0xffffffffU)
416 426
417#define S_SHIFT 12 427#define S_SHIFT 12
418static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = { 428static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
@@ -628,8 +638,9 @@ struct ocfs2_super_block {
628/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size 638/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size
629 for this fs*/ 639 for this fs*/
630 __le16 s_reserved0; 640 __le16 s_reserved0;
631 __le32 s_reserved1; 641 __le32 s_dx_seed[3]; /* seed[0-2] for dx dir hash.
632/*C0*/ __le64 s_reserved2[16]; /* Fill out superblock */ 642 * s_uuid_hash serves as seed[3]. */
643/*C0*/ __le64 s_reserved2[15]; /* Fill out superblock */
633/*140*/ 644/*140*/
634 645
635 /* 646 /*
@@ -679,7 +690,7 @@ struct ocfs2_dinode {
679 belongs to */ 690 belongs to */
680 __le16 i_suballoc_bit; /* Bit offset in suballocator 691 __le16 i_suballoc_bit; /* Bit offset in suballocator
681 block group */ 692 block group */
682/*10*/ __le16 i_reserved0; 693/*10*/ __le16 i_links_count_hi; /* High 16 bits of links count */
683 __le16 i_xattr_inline_size; 694 __le16 i_xattr_inline_size;
684 __le32 i_clusters; /* Cluster count */ 695 __le32 i_clusters; /* Cluster count */
685 __le32 i_uid; /* Owner UID */ 696 __le32 i_uid; /* Owner UID */
@@ -705,7 +716,8 @@ struct ocfs2_dinode {
705 __le16 i_dyn_features; 716 __le16 i_dyn_features;
706 __le64 i_xattr_loc; 717 __le64 i_xattr_loc;
707/*80*/ struct ocfs2_block_check i_check; /* Error checking */ 718/*80*/ struct ocfs2_block_check i_check; /* Error checking */
708/*88*/ __le64 i_reserved2[6]; 719/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */
720 __le64 i_reserved2[5];
709/*B8*/ union { 721/*B8*/ union {
710 __le64 i_pad1; /* Generic way to refer to this 722 __le64 i_pad1; /* Generic way to refer to this
711 64bit union */ 723 64bit union */
@@ -781,6 +793,90 @@ struct ocfs2_dir_block_trailer {
781/*40*/ 793/*40*/
782}; 794};
783 795
796 /*
797 * A directory entry in the indexed tree. We don't store the full name here,
798 * but instead provide a pointer to the full dirent in the unindexed tree.
799 *
800 * We also store name_len here so as to reduce the number of leaf blocks we
801 * need to search in case of collisions.
802 */
803struct ocfs2_dx_entry {
804 __le32 dx_major_hash; /* Used to find logical
805 * cluster in index */
806 __le32 dx_minor_hash; /* Lower bits used to find
807 * block in cluster */
808 __le64 dx_dirent_blk; /* Physical block in unindexed
809 * tree holding this dirent. */
810};
811
812struct ocfs2_dx_entry_list {
813 __le32 de_reserved;
814 __le16 de_count; /* Maximum number of entries
815 * possible in de_entries */
816 __le16 de_num_used; /* Current number of
817 * de_entries entries */
818 struct ocfs2_dx_entry de_entries[0]; /* Indexed dir entries
819 * in a packed array of
820 * length de_num_used */
821};
822
823#define OCFS2_DX_FLAG_INLINE 0x01
824
825/*
826 * A directory indexing block. Each indexed directory has one of these,
827 * pointed to by ocfs2_dinode.
828 *
829 * This block stores an indexed btree root, and a set of free space
830 * start-of-list pointers.
831 */
832struct ocfs2_dx_root_block {
833 __u8 dr_signature[8]; /* Signature for verification */
834 struct ocfs2_block_check dr_check; /* Error checking */
835 __le16 dr_suballoc_slot; /* Slot suballocator this
836 * block belongs to. */
837 __le16 dr_suballoc_bit; /* Bit offset in suballocator
838 * block group */
839 __le32 dr_fs_generation; /* Must match super block */
840 __le64 dr_blkno; /* Offset on disk, in blocks */
841 __le64 dr_last_eb_blk; /* Pointer to last
842 * extent block */
843 __le32 dr_clusters; /* Clusters allocated
844 * to the indexed tree. */
845 __u8 dr_flags; /* OCFS2_DX_FLAG_* flags */
846 __u8 dr_reserved0;
847 __le16 dr_reserved1;
848 __le64 dr_dir_blkno; /* Pointer to parent inode */
849 __le32 dr_num_entries; /* Total number of
850 * names stored in
851 * this directory.*/
852 __le32 dr_reserved2;
853 __le64 dr_free_blk; /* Pointer to head of free
854 * unindexed block list. */
855 __le64 dr_reserved3[15];
856 union {
857 struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
858 * bits for maximum space
859 * efficiency. */
860 struct ocfs2_dx_entry_list dr_entries; /* In-root-block list of
861 * entries. We grow out
862 * to extents if this
863 * gets too big. */
864 };
865};
866
867/*
868 * The header of a leaf block in the indexed tree.
869 */
870struct ocfs2_dx_leaf {
871 __u8 dl_signature[8];/* Signature for verification */
872 struct ocfs2_block_check dl_check; /* Error checking */
873 __le64 dl_blkno; /* Offset on disk, in blocks */
874 __le32 dl_fs_generation;/* Must match super block */
875 __le32 dl_reserved0;
876 __le64 dl_reserved1;
877 struct ocfs2_dx_entry_list dl_list;
878};
879
784/* 880/*
785 * On disk allocator group structure for OCFS2 881 * On disk allocator group structure for OCFS2
786 */ 882 */
@@ -1070,12 +1166,6 @@ static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
1070 offsetof(struct ocfs2_dinode, id2.i_symlink); 1166 offsetof(struct ocfs2_dinode, id2.i_symlink);
1071} 1167}
1072 1168
1073static inline int ocfs2_max_inline_data(struct super_block *sb)
1074{
1075 return sb->s_blocksize -
1076 offsetof(struct ocfs2_dinode, id2.i_data.id_data);
1077}
1078
1079static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb, 1169static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb,
1080 struct ocfs2_dinode *di) 1170 struct ocfs2_dinode *di)
1081{ 1171{
@@ -1118,6 +1208,16 @@ static inline int ocfs2_extent_recs_per_inode_with_xattr(
1118 return size / sizeof(struct ocfs2_extent_rec); 1208 return size / sizeof(struct ocfs2_extent_rec);
1119} 1209}
1120 1210
1211static inline int ocfs2_extent_recs_per_dx_root(struct super_block *sb)
1212{
1213 int size;
1214
1215 size = sb->s_blocksize -
1216 offsetof(struct ocfs2_dx_root_block, dr_list.l_recs);
1217
1218 return size / sizeof(struct ocfs2_extent_rec);
1219}
1220
1121static inline int ocfs2_chain_recs_per_inode(struct super_block *sb) 1221static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
1122{ 1222{
1123 int size; 1223 int size;
@@ -1138,6 +1238,26 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
1138 return size / sizeof(struct ocfs2_extent_rec); 1238 return size / sizeof(struct ocfs2_extent_rec);
1139} 1239}
1140 1240
1241static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb)
1242{
1243 int size;
1244
1245 size = sb->s_blocksize -
1246 offsetof(struct ocfs2_dx_leaf, dl_list.de_entries);
1247
1248 return size / sizeof(struct ocfs2_dx_entry);
1249}
1250
1251static inline int ocfs2_dx_entries_per_root(struct super_block *sb)
1252{
1253 int size;
1254
1255 size = sb->s_blocksize -
1256 offsetof(struct ocfs2_dx_root_block, dr_entries.de_entries);
1257
1258 return size / sizeof(struct ocfs2_dx_entry);
1259}
1260
1141static inline u16 ocfs2_local_alloc_size(struct super_block *sb) 1261static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
1142{ 1262{
1143 u16 size; 1263 u16 size;
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index eb6f50c9ceca..a53ce87481bf 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -47,6 +47,7 @@ enum ocfs2_lock_type {
47 OCFS2_LOCK_TYPE_OPEN, 47 OCFS2_LOCK_TYPE_OPEN,
48 OCFS2_LOCK_TYPE_FLOCK, 48 OCFS2_LOCK_TYPE_FLOCK,
49 OCFS2_LOCK_TYPE_QINFO, 49 OCFS2_LOCK_TYPE_QINFO,
50 OCFS2_LOCK_TYPE_NFS_SYNC,
50 OCFS2_NUM_LOCK_TYPES 51 OCFS2_NUM_LOCK_TYPES
51}; 52};
52 53
@@ -81,6 +82,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
81 case OCFS2_LOCK_TYPE_QINFO: 82 case OCFS2_LOCK_TYPE_QINFO:
82 c = 'Q'; 83 c = 'Q';
83 break; 84 break;
85 case OCFS2_LOCK_TYPE_NFS_SYNC:
86 c = 'Y';
87 break;
84 default: 88 default:
85 c = '\0'; 89 c = '\0';
86 } 90 }
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index a69628603e18..b4ca5911caaf 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -48,7 +48,8 @@
48#include "buffer_head_io.h" 48#include "buffer_head_io.h"
49 49
50#define NOT_ALLOC_NEW_GROUP 0 50#define NOT_ALLOC_NEW_GROUP 0
51#define ALLOC_NEW_GROUP 1 51#define ALLOC_NEW_GROUP 0x1
52#define ALLOC_GROUPS_FROM_GLOBAL 0x2
52 53
53#define OCFS2_MAX_INODES_TO_STEAL 1024 54#define OCFS2_MAX_INODES_TO_STEAL 1024
54 55
@@ -64,7 +65,9 @@ static int ocfs2_block_group_fill(handle_t *handle,
64static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 65static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
65 struct inode *alloc_inode, 66 struct inode *alloc_inode,
66 struct buffer_head *bh, 67 struct buffer_head *bh,
67 u64 max_block); 68 u64 max_block,
69 u64 *last_alloc_group,
70 int flags);
68 71
69static int ocfs2_cluster_group_search(struct inode *inode, 72static int ocfs2_cluster_group_search(struct inode *inode,
70 struct buffer_head *group_bh, 73 struct buffer_head *group_bh,
@@ -116,6 +119,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
116 u16 *bg_bit_off); 119 u16 *bg_bit_off);
117static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, 120static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
118 u32 bits_wanted, u64 max_block, 121 u32 bits_wanted, u64 max_block,
122 int flags,
119 struct ocfs2_alloc_context **ac); 123 struct ocfs2_alloc_context **ac);
120 124
121void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) 125void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
@@ -403,7 +407,9 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
403static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 407static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
404 struct inode *alloc_inode, 408 struct inode *alloc_inode,
405 struct buffer_head *bh, 409 struct buffer_head *bh,
406 u64 max_block) 410 u64 max_block,
411 u64 *last_alloc_group,
412 int flags)
407{ 413{
408 int status, credits; 414 int status, credits;
409 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; 415 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
@@ -423,7 +429,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
423 cl = &fe->id2.i_chain; 429 cl = &fe->id2.i_chain;
424 status = ocfs2_reserve_clusters_with_limit(osb, 430 status = ocfs2_reserve_clusters_with_limit(osb,
425 le16_to_cpu(cl->cl_cpg), 431 le16_to_cpu(cl->cl_cpg),
426 max_block, &ac); 432 max_block, flags, &ac);
427 if (status < 0) { 433 if (status < 0) {
428 if (status != -ENOSPC) 434 if (status != -ENOSPC)
429 mlog_errno(status); 435 mlog_errno(status);
@@ -440,6 +446,11 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
440 goto bail; 446 goto bail;
441 } 447 }
442 448
449 if (last_alloc_group && *last_alloc_group != 0) {
450 mlog(0, "use old allocation group %llu for block group alloc\n",
451 (unsigned long long)*last_alloc_group);
452 ac->ac_last_group = *last_alloc_group;
453 }
443 status = ocfs2_claim_clusters(osb, 454 status = ocfs2_claim_clusters(osb,
444 handle, 455 handle,
445 ac, 456 ac,
@@ -514,6 +525,11 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
514 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); 525 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
515 526
516 status = 0; 527 status = 0;
528
529 /* save the new last alloc group so that the caller can cache it. */
530 if (last_alloc_group)
531 *last_alloc_group = ac->ac_last_group;
532
517bail: 533bail:
518 if (handle) 534 if (handle)
519 ocfs2_commit_trans(osb, handle); 535 ocfs2_commit_trans(osb, handle);
@@ -531,7 +547,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
531 struct ocfs2_alloc_context *ac, 547 struct ocfs2_alloc_context *ac,
532 int type, 548 int type,
533 u32 slot, 549 u32 slot,
534 int alloc_new_group) 550 u64 *last_alloc_group,
551 int flags)
535{ 552{
536 int status; 553 int status;
537 u32 bits_wanted = ac->ac_bits_wanted; 554 u32 bits_wanted = ac->ac_bits_wanted;
@@ -587,7 +604,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
587 goto bail; 604 goto bail;
588 } 605 }
589 606
590 if (alloc_new_group != ALLOC_NEW_GROUP) { 607 if (!(flags & ALLOC_NEW_GROUP)) {
591 mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, " 608 mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
592 "and we don't alloc a new group for it.\n", 609 "and we don't alloc a new group for it.\n",
593 slot, bits_wanted, free_bits); 610 slot, bits_wanted, free_bits);
@@ -596,7 +613,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
596 } 613 }
597 614
598 status = ocfs2_block_group_alloc(osb, alloc_inode, bh, 615 status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
599 ac->ac_max_block); 616 ac->ac_max_block,
617 last_alloc_group, flags);
600 if (status < 0) { 618 if (status < 0) {
601 if (status != -ENOSPC) 619 if (status != -ENOSPC)
602 mlog_errno(status); 620 mlog_errno(status);
@@ -640,7 +658,7 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
640 658
641 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 659 status = ocfs2_reserve_suballoc_bits(osb, (*ac),
642 EXTENT_ALLOC_SYSTEM_INODE, 660 EXTENT_ALLOC_SYSTEM_INODE,
643 slot, ALLOC_NEW_GROUP); 661 slot, NULL, ALLOC_NEW_GROUP);
644 if (status < 0) { 662 if (status < 0) {
645 if (status != -ENOSPC) 663 if (status != -ENOSPC)
646 mlog_errno(status); 664 mlog_errno(status);
@@ -686,7 +704,8 @@ static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
686 704
687 status = ocfs2_reserve_suballoc_bits(osb, ac, 705 status = ocfs2_reserve_suballoc_bits(osb, ac,
688 INODE_ALLOC_SYSTEM_INODE, 706 INODE_ALLOC_SYSTEM_INODE,
689 slot, NOT_ALLOC_NEW_GROUP); 707 slot, NULL,
708 NOT_ALLOC_NEW_GROUP);
690 if (status >= 0) { 709 if (status >= 0) {
691 ocfs2_set_inode_steal_slot(osb, slot); 710 ocfs2_set_inode_steal_slot(osb, slot);
692 break; 711 break;
@@ -703,6 +722,7 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
703{ 722{
704 int status; 723 int status;
705 s16 slot = ocfs2_get_inode_steal_slot(osb); 724 s16 slot = ocfs2_get_inode_steal_slot(osb);
725 u64 alloc_group;
706 726
707 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 727 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
708 if (!(*ac)) { 728 if (!(*ac)) {
@@ -738,12 +758,22 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
738 goto inode_steal; 758 goto inode_steal;
739 759
740 atomic_set(&osb->s_num_inodes_stolen, 0); 760 atomic_set(&osb->s_num_inodes_stolen, 0);
761 alloc_group = osb->osb_inode_alloc_group;
741 status = ocfs2_reserve_suballoc_bits(osb, *ac, 762 status = ocfs2_reserve_suballoc_bits(osb, *ac,
742 INODE_ALLOC_SYSTEM_INODE, 763 INODE_ALLOC_SYSTEM_INODE,
743 osb->slot_num, ALLOC_NEW_GROUP); 764 osb->slot_num,
765 &alloc_group,
766 ALLOC_NEW_GROUP |
767 ALLOC_GROUPS_FROM_GLOBAL);
744 if (status >= 0) { 768 if (status >= 0) {
745 status = 0; 769 status = 0;
746 770
771 spin_lock(&osb->osb_lock);
772 osb->osb_inode_alloc_group = alloc_group;
773 spin_unlock(&osb->osb_lock);
774 mlog(0, "after reservation, new allocation group is "
775 "%llu\n", (unsigned long long)alloc_group);
776
747 /* 777 /*
748 * Some inodes must be freed by us, so try to allocate 778 * Some inodes must be freed by us, so try to allocate
749 * from our own next time. 779 * from our own next time.
@@ -790,7 +820,7 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
790 820
791 status = ocfs2_reserve_suballoc_bits(osb, ac, 821 status = ocfs2_reserve_suballoc_bits(osb, ac,
792 GLOBAL_BITMAP_SYSTEM_INODE, 822 GLOBAL_BITMAP_SYSTEM_INODE,
793 OCFS2_INVALID_SLOT, 823 OCFS2_INVALID_SLOT, NULL,
794 ALLOC_NEW_GROUP); 824 ALLOC_NEW_GROUP);
795 if (status < 0 && status != -ENOSPC) { 825 if (status < 0 && status != -ENOSPC) {
796 mlog_errno(status); 826 mlog_errno(status);
@@ -806,6 +836,7 @@ bail:
806 * things a bit. */ 836 * things a bit. */
807static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, 837static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
808 u32 bits_wanted, u64 max_block, 838 u32 bits_wanted, u64 max_block,
839 int flags,
809 struct ocfs2_alloc_context **ac) 840 struct ocfs2_alloc_context **ac)
810{ 841{
811 int status; 842 int status;
@@ -823,7 +854,8 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
823 (*ac)->ac_max_block = max_block; 854 (*ac)->ac_max_block = max_block;
824 855
825 status = -ENOSPC; 856 status = -ENOSPC;
826 if (ocfs2_alloc_should_use_local(osb, bits_wanted)) { 857 if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
858 ocfs2_alloc_should_use_local(osb, bits_wanted)) {
827 status = ocfs2_reserve_local_alloc_bits(osb, 859 status = ocfs2_reserve_local_alloc_bits(osb,
828 bits_wanted, 860 bits_wanted,
829 *ac); 861 *ac);
@@ -861,7 +893,8 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
861 u32 bits_wanted, 893 u32 bits_wanted,
862 struct ocfs2_alloc_context **ac) 894 struct ocfs2_alloc_context **ac)
863{ 895{
864 return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac); 896 return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
897 ALLOC_NEW_GROUP, ac);
865} 898}
866 899
867/* 900/*
@@ -1618,8 +1651,41 @@ bail:
1618 return status; 1651 return status;
1619} 1652}
1620 1653
1654static void ocfs2_init_inode_ac_group(struct inode *dir,
1655 struct buffer_head *parent_fe_bh,
1656 struct ocfs2_alloc_context *ac)
1657{
1658 struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data;
1659 /*
1660 * Try to allocate inodes from some specific group.
1661 *
1662 * If the parent dir has recorded the last group used in allocation,
1663 * cool, use it. Otherwise if we try to allocate new inode from the
1664 * same slot the parent dir belongs to, use the same chunk.
1665 *
1666 * We are very careful here to avoid the mistake of setting
1667 * ac_last_group to a group descriptor from a different (unlocked) slot.
1668 */
1669 if (OCFS2_I(dir)->ip_last_used_group &&
1670 OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
1671 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
1672 else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot)
1673 ac->ac_last_group = ocfs2_which_suballoc_group(
1674 le64_to_cpu(fe->i_blkno),
1675 le16_to_cpu(fe->i_suballoc_bit));
1676}
1677
1678static inline void ocfs2_save_inode_ac_group(struct inode *dir,
1679 struct ocfs2_alloc_context *ac)
1680{
1681 OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
1682 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
1683}
1684
1621int ocfs2_claim_new_inode(struct ocfs2_super *osb, 1685int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1622 handle_t *handle, 1686 handle_t *handle,
1687 struct inode *dir,
1688 struct buffer_head *parent_fe_bh,
1623 struct ocfs2_alloc_context *ac, 1689 struct ocfs2_alloc_context *ac,
1624 u16 *suballoc_bit, 1690 u16 *suballoc_bit,
1625 u64 *fe_blkno) 1691 u64 *fe_blkno)
@@ -1635,6 +1701,8 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1635 BUG_ON(ac->ac_bits_wanted != 1); 1701 BUG_ON(ac->ac_bits_wanted != 1);
1636 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); 1702 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
1637 1703
1704 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
1705
1638 status = ocfs2_claim_suballoc_bits(osb, 1706 status = ocfs2_claim_suballoc_bits(osb,
1639 ac, 1707 ac,
1640 handle, 1708 handle,
@@ -1653,6 +1721,7 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1653 1721
1654 *fe_blkno = bg_blkno + (u64) (*suballoc_bit); 1722 *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
1655 ac->ac_bits_given++; 1723 ac->ac_bits_given++;
1724 ocfs2_save_inode_ac_group(dir, ac);
1656 status = 0; 1725 status = 0;
1657bail: 1726bail:
1658 mlog_exit(status); 1727 mlog_exit(status);
@@ -2116,3 +2185,162 @@ out:
2116 2185
2117 return ret; 2186 return ret;
2118} 2187}
2188
2189/*
2190 * Read the inode specified by blkno to get suballoc_slot and
2191 * suballoc_bit.
2192 */
2193static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2194 u16 *suballoc_slot, u16 *suballoc_bit)
2195{
2196 int status;
2197 struct buffer_head *inode_bh = NULL;
2198 struct ocfs2_dinode *inode_fe;
2199
2200 mlog_entry("blkno: %llu\n", blkno);
2201
2202 /* dirty read disk */
2203 status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
2204 if (status < 0) {
2205 mlog(ML_ERROR, "read block %llu failed %d\n", blkno, status);
2206 goto bail;
2207 }
2208
2209 inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
2210 if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
2211 mlog(ML_ERROR, "invalid inode %llu requested\n", blkno);
2212 status = -EINVAL;
2213 goto bail;
2214 }
2215
2216 if (le16_to_cpu(inode_fe->i_suballoc_slot) != OCFS2_INVALID_SLOT &&
2217 (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
2218 mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
2219 blkno, (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
2220 status = -EINVAL;
2221 goto bail;
2222 }
2223
2224 if (suballoc_slot)
2225 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
2226 if (suballoc_bit)
2227 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
2228
2229bail:
2230 brelse(inode_bh);
2231
2232 mlog_exit(status);
2233 return status;
2234}
2235
2236/*
2237 * test whether bit is SET in allocator bitmap or not. on success, 0
2238 * is returned and *res is 1 for SET; 0 otherwise. when fails, errno
2239 * is returned and *res is meaningless. Call this after you have
2240 * cluster locked against suballoc, or you may get a result based on
2241 * non-up2date contents
2242 */
2243static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2244 struct inode *suballoc,
2245 struct buffer_head *alloc_bh, u64 blkno,
2246 u16 bit, int *res)
2247{
2248 struct ocfs2_dinode *alloc_fe;
2249 struct ocfs2_group_desc *group;
2250 struct buffer_head *group_bh = NULL;
2251 u64 bg_blkno;
2252 int status;
2253
2254 mlog_entry("blkno: %llu bit: %u\n", blkno, (unsigned int)bit);
2255
2256 alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
2257 if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
2258 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
2259 (unsigned int)bit,
2260 ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
2261 status = -EINVAL;
2262 goto bail;
2263 }
2264
2265 bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
2266 status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
2267 &group_bh);
2268 if (status < 0) {
2269 mlog(ML_ERROR, "read group %llu failed %d\n", bg_blkno, status);
2270 goto bail;
2271 }
2272
2273 group = (struct ocfs2_group_desc *) group_bh->b_data;
2274 *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
2275
2276bail:
2277 brelse(group_bh);
2278
2279 mlog_exit(status);
2280 return status;
2281}
2282
2283/*
2284 * Test if the bit representing this inode (blkno) is set in the
2285 * suballocator.
2286 *
2287 * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
2288 *
2289 * In the event of failure, a negative value is returned and *res is
2290 * meaningless.
2291 *
2292 * Callers must make sure to hold nfs_sync_lock to prevent
2293 * ocfs2_delete_inode() on another node from accessing the same
2294 * suballocator concurrently.
2295 */
2296int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2297{
2298 int status;
2299 u16 suballoc_bit = 0, suballoc_slot = 0;
2300 struct inode *inode_alloc_inode;
2301 struct buffer_head *alloc_bh = NULL;
2302
2303 mlog_entry("blkno: %llu", blkno);
2304
2305 status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2306 &suballoc_bit);
2307 if (status < 0) {
2308 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
2309 goto bail;
2310 }
2311
2312 inode_alloc_inode =
2313 ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
2314 suballoc_slot);
2315 if (!inode_alloc_inode) {
2316 /* the error code could be inaccurate, but we are not able to
2317 * get the correct one. */
2318 status = -EINVAL;
2319 mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
2320 (u32)suballoc_slot);
2321 goto bail;
2322 }
2323
2324 mutex_lock(&inode_alloc_inode->i_mutex);
2325 status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
2326 if (status < 0) {
2327 mutex_unlock(&inode_alloc_inode->i_mutex);
2328 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
2329 (u32)suballoc_slot, status);
2330 goto bail;
2331 }
2332
2333 status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
2334 blkno, suballoc_bit, res);
2335 if (status < 0)
2336 mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
2337
2338 ocfs2_inode_unlock(inode_alloc_inode, 0);
2339 mutex_unlock(&inode_alloc_inode->i_mutex);
2340
2341 iput(inode_alloc_inode);
2342 brelse(alloc_bh);
2343bail:
2344 mlog_exit(status);
2345 return status;
2346}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index e3c13c77f9e8..8c9a78a43164 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -88,6 +88,8 @@ int ocfs2_claim_metadata(struct ocfs2_super *osb,
88 u64 *blkno_start); 88 u64 *blkno_start);
89int ocfs2_claim_new_inode(struct ocfs2_super *osb, 89int ocfs2_claim_new_inode(struct ocfs2_super *osb,
90 handle_t *handle, 90 handle_t *handle,
91 struct inode *dir,
92 struct buffer_head *parent_fe_bh,
91 struct ocfs2_alloc_context *ac, 93 struct ocfs2_alloc_context *ac,
92 u16 *suballoc_bit, 94 u16 *suballoc_bit,
93 u64 *fe_blkno); 95 u64 *fe_blkno);
@@ -186,4 +188,6 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
186 u32 clusters_to_add, u32 extents_to_split, 188 u32 clusters_to_add, u32 extents_to_split,
187 struct ocfs2_alloc_context **data_ac, 189 struct ocfs2_alloc_context **data_ac,
188 struct ocfs2_alloc_context **meta_ac); 190 struct ocfs2_alloc_context **meta_ac);
191
192int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res);
189#endif /* _CHAINALLOC_H_ */ 193#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 7ac83a81ee55..79ff8d9d37e0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -201,6 +201,170 @@ static const match_table_t tokens = {
201 {Opt_err, NULL} 201 {Opt_err, NULL}
202}; 202};
203 203
204#ifdef CONFIG_DEBUG_FS
205static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
206{
207 int out = 0;
208 int i;
209 struct ocfs2_cluster_connection *cconn = osb->cconn;
210 struct ocfs2_recovery_map *rm = osb->recovery_map;
211
212 out += snprintf(buf + out, len - out,
213 "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n",
214 "Device", osb->dev_str, osb->uuid_str,
215 osb->fs_generation, osb->vol_label);
216
217 out += snprintf(buf + out, len - out,
218 "%10s => State: %d Flags: 0x%lX\n", "Volume",
219 atomic_read(&osb->vol_state), osb->osb_flags);
220
221 out += snprintf(buf + out, len - out,
222 "%10s => Block: %lu Cluster: %d\n", "Sizes",
223 osb->sb->s_blocksize, osb->s_clustersize);
224
225 out += snprintf(buf + out, len - out,
226 "%10s => Compat: 0x%X Incompat: 0x%X "
227 "ROcompat: 0x%X\n",
228 "Features", osb->s_feature_compat,
229 osb->s_feature_incompat, osb->s_feature_ro_compat);
230
231 out += snprintf(buf + out, len - out,
232 "%10s => Opts: 0x%lX AtimeQuanta: %u\n", "Mount",
233 osb->s_mount_opt, osb->s_atime_quantum);
234
235 out += snprintf(buf + out, len - out,
236 "%10s => Stack: %s Name: %*s Version: %d.%d\n",
237 "Cluster",
238 (*osb->osb_cluster_stack == '\0' ?
239 "o2cb" : osb->osb_cluster_stack),
240 cconn->cc_namelen, cconn->cc_name,
241 cconn->cc_version.pv_major, cconn->cc_version.pv_minor);
242
243 spin_lock(&osb->dc_task_lock);
244 out += snprintf(buf + out, len - out,
245 "%10s => Pid: %d Count: %lu WakeSeq: %lu "
246 "WorkSeq: %lu\n", "DownCnvt",
247 task_pid_nr(osb->dc_task), osb->blocked_lock_count,
248 osb->dc_wake_sequence, osb->dc_work_sequence);
249 spin_unlock(&osb->dc_task_lock);
250
251 spin_lock(&osb->osb_lock);
252 out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:",
253 "Recovery",
254 (osb->recovery_thread_task ?
255 task_pid_nr(osb->recovery_thread_task) : -1));
256 if (rm->rm_used == 0)
257 out += snprintf(buf + out, len - out, " None\n");
258 else {
259 for (i = 0; i < rm->rm_used; i++)
260 out += snprintf(buf + out, len - out, " %d",
261 rm->rm_entries[i]);
262 out += snprintf(buf + out, len - out, "\n");
263 }
264 spin_unlock(&osb->osb_lock);
265
266 out += snprintf(buf + out, len - out,
267 "%10s => Pid: %d Interval: %lu Needs: %d\n", "Commit",
268 task_pid_nr(osb->commit_task), osb->osb_commit_interval,
269 atomic_read(&osb->needs_checkpoint));
270
271 out += snprintf(buf + out, len - out,
272 "%10s => State: %d NumTxns: %d TxnId: %lu\n",
273 "Journal", osb->journal->j_state,
274 atomic_read(&osb->journal->j_num_trans),
275 osb->journal->j_trans_id);
276
277 out += snprintf(buf + out, len - out,
278 "%10s => GlobalAllocs: %d LocalAllocs: %d "
279 "SubAllocs: %d LAWinMoves: %d SAExtends: %d\n",
280 "Stats",
281 atomic_read(&osb->alloc_stats.bitmap_data),
282 atomic_read(&osb->alloc_stats.local_data),
283 atomic_read(&osb->alloc_stats.bg_allocs),
284 atomic_read(&osb->alloc_stats.moves),
285 atomic_read(&osb->alloc_stats.bg_extends));
286
287 out += snprintf(buf + out, len - out,
288 "%10s => State: %u Descriptor: %llu Size: %u bits "
289 "Default: %u bits\n",
290 "LocalAlloc", osb->local_alloc_state,
291 (unsigned long long)osb->la_last_gd,
292 osb->local_alloc_bits, osb->local_alloc_default_bits);
293
294 spin_lock(&osb->osb_lock);
295 out += snprintf(buf + out, len - out,
296 "%10s => Slot: %d NumStolen: %d\n", "Steal",
297 osb->s_inode_steal_slot,
298 atomic_read(&osb->s_num_inodes_stolen));
299 spin_unlock(&osb->osb_lock);
300
301 out += snprintf(buf + out, len - out, "%10s => %3s %10s\n",
302 "Slots", "Num", "RecoGen");
303
304 for (i = 0; i < osb->max_slots; ++i) {
305 out += snprintf(buf + out, len - out,
306 "%10s %c %3d %10d\n",
307 " ",
308 (i == osb->slot_num ? '*' : ' '),
309 i, osb->slot_recovery_generations[i]);
310 }
311
312 return out;
313}
314
315static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
316{
317 struct ocfs2_super *osb = inode->i_private;
318 char *buf = NULL;
319
320 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
321 if (!buf)
322 goto bail;
323
324 i_size_write(inode, ocfs2_osb_dump(osb, buf, PAGE_SIZE));
325
326 file->private_data = buf;
327
328 return 0;
329bail:
330 return -ENOMEM;
331}
332
333static int ocfs2_debug_release(struct inode *inode, struct file *file)
334{
335 kfree(file->private_data);
336 return 0;
337}
338
339static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
340 size_t nbytes, loff_t *ppos)
341{
342 return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
343 i_size_read(file->f_mapping->host));
344}
345#else
346static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
347{
348 return 0;
349}
350static int ocfs2_debug_release(struct inode *inode, struct file *file)
351{
352 return 0;
353}
354static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
355 size_t nbytes, loff_t *ppos)
356{
357 return 0;
358}
359#endif /* CONFIG_DEBUG_FS */
360
361static struct file_operations ocfs2_osb_debug_fops = {
362 .open = ocfs2_osb_debug_open,
363 .release = ocfs2_debug_release,
364 .read = ocfs2_debug_read,
365 .llseek = generic_file_llseek,
366};
367
204/* 368/*
205 * write_super and sync_fs ripped right out of ext3. 369 * write_super and sync_fs ripped right out of ext3.
206 */ 370 */
@@ -926,6 +1090,16 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
926 goto read_super_error; 1090 goto read_super_error;
927 } 1091 }
928 1092
1093 osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR,
1094 osb->osb_debug_root,
1095 osb,
1096 &ocfs2_osb_debug_fops);
1097 if (!osb->osb_ctxt) {
1098 status = -EINVAL;
1099 mlog_errno(status);
1100 goto read_super_error;
1101 }
1102
929 status = ocfs2_mount_volume(sb); 1103 status = ocfs2_mount_volume(sb);
930 if (osb->root_inode) 1104 if (osb->root_inode)
931 inode = igrab(osb->root_inode); 1105 inode = igrab(osb->root_inode);
@@ -1620,6 +1794,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1620 osb = OCFS2_SB(sb); 1794 osb = OCFS2_SB(sb);
1621 BUG_ON(!osb); 1795 BUG_ON(!osb);
1622 1796
1797 debugfs_remove(osb->osb_ctxt);
1798
1623 ocfs2_disable_quotas(osb); 1799 ocfs2_disable_quotas(osb);
1624 1800
1625 ocfs2_shutdown_local_alloc(osb); 1801 ocfs2_shutdown_local_alloc(osb);
@@ -1742,6 +1918,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
1742 bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); 1918 bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
1743 sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits); 1919 sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
1744 1920
1921 osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
1922
1923 for (i = 0; i < 3; i++)
1924 osb->osb_dx_seed[i] = le32_to_cpu(di->id2.i_super.s_dx_seed[i]);
1925 osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash);
1926
1745 osb->sb = sb; 1927 osb->sb = sb;
1746 /* Save off for ocfs2_rw_direct */ 1928 /* Save off for ocfs2_rw_direct */
1747 osb->s_sectsize_bits = blksize_bits(sector_size); 1929 osb->s_sectsize_bits = blksize_bits(sector_size);
@@ -2130,6 +2312,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
2130 * lock, and it's marked as dirty, set the bit in the recover 2312 * lock, and it's marked as dirty, set the bit in the recover
2131 * map and launch a recovery thread for it. */ 2313 * map and launch a recovery thread for it. */
2132 status = ocfs2_mark_dead_nodes(osb); 2314 status = ocfs2_mark_dead_nodes(osb);
2315 if (status < 0) {
2316 mlog_errno(status);
2317 goto finally;
2318 }
2319
2320 status = ocfs2_compute_replay_slots(osb);
2133 if (status < 0) 2321 if (status < 0)
2134 mlog_errno(status); 2322 mlog_errno(status);
2135 2323
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 4ddd788add67..15631019dc63 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -512,7 +512,7 @@ int ocfs2_calc_xattr_init(struct inode *dir,
512 struct ocfs2_security_xattr_info *si, 512 struct ocfs2_security_xattr_info *si,
513 int *want_clusters, 513 int *want_clusters,
514 int *xattr_credits, 514 int *xattr_credits,
515 struct ocfs2_alloc_context **xattr_ac) 515 int *want_meta)
516{ 516{
517 int ret = 0; 517 int ret = 0;
518 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 518 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
@@ -547,14 +547,14 @@ int ocfs2_calc_xattr_init(struct inode *dir,
547 * when blocksize = 512, may reserve one more cluser for 547 * when blocksize = 512, may reserve one more cluser for
548 * xattr bucket, otherwise reserve one metadata block 548 * xattr bucket, otherwise reserve one metadata block
549 * for them is ok. 549 * for them is ok.
550 * If this is a new directory with inline data,
551 * we choose to reserve the entire inline area for
552 * directory contents and force an external xattr block.
550 */ 553 */
551 if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE || 554 if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
555 (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) ||
552 (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) { 556 (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
553 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac); 557 *want_meta = *want_meta + 1;
554 if (ret) {
555 mlog_errno(ret);
556 return ret;
557 }
558 *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; 558 *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
559 } 559 }
560 560
@@ -4791,19 +4791,33 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
4791 char *val, 4791 char *val,
4792 int value_len) 4792 int value_len)
4793{ 4793{
4794 int offset; 4794 int ret, offset, block_off;
4795 struct ocfs2_xattr_value_root *xv; 4795 struct ocfs2_xattr_value_root *xv;
4796 struct ocfs2_xattr_entry *xe = xs->here; 4796 struct ocfs2_xattr_entry *xe = xs->here;
4797 struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
4798 void *base;
4797 4799
4798 BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe)); 4800 BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
4799 4801
4800 offset = le16_to_cpu(xe->xe_name_offset) + 4802 ret = ocfs2_xattr_bucket_get_name_value(inode, xh,
4801 OCFS2_XATTR_SIZE(xe->xe_name_len); 4803 xe - xh->xh_entries,
4804 &block_off,
4805 &offset);
4806 if (ret) {
4807 mlog_errno(ret);
4808 goto out;
4809 }
4802 4810
4803 xv = (struct ocfs2_xattr_value_root *)(xs->base + offset); 4811 base = bucket_block(xs->bucket, block_off);
4812 xv = (struct ocfs2_xattr_value_root *)(base + offset +
4813 OCFS2_XATTR_SIZE(xe->xe_name_len));
4804 4814
4805 return __ocfs2_xattr_set_value_outside(inode, handle, 4815 ret = __ocfs2_xattr_set_value_outside(inode, handle,
4806 xv, val, value_len); 4816 xv, val, value_len);
4817 if (ret)
4818 mlog_errno(ret);
4819out:
4820 return ret;
4807} 4821}
4808 4822
4809static int ocfs2_rm_xattr_cluster(struct inode *inode, 4823static int ocfs2_rm_xattr_cluster(struct inode *inode,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 5a1ebc789f7e..1ca7e9a1b7bc 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -68,7 +68,7 @@ int ocfs2_calc_security_init(struct inode *,
68 int *, int *, struct ocfs2_alloc_context **); 68 int *, int *, struct ocfs2_alloc_context **);
69int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *, 69int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
70 int, struct ocfs2_security_xattr_info *, 70 int, struct ocfs2_security_xattr_info *,
71 int *, int *, struct ocfs2_alloc_context **); 71 int *, int *, int *);
72 72
73/* 73/*
74 * xattrs can live inside an inode, as part of an external xattr block, 74 * xattrs can live inside an inode, as part of an external xattr block,
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 633e9dc972bb..379ae5fb4411 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -262,14 +262,19 @@ static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf)
262{ 262{
263 struct super_block *s = dentry->d_sb; 263 struct super_block *s = dentry->d_sb;
264 struct omfs_sb_info *sbi = OMFS_SB(s); 264 struct omfs_sb_info *sbi = OMFS_SB(s);
265 u64 id = huge_encode_dev(s->s_bdev->bd_dev);
266
265 buf->f_type = OMFS_MAGIC; 267 buf->f_type = OMFS_MAGIC;
266 buf->f_bsize = sbi->s_blocksize; 268 buf->f_bsize = sbi->s_blocksize;
267 buf->f_blocks = sbi->s_num_blocks; 269 buf->f_blocks = sbi->s_num_blocks;
268 buf->f_files = sbi->s_num_blocks; 270 buf->f_files = sbi->s_num_blocks;
269 buf->f_namelen = OMFS_NAMELEN; 271 buf->f_namelen = OMFS_NAMELEN;
272 buf->f_fsid.val[0] = (u32)id;
273 buf->f_fsid.val[1] = (u32)(id >> 32);
270 274
271 buf->f_bfree = buf->f_bavail = buf->f_ffree = 275 buf->f_bfree = buf->f_bavail = buf->f_ffree =
272 omfs_count_free(s); 276 omfs_count_free(s);
277
273 return 0; 278 return 0;
274} 279}
275 280
@@ -421,7 +426,7 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
421 426
422 sbi->s_uid = current_uid(); 427 sbi->s_uid = current_uid();
423 sbi->s_gid = current_gid(); 428 sbi->s_gid = current_gid();
424 sbi->s_dmask = sbi->s_fmask = current->fs->umask; 429 sbi->s_dmask = sbi->s_fmask = current_umask();
425 430
426 if (!parse_options((char *) data, sbi)) 431 if (!parse_options((char *) data, sbi))
427 goto end; 432 goto end;
diff --git a/fs/open.c b/fs/open.c
index a3a78ceb2a2b..377eb25b6abf 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -29,6 +29,7 @@
29#include <linux/rcupdate.h> 29#include <linux/rcupdate.h>
30#include <linux/audit.h> 30#include <linux/audit.h>
31#include <linux/falloc.h> 31#include <linux/falloc.h>
32#include <linux/fs_struct.h>
32 33
33int vfs_statfs(struct dentry *dentry, struct kstatfs *buf) 34int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
34{ 35{
@@ -273,7 +274,7 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
273 if (!error) 274 if (!error)
274 error = security_path_truncate(&path, length, 0); 275 error = security_path_truncate(&path, length, 0);
275 if (!error) { 276 if (!error) {
276 DQUOT_INIT(inode); 277 vfs_dq_init(inode);
277 error = do_truncate(path.dentry, length, 0, NULL); 278 error = do_truncate(path.dentry, length, 0, NULL);
278 } 279 }
279 280
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6d720243f5f4..99e33ef40be4 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -19,6 +19,7 @@
19#include <linux/kmod.h> 19#include <linux/kmod.h>
20#include <linux/ctype.h> 20#include <linux/ctype.h>
21#include <linux/genhd.h> 21#include <linux/genhd.h>
22#include <linux/blktrace_api.h>
22 23
23#include "check.h" 24#include "check.h"
24 25
@@ -294,6 +295,9 @@ static struct attribute_group part_attr_group = {
294 295
295static struct attribute_group *part_attr_groups[] = { 296static struct attribute_group *part_attr_groups[] = {
296 &part_attr_group, 297 &part_attr_group,
298#ifdef CONFIG_BLK_DEV_IO_TRACE
299 &blk_trace_attr_group,
300#endif
297 NULL 301 NULL
298}; 302};
299 303
@@ -400,7 +404,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
400 pdev->devt = devt; 404 pdev->devt = devt;
401 405
402 /* delay uevent until 'holders' subdir is created */ 406 /* delay uevent until 'holders' subdir is created */
403 pdev->uevent_suppress = 1; 407 dev_set_uevent_suppress(pdev, 1);
404 err = device_add(pdev); 408 err = device_add(pdev);
405 if (err) 409 if (err)
406 goto out_put; 410 goto out_put;
@@ -410,7 +414,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
410 if (!p->holder_dir) 414 if (!p->holder_dir)
411 goto out_del; 415 goto out_del;
412 416
413 pdev->uevent_suppress = 0; 417 dev_set_uevent_suppress(pdev, 0);
414 if (flags & ADDPART_FLAG_WHOLEDISK) { 418 if (flags & ADDPART_FLAG_WHOLEDISK) {
415 err = device_create_file(pdev, &dev_attr_whole_disk); 419 err = device_create_file(pdev, &dev_attr_whole_disk);
416 if (err) 420 if (err)
@@ -422,7 +426,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
422 rcu_assign_pointer(ptbl->part[partno], p); 426 rcu_assign_pointer(ptbl->part[partno], p);
423 427
424 /* suppress uevent if the disk supresses it */ 428 /* suppress uevent if the disk supresses it */
425 if (!ddev->uevent_suppress) 429 if (!dev_get_uevent_suppress(pdev))
426 kobject_uevent(&pdev->kobj, KOBJ_ADD); 430 kobject_uevent(&pdev->kobj, KOBJ_ADD);
427 431
428 return p; 432 return p;
@@ -455,7 +459,7 @@ void register_disk(struct gendisk *disk)
455 dev_set_name(ddev, disk->disk_name); 459 dev_set_name(ddev, disk->disk_name);
456 460
457 /* delay uevents, until we scanned partition table */ 461 /* delay uevents, until we scanned partition table */
458 ddev->uevent_suppress = 1; 462 dev_set_uevent_suppress(ddev, 1);
459 463
460 if (device_add(ddev)) 464 if (device_add(ddev))
461 return; 465 return;
@@ -490,7 +494,7 @@ void register_disk(struct gendisk *disk)
490 494
491exit: 495exit:
492 /* announce disk after possible partitions are created */ 496 /* announce disk after possible partitions are created */
493 ddev->uevent_suppress = 0; 497 dev_set_uevent_suppress(ddev, 0);
494 kobject_uevent(&ddev->kobj, KOBJ_ADD); 498 kobject_uevent(&ddev->kobj, KOBJ_ADD);
495 499
496 /* announce possible partitions */ 500 /* announce possible partitions */
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 1e064c4a4f86..46297683cd34 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -21,20 +21,38 @@
21 * compute the block number from a 21 * compute the block number from a
22 * cyl-cyl-head-head structure 22 * cyl-cyl-head-head structure
23 */ 23 */
24static inline int 24static sector_t
25cchh2blk (struct vtoc_cchh *ptr, struct hd_geometry *geo) { 25cchh2blk (struct vtoc_cchh *ptr, struct hd_geometry *geo) {
26 return ptr->cc * geo->heads * geo->sectors + 26
27 ptr->hh * geo->sectors; 27 sector_t cyl;
28 __u16 head;
29
30 /*decode cylinder and heads for large volumes */
31 cyl = ptr->hh & 0xFFF0;
32 cyl <<= 12;
33 cyl |= ptr->cc;
34 head = ptr->hh & 0x000F;
35 return cyl * geo->heads * geo->sectors +
36 head * geo->sectors;
28} 37}
29 38
30/* 39/*
31 * compute the block number from a 40 * compute the block number from a
32 * cyl-cyl-head-head-block structure 41 * cyl-cyl-head-head-block structure
33 */ 42 */
34static inline int 43static sector_t
35cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) { 44cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) {
36 return ptr->cc * geo->heads * geo->sectors + 45
37 ptr->hh * geo->sectors + 46 sector_t cyl;
47 __u16 head;
48
49 /*decode cylinder and heads for large volumes */
50 cyl = ptr->hh & 0xFFF0;
51 cyl <<= 12;
52 cyl |= ptr->cc;
53 head = ptr->hh & 0x000F;
54 return cyl * geo->heads * geo->sectors +
55 head * geo->sectors +
38 ptr->b; 56 ptr->b;
39} 57}
40 58
@@ -43,14 +61,15 @@ cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) {
43int 61int
44ibm_partition(struct parsed_partitions *state, struct block_device *bdev) 62ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
45{ 63{
46 int blocksize, offset, size,res; 64 int blocksize, res;
47 loff_t i_size; 65 loff_t i_size, offset, size, fmt_size;
48 dasd_information2_t *info; 66 dasd_information2_t *info;
49 struct hd_geometry *geo; 67 struct hd_geometry *geo;
50 char type[5] = {0,}; 68 char type[5] = {0,};
51 char name[7] = {0,}; 69 char name[7] = {0,};
52 union label_t { 70 union label_t {
53 struct vtoc_volume_label vol; 71 struct vtoc_volume_label_cdl vol;
72 struct vtoc_volume_label_ldl lnx;
54 struct vtoc_cms_label cms; 73 struct vtoc_cms_label cms;
55 } *label; 74 } *label;
56 unsigned char *data; 75 unsigned char *data;
@@ -85,14 +104,16 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
85 if (data == NULL) 104 if (data == NULL)
86 goto out_readerr; 105 goto out_readerr;
87 106
88 strncpy (type, data, 4);
89 if ((!info->FBA_layout) && (!strcmp(info->type, "ECKD")))
90 strncpy(name, data + 8, 6);
91 else
92 strncpy(name, data + 4, 6);
93 memcpy(label, data, sizeof(union label_t)); 107 memcpy(label, data, sizeof(union label_t));
94 put_dev_sector(sect); 108 put_dev_sector(sect);
95 109
110 if ((!info->FBA_layout) && (!strcmp(info->type, "ECKD"))) {
111 strncpy(type, label->vol.vollbl, 4);
112 strncpy(name, label->vol.volid, 6);
113 } else {
114 strncpy(type, label->lnx.vollbl, 4);
115 strncpy(name, label->lnx.volid, 6);
116 }
96 EBCASC(type, 4); 117 EBCASC(type, 4);
97 EBCASC(name, 6); 118 EBCASC(name, 6);
98 119
@@ -110,36 +131,54 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
110 /* 131 /*
111 * VM style CMS1 labeled disk 132 * VM style CMS1 labeled disk
112 */ 133 */
134 blocksize = label->cms.block_size;
113 if (label->cms.disk_offset != 0) { 135 if (label->cms.disk_offset != 0) {
114 printk("CMS1/%8s(MDSK):", name); 136 printk("CMS1/%8s(MDSK):", name);
115 /* disk is reserved minidisk */ 137 /* disk is reserved minidisk */
116 blocksize = label->cms.block_size;
117 offset = label->cms.disk_offset; 138 offset = label->cms.disk_offset;
118 size = (label->cms.block_count - 1) 139 size = (label->cms.block_count - 1)
119 * (blocksize >> 9); 140 * (blocksize >> 9);
120 } else { 141 } else {
121 printk("CMS1/%8s:", name); 142 printk("CMS1/%8s:", name);
122 offset = (info->label_block + 1); 143 offset = (info->label_block + 1);
123 size = i_size >> 9; 144 size = label->cms.block_count
145 * (blocksize >> 9);
124 } 146 }
147 put_partition(state, 1, offset*(blocksize >> 9),
148 size-offset*(blocksize >> 9));
125 } else { 149 } else {
126 /* 150 if (strncmp(type, "LNX1", 4) == 0) {
127 * Old style LNX1 or unlabeled disk 151 printk("LNX1/%8s:", name);
128 */ 152 if (label->lnx.ldl_version == 0xf2) {
129 if (strncmp(type, "LNX1", 4) == 0) 153 fmt_size = label->lnx.formatted_blocks
130 printk ("LNX1/%8s:", name); 154 * (blocksize >> 9);
131 else 155 } else if (!strcmp(info->type, "ECKD")) {
156 /* formated w/o large volume support */
157 fmt_size = geo->cylinders * geo->heads
158 * geo->sectors * (blocksize >> 9);
159 } else {
160 /* old label and no usable disk geometry
161 * (e.g. DIAG) */
162 fmt_size = i_size >> 9;
163 }
164 size = i_size >> 9;
165 if (fmt_size < size)
166 size = fmt_size;
167 offset = (info->label_block + 1);
168 } else {
169 /* unlabeled disk */
132 printk("(nonl)"); 170 printk("(nonl)");
133 offset = (info->label_block + 1); 171 size = i_size >> 9;
134 size = i_size >> 9; 172 offset = (info->label_block + 1);
135 } 173 }
136 put_partition(state, 1, offset*(blocksize >> 9), 174 put_partition(state, 1, offset*(blocksize >> 9),
137 size-offset*(blocksize >> 9)); 175 size-offset*(blocksize >> 9));
176 }
138 } else if (info->format == DASD_FORMAT_CDL) { 177 } else if (info->format == DASD_FORMAT_CDL) {
139 /* 178 /*
140 * New style CDL formatted disk 179 * New style CDL formatted disk
141 */ 180 */
142 unsigned int blk; 181 sector_t blk;
143 int counter; 182 int counter;
144 183
145 /* 184 /*
@@ -166,7 +205,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
166 /* skip FMT4 / FMT5 / FMT7 labels */ 205 /* skip FMT4 / FMT5 / FMT7 labels */
167 if (f1.DS1FMTID == _ascebc['4'] 206 if (f1.DS1FMTID == _ascebc['4']
168 || f1.DS1FMTID == _ascebc['5'] 207 || f1.DS1FMTID == _ascebc['5']
169 || f1.DS1FMTID == _ascebc['7']) { 208 || f1.DS1FMTID == _ascebc['7']
209 || f1.DS1FMTID == _ascebc['9']) {
170 blk++; 210 blk++;
171 data = read_dev_sector(bdev, blk * 211 data = read_dev_sector(bdev, blk *
172 (blocksize/512), 212 (blocksize/512),
@@ -174,8 +214,9 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
174 continue; 214 continue;
175 } 215 }
176 216
177 /* only FMT1 valid at this point */ 217 /* only FMT1 and 8 labels valid at this point */
178 if (f1.DS1FMTID != _ascebc['1']) 218 if (f1.DS1FMTID != _ascebc['1'] &&
219 f1.DS1FMTID != _ascebc['8'])
179 break; 220 break;
180 221
181 /* OK, we got valid partition data */ 222 /* OK, we got valid partition data */
diff --git a/fs/pipe.c b/fs/pipe.c
index 3a48ba5179d5..4af7aa521813 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -667,10 +667,7 @@ pipe_read_fasync(int fd, struct file *filp, int on)
667 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers); 667 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers);
668 mutex_unlock(&inode->i_mutex); 668 mutex_unlock(&inode->i_mutex);
669 669
670 if (retval < 0) 670 return retval;
671 return retval;
672
673 return 0;
674} 671}
675 672
676 673
@@ -684,10 +681,7 @@ pipe_write_fasync(int fd, struct file *filp, int on)
684 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers); 681 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers);
685 mutex_unlock(&inode->i_mutex); 682 mutex_unlock(&inode->i_mutex);
686 683
687 if (retval < 0) 684 return retval;
688 return retval;
689
690 return 0;
691} 685}
692 686
693 687
@@ -699,18 +693,14 @@ pipe_rdwr_fasync(int fd, struct file *filp, int on)
699 int retval; 693 int retval;
700 694
701 mutex_lock(&inode->i_mutex); 695 mutex_lock(&inode->i_mutex);
702
703 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); 696 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
704 697 if (retval >= 0) {
705 if (retval >= 0)
706 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); 698 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
707 699 if (retval < 0) /* this can happen only if on == T */
700 fasync_helper(-1, filp, 0, &pipe->fasync_readers);
701 }
708 mutex_unlock(&inode->i_mutex); 702 mutex_unlock(&inode->i_mutex);
709 703 return retval;
710 if (retval < 0)
711 return retval;
712
713 return 0;
714} 704}
715 705
716 706
@@ -870,7 +860,7 @@ static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
870 dentry->d_inode->i_ino); 860 dentry->d_inode->i_ino);
871} 861}
872 862
873static struct dentry_operations pipefs_dentry_operations = { 863static const struct dentry_operations pipefs_dentry_operations = {
874 .d_delete = pipefs_delete_dentry, 864 .d_delete = pipefs_delete_dentry,
875 .d_dname = pipefs_dname, 865 .d_dname = pipefs_dname,
876}; 866};
@@ -1034,11 +1024,6 @@ int do_pipe_flags(int *fd, int flags)
1034 return error; 1024 return error;
1035} 1025}
1036 1026
1037int do_pipe(int *fd)
1038{
1039 return do_pipe_flags(fd, 0);
1040}
1041
1042/* 1027/*
1043 * sys_pipe() is the normal C calling standard for creating 1028 * sys_pipe() is the normal C calling standard for creating
1044 * a pipe. It's not the way Unix traditionally does this, though. 1029 * a pipe. It's not the way Unix traditionally does this, though.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 0c9de19a1633..f71559784bfb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -80,6 +80,7 @@
80#include <linux/oom.h> 80#include <linux/oom.h>
81#include <linux/elf.h> 81#include <linux/elf.h>
82#include <linux/pid_namespace.h> 82#include <linux/pid_namespace.h>
83#include <linux/fs_struct.h>
83#include "internal.h" 84#include "internal.h"
84 85
85/* NOTE: 86/* NOTE:
@@ -146,15 +147,22 @@ static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
146 return count; 147 return count;
147} 148}
148 149
149static struct fs_struct *get_fs_struct(struct task_struct *task) 150static int get_fs_path(struct task_struct *task, struct path *path, bool root)
150{ 151{
151 struct fs_struct *fs; 152 struct fs_struct *fs;
153 int result = -ENOENT;
154
152 task_lock(task); 155 task_lock(task);
153 fs = task->fs; 156 fs = task->fs;
154 if(fs) 157 if (fs) {
155 atomic_inc(&fs->count); 158 read_lock(&fs->lock);
159 *path = root ? fs->root : fs->pwd;
160 path_get(path);
161 read_unlock(&fs->lock);
162 result = 0;
163 }
156 task_unlock(task); 164 task_unlock(task);
157 return fs; 165 return result;
158} 166}
159 167
160static int get_nr_threads(struct task_struct *tsk) 168static int get_nr_threads(struct task_struct *tsk)
@@ -172,42 +180,24 @@ static int get_nr_threads(struct task_struct *tsk)
172static int proc_cwd_link(struct inode *inode, struct path *path) 180static int proc_cwd_link(struct inode *inode, struct path *path)
173{ 181{
174 struct task_struct *task = get_proc_task(inode); 182 struct task_struct *task = get_proc_task(inode);
175 struct fs_struct *fs = NULL;
176 int result = -ENOENT; 183 int result = -ENOENT;
177 184
178 if (task) { 185 if (task) {
179 fs = get_fs_struct(task); 186 result = get_fs_path(task, path, 0);
180 put_task_struct(task); 187 put_task_struct(task);
181 } 188 }
182 if (fs) {
183 read_lock(&fs->lock);
184 *path = fs->pwd;
185 path_get(&fs->pwd);
186 read_unlock(&fs->lock);
187 result = 0;
188 put_fs_struct(fs);
189 }
190 return result; 189 return result;
191} 190}
192 191
193static int proc_root_link(struct inode *inode, struct path *path) 192static int proc_root_link(struct inode *inode, struct path *path)
194{ 193{
195 struct task_struct *task = get_proc_task(inode); 194 struct task_struct *task = get_proc_task(inode);
196 struct fs_struct *fs = NULL;
197 int result = -ENOENT; 195 int result = -ENOENT;
198 196
199 if (task) { 197 if (task) {
200 fs = get_fs_struct(task); 198 result = get_fs_path(task, path, 1);
201 put_task_struct(task); 199 put_task_struct(task);
202 } 200 }
203 if (fs) {
204 read_lock(&fs->lock);
205 *path = fs->root;
206 path_get(&fs->root);
207 read_unlock(&fs->lock);
208 result = 0;
209 put_fs_struct(fs);
210 }
211 return result; 201 return result;
212} 202}
213 203
@@ -596,7 +586,6 @@ static int mounts_open_common(struct inode *inode, struct file *file,
596 struct task_struct *task = get_proc_task(inode); 586 struct task_struct *task = get_proc_task(inode);
597 struct nsproxy *nsp; 587 struct nsproxy *nsp;
598 struct mnt_namespace *ns = NULL; 588 struct mnt_namespace *ns = NULL;
599 struct fs_struct *fs = NULL;
600 struct path root; 589 struct path root;
601 struct proc_mounts *p; 590 struct proc_mounts *p;
602 int ret = -EINVAL; 591 int ret = -EINVAL;
@@ -610,22 +599,16 @@ static int mounts_open_common(struct inode *inode, struct file *file,
610 get_mnt_ns(ns); 599 get_mnt_ns(ns);
611 } 600 }
612 rcu_read_unlock(); 601 rcu_read_unlock();
613 if (ns) 602 if (ns && get_fs_path(task, &root, 1) == 0)
614 fs = get_fs_struct(task); 603 ret = 0;
615 put_task_struct(task); 604 put_task_struct(task);
616 } 605 }
617 606
618 if (!ns) 607 if (!ns)
619 goto err; 608 goto err;
620 if (!fs) 609 if (ret)
621 goto err_put_ns; 610 goto err_put_ns;
622 611
623 read_lock(&fs->lock);
624 root = fs->root;
625 path_get(&root);
626 read_unlock(&fs->lock);
627 put_fs_struct(fs);
628
629 ret = -ENOMEM; 612 ret = -ENOMEM;
630 p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL); 613 p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
631 if (!p) 614 if (!p)
@@ -1545,7 +1528,7 @@ static int pid_delete_dentry(struct dentry * dentry)
1545 return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; 1528 return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
1546} 1529}
1547 1530
1548static struct dentry_operations pid_dentry_operations = 1531static const struct dentry_operations pid_dentry_operations =
1549{ 1532{
1550 .d_revalidate = pid_revalidate, 1533 .d_revalidate = pid_revalidate,
1551 .d_delete = pid_delete_dentry, 1534 .d_delete = pid_delete_dentry,
@@ -1717,7 +1700,7 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
1717 return 0; 1700 return 0;
1718} 1701}
1719 1702
1720static struct dentry_operations tid_fd_dentry_operations = 1703static const struct dentry_operations tid_fd_dentry_operations =
1721{ 1704{
1722 .d_revalidate = tid_fd_revalidate, 1705 .d_revalidate = tid_fd_revalidate,
1723 .d_delete = pid_delete_dentry, 1706 .d_delete = pid_delete_dentry,
@@ -2339,7 +2322,7 @@ static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd)
2339 return 0; 2322 return 0;
2340} 2323}
2341 2324
2342static struct dentry_operations proc_base_dentry_operations = 2325static const struct dentry_operations proc_base_dentry_operations =
2343{ 2326{
2344 .d_revalidate = proc_base_revalidate, 2327 .d_revalidate = proc_base_revalidate,
2345 .d_delete = pid_delete_dentry, 2328 .d_delete = pid_delete_dentry,
@@ -3066,7 +3049,6 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
3066 int retval = -ENOENT; 3049 int retval = -ENOENT;
3067 ino_t ino; 3050 ino_t ino;
3068 int tid; 3051 int tid;
3069 unsigned long pos = filp->f_pos; /* avoiding "long long" filp->f_pos */
3070 struct pid_namespace *ns; 3052 struct pid_namespace *ns;
3071 3053
3072 task = get_proc_task(inode); 3054 task = get_proc_task(inode);
@@ -3083,18 +3065,18 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
3083 goto out_no_task; 3065 goto out_no_task;
3084 retval = 0; 3066 retval = 0;
3085 3067
3086 switch (pos) { 3068 switch ((unsigned long)filp->f_pos) {
3087 case 0: 3069 case 0:
3088 ino = inode->i_ino; 3070 ino = inode->i_ino;
3089 if (filldir(dirent, ".", 1, pos, ino, DT_DIR) < 0) 3071 if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0)
3090 goto out; 3072 goto out;
3091 pos++; 3073 filp->f_pos++;
3092 /* fall through */ 3074 /* fall through */
3093 case 1: 3075 case 1:
3094 ino = parent_ino(dentry); 3076 ino = parent_ino(dentry);
3095 if (filldir(dirent, "..", 2, pos, ino, DT_DIR) < 0) 3077 if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) < 0)
3096 goto out; 3078 goto out;
3097 pos++; 3079 filp->f_pos++;
3098 /* fall through */ 3080 /* fall through */
3099 } 3081 }
3100 3082
@@ -3104,9 +3086,9 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
3104 ns = filp->f_dentry->d_sb->s_fs_info; 3086 ns = filp->f_dentry->d_sb->s_fs_info;
3105 tid = (int)filp->f_version; 3087 tid = (int)filp->f_version;
3106 filp->f_version = 0; 3088 filp->f_version = 0;
3107 for (task = first_tid(leader, tid, pos - 2, ns); 3089 for (task = first_tid(leader, tid, filp->f_pos - 2, ns);
3108 task; 3090 task;
3109 task = next_tid(task), pos++) { 3091 task = next_tid(task), filp->f_pos++) {
3110 tid = task_pid_nr_ns(task, ns); 3092 tid = task_pid_nr_ns(task, ns);
3111 if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) { 3093 if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) {
3112 /* returning this tgid failed, save it as the first 3094 /* returning this tgid failed, save it as the first
@@ -3117,7 +3099,6 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
3117 } 3099 }
3118 } 3100 }
3119out: 3101out:
3120 filp->f_pos = pos;
3121 put_task_struct(leader); 3102 put_task_struct(leader);
3122out_no_task: 3103out_no_task:
3123 return retval; 3104 return retval;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index db7fa5cab988..fa678abc9db1 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -37,7 +37,7 @@ static int proc_match(int len, const char *name, struct proc_dir_entry *de)
37#define PROC_BLOCK_SIZE (PAGE_SIZE - 1024) 37#define PROC_BLOCK_SIZE (PAGE_SIZE - 1024)
38 38
39static ssize_t 39static ssize_t
40proc_file_read(struct file *file, char __user *buf, size_t nbytes, 40__proc_file_read(struct file *file, char __user *buf, size_t nbytes,
41 loff_t *ppos) 41 loff_t *ppos)
42{ 42{
43 struct inode * inode = file->f_path.dentry->d_inode; 43 struct inode * inode = file->f_path.dentry->d_inode;
@@ -183,19 +183,47 @@ proc_file_read(struct file *file, char __user *buf, size_t nbytes,
183} 183}
184 184
185static ssize_t 185static ssize_t
186proc_file_read(struct file *file, char __user *buf, size_t nbytes,
187 loff_t *ppos)
188{
189 struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
190 ssize_t rv = -EIO;
191
192 spin_lock(&pde->pde_unload_lock);
193 if (!pde->proc_fops) {
194 spin_unlock(&pde->pde_unload_lock);
195 return rv;
196 }
197 pde->pde_users++;
198 spin_unlock(&pde->pde_unload_lock);
199
200 rv = __proc_file_read(file, buf, nbytes, ppos);
201
202 pde_users_dec(pde);
203 return rv;
204}
205
206static ssize_t
186proc_file_write(struct file *file, const char __user *buffer, 207proc_file_write(struct file *file, const char __user *buffer,
187 size_t count, loff_t *ppos) 208 size_t count, loff_t *ppos)
188{ 209{
189 struct inode *inode = file->f_path.dentry->d_inode; 210 struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
190 struct proc_dir_entry * dp; 211 ssize_t rv = -EIO;
191 212
192 dp = PDE(inode); 213 if (pde->write_proc) {
193 214 spin_lock(&pde->pde_unload_lock);
194 if (!dp->write_proc) 215 if (!pde->proc_fops) {
195 return -EIO; 216 spin_unlock(&pde->pde_unload_lock);
217 return rv;
218 }
219 pde->pde_users++;
220 spin_unlock(&pde->pde_unload_lock);
196 221
197 /* FIXME: does this routine need ppos? probably... */ 222 /* FIXME: does this routine need ppos? probably... */
198 return dp->write_proc(file, buffer, count, dp->data); 223 rv = pde->write_proc(file, buffer, count, pde->data);
224 pde_users_dec(pde);
225 }
226 return rv;
199} 227}
200 228
201 229
@@ -307,6 +335,21 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
307/* 335/*
308 * Return an inode number between PROC_DYNAMIC_FIRST and 336 * Return an inode number between PROC_DYNAMIC_FIRST and
309 * 0xffffffff, or zero on failure. 337 * 0xffffffff, or zero on failure.
338 *
339 * Current inode allocations in the proc-fs (hex-numbers):
340 *
341 * 00000000 reserved
342 * 00000001-00000fff static entries (goners)
343 * 001 root-ino
344 *
345 * 00001000-00001fff unused
346 * 0001xxxx-7fffxxxx pid-dir entries for pid 1-7fff
347 * 80000000-efffffff unused
348 * f0000000-ffffffff dynamic entries
349 *
350 * Goal:
351 * Once we split the thing into several virtual filesystems,
352 * we will get rid of magical ranges (and this comment, BTW).
310 */ 353 */
311static unsigned int get_inode_number(void) 354static unsigned int get_inode_number(void)
312{ 355{
@@ -363,7 +406,7 @@ static int proc_delete_dentry(struct dentry * dentry)
363 return 1; 406 return 1;
364} 407}
365 408
366static struct dentry_operations proc_dentry_operations = 409static const struct dentry_operations proc_dentry_operations =
367{ 410{
368 .d_delete = proc_delete_dentry, 411 .d_delete = proc_delete_dentry,
369}; 412};
diff --git a/fs/proc/inode-alloc.txt b/fs/proc/inode-alloc.txt
deleted file mode 100644
index 77212f938c2c..000000000000
--- a/fs/proc/inode-alloc.txt
+++ /dev/null
@@ -1,14 +0,0 @@
1Current inode allocations in the proc-fs (hex-numbers):
2
3 00000000 reserved
4 00000001-00000fff static entries (goners)
5 001 root-ino
6
7 00001000-00001fff unused
8 0001xxxx-7fffxxxx pid-dir entries for pid 1-7fff
9 80000000-efffffff unused
10 f0000000-ffffffff dynamic entries
11
12Goal:
13 a) once we'll split the thing into several virtual filesystems we
14 will get rid of magical ranges (and this file, BTW).
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d8bb5c671f42..d78ade305541 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -58,11 +58,8 @@ static void proc_delete_inode(struct inode *inode)
58 58
59 /* Let go of any associated proc directory entry */ 59 /* Let go of any associated proc directory entry */
60 de = PROC_I(inode)->pde; 60 de = PROC_I(inode)->pde;
61 if (de) { 61 if (de)
62 if (de->owner)
63 module_put(de->owner);
64 de_put(de); 62 de_put(de);
65 }
66 if (PROC_I(inode)->sysctl) 63 if (PROC_I(inode)->sysctl)
67 sysctl_head_put(PROC_I(inode)->sysctl); 64 sysctl_head_put(PROC_I(inode)->sysctl);
68 clear_inode(inode); 65 clear_inode(inode);
@@ -127,7 +124,7 @@ static void __pde_users_dec(struct proc_dir_entry *pde)
127 complete(pde->pde_unload_completion); 124 complete(pde->pde_unload_completion);
128} 125}
129 126
130static void pde_users_dec(struct proc_dir_entry *pde) 127void pde_users_dec(struct proc_dir_entry *pde)
131{ 128{
132 spin_lock(&pde->pde_unload_lock); 129 spin_lock(&pde->pde_unload_lock);
133 __pde_users_dec(pde); 130 __pde_users_dec(pde);
@@ -449,12 +446,9 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
449{ 446{
450 struct inode * inode; 447 struct inode * inode;
451 448
452 if (!try_module_get(de->owner))
453 goto out_mod;
454
455 inode = iget_locked(sb, ino); 449 inode = iget_locked(sb, ino);
456 if (!inode) 450 if (!inode)
457 goto out_ino; 451 return NULL;
458 if (inode->i_state & I_NEW) { 452 if (inode->i_state & I_NEW) {
459 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 453 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
460 PROC_I(inode)->fd = 0; 454 PROC_I(inode)->fd = 0;
@@ -485,16 +479,9 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
485 } 479 }
486 } 480 }
487 unlock_new_inode(inode); 481 unlock_new_inode(inode);
488 } else { 482 } else
489 module_put(de->owner);
490 de_put(de); 483 de_put(de);
491 }
492 return inode; 484 return inode;
493
494out_ino:
495 module_put(de->owner);
496out_mod:
497 return NULL;
498} 485}
499 486
500int proc_fill_super(struct super_block *s) 487int proc_fill_super(struct super_block *s)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index cd53ff838498..f6db9618a888 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -91,3 +91,4 @@ struct pde_opener {
91 int (*release)(struct inode *, struct file *); 91 int (*release)(struct inode *, struct file *);
92 struct list_head lh; 92 struct list_head lh;
93}; 93};
94void pde_users_dec(struct proc_dir_entry *pde);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 43d23948384a..74ea974f5ca6 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -120,7 +120,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
120 K(i.freeram-i.freehigh), 120 K(i.freeram-i.freehigh),
121#endif 121#endif
122#ifndef CONFIG_MMU 122#ifndef CONFIG_MMU
123 K((unsigned long) atomic_read(&mmap_pages_allocated)), 123 K((unsigned long) atomic_long_read(&mmap_pages_allocated)),
124#endif 124#endif
125 K(i.totalswap), 125 K(i.totalswap),
126 K(i.freeswap), 126 K(i.freeswap),
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index b446d7ad0b0d..7e14d1a04001 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -76,7 +76,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
76 76
77/* 77/*
78 * display a list of all the REGIONs the kernel knows about 78 * display a list of all the REGIONs the kernel knows about
79 * - nommu kernals have a single flat list 79 * - nommu kernels have a single flat list
80 */ 80 */
81static int nommu_region_list_show(struct seq_file *m, void *_p) 81static int nommu_region_list_show(struct seq_file *m, void *_p)
82{ 82{
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 2d1345112a42..e9983837d08d 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -80,7 +80,7 @@ static const struct file_operations proc_kpagecount_operations = {
80#define KPF_RECLAIM 9 80#define KPF_RECLAIM 9
81#define KPF_BUDDY 10 81#define KPF_BUDDY 10
82 82
83#define kpf_copy_bit(flags, srcpos, dstpos) (((flags >> srcpos) & 1) << dstpos) 83#define kpf_copy_bit(flags, dstpos, srcpos) (((flags >> srcpos) & 1) << dstpos)
84 84
85static ssize_t kpageflags_read(struct file *file, char __user *buf, 85static ssize_t kpageflags_read(struct file *file, char __user *buf,
86 size_t count, loff_t *ppos) 86 size_t count, loff_t *ppos)
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 94fcfff6863a..9b1e4e9a16bf 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -7,7 +7,7 @@
7#include <linux/security.h> 7#include <linux/security.h>
8#include "internal.h" 8#include "internal.h"
9 9
10static struct dentry_operations proc_sys_dentry_operations; 10static const struct dentry_operations proc_sys_dentry_operations;
11static const struct file_operations proc_sys_file_operations; 11static const struct file_operations proc_sys_file_operations;
12static const struct inode_operations proc_sys_inode_operations; 12static const struct inode_operations proc_sys_inode_operations;
13static const struct file_operations proc_sys_dir_file_operations; 13static const struct file_operations proc_sys_dir_file_operations;
@@ -396,7 +396,7 @@ static int proc_sys_compare(struct dentry *dir, struct qstr *qstr,
396 return !sysctl_is_seen(PROC_I(dentry->d_inode)->sysctl); 396 return !sysctl_is_seen(PROC_I(dentry->d_inode)->sysctl);
397} 397}
398 398
399static struct dentry_operations proc_sys_dentry_operations = { 399static const struct dentry_operations proc_sys_dentry_operations = {
400 .d_revalidate = proc_sys_revalidate, 400 .d_revalidate = proc_sys_revalidate,
401 .d_delete = proc_sys_delete, 401 .d_delete = proc_sys_delete,
402 .d_compare = proc_sys_compare, 402 .d_compare = proc_sys_compare,
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index d153946d6d15..83adcc869437 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -144,17 +144,12 @@ void proc_tty_register_driver(struct tty_driver *driver)
144{ 144{
145 struct proc_dir_entry *ent; 145 struct proc_dir_entry *ent;
146 146
147 if (!driver->ops->read_proc || !driver->driver_name || 147 if (!driver->driver_name || driver->proc_entry ||
148 driver->proc_entry) 148 !driver->ops->proc_fops)
149 return; 149 return;
150 150
151 ent = create_proc_entry(driver->driver_name, 0, proc_tty_driver); 151 ent = proc_create_data(driver->driver_name, 0, proc_tty_driver,
152 if (!ent) 152 driver->ops->proc_fops, driver);
153 return;
154 ent->read_proc = driver->ops->read_proc;
155 ent->owner = driver->owner;
156 ent->data = driver;
157
158 driver->proc_entry = ent; 153 driver->proc_entry = ent;
159} 154}
160 155
diff --git a/fs/proc/root.c b/fs/proc/root.c
index f6299a25594e..1e15a2b176e8 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -83,7 +83,8 @@ static int proc_get_sb(struct file_system_type *fs_type,
83 ns->proc_mnt = mnt; 83 ns->proc_mnt = mnt;
84 } 84 }
85 85
86 return simple_set_mnt(mnt, sb); 86 simple_set_mnt(mnt, sb);
87 return 0;
87} 88}
88 89
89static void proc_kill_sb(struct super_block *sb) 90static void proc_kill_sb(struct super_block *sb)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 94063840832a..b0ae0be4801f 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -693,8 +693,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
693 goto out_pages; 693 goto out_pages;
694 } 694 }
695 695
696 pm.out = (u64 *)buf; 696 pm.out = (u64 __user *)buf;
697 pm.end = (u64 *)(buf + count); 697 pm.end = (u64 __user *)(buf + count);
698 698
699 pagemap_walk.pmd_entry = pagemap_pte_range; 699 pagemap_walk.pmd_entry = pagemap_pte_range;
700 pagemap_walk.pte_hole = pagemap_pte_hole; 700 pagemap_walk.pte_hole = pagemap_pte_hole;
@@ -720,9 +720,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
720 if (ret == PM_END_OF_BUFFER) 720 if (ret == PM_END_OF_BUFFER)
721 ret = 0; 721 ret = 0;
722 /* don't need mmap_sem for these, but this looks cleaner */ 722 /* don't need mmap_sem for these, but this looks cleaner */
723 *ppos += (char *)pm.out - buf; 723 *ppos += (char __user *)pm.out - buf;
724 if (!ret) 724 if (!ret)
725 ret = (char *)pm.out - buf; 725 ret = (char __user *)pm.out - buf;
726 726
727out_pages: 727out_pages:
728 for (; pagecount; pagecount--) { 728 for (; pagecount; pagecount--) {
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 343ea1216bc8..863464d5519c 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -2,6 +2,7 @@
2#include <linux/mm.h> 2#include <linux/mm.h>
3#include <linux/file.h> 3#include <linux/file.h>
4#include <linux/fdtable.h> 4#include <linux/fdtable.h>
5#include <linux/fs_struct.h>
5#include <linux/mount.h> 6#include <linux/mount.h>
6#include <linux/ptrace.h> 7#include <linux/ptrace.h>
7#include <linux/seq_file.h> 8#include <linux/seq_file.h>
@@ -49,7 +50,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
49 else 50 else
50 bytes += kobjsize(mm); 51 bytes += kobjsize(mm);
51 52
52 if (current->fs && atomic_read(&current->fs->count) > 1) 53 if (current->fs && current->fs->users > 1)
53 sbytes += kobjsize(current->fs); 54 sbytes += kobjsize(current->fs);
54 else 55 else
55 bytes += kobjsize(current->fs); 56 bytes += kobjsize(current->fs);
@@ -136,14 +137,14 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
136 } 137 }
137 138
138 seq_printf(m, 139 seq_printf(m,
139 "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n", 140 "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
140 vma->vm_start, 141 vma->vm_start,
141 vma->vm_end, 142 vma->vm_end,
142 flags & VM_READ ? 'r' : '-', 143 flags & VM_READ ? 'r' : '-',
143 flags & VM_WRITE ? 'w' : '-', 144 flags & VM_WRITE ? 'w' : '-',
144 flags & VM_EXEC ? 'x' : '-', 145 flags & VM_EXEC ? 'x' : '-',
145 flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', 146 flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
146 vma->vm_pgoff << PAGE_SHIFT, 147 (unsigned long long) vma->vm_pgoff << PAGE_SHIFT,
147 MAJOR(dev), MINOR(dev), ino, &len); 148 MAJOR(dev), MINOR(dev), ino, &len);
148 149
149 if (file) { 150 if (file) {
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index df26aa88fa47..0c10a0b3f146 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -1,45 +1,43 @@
1#include <linux/fs.h>
1#include <linux/init.h> 2#include <linux/init.h>
2#include <linux/proc_fs.h> 3#include <linux/proc_fs.h>
3#include <linux/sched.h> 4#include <linux/sched.h>
5#include <linux/seq_file.h>
4#include <linux/time.h> 6#include <linux/time.h>
5#include <asm/cputime.h> 7#include <asm/cputime.h>
6 8
7static int proc_calc_metrics(char *page, char **start, off_t off, 9static int uptime_proc_show(struct seq_file *m, void *v)
8 int count, int *eof, int len)
9{
10 if (len <= off + count)
11 *eof = 1;
12 *start = page + off;
13 len -= off;
14 if (len > count)
15 len = count;
16 if (len < 0)
17 len = 0;
18 return len;
19}
20
21static int uptime_read_proc(char *page, char **start, off_t off, int count,
22 int *eof, void *data)
23{ 10{
24 struct timespec uptime; 11 struct timespec uptime;
25 struct timespec idle; 12 struct timespec idle;
26 int len;
27 cputime_t idletime = cputime_add(init_task.utime, init_task.stime); 13 cputime_t idletime = cputime_add(init_task.utime, init_task.stime);
28 14
29 do_posix_clock_monotonic_gettime(&uptime); 15 do_posix_clock_monotonic_gettime(&uptime);
30 monotonic_to_bootbased(&uptime); 16 monotonic_to_bootbased(&uptime);
31 cputime_to_timespec(idletime, &idle); 17 cputime_to_timespec(idletime, &idle);
32 len = sprintf(page, "%lu.%02lu %lu.%02lu\n", 18 seq_printf(m, "%lu.%02lu %lu.%02lu\n",
33 (unsigned long) uptime.tv_sec, 19 (unsigned long) uptime.tv_sec,
34 (uptime.tv_nsec / (NSEC_PER_SEC / 100)), 20 (uptime.tv_nsec / (NSEC_PER_SEC / 100)),
35 (unsigned long) idle.tv_sec, 21 (unsigned long) idle.tv_sec,
36 (idle.tv_nsec / (NSEC_PER_SEC / 100))); 22 (idle.tv_nsec / (NSEC_PER_SEC / 100)));
37 return proc_calc_metrics(page, start, off, count, eof, len); 23 return 0;
38} 24}
39 25
26static int uptime_proc_open(struct inode *inode, struct file *file)
27{
28 return single_open(file, uptime_proc_show, NULL);
29}
30
31static const struct file_operations uptime_proc_fops = {
32 .open = uptime_proc_open,
33 .read = seq_read,
34 .llseek = seq_lseek,
35 .release = single_release,
36};
37
40static int __init proc_uptime_init(void) 38static int __init proc_uptime_init(void)
41{ 39{
42 create_proc_read_entry("uptime", 0, NULL, uptime_read_proc, NULL); 40 proc_create("uptime", 0, NULL, &uptime_proc_fops);
43 return 0; 41 return 0;
44} 42}
45module_init(proc_uptime_init); 43module_init(proc_uptime_init);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2aad1044b84c..fe1f0f31d11c 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -282,6 +282,7 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
282static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf) 282static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
283{ 283{
284 struct super_block *sb = dentry->d_sb; 284 struct super_block *sb = dentry->d_sb;
285 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
285 286
286 lock_kernel(); 287 lock_kernel();
287 288
@@ -291,6 +292,8 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
291 buf->f_bfree = qnx4_count_free_blocks(sb); 292 buf->f_bfree = qnx4_count_free_blocks(sb);
292 buf->f_bavail = buf->f_bfree; 293 buf->f_bavail = buf->f_bfree;
293 buf->f_namelen = QNX4_NAME_MAX; 294 buf->f_namelen = QNX4_NAME_MAX;
295 buf->f_fsid.val[0] = (u32)id;
296 buf->f_fsid.val[1] = (u32)(id >> 32);
294 297
295 unlock_kernel(); 298 unlock_kernel();
296 299
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
new file mode 100644
index 000000000000..8047e01ef46b
--- /dev/null
+++ b/fs/quota/Kconfig
@@ -0,0 +1,59 @@
1#
2# Quota configuration
3#
4
5config QUOTA
6 bool "Quota support"
7 help
8 If you say Y here, you will be able to set per user limits for disk
9 usage (also called disk quotas). Currently, it works for the
10 ext2, ext3, and reiserfs file system. ext3 also supports journalled
11 quotas for which you don't need to run quotacheck(8) after an unclean
12 shutdown.
13 For further details, read the Quota mini-HOWTO, available from
14 <http://www.tldp.org/docs.html#howto>, or the documentation provided
15 with the quota tools. Probably the quota support is only useful for
16 multi user systems. If unsure, say N.
17
18config QUOTA_NETLINK_INTERFACE
19 bool "Report quota messages through netlink interface"
20 depends on QUOTA && NET
21 help
22 If you say Y here, quota warnings (about exceeding softlimit, reaching
23 hardlimit, etc.) will be reported through netlink interface. If unsure,
24 say Y.
25
26config PRINT_QUOTA_WARNING
27 bool "Print quota warnings to console (OBSOLETE)"
28 depends on QUOTA
29 default y
30 help
31 If you say Y here, quota warnings (about exceeding softlimit, reaching
32 hardlimit, etc.) will be printed to the process' controlling terminal.
33 Note that this behavior is currently deprecated and may go away in
34 future. Please use notification via netlink socket instead.
35
36# Generic support for tree structured quota files. Selected when needed.
37config QUOTA_TREE
38 tristate
39
40config QFMT_V1
41 tristate "Old quota format support"
42 depends on QUOTA
43 help
44 This quota format was (is) used by kernels earlier than 2.4.22. If
45 you have quota working and you don't want to convert to new quota
46 format say Y here.
47
48config QFMT_V2
49 tristate "Quota format v2 support"
50 depends on QUOTA
51 select QUOTA_TREE
52 help
53 This quota format allows using quotas with 32-bit UIDs/GIDs. If you
54 need this functionality say Y here.
55
56config QUOTACTL
57 bool
58 depends on XFS_QUOTA || QUOTA
59 default y
diff --git a/fs/quota/Makefile b/fs/quota/Makefile
new file mode 100644
index 000000000000..385a0831cc99
--- /dev/null
+++ b/fs/quota/Makefile
@@ -0,0 +1,14 @@
1#
2# Makefile for the Linux filesystems.
3#
4# 14 Sep 2000, Christoph Hellwig <hch@infradead.org>
5# Rewritten to use lists instead of if-statements.
6#
7
8obj-y :=
9
10obj-$(CONFIG_QUOTA) += dquot.o
11obj-$(CONFIG_QFMT_V1) += quota_v1.o
12obj-$(CONFIG_QFMT_V2) += quota_v2.o
13obj-$(CONFIG_QUOTA_TREE) += quota_tree.o
14obj-$(CONFIG_QUOTACTL) += quota.o
diff --git a/fs/dquot.c b/fs/quota/dquot.c
index bca3cac4bee7..607c579e5eca 100644
--- a/fs/dquot.c
+++ b/fs/quota/dquot.c
@@ -129,9 +129,10 @@
129 * i_mutex on quota files is special (it's below dqio_mutex) 129 * i_mutex on quota files is special (it's below dqio_mutex)
130 */ 130 */
131 131
132static DEFINE_SPINLOCK(dq_list_lock); 132static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock);
133static DEFINE_SPINLOCK(dq_state_lock); 133static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
134DEFINE_SPINLOCK(dq_data_lock); 134__cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
135EXPORT_SYMBOL(dq_data_lock);
135 136
136static char *quotatypes[] = INITQFNAMES; 137static char *quotatypes[] = INITQFNAMES;
137static struct quota_format_type *quota_formats; /* List of registered formats */ 138static struct quota_format_type *quota_formats; /* List of registered formats */
@@ -148,35 +149,46 @@ int register_quota_format(struct quota_format_type *fmt)
148 spin_unlock(&dq_list_lock); 149 spin_unlock(&dq_list_lock);
149 return 0; 150 return 0;
150} 151}
152EXPORT_SYMBOL(register_quota_format);
151 153
152void unregister_quota_format(struct quota_format_type *fmt) 154void unregister_quota_format(struct quota_format_type *fmt)
153{ 155{
154 struct quota_format_type **actqf; 156 struct quota_format_type **actqf;
155 157
156 spin_lock(&dq_list_lock); 158 spin_lock(&dq_list_lock);
157 for (actqf = &quota_formats; *actqf && *actqf != fmt; actqf = &(*actqf)->qf_next); 159 for (actqf = &quota_formats; *actqf && *actqf != fmt;
160 actqf = &(*actqf)->qf_next)
161 ;
158 if (*actqf) 162 if (*actqf)
159 *actqf = (*actqf)->qf_next; 163 *actqf = (*actqf)->qf_next;
160 spin_unlock(&dq_list_lock); 164 spin_unlock(&dq_list_lock);
161} 165}
166EXPORT_SYMBOL(unregister_quota_format);
162 167
163static struct quota_format_type *find_quota_format(int id) 168static struct quota_format_type *find_quota_format(int id)
164{ 169{
165 struct quota_format_type *actqf; 170 struct quota_format_type *actqf;
166 171
167 spin_lock(&dq_list_lock); 172 spin_lock(&dq_list_lock);
168 for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id; actqf = actqf->qf_next); 173 for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id;
174 actqf = actqf->qf_next)
175 ;
169 if (!actqf || !try_module_get(actqf->qf_owner)) { 176 if (!actqf || !try_module_get(actqf->qf_owner)) {
170 int qm; 177 int qm;
171 178
172 spin_unlock(&dq_list_lock); 179 spin_unlock(&dq_list_lock);
173 180
174 for (qm = 0; module_names[qm].qm_fmt_id && module_names[qm].qm_fmt_id != id; qm++); 181 for (qm = 0; module_names[qm].qm_fmt_id &&
175 if (!module_names[qm].qm_fmt_id || request_module(module_names[qm].qm_mod_name)) 182 module_names[qm].qm_fmt_id != id; qm++)
183 ;
184 if (!module_names[qm].qm_fmt_id ||
185 request_module(module_names[qm].qm_mod_name))
176 return NULL; 186 return NULL;
177 187
178 spin_lock(&dq_list_lock); 188 spin_lock(&dq_list_lock);
179 for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id; actqf = actqf->qf_next); 189 for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id;
190 actqf = actqf->qf_next)
191 ;
180 if (actqf && !try_module_get(actqf->qf_owner)) 192 if (actqf && !try_module_get(actqf->qf_owner))
181 actqf = NULL; 193 actqf = NULL;
182 } 194 }
@@ -215,6 +227,7 @@ static unsigned int dq_hash_bits, dq_hash_mask;
215static struct hlist_head *dquot_hash; 227static struct hlist_head *dquot_hash;
216 228
217struct dqstats dqstats; 229struct dqstats dqstats;
230EXPORT_SYMBOL(dqstats);
218 231
219static inline unsigned int 232static inline unsigned int
220hashfn(const struct super_block *sb, unsigned int id, int type) 233hashfn(const struct super_block *sb, unsigned int id, int type)
@@ -230,7 +243,8 @@ hashfn(const struct super_block *sb, unsigned int id, int type)
230 */ 243 */
231static inline void insert_dquot_hash(struct dquot *dquot) 244static inline void insert_dquot_hash(struct dquot *dquot)
232{ 245{
233 struct hlist_head *head = dquot_hash + hashfn(dquot->dq_sb, dquot->dq_id, dquot->dq_type); 246 struct hlist_head *head;
247 head = dquot_hash + hashfn(dquot->dq_sb, dquot->dq_id, dquot->dq_type);
234 hlist_add_head(&dquot->dq_hash, head); 248 hlist_add_head(&dquot->dq_hash, head);
235} 249}
236 250
@@ -239,17 +253,19 @@ static inline void remove_dquot_hash(struct dquot *dquot)
239 hlist_del_init(&dquot->dq_hash); 253 hlist_del_init(&dquot->dq_hash);
240} 254}
241 255
242static inline struct dquot *find_dquot(unsigned int hashent, struct super_block *sb, unsigned int id, int type) 256static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb,
257 unsigned int id, int type)
243{ 258{
244 struct hlist_node *node; 259 struct hlist_node *node;
245 struct dquot *dquot; 260 struct dquot *dquot;
246 261
247 hlist_for_each (node, dquot_hash+hashent) { 262 hlist_for_each (node, dquot_hash+hashent) {
248 dquot = hlist_entry(node, struct dquot, dq_hash); 263 dquot = hlist_entry(node, struct dquot, dq_hash);
249 if (dquot->dq_sb == sb && dquot->dq_id == id && dquot->dq_type == type) 264 if (dquot->dq_sb == sb && dquot->dq_id == id &&
265 dquot->dq_type == type)
250 return dquot; 266 return dquot;
251 } 267 }
252 return NODQUOT; 268 return NULL;
253} 269}
254 270
255/* Add a dquot to the tail of the free list */ 271/* Add a dquot to the tail of the free list */
@@ -309,6 +325,7 @@ int dquot_mark_dquot_dirty(struct dquot *dquot)
309 spin_unlock(&dq_list_lock); 325 spin_unlock(&dq_list_lock);
310 return 0; 326 return 0;
311} 327}
328EXPORT_SYMBOL(dquot_mark_dquot_dirty);
312 329
313/* This function needs dq_list_lock */ 330/* This function needs dq_list_lock */
314static inline int clear_dquot_dirty(struct dquot *dquot) 331static inline int clear_dquot_dirty(struct dquot *dquot)
@@ -345,8 +362,10 @@ int dquot_acquire(struct dquot *dquot)
345 if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && !dquot->dq_off) { 362 if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && !dquot->dq_off) {
346 ret = dqopt->ops[dquot->dq_type]->commit_dqblk(dquot); 363 ret = dqopt->ops[dquot->dq_type]->commit_dqblk(dquot);
347 /* Write the info if needed */ 364 /* Write the info if needed */
348 if (info_dirty(&dqopt->info[dquot->dq_type])) 365 if (info_dirty(&dqopt->info[dquot->dq_type])) {
349 ret2 = dqopt->ops[dquot->dq_type]->write_file_info(dquot->dq_sb, dquot->dq_type); 366 ret2 = dqopt->ops[dquot->dq_type]->write_file_info(
367 dquot->dq_sb, dquot->dq_type);
368 }
350 if (ret < 0) 369 if (ret < 0)
351 goto out_iolock; 370 goto out_iolock;
352 if (ret2 < 0) { 371 if (ret2 < 0) {
@@ -360,6 +379,7 @@ out_iolock:
360 mutex_unlock(&dquot->dq_lock); 379 mutex_unlock(&dquot->dq_lock);
361 return ret; 380 return ret;
362} 381}
382EXPORT_SYMBOL(dquot_acquire);
363 383
364/* 384/*
365 * Write dquot to disk 385 * Write dquot to disk
@@ -380,8 +400,10 @@ int dquot_commit(struct dquot *dquot)
380 * => we have better not writing it */ 400 * => we have better not writing it */
381 if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) { 401 if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
382 ret = dqopt->ops[dquot->dq_type]->commit_dqblk(dquot); 402 ret = dqopt->ops[dquot->dq_type]->commit_dqblk(dquot);
383 if (info_dirty(&dqopt->info[dquot->dq_type])) 403 if (info_dirty(&dqopt->info[dquot->dq_type])) {
384 ret2 = dqopt->ops[dquot->dq_type]->write_file_info(dquot->dq_sb, dquot->dq_type); 404 ret2 = dqopt->ops[dquot->dq_type]->write_file_info(
405 dquot->dq_sb, dquot->dq_type);
406 }
385 if (ret >= 0) 407 if (ret >= 0)
386 ret = ret2; 408 ret = ret2;
387 } 409 }
@@ -389,6 +411,7 @@ out_sem:
389 mutex_unlock(&dqopt->dqio_mutex); 411 mutex_unlock(&dqopt->dqio_mutex);
390 return ret; 412 return ret;
391} 413}
414EXPORT_SYMBOL(dquot_commit);
392 415
393/* 416/*
394 * Release dquot 417 * Release dquot
@@ -406,8 +429,10 @@ int dquot_release(struct dquot *dquot)
406 if (dqopt->ops[dquot->dq_type]->release_dqblk) { 429 if (dqopt->ops[dquot->dq_type]->release_dqblk) {
407 ret = dqopt->ops[dquot->dq_type]->release_dqblk(dquot); 430 ret = dqopt->ops[dquot->dq_type]->release_dqblk(dquot);
408 /* Write the info */ 431 /* Write the info */
409 if (info_dirty(&dqopt->info[dquot->dq_type])) 432 if (info_dirty(&dqopt->info[dquot->dq_type])) {
410 ret2 = dqopt->ops[dquot->dq_type]->write_file_info(dquot->dq_sb, dquot->dq_type); 433 ret2 = dqopt->ops[dquot->dq_type]->write_file_info(
434 dquot->dq_sb, dquot->dq_type);
435 }
411 if (ret >= 0) 436 if (ret >= 0)
412 ret = ret2; 437 ret = ret2;
413 } 438 }
@@ -417,6 +442,7 @@ out_dqlock:
417 mutex_unlock(&dquot->dq_lock); 442 mutex_unlock(&dquot->dq_lock);
418 return ret; 443 return ret;
419} 444}
445EXPORT_SYMBOL(dquot_release);
420 446
421void dquot_destroy(struct dquot *dquot) 447void dquot_destroy(struct dquot *dquot)
422{ 448{
@@ -516,6 +542,7 @@ out:
516 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 542 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
517 return ret; 543 return ret;
518} 544}
545EXPORT_SYMBOL(dquot_scan_active);
519 546
520int vfs_quota_sync(struct super_block *sb, int type) 547int vfs_quota_sync(struct super_block *sb, int type)
521{ 548{
@@ -533,7 +560,8 @@ int vfs_quota_sync(struct super_block *sb, int type)
533 spin_lock(&dq_list_lock); 560 spin_lock(&dq_list_lock);
534 dirty = &dqopt->info[cnt].dqi_dirty_list; 561 dirty = &dqopt->info[cnt].dqi_dirty_list;
535 while (!list_empty(dirty)) { 562 while (!list_empty(dirty)) {
536 dquot = list_first_entry(dirty, struct dquot, dq_dirty); 563 dquot = list_first_entry(dirty, struct dquot,
564 dq_dirty);
537 /* Dirty and inactive can be only bad dquot... */ 565 /* Dirty and inactive can be only bad dquot... */
538 if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) { 566 if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
539 clear_dquot_dirty(dquot); 567 clear_dquot_dirty(dquot);
@@ -563,6 +591,7 @@ int vfs_quota_sync(struct super_block *sb, int type)
563 591
564 return 0; 592 return 0;
565} 593}
594EXPORT_SYMBOL(vfs_quota_sync);
566 595
567/* Free unused dquots from cache */ 596/* Free unused dquots from cache */
568static void prune_dqcache(int count) 597static void prune_dqcache(int count)
@@ -672,6 +701,7 @@ we_slept:
672 put_dquot_last(dquot); 701 put_dquot_last(dquot);
673 spin_unlock(&dq_list_lock); 702 spin_unlock(&dq_list_lock);
674} 703}
704EXPORT_SYMBOL(dqput);
675 705
676struct dquot *dquot_alloc(struct super_block *sb, int type) 706struct dquot *dquot_alloc(struct super_block *sb, int type)
677{ 707{
@@ -685,7 +715,7 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
685 715
686 dquot = sb->dq_op->alloc_dquot(sb, type); 716 dquot = sb->dq_op->alloc_dquot(sb, type);
687 if(!dquot) 717 if(!dquot)
688 return NODQUOT; 718 return NULL;
689 719
690 mutex_init(&dquot->dq_lock); 720 mutex_init(&dquot->dq_lock);
691 INIT_LIST_HEAD(&dquot->dq_free); 721 INIT_LIST_HEAD(&dquot->dq_free);
@@ -711,10 +741,10 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
711struct dquot *dqget(struct super_block *sb, unsigned int id, int type) 741struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
712{ 742{
713 unsigned int hashent = hashfn(sb, id, type); 743 unsigned int hashent = hashfn(sb, id, type);
714 struct dquot *dquot = NODQUOT, *empty = NODQUOT; 744 struct dquot *dquot = NULL, *empty = NULL;
715 745
716 if (!sb_has_quota_active(sb, type)) 746 if (!sb_has_quota_active(sb, type))
717 return NODQUOT; 747 return NULL;
718we_slept: 748we_slept:
719 spin_lock(&dq_list_lock); 749 spin_lock(&dq_list_lock);
720 spin_lock(&dq_state_lock); 750 spin_lock(&dq_state_lock);
@@ -725,15 +755,17 @@ we_slept:
725 } 755 }
726 spin_unlock(&dq_state_lock); 756 spin_unlock(&dq_state_lock);
727 757
728 if ((dquot = find_dquot(hashent, sb, id, type)) == NODQUOT) { 758 dquot = find_dquot(hashent, sb, id, type);
729 if (empty == NODQUOT) { 759 if (!dquot) {
760 if (!empty) {
730 spin_unlock(&dq_list_lock); 761 spin_unlock(&dq_list_lock);
731 if ((empty = get_empty_dquot(sb, type)) == NODQUOT) 762 empty = get_empty_dquot(sb, type);
763 if (!empty)
732 schedule(); /* Try to wait for a moment... */ 764 schedule(); /* Try to wait for a moment... */
733 goto we_slept; 765 goto we_slept;
734 } 766 }
735 dquot = empty; 767 dquot = empty;
736 empty = NODQUOT; 768 empty = NULL;
737 dquot->dq_id = id; 769 dquot->dq_id = id;
738 /* all dquots go on the inuse_list */ 770 /* all dquots go on the inuse_list */
739 put_inuse(dquot); 771 put_inuse(dquot);
@@ -749,13 +781,14 @@ we_slept:
749 dqstats.lookups++; 781 dqstats.lookups++;
750 spin_unlock(&dq_list_lock); 782 spin_unlock(&dq_list_lock);
751 } 783 }
752 /* Wait for dq_lock - after this we know that either dquot_release() is already 784 /* Wait for dq_lock - after this we know that either dquot_release() is
753 * finished or it will be canceled due to dq_count > 1 test */ 785 * already finished or it will be canceled due to dq_count > 1 test */
754 wait_on_dquot(dquot); 786 wait_on_dquot(dquot);
755 /* Read the dquot and instantiate it (everything done only if needed) */ 787 /* Read the dquot / allocate space in quota file */
756 if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && sb->dq_op->acquire_dquot(dquot) < 0) { 788 if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) &&
789 sb->dq_op->acquire_dquot(dquot) < 0) {
757 dqput(dquot); 790 dqput(dquot);
758 dquot = NODQUOT; 791 dquot = NULL;
759 goto out; 792 goto out;
760 } 793 }
761#ifdef __DQUOT_PARANOIA 794#ifdef __DQUOT_PARANOIA
@@ -767,6 +800,7 @@ out:
767 800
768 return dquot; 801 return dquot;
769} 802}
803EXPORT_SYMBOL(dqget);
770 804
771static int dqinit_needed(struct inode *inode, int type) 805static int dqinit_needed(struct inode *inode, int type)
772{ 806{
@@ -775,9 +809,9 @@ static int dqinit_needed(struct inode *inode, int type)
775 if (IS_NOQUOTA(inode)) 809 if (IS_NOQUOTA(inode))
776 return 0; 810 return 0;
777 if (type != -1) 811 if (type != -1)
778 return inode->i_dquot[type] == NODQUOT; 812 return !inode->i_dquot[type];
779 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 813 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
780 if (inode->i_dquot[cnt] == NODQUOT) 814 if (!inode->i_dquot[cnt])
781 return 1; 815 return 1;
782 return 0; 816 return 0;
783} 817}
@@ -789,12 +823,12 @@ static void add_dquot_ref(struct super_block *sb, int type)
789 823
790 spin_lock(&inode_lock); 824 spin_lock(&inode_lock);
791 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 825 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
826 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
827 continue;
792 if (!atomic_read(&inode->i_writecount)) 828 if (!atomic_read(&inode->i_writecount))
793 continue; 829 continue;
794 if (!dqinit_needed(inode, type)) 830 if (!dqinit_needed(inode, type))
795 continue; 831 continue;
796 if (inode->i_state & (I_FREEING|I_WILL_FREE))
797 continue;
798 832
799 __iget(inode); 833 __iget(inode);
800 spin_unlock(&inode_lock); 834 spin_unlock(&inode_lock);
@@ -813,7 +847,10 @@ static void add_dquot_ref(struct super_block *sb, int type)
813 iput(old_inode); 847 iput(old_inode);
814} 848}
815 849
816/* Return 0 if dqput() won't block (note that 1 doesn't necessarily mean blocking) */ 850/*
851 * Return 0 if dqput() won't block.
852 * (note that 1 doesn't necessarily mean blocking)
853 */
817static inline int dqput_blocks(struct dquot *dquot) 854static inline int dqput_blocks(struct dquot *dquot)
818{ 855{
819 if (atomic_read(&dquot->dq_count) <= 1) 856 if (atomic_read(&dquot->dq_count) <= 1)
@@ -821,22 +858,27 @@ static inline int dqput_blocks(struct dquot *dquot)
821 return 0; 858 return 0;
822} 859}
823 860
824/* Remove references to dquots from inode - add dquot to list for freeing if needed */ 861/*
825/* We can't race with anybody because we hold dqptr_sem for writing... */ 862 * Remove references to dquots from inode and add dquot to list for freeing
863 * if we have the last referece to dquot
864 * We can't race with anybody because we hold dqptr_sem for writing...
865 */
826static int remove_inode_dquot_ref(struct inode *inode, int type, 866static int remove_inode_dquot_ref(struct inode *inode, int type,
827 struct list_head *tofree_head) 867 struct list_head *tofree_head)
828{ 868{
829 struct dquot *dquot = inode->i_dquot[type]; 869 struct dquot *dquot = inode->i_dquot[type];
830 870
831 inode->i_dquot[type] = NODQUOT; 871 inode->i_dquot[type] = NULL;
832 if (dquot != NODQUOT) { 872 if (dquot) {
833 if (dqput_blocks(dquot)) { 873 if (dqput_blocks(dquot)) {
834#ifdef __DQUOT_PARANOIA 874#ifdef __DQUOT_PARANOIA
835 if (atomic_read(&dquot->dq_count) != 1) 875 if (atomic_read(&dquot->dq_count) != 1)
836 printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count)); 876 printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count));
837#endif 877#endif
838 spin_lock(&dq_list_lock); 878 spin_lock(&dq_list_lock);
839 list_add(&dquot->dq_free, tofree_head); /* As dquot must have currently users it can't be on the free list... */ 879 /* As dquot must have currently users it can't be on
880 * the free list... */
881 list_add(&dquot->dq_free, tofree_head);
840 spin_unlock(&dq_list_lock); 882 spin_unlock(&dq_list_lock);
841 return 1; 883 return 1;
842 } 884 }
@@ -846,19 +888,22 @@ static int remove_inode_dquot_ref(struct inode *inode, int type,
846 return 0; 888 return 0;
847} 889}
848 890
849/* Free list of dquots - called from inode.c */ 891/*
850/* dquots are removed from inodes, no new references can be got so we are the only ones holding reference */ 892 * Free list of dquots
893 * Dquots are removed from inodes and no new references can be got so we are
894 * the only ones holding reference
895 */
851static void put_dquot_list(struct list_head *tofree_head) 896static void put_dquot_list(struct list_head *tofree_head)
852{ 897{
853 struct list_head *act_head; 898 struct list_head *act_head;
854 struct dquot *dquot; 899 struct dquot *dquot;
855 900
856 act_head = tofree_head->next; 901 act_head = tofree_head->next;
857 /* So now we have dquots on the list... Just free them */
858 while (act_head != tofree_head) { 902 while (act_head != tofree_head) {
859 dquot = list_entry(act_head, struct dquot, dq_free); 903 dquot = list_entry(act_head, struct dquot, dq_free);
860 act_head = act_head->next; 904 act_head = act_head->next;
861 list_del_init(&dquot->dq_free); /* Remove dquot from the list so we won't have problems... */ 905 /* Remove dquot from the list so we won't have problems... */
906 list_del_init(&dquot->dq_free);
862 dqput(dquot); 907 dqput(dquot);
863 } 908 }
864} 909}
@@ -870,6 +915,12 @@ static void remove_dquot_ref(struct super_block *sb, int type,
870 915
871 spin_lock(&inode_lock); 916 spin_lock(&inode_lock);
872 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 917 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
918 /*
919 * We have to scan also I_NEW inodes because they can already
920 * have quota pointer initialized. Luckily, we need to touch
921 * only quota pointers and these have separate locking
922 * (dqptr_sem).
923 */
873 if (!IS_NOQUOTA(inode)) 924 if (!IS_NOQUOTA(inode))
874 remove_inode_dquot_ref(inode, type, tofree_head); 925 remove_inode_dquot_ref(inode, type, tofree_head);
875 } 926 }
@@ -899,7 +950,29 @@ static inline void dquot_incr_space(struct dquot *dquot, qsize_t number)
899 dquot->dq_dqb.dqb_curspace += number; 950 dquot->dq_dqb.dqb_curspace += number;
900} 951}
901 952
902static inline void dquot_decr_inodes(struct dquot *dquot, qsize_t number) 953static inline void dquot_resv_space(struct dquot *dquot, qsize_t number)
954{
955 dquot->dq_dqb.dqb_rsvspace += number;
956}
957
958/*
959 * Claim reserved quota space
960 */
961static void dquot_claim_reserved_space(struct dquot *dquot,
962 qsize_t number)
963{
964 WARN_ON(dquot->dq_dqb.dqb_rsvspace < number);
965 dquot->dq_dqb.dqb_curspace += number;
966 dquot->dq_dqb.dqb_rsvspace -= number;
967}
968
969static inline
970void dquot_free_reserved_space(struct dquot *dquot, qsize_t number)
971{
972 dquot->dq_dqb.dqb_rsvspace -= number;
973}
974
975static void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
903{ 976{
904 if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE || 977 if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
905 dquot->dq_dqb.dqb_curinodes >= number) 978 dquot->dq_dqb.dqb_curinodes >= number)
@@ -911,7 +984,7 @@ static inline void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
911 clear_bit(DQ_INODES_B, &dquot->dq_flags); 984 clear_bit(DQ_INODES_B, &dquot->dq_flags);
912} 985}
913 986
914static inline void dquot_decr_space(struct dquot *dquot, qsize_t number) 987static void dquot_decr_space(struct dquot *dquot, qsize_t number)
915{ 988{
916 if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE || 989 if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
917 dquot->dq_dqb.dqb_curspace >= number) 990 dquot->dq_dqb.dqb_curspace >= number)
@@ -938,7 +1011,7 @@ static int warning_issued(struct dquot *dquot, const int warntype)
938#ifdef CONFIG_PRINT_QUOTA_WARNING 1011#ifdef CONFIG_PRINT_QUOTA_WARNING
939static int flag_print_warnings = 1; 1012static int flag_print_warnings = 1;
940 1013
941static inline int need_print_warning(struct dquot *dquot) 1014static int need_print_warning(struct dquot *dquot)
942{ 1015{
943 if (!flag_print_warnings) 1016 if (!flag_print_warnings)
944 return 0; 1017 return 0;
@@ -1057,10 +1130,7 @@ static void send_warning(const struct dquot *dquot, const char warntype)
1057 goto attr_err_out; 1130 goto attr_err_out;
1058 genlmsg_end(skb, msg_head); 1131 genlmsg_end(skb, msg_head);
1059 1132
1060 ret = genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS); 1133 genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
1061 if (ret < 0 && ret != -ESRCH)
1062 printk(KERN_ERR
1063 "VFS: Failed to send notification message: %d\n", ret);
1064 return; 1134 return;
1065attr_err_out: 1135attr_err_out:
1066 printk(KERN_ERR "VFS: Not enough space to compose quota message!\n"); 1136 printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
@@ -1068,13 +1138,17 @@ err_out:
1068 kfree_skb(skb); 1138 kfree_skb(skb);
1069} 1139}
1070#endif 1140#endif
1071 1141/*
1072static inline void flush_warnings(struct dquot * const *dquots, char *warntype) 1142 * Write warnings to the console and send warning messages over netlink.
1143 *
1144 * Note that this function can sleep.
1145 */
1146static void flush_warnings(struct dquot *const *dquots, char *warntype)
1073{ 1147{
1074 int i; 1148 int i;
1075 1149
1076 for (i = 0; i < MAXQUOTAS; i++) 1150 for (i = 0; i < MAXQUOTAS; i++)
1077 if (dquots[i] != NODQUOT && warntype[i] != QUOTA_NL_NOWARN && 1151 if (dquots[i] && warntype[i] != QUOTA_NL_NOWARN &&
1078 !warning_issued(dquots[i], warntype[i])) { 1152 !warning_issued(dquots[i], warntype[i])) {
1079#ifdef CONFIG_PRINT_QUOTA_WARNING 1153#ifdef CONFIG_PRINT_QUOTA_WARNING
1080 print_warning(dquots[i], warntype[i]); 1154 print_warning(dquots[i], warntype[i]);
@@ -1085,42 +1159,47 @@ static inline void flush_warnings(struct dquot * const *dquots, char *warntype)
1085 } 1159 }
1086} 1160}
1087 1161
1088static inline char ignore_hardlimit(struct dquot *dquot) 1162static int ignore_hardlimit(struct dquot *dquot)
1089{ 1163{
1090 struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type]; 1164 struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type];
1091 1165
1092 return capable(CAP_SYS_RESOURCE) && 1166 return capable(CAP_SYS_RESOURCE) &&
1093 (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || !(info->dqi_flags & V1_DQF_RSQUASH)); 1167 (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
1168 !(info->dqi_flags & V1_DQF_RSQUASH));
1094} 1169}
1095 1170
1096/* needs dq_data_lock */ 1171/* needs dq_data_lock */
1097static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype) 1172static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
1098{ 1173{
1174 qsize_t newinodes = dquot->dq_dqb.dqb_curinodes + inodes;
1175
1099 *warntype = QUOTA_NL_NOWARN; 1176 *warntype = QUOTA_NL_NOWARN;
1100 if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) || 1177 if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
1101 test_bit(DQ_FAKE_B, &dquot->dq_flags)) 1178 test_bit(DQ_FAKE_B, &dquot->dq_flags))
1102 return QUOTA_OK; 1179 return QUOTA_OK;
1103 1180
1104 if (dquot->dq_dqb.dqb_ihardlimit && 1181 if (dquot->dq_dqb.dqb_ihardlimit &&
1105 (dquot->dq_dqb.dqb_curinodes + inodes) > dquot->dq_dqb.dqb_ihardlimit && 1182 newinodes > dquot->dq_dqb.dqb_ihardlimit &&
1106 !ignore_hardlimit(dquot)) { 1183 !ignore_hardlimit(dquot)) {
1107 *warntype = QUOTA_NL_IHARDWARN; 1184 *warntype = QUOTA_NL_IHARDWARN;
1108 return NO_QUOTA; 1185 return NO_QUOTA;
1109 } 1186 }
1110 1187
1111 if (dquot->dq_dqb.dqb_isoftlimit && 1188 if (dquot->dq_dqb.dqb_isoftlimit &&
1112 (dquot->dq_dqb.dqb_curinodes + inodes) > dquot->dq_dqb.dqb_isoftlimit && 1189 newinodes > dquot->dq_dqb.dqb_isoftlimit &&
1113 dquot->dq_dqb.dqb_itime && get_seconds() >= dquot->dq_dqb.dqb_itime && 1190 dquot->dq_dqb.dqb_itime &&
1191 get_seconds() >= dquot->dq_dqb.dqb_itime &&
1114 !ignore_hardlimit(dquot)) { 1192 !ignore_hardlimit(dquot)) {
1115 *warntype = QUOTA_NL_ISOFTLONGWARN; 1193 *warntype = QUOTA_NL_ISOFTLONGWARN;
1116 return NO_QUOTA; 1194 return NO_QUOTA;
1117 } 1195 }
1118 1196
1119 if (dquot->dq_dqb.dqb_isoftlimit && 1197 if (dquot->dq_dqb.dqb_isoftlimit &&
1120 (dquot->dq_dqb.dqb_curinodes + inodes) > dquot->dq_dqb.dqb_isoftlimit && 1198 newinodes > dquot->dq_dqb.dqb_isoftlimit &&
1121 dquot->dq_dqb.dqb_itime == 0) { 1199 dquot->dq_dqb.dqb_itime == 0) {
1122 *warntype = QUOTA_NL_ISOFTWARN; 1200 *warntype = QUOTA_NL_ISOFTWARN;
1123 dquot->dq_dqb.dqb_itime = get_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace; 1201 dquot->dq_dqb.dqb_itime = get_seconds() +
1202 sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace;
1124 } 1203 }
1125 1204
1126 return QUOTA_OK; 1205 return QUOTA_OK;
@@ -1129,13 +1208,19 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
1129/* needs dq_data_lock */ 1208/* needs dq_data_lock */
1130static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype) 1209static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype)
1131{ 1210{
1211 qsize_t tspace;
1212 struct super_block *sb = dquot->dq_sb;
1213
1132 *warntype = QUOTA_NL_NOWARN; 1214 *warntype = QUOTA_NL_NOWARN;
1133 if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) || 1215 if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) ||
1134 test_bit(DQ_FAKE_B, &dquot->dq_flags)) 1216 test_bit(DQ_FAKE_B, &dquot->dq_flags))
1135 return QUOTA_OK; 1217 return QUOTA_OK;
1136 1218
1219 tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace
1220 + space;
1221
1137 if (dquot->dq_dqb.dqb_bhardlimit && 1222 if (dquot->dq_dqb.dqb_bhardlimit &&
1138 dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bhardlimit && 1223 tspace > dquot->dq_dqb.dqb_bhardlimit &&
1139 !ignore_hardlimit(dquot)) { 1224 !ignore_hardlimit(dquot)) {
1140 if (!prealloc) 1225 if (!prealloc)
1141 *warntype = QUOTA_NL_BHARDWARN; 1226 *warntype = QUOTA_NL_BHARDWARN;
@@ -1143,8 +1228,9 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
1143 } 1228 }
1144 1229
1145 if (dquot->dq_dqb.dqb_bsoftlimit && 1230 if (dquot->dq_dqb.dqb_bsoftlimit &&
1146 dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit && 1231 tspace > dquot->dq_dqb.dqb_bsoftlimit &&
1147 dquot->dq_dqb.dqb_btime && get_seconds() >= dquot->dq_dqb.dqb_btime && 1232 dquot->dq_dqb.dqb_btime &&
1233 get_seconds() >= dquot->dq_dqb.dqb_btime &&
1148 !ignore_hardlimit(dquot)) { 1234 !ignore_hardlimit(dquot)) {
1149 if (!prealloc) 1235 if (!prealloc)
1150 *warntype = QUOTA_NL_BSOFTLONGWARN; 1236 *warntype = QUOTA_NL_BSOFTLONGWARN;
@@ -1152,11 +1238,12 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
1152 } 1238 }
1153 1239
1154 if (dquot->dq_dqb.dqb_bsoftlimit && 1240 if (dquot->dq_dqb.dqb_bsoftlimit &&
1155 dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit && 1241 tspace > dquot->dq_dqb.dqb_bsoftlimit &&
1156 dquot->dq_dqb.dqb_btime == 0) { 1242 dquot->dq_dqb.dqb_btime == 0) {
1157 if (!prealloc) { 1243 if (!prealloc) {
1158 *warntype = QUOTA_NL_BSOFTWARN; 1244 *warntype = QUOTA_NL_BSOFTWARN;
1159 dquot->dq_dqb.dqb_btime = get_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_bgrace; 1245 dquot->dq_dqb.dqb_btime = get_seconds() +
1246 sb_dqopt(sb)->info[dquot->dq_type].dqi_bgrace;
1160 } 1247 }
1161 else 1248 else
1162 /* 1249 /*
@@ -1171,15 +1258,18 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
1171 1258
1172static int info_idq_free(struct dquot *dquot, qsize_t inodes) 1259static int info_idq_free(struct dquot *dquot, qsize_t inodes)
1173{ 1260{
1261 qsize_t newinodes;
1262
1174 if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || 1263 if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
1175 dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit || 1264 dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit ||
1176 !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type)) 1265 !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type))
1177 return QUOTA_NL_NOWARN; 1266 return QUOTA_NL_NOWARN;
1178 1267
1179 if (dquot->dq_dqb.dqb_curinodes - inodes <= dquot->dq_dqb.dqb_isoftlimit) 1268 newinodes = dquot->dq_dqb.dqb_curinodes - inodes;
1269 if (newinodes <= dquot->dq_dqb.dqb_isoftlimit)
1180 return QUOTA_NL_ISOFTBELOW; 1270 return QUOTA_NL_ISOFTBELOW;
1181 if (dquot->dq_dqb.dqb_curinodes >= dquot->dq_dqb.dqb_ihardlimit && 1271 if (dquot->dq_dqb.dqb_curinodes >= dquot->dq_dqb.dqb_ihardlimit &&
1182 dquot->dq_dqb.dqb_curinodes - inodes < dquot->dq_dqb.dqb_ihardlimit) 1272 newinodes < dquot->dq_dqb.dqb_ihardlimit)
1183 return QUOTA_NL_IHARDBELOW; 1273 return QUOTA_NL_IHARDBELOW;
1184 return QUOTA_NL_NOWARN; 1274 return QUOTA_NL_NOWARN;
1185} 1275}
@@ -1206,7 +1296,7 @@ int dquot_initialize(struct inode *inode, int type)
1206{ 1296{
1207 unsigned int id = 0; 1297 unsigned int id = 0;
1208 int cnt, ret = 0; 1298 int cnt, ret = 0;
1209 struct dquot *got[MAXQUOTAS] = { NODQUOT, NODQUOT }; 1299 struct dquot *got[MAXQUOTAS] = { NULL, NULL };
1210 struct super_block *sb = inode->i_sb; 1300 struct super_block *sb = inode->i_sb;
1211 1301
1212 /* First test before acquiring mutex - solves deadlocks when we 1302 /* First test before acquiring mutex - solves deadlocks when we
@@ -1239,9 +1329,9 @@ int dquot_initialize(struct inode *inode, int type)
1239 /* Avoid races with quotaoff() */ 1329 /* Avoid races with quotaoff() */
1240 if (!sb_has_quota_active(sb, cnt)) 1330 if (!sb_has_quota_active(sb, cnt))
1241 continue; 1331 continue;
1242 if (inode->i_dquot[cnt] == NODQUOT) { 1332 if (!inode->i_dquot[cnt]) {
1243 inode->i_dquot[cnt] = got[cnt]; 1333 inode->i_dquot[cnt] = got[cnt];
1244 got[cnt] = NODQUOT; 1334 got[cnt] = NULL;
1245 } 1335 }
1246 } 1336 }
1247out_err: 1337out_err:
@@ -1251,6 +1341,7 @@ out_err:
1251 dqput(got[cnt]); 1341 dqput(got[cnt]);
1252 return ret; 1342 return ret;
1253} 1343}
1344EXPORT_SYMBOL(dquot_initialize);
1254 1345
1255/* 1346/*
1256 * Release all quotas referenced by inode 1347 * Release all quotas referenced by inode
@@ -1263,7 +1354,7 @@ int dquot_drop(struct inode *inode)
1263 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1354 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1264 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1355 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1265 put[cnt] = inode->i_dquot[cnt]; 1356 put[cnt] = inode->i_dquot[cnt];
1266 inode->i_dquot[cnt] = NODQUOT; 1357 inode->i_dquot[cnt] = NULL;
1267 } 1358 }
1268 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1359 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1269 1360
@@ -1271,6 +1362,7 @@ int dquot_drop(struct inode *inode)
1271 dqput(put[cnt]); 1362 dqput(put[cnt]);
1272 return 0; 1363 return 0;
1273} 1364}
1365EXPORT_SYMBOL(dquot_drop);
1274 1366
1275/* Wrapper to remove references to quota structures from inode */ 1367/* Wrapper to remove references to quota structures from inode */
1276void vfs_dq_drop(struct inode *inode) 1368void vfs_dq_drop(struct inode *inode)
@@ -1287,12 +1379,13 @@ void vfs_dq_drop(struct inode *inode)
1287 * must assure that nobody can come after the DQUOT_DROP and 1379 * must assure that nobody can come after the DQUOT_DROP and
1288 * add quota pointers back anyway */ 1380 * add quota pointers back anyway */
1289 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1381 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1290 if (inode->i_dquot[cnt] != NODQUOT) 1382 if (inode->i_dquot[cnt])
1291 break; 1383 break;
1292 if (cnt < MAXQUOTAS) 1384 if (cnt < MAXQUOTAS)
1293 inode->i_sb->dq_op->drop(inode); 1385 inode->i_sb->dq_op->drop(inode);
1294 } 1386 }
1295} 1387}
1388EXPORT_SYMBOL(vfs_dq_drop);
1296 1389
1297/* 1390/*
1298 * Following four functions update i_blocks+i_bytes fields and 1391 * Following four functions update i_blocks+i_bytes fields and
@@ -1306,51 +1399,93 @@ void vfs_dq_drop(struct inode *inode)
1306/* 1399/*
1307 * This operation can block, but only after everything is updated 1400 * This operation can block, but only after everything is updated
1308 */ 1401 */
1309int dquot_alloc_space(struct inode *inode, qsize_t number, int warn) 1402int __dquot_alloc_space(struct inode *inode, qsize_t number,
1403 int warn, int reserve)
1310{ 1404{
1311 int cnt, ret = NO_QUOTA; 1405 int cnt, ret = QUOTA_OK;
1312 char warntype[MAXQUOTAS]; 1406 char warntype[MAXQUOTAS];
1313 1407
1314 /* First test before acquiring mutex - solves deadlocks when we
1315 * re-enter the quota code and are already holding the mutex */
1316 if (IS_NOQUOTA(inode)) {
1317out_add:
1318 inode_add_bytes(inode, number);
1319 return QUOTA_OK;
1320 }
1321 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1408 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1322 warntype[cnt] = QUOTA_NL_NOWARN; 1409 warntype[cnt] = QUOTA_NL_NOWARN;
1323 1410
1324 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1325 if (IS_NOQUOTA(inode)) { /* Now we can do reliable test... */
1326 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1327 goto out_add;
1328 }
1329 spin_lock(&dq_data_lock); 1411 spin_lock(&dq_data_lock);
1330 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1412 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1331 if (inode->i_dquot[cnt] == NODQUOT) 1413 if (!inode->i_dquot[cnt])
1332 continue; 1414 continue;
1333 if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt) == NO_QUOTA) 1415 if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt)
1334 goto warn_put_all; 1416 == NO_QUOTA) {
1417 ret = NO_QUOTA;
1418 goto out_unlock;
1419 }
1335 } 1420 }
1336 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1421 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1337 if (inode->i_dquot[cnt] == NODQUOT) 1422 if (!inode->i_dquot[cnt])
1338 continue; 1423 continue;
1339 dquot_incr_space(inode->i_dquot[cnt], number); 1424 if (reserve)
1425 dquot_resv_space(inode->i_dquot[cnt], number);
1426 else
1427 dquot_incr_space(inode->i_dquot[cnt], number);
1340 } 1428 }
1341 inode_add_bytes(inode, number); 1429 if (!reserve)
1342 ret = QUOTA_OK; 1430 inode_add_bytes(inode, number);
1343warn_put_all: 1431out_unlock:
1344 spin_unlock(&dq_data_lock); 1432 spin_unlock(&dq_data_lock);
1345 if (ret == QUOTA_OK)
1346 /* Dirtify all the dquots - this can block when journalling */
1347 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1348 if (inode->i_dquot[cnt])
1349 mark_dquot_dirty(inode->i_dquot[cnt]);
1350 flush_warnings(inode->i_dquot, warntype); 1433 flush_warnings(inode->i_dquot, warntype);
1434 return ret;
1435}
1436
1437int dquot_alloc_space(struct inode *inode, qsize_t number, int warn)
1438{
1439 int cnt, ret = QUOTA_OK;
1440
1441 /*
1442 * First test before acquiring mutex - solves deadlocks when we
1443 * re-enter the quota code and are already holding the mutex
1444 */
1445 if (IS_NOQUOTA(inode)) {
1446 inode_add_bytes(inode, number);
1447 goto out;
1448 }
1449
1450 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1451 if (IS_NOQUOTA(inode)) {
1452 inode_add_bytes(inode, number);
1453 goto out_unlock;
1454 }
1455
1456 ret = __dquot_alloc_space(inode, number, warn, 0);
1457 if (ret == NO_QUOTA)
1458 goto out_unlock;
1459
1460 /* Dirtify all the dquots - this can block when journalling */
1461 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1462 if (inode->i_dquot[cnt])
1463 mark_dquot_dirty(inode->i_dquot[cnt]);
1464out_unlock:
1351 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1465 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1466out:
1352 return ret; 1467 return ret;
1353} 1468}
1469EXPORT_SYMBOL(dquot_alloc_space);
1470
1471int dquot_reserve_space(struct inode *inode, qsize_t number, int warn)
1472{
1473 int ret = QUOTA_OK;
1474
1475 if (IS_NOQUOTA(inode))
1476 goto out;
1477
1478 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1479 if (IS_NOQUOTA(inode))
1480 goto out_unlock;
1481
1482 ret = __dquot_alloc_space(inode, number, warn, 1);
1483out_unlock:
1484 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1485out:
1486 return ret;
1487}
1488EXPORT_SYMBOL(dquot_reserve_space);
1354 1489
1355/* 1490/*
1356 * This operation can block, but only after everything is updated 1491 * This operation can block, but only after everything is updated
@@ -1373,14 +1508,15 @@ int dquot_alloc_inode(const struct inode *inode, qsize_t number)
1373 } 1508 }
1374 spin_lock(&dq_data_lock); 1509 spin_lock(&dq_data_lock);
1375 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1510 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1376 if (inode->i_dquot[cnt] == NODQUOT) 1511 if (!inode->i_dquot[cnt])
1377 continue; 1512 continue;
1378 if (check_idq(inode->i_dquot[cnt], number, warntype+cnt) == NO_QUOTA) 1513 if (check_idq(inode->i_dquot[cnt], number, warntype+cnt)
1514 == NO_QUOTA)
1379 goto warn_put_all; 1515 goto warn_put_all;
1380 } 1516 }
1381 1517
1382 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1518 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1383 if (inode->i_dquot[cnt] == NODQUOT) 1519 if (!inode->i_dquot[cnt])
1384 continue; 1520 continue;
1385 dquot_incr_inodes(inode->i_dquot[cnt], number); 1521 dquot_incr_inodes(inode->i_dquot[cnt], number);
1386 } 1522 }
@@ -1396,6 +1532,73 @@ warn_put_all:
1396 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1532 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1397 return ret; 1533 return ret;
1398} 1534}
1535EXPORT_SYMBOL(dquot_alloc_inode);
1536
1537int dquot_claim_space(struct inode *inode, qsize_t number)
1538{
1539 int cnt;
1540 int ret = QUOTA_OK;
1541
1542 if (IS_NOQUOTA(inode)) {
1543 inode_add_bytes(inode, number);
1544 goto out;
1545 }
1546
1547 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1548 if (IS_NOQUOTA(inode)) {
1549 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1550 inode_add_bytes(inode, number);
1551 goto out;
1552 }
1553
1554 spin_lock(&dq_data_lock);
1555 /* Claim reserved quotas to allocated quotas */
1556 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1557 if (inode->i_dquot[cnt])
1558 dquot_claim_reserved_space(inode->i_dquot[cnt],
1559 number);
1560 }
1561 /* Update inode bytes */
1562 inode_add_bytes(inode, number);
1563 spin_unlock(&dq_data_lock);
1564 /* Dirtify all the dquots - this can block when journalling */
1565 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1566 if (inode->i_dquot[cnt])
1567 mark_dquot_dirty(inode->i_dquot[cnt]);
1568 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1569out:
1570 return ret;
1571}
1572EXPORT_SYMBOL(dquot_claim_space);
1573
1574/*
1575 * Release reserved quota space
1576 */
1577void dquot_release_reserved_space(struct inode *inode, qsize_t number)
1578{
1579 int cnt;
1580
1581 if (IS_NOQUOTA(inode))
1582 goto out;
1583
1584 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1585 if (IS_NOQUOTA(inode))
1586 goto out_unlock;
1587
1588 spin_lock(&dq_data_lock);
1589 /* Release reserved dquots */
1590 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1591 if (inode->i_dquot[cnt])
1592 dquot_free_reserved_space(inode->i_dquot[cnt], number);
1593 }
1594 spin_unlock(&dq_data_lock);
1595
1596out_unlock:
1597 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1598out:
1599 return;
1600}
1601EXPORT_SYMBOL(dquot_release_reserved_space);
1399 1602
1400/* 1603/*
1401 * This operation can block, but only after everything is updated 1604 * This operation can block, but only after everything is updated
@@ -1421,7 +1624,7 @@ out_sub:
1421 } 1624 }
1422 spin_lock(&dq_data_lock); 1625 spin_lock(&dq_data_lock);
1423 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1626 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1424 if (inode->i_dquot[cnt] == NODQUOT) 1627 if (!inode->i_dquot[cnt])
1425 continue; 1628 continue;
1426 warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number); 1629 warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number);
1427 dquot_decr_space(inode->i_dquot[cnt], number); 1630 dquot_decr_space(inode->i_dquot[cnt], number);
@@ -1436,6 +1639,7 @@ out_sub:
1436 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1639 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1437 return QUOTA_OK; 1640 return QUOTA_OK;
1438} 1641}
1642EXPORT_SYMBOL(dquot_free_space);
1439 1643
1440/* 1644/*
1441 * This operation can block, but only after everything is updated 1645 * This operation can block, but only after everything is updated
@@ -1458,7 +1662,7 @@ int dquot_free_inode(const struct inode *inode, qsize_t number)
1458 } 1662 }
1459 spin_lock(&dq_data_lock); 1663 spin_lock(&dq_data_lock);
1460 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1664 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1461 if (inode->i_dquot[cnt] == NODQUOT) 1665 if (!inode->i_dquot[cnt])
1462 continue; 1666 continue;
1463 warntype[cnt] = info_idq_free(inode->i_dquot[cnt], number); 1667 warntype[cnt] = info_idq_free(inode->i_dquot[cnt], number);
1464 dquot_decr_inodes(inode->i_dquot[cnt], number); 1668 dquot_decr_inodes(inode->i_dquot[cnt], number);
@@ -1472,6 +1676,20 @@ int dquot_free_inode(const struct inode *inode, qsize_t number)
1472 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1676 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1473 return QUOTA_OK; 1677 return QUOTA_OK;
1474} 1678}
1679EXPORT_SYMBOL(dquot_free_inode);
1680
1681/*
1682 * call back function, get reserved quota space from underlying fs
1683 */
1684qsize_t dquot_get_reserved_space(struct inode *inode)
1685{
1686 qsize_t reserved_space = 0;
1687
1688 if (sb_any_quota_active(inode->i_sb) &&
1689 inode->i_sb->dq_op->get_reserved_space)
1690 reserved_space = inode->i_sb->dq_op->get_reserved_space(inode);
1691 return reserved_space;
1692}
1475 1693
1476/* 1694/*
1477 * Transfer the number of inode and blocks from one diskquota to an other. 1695 * Transfer the number of inode and blocks from one diskquota to an other.
@@ -1481,7 +1699,8 @@ int dquot_free_inode(const struct inode *inode, qsize_t number)
1481 */ 1699 */
1482int dquot_transfer(struct inode *inode, struct iattr *iattr) 1700int dquot_transfer(struct inode *inode, struct iattr *iattr)
1483{ 1701{
1484 qsize_t space; 1702 qsize_t space, cur_space;
1703 qsize_t rsv_space = 0;
1485 struct dquot *transfer_from[MAXQUOTAS]; 1704 struct dquot *transfer_from[MAXQUOTAS];
1486 struct dquot *transfer_to[MAXQUOTAS]; 1705 struct dquot *transfer_to[MAXQUOTAS];
1487 int cnt, ret = QUOTA_OK; 1706 int cnt, ret = QUOTA_OK;
@@ -1496,22 +1715,16 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
1496 return QUOTA_OK; 1715 return QUOTA_OK;
1497 /* Initialize the arrays */ 1716 /* Initialize the arrays */
1498 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1717 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1499 transfer_from[cnt] = NODQUOT; 1718 transfer_from[cnt] = NULL;
1500 transfer_to[cnt] = NODQUOT; 1719 transfer_to[cnt] = NULL;
1501 warntype_to[cnt] = QUOTA_NL_NOWARN; 1720 warntype_to[cnt] = QUOTA_NL_NOWARN;
1502 switch (cnt) {
1503 case USRQUOTA:
1504 if (!chuid)
1505 continue;
1506 transfer_to[cnt] = dqget(inode->i_sb, iattr->ia_uid, cnt);
1507 break;
1508 case GRPQUOTA:
1509 if (!chgid)
1510 continue;
1511 transfer_to[cnt] = dqget(inode->i_sb, iattr->ia_gid, cnt);
1512 break;
1513 }
1514 } 1721 }
1722 if (chuid)
1723 transfer_to[USRQUOTA] = dqget(inode->i_sb, iattr->ia_uid,
1724 USRQUOTA);
1725 if (chgid)
1726 transfer_to[GRPQUOTA] = dqget(inode->i_sb, iattr->ia_gid,
1727 GRPQUOTA);
1515 1728
1516 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1729 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1517 /* Now recheck reliably when holding dqptr_sem */ 1730 /* Now recheck reliably when holding dqptr_sem */
@@ -1520,10 +1733,12 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
1520 goto put_all; 1733 goto put_all;
1521 } 1734 }
1522 spin_lock(&dq_data_lock); 1735 spin_lock(&dq_data_lock);
1523 space = inode_get_bytes(inode); 1736 cur_space = inode_get_bytes(inode);
1737 rsv_space = dquot_get_reserved_space(inode);
1738 space = cur_space + rsv_space;
1524 /* Build the transfer_from list and check the limits */ 1739 /* Build the transfer_from list and check the limits */
1525 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1740 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1526 if (transfer_to[cnt] == NODQUOT) 1741 if (!transfer_to[cnt])
1527 continue; 1742 continue;
1528 transfer_from[cnt] = inode->i_dquot[cnt]; 1743 transfer_from[cnt] = inode->i_dquot[cnt];
1529 if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) == 1744 if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) ==
@@ -1539,7 +1754,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
1539 /* 1754 /*
1540 * Skip changes for same uid or gid or for turned off quota-type. 1755 * Skip changes for same uid or gid or for turned off quota-type.
1541 */ 1756 */
1542 if (transfer_to[cnt] == NODQUOT) 1757 if (!transfer_to[cnt])
1543 continue; 1758 continue;
1544 1759
1545 /* Due to IO error we might not have transfer_from[] structure */ 1760 /* Due to IO error we might not have transfer_from[] structure */
@@ -1549,11 +1764,14 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
1549 warntype_from_space[cnt] = 1764 warntype_from_space[cnt] =
1550 info_bdq_free(transfer_from[cnt], space); 1765 info_bdq_free(transfer_from[cnt], space);
1551 dquot_decr_inodes(transfer_from[cnt], 1); 1766 dquot_decr_inodes(transfer_from[cnt], 1);
1552 dquot_decr_space(transfer_from[cnt], space); 1767 dquot_decr_space(transfer_from[cnt], cur_space);
1768 dquot_free_reserved_space(transfer_from[cnt],
1769 rsv_space);
1553 } 1770 }
1554 1771
1555 dquot_incr_inodes(transfer_to[cnt], 1); 1772 dquot_incr_inodes(transfer_to[cnt], 1);
1556 dquot_incr_space(transfer_to[cnt], space); 1773 dquot_incr_space(transfer_to[cnt], cur_space);
1774 dquot_resv_space(transfer_to[cnt], rsv_space);
1557 1775
1558 inode->i_dquot[cnt] = transfer_to[cnt]; 1776 inode->i_dquot[cnt] = transfer_to[cnt];
1559 } 1777 }
@@ -1567,7 +1785,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
1567 if (transfer_to[cnt]) { 1785 if (transfer_to[cnt]) {
1568 mark_dquot_dirty(transfer_to[cnt]); 1786 mark_dquot_dirty(transfer_to[cnt]);
1569 /* The reference we got is transferred to the inode */ 1787 /* The reference we got is transferred to the inode */
1570 transfer_to[cnt] = NODQUOT; 1788 transfer_to[cnt] = NULL;
1571 } 1789 }
1572 } 1790 }
1573warn_put_all: 1791warn_put_all:
@@ -1585,10 +1803,11 @@ over_quota:
1585 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1803 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1586 /* Clear dquot pointers we don't want to dqput() */ 1804 /* Clear dquot pointers we don't want to dqput() */
1587 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1805 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1588 transfer_from[cnt] = NODQUOT; 1806 transfer_from[cnt] = NULL;
1589 ret = NO_QUOTA; 1807 ret = NO_QUOTA;
1590 goto warn_put_all; 1808 goto warn_put_all;
1591} 1809}
1810EXPORT_SYMBOL(dquot_transfer);
1592 1811
1593/* Wrapper for transferring ownership of an inode */ 1812/* Wrapper for transferring ownership of an inode */
1594int vfs_dq_transfer(struct inode *inode, struct iattr *iattr) 1813int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
@@ -1600,7 +1819,7 @@ int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
1600 } 1819 }
1601 return 0; 1820 return 0;
1602} 1821}
1603 1822EXPORT_SYMBOL(vfs_dq_transfer);
1604 1823
1605/* 1824/*
1606 * Write info of quota file to disk 1825 * Write info of quota file to disk
@@ -1615,6 +1834,7 @@ int dquot_commit_info(struct super_block *sb, int type)
1615 mutex_unlock(&dqopt->dqio_mutex); 1834 mutex_unlock(&dqopt->dqio_mutex);
1616 return ret; 1835 return ret;
1617} 1836}
1837EXPORT_SYMBOL(dquot_commit_info);
1618 1838
1619/* 1839/*
1620 * Definitions of diskquota operations. 1840 * Definitions of diskquota operations.
@@ -1700,8 +1920,8 @@ int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
1700 drop_dquot_ref(sb, cnt); 1920 drop_dquot_ref(sb, cnt);
1701 invalidate_dquots(sb, cnt); 1921 invalidate_dquots(sb, cnt);
1702 /* 1922 /*
1703 * Now all dquots should be invalidated, all writes done so we should be only 1923 * Now all dquots should be invalidated, all writes done so we
1704 * users of the info. No locks needed. 1924 * should be only users of the info. No locks needed.
1705 */ 1925 */
1706 if (info_dirty(&dqopt->info[cnt])) 1926 if (info_dirty(&dqopt->info[cnt]))
1707 sb->dq_op->write_info(sb, cnt); 1927 sb->dq_op->write_info(sb, cnt);
@@ -1739,10 +1959,12 @@ int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
1739 /* If quota was reenabled in the meantime, we have 1959 /* If quota was reenabled in the meantime, we have
1740 * nothing to do */ 1960 * nothing to do */
1741 if (!sb_has_quota_loaded(sb, cnt)) { 1961 if (!sb_has_quota_loaded(sb, cnt)) {
1742 mutex_lock_nested(&toputinode[cnt]->i_mutex, I_MUTEX_QUOTA); 1962 mutex_lock_nested(&toputinode[cnt]->i_mutex,
1963 I_MUTEX_QUOTA);
1743 toputinode[cnt]->i_flags &= ~(S_IMMUTABLE | 1964 toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
1744 S_NOATIME | S_NOQUOTA); 1965 S_NOATIME | S_NOQUOTA);
1745 truncate_inode_pages(&toputinode[cnt]->i_data, 0); 1966 truncate_inode_pages(&toputinode[cnt]->i_data,
1967 0);
1746 mutex_unlock(&toputinode[cnt]->i_mutex); 1968 mutex_unlock(&toputinode[cnt]->i_mutex);
1747 mark_inode_dirty(toputinode[cnt]); 1969 mark_inode_dirty(toputinode[cnt]);
1748 } 1970 }
@@ -1767,13 +1989,14 @@ put_inodes:
1767 } 1989 }
1768 return ret; 1990 return ret;
1769} 1991}
1992EXPORT_SYMBOL(vfs_quota_disable);
1770 1993
1771int vfs_quota_off(struct super_block *sb, int type, int remount) 1994int vfs_quota_off(struct super_block *sb, int type, int remount)
1772{ 1995{
1773 return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED : 1996 return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED :
1774 (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED)); 1997 (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED));
1775} 1998}
1776 1999EXPORT_SYMBOL(vfs_quota_off);
1777/* 2000/*
1778 * Turn quotas on on a device 2001 * Turn quotas on on a device
1779 */ 2002 */
@@ -1831,7 +2054,8 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
1831 * possible) Also nobody should write to the file - we use 2054 * possible) Also nobody should write to the file - we use
1832 * special IO operations which ignore the immutable bit. */ 2055 * special IO operations which ignore the immutable bit. */
1833 down_write(&dqopt->dqptr_sem); 2056 down_write(&dqopt->dqptr_sem);
1834 oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA); 2057 oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
2058 S_NOQUOTA);
1835 inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; 2059 inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
1836 up_write(&dqopt->dqptr_sem); 2060 up_write(&dqopt->dqptr_sem);
1837 sb->dq_op->drop(inode); 2061 sb->dq_op->drop(inode);
@@ -1850,7 +2074,8 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
1850 dqopt->info[type].dqi_fmt_id = format_id; 2074 dqopt->info[type].dqi_fmt_id = format_id;
1851 INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list); 2075 INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list);
1852 mutex_lock(&dqopt->dqio_mutex); 2076 mutex_lock(&dqopt->dqio_mutex);
1853 if ((error = dqopt->ops[type]->read_file_info(sb, type)) < 0) { 2077 error = dqopt->ops[type]->read_file_info(sb, type);
2078 if (error < 0) {
1854 mutex_unlock(&dqopt->dqio_mutex); 2079 mutex_unlock(&dqopt->dqio_mutex);
1855 goto out_file_init; 2080 goto out_file_init;
1856 } 2081 }
@@ -1930,6 +2155,7 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
1930 DQUOT_LIMITS_ENABLED); 2155 DQUOT_LIMITS_ENABLED);
1931 return error; 2156 return error;
1932} 2157}
2158EXPORT_SYMBOL(vfs_quota_on_path);
1933 2159
1934int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name, 2160int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
1935 int remount) 2161 int remount)
@@ -1947,6 +2173,7 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
1947 } 2173 }
1948 return error; 2174 return error;
1949} 2175}
2176EXPORT_SYMBOL(vfs_quota_on);
1950 2177
1951/* 2178/*
1952 * More powerful function for turning on quotas allowing setting 2179 * More powerful function for turning on quotas allowing setting
@@ -1993,6 +2220,7 @@ out_lock:
1993load_quota: 2220load_quota:
1994 return vfs_load_quota_inode(inode, type, format_id, flags); 2221 return vfs_load_quota_inode(inode, type, format_id, flags);
1995} 2222}
2223EXPORT_SYMBOL(vfs_quota_enable);
1996 2224
1997/* 2225/*
1998 * This function is used when filesystem needs to initialize quotas 2226 * This function is used when filesystem needs to initialize quotas
@@ -2022,6 +2250,7 @@ out:
2022 dput(dentry); 2250 dput(dentry);
2023 return error; 2251 return error;
2024} 2252}
2253EXPORT_SYMBOL(vfs_quota_on_mount);
2025 2254
2026/* Wrapper to turn on quotas when remounting rw */ 2255/* Wrapper to turn on quotas when remounting rw */
2027int vfs_dq_quota_on_remount(struct super_block *sb) 2256int vfs_dq_quota_on_remount(struct super_block *sb)
@@ -2038,6 +2267,7 @@ int vfs_dq_quota_on_remount(struct super_block *sb)
2038 } 2267 }
2039 return ret; 2268 return ret;
2040} 2269}
2270EXPORT_SYMBOL(vfs_dq_quota_on_remount);
2041 2271
2042static inline qsize_t qbtos(qsize_t blocks) 2272static inline qsize_t qbtos(qsize_t blocks)
2043{ 2273{
@@ -2057,7 +2287,7 @@ static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
2057 spin_lock(&dq_data_lock); 2287 spin_lock(&dq_data_lock);
2058 di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit); 2288 di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit);
2059 di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit); 2289 di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit);
2060 di->dqb_curspace = dm->dqb_curspace; 2290 di->dqb_curspace = dm->dqb_curspace + dm->dqb_rsvspace;
2061 di->dqb_ihardlimit = dm->dqb_ihardlimit; 2291 di->dqb_ihardlimit = dm->dqb_ihardlimit;
2062 di->dqb_isoftlimit = dm->dqb_isoftlimit; 2292 di->dqb_isoftlimit = dm->dqb_isoftlimit;
2063 di->dqb_curinodes = dm->dqb_curinodes; 2293 di->dqb_curinodes = dm->dqb_curinodes;
@@ -2067,18 +2297,20 @@ static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
2067 spin_unlock(&dq_data_lock); 2297 spin_unlock(&dq_data_lock);
2068} 2298}
2069 2299
2070int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di) 2300int vfs_get_dqblk(struct super_block *sb, int type, qid_t id,
2301 struct if_dqblk *di)
2071{ 2302{
2072 struct dquot *dquot; 2303 struct dquot *dquot;
2073 2304
2074 dquot = dqget(sb, id, type); 2305 dquot = dqget(sb, id, type);
2075 if (dquot == NODQUOT) 2306 if (!dquot)
2076 return -ESRCH; 2307 return -ESRCH;
2077 do_get_dqblk(dquot, di); 2308 do_get_dqblk(dquot, di);
2078 dqput(dquot); 2309 dqput(dquot);
2079 2310
2080 return 0; 2311 return 0;
2081} 2312}
2313EXPORT_SYMBOL(vfs_get_dqblk);
2082 2314
2083/* Generic routine for setting common part of quota structure */ 2315/* Generic routine for setting common part of quota structure */
2084static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di) 2316static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
@@ -2097,7 +2329,7 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
2097 2329
2098 spin_lock(&dq_data_lock); 2330 spin_lock(&dq_data_lock);
2099 if (di->dqb_valid & QIF_SPACE) { 2331 if (di->dqb_valid & QIF_SPACE) {
2100 dm->dqb_curspace = di->dqb_curspace; 2332 dm->dqb_curspace = di->dqb_curspace - dm->dqb_rsvspace;
2101 check_blim = 1; 2333 check_blim = 1;
2102 __set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags); 2334 __set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
2103 } 2335 }
@@ -2130,22 +2362,25 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
2130 } 2362 }
2131 2363
2132 if (check_blim) { 2364 if (check_blim) {
2133 if (!dm->dqb_bsoftlimit || dm->dqb_curspace < dm->dqb_bsoftlimit) { 2365 if (!dm->dqb_bsoftlimit ||
2366 dm->dqb_curspace < dm->dqb_bsoftlimit) {
2134 dm->dqb_btime = 0; 2367 dm->dqb_btime = 0;
2135 clear_bit(DQ_BLKS_B, &dquot->dq_flags); 2368 clear_bit(DQ_BLKS_B, &dquot->dq_flags);
2136 } 2369 } else if (!(di->dqb_valid & QIF_BTIME))
2137 else if (!(di->dqb_valid & QIF_BTIME)) /* Set grace only if user hasn't provided his own... */ 2370 /* Set grace only if user hasn't provided his own... */
2138 dm->dqb_btime = get_seconds() + dqi->dqi_bgrace; 2371 dm->dqb_btime = get_seconds() + dqi->dqi_bgrace;
2139 } 2372 }
2140 if (check_ilim) { 2373 if (check_ilim) {
2141 if (!dm->dqb_isoftlimit || dm->dqb_curinodes < dm->dqb_isoftlimit) { 2374 if (!dm->dqb_isoftlimit ||
2375 dm->dqb_curinodes < dm->dqb_isoftlimit) {
2142 dm->dqb_itime = 0; 2376 dm->dqb_itime = 0;
2143 clear_bit(DQ_INODES_B, &dquot->dq_flags); 2377 clear_bit(DQ_INODES_B, &dquot->dq_flags);
2144 } 2378 } else if (!(di->dqb_valid & QIF_ITIME))
2145 else if (!(di->dqb_valid & QIF_ITIME)) /* Set grace only if user hasn't provided his own... */ 2379 /* Set grace only if user hasn't provided his own... */
2146 dm->dqb_itime = get_seconds() + dqi->dqi_igrace; 2380 dm->dqb_itime = get_seconds() + dqi->dqi_igrace;
2147 } 2381 }
2148 if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit || dm->dqb_isoftlimit) 2382 if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit ||
2383 dm->dqb_isoftlimit)
2149 clear_bit(DQ_FAKE_B, &dquot->dq_flags); 2384 clear_bit(DQ_FAKE_B, &dquot->dq_flags);
2150 else 2385 else
2151 set_bit(DQ_FAKE_B, &dquot->dq_flags); 2386 set_bit(DQ_FAKE_B, &dquot->dq_flags);
@@ -2155,7 +2390,8 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
2155 return 0; 2390 return 0;
2156} 2391}
2157 2392
2158int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di) 2393int vfs_set_dqblk(struct super_block *sb, int type, qid_t id,
2394 struct if_dqblk *di)
2159{ 2395{
2160 struct dquot *dquot; 2396 struct dquot *dquot;
2161 int rc; 2397 int rc;
@@ -2170,6 +2406,7 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
2170out: 2406out:
2171 return rc; 2407 return rc;
2172} 2408}
2409EXPORT_SYMBOL(vfs_set_dqblk);
2173 2410
2174/* Generic routine for getting common part of quota file information */ 2411/* Generic routine for getting common part of quota file information */
2175int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) 2412int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
@@ -2191,6 +2428,7 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2191 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2428 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
2192 return 0; 2429 return 0;
2193} 2430}
2431EXPORT_SYMBOL(vfs_get_dqinfo);
2194 2432
2195/* Generic routine for setting common part of quota file information */ 2433/* Generic routine for setting common part of quota file information */
2196int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) 2434int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
@@ -2210,7 +2448,8 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2210 if (ii->dqi_valid & IIF_IGRACE) 2448 if (ii->dqi_valid & IIF_IGRACE)
2211 mi->dqi_igrace = ii->dqi_igrace; 2449 mi->dqi_igrace = ii->dqi_igrace;
2212 if (ii->dqi_valid & IIF_FLAGS) 2450 if (ii->dqi_valid & IIF_FLAGS)
2213 mi->dqi_flags = (mi->dqi_flags & ~DQF_MASK) | (ii->dqi_flags & DQF_MASK); 2451 mi->dqi_flags = (mi->dqi_flags & ~DQF_MASK) |
2452 (ii->dqi_flags & DQF_MASK);
2214 spin_unlock(&dq_data_lock); 2453 spin_unlock(&dq_data_lock);
2215 mark_info_dirty(sb, type); 2454 mark_info_dirty(sb, type);
2216 /* Force write to disk */ 2455 /* Force write to disk */
@@ -2219,6 +2458,7 @@ out:
2219 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2458 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
2220 return err; 2459 return err;
2221} 2460}
2461EXPORT_SYMBOL(vfs_set_dqinfo);
2222 2462
2223struct quotactl_ops vfs_quotactl_ops = { 2463struct quotactl_ops vfs_quotactl_ops = {
2224 .quota_on = vfs_quota_on, 2464 .quota_on = vfs_quota_on,
@@ -2368,43 +2608,10 @@ static int __init dquot_init(void)
2368 2608
2369#ifdef CONFIG_QUOTA_NETLINK_INTERFACE 2609#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
2370 if (genl_register_family(&quota_genl_family) != 0) 2610 if (genl_register_family(&quota_genl_family) != 0)
2371 printk(KERN_ERR "VFS: Failed to create quota netlink interface.\n"); 2611 printk(KERN_ERR
2612 "VFS: Failed to create quota netlink interface.\n");
2372#endif 2613#endif
2373 2614
2374 return 0; 2615 return 0;
2375} 2616}
2376module_init(dquot_init); 2617module_init(dquot_init);
2377
2378EXPORT_SYMBOL(register_quota_format);
2379EXPORT_SYMBOL(unregister_quota_format);
2380EXPORT_SYMBOL(dqstats);
2381EXPORT_SYMBOL(dq_data_lock);
2382EXPORT_SYMBOL(vfs_quota_enable);
2383EXPORT_SYMBOL(vfs_quota_on);
2384EXPORT_SYMBOL(vfs_quota_on_path);
2385EXPORT_SYMBOL(vfs_quota_on_mount);
2386EXPORT_SYMBOL(vfs_quota_disable);
2387EXPORT_SYMBOL(vfs_quota_off);
2388EXPORT_SYMBOL(dquot_scan_active);
2389EXPORT_SYMBOL(vfs_quota_sync);
2390EXPORT_SYMBOL(vfs_get_dqinfo);
2391EXPORT_SYMBOL(vfs_set_dqinfo);
2392EXPORT_SYMBOL(vfs_get_dqblk);
2393EXPORT_SYMBOL(vfs_set_dqblk);
2394EXPORT_SYMBOL(dquot_commit);
2395EXPORT_SYMBOL(dquot_commit_info);
2396EXPORT_SYMBOL(dquot_acquire);
2397EXPORT_SYMBOL(dquot_release);
2398EXPORT_SYMBOL(dquot_mark_dquot_dirty);
2399EXPORT_SYMBOL(dquot_initialize);
2400EXPORT_SYMBOL(dquot_drop);
2401EXPORT_SYMBOL(vfs_dq_drop);
2402EXPORT_SYMBOL(dqget);
2403EXPORT_SYMBOL(dqput);
2404EXPORT_SYMBOL(dquot_alloc_space);
2405EXPORT_SYMBOL(dquot_alloc_inode);
2406EXPORT_SYMBOL(dquot_free_space);
2407EXPORT_SYMBOL(dquot_free_inode);
2408EXPORT_SYMBOL(dquot_transfer);
2409EXPORT_SYMBOL(vfs_dq_transfer);
2410EXPORT_SYMBOL(vfs_dq_quota_on_remount);
diff --git a/fs/quota.c b/fs/quota/quota.c
index d76ada914f98..b7f5a468f076 100644
--- a/fs/quota.c
+++ b/fs/quota/quota.c
@@ -20,7 +20,8 @@
20#include <linux/types.h> 20#include <linux/types.h>
21 21
22/* Check validity of generic quotactl commands */ 22/* Check validity of generic quotactl commands */
23static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id) 23static int generic_quotactl_valid(struct super_block *sb, int type, int cmd,
24 qid_t id)
24{ 25{
25 if (type >= MAXQUOTAS) 26 if (type >= MAXQUOTAS)
26 return -EINVAL; 27 return -EINVAL;
@@ -72,7 +73,8 @@ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid
72 case Q_SETINFO: 73 case Q_SETINFO:
73 case Q_SETQUOTA: 74 case Q_SETQUOTA:
74 case Q_GETQUOTA: 75 case Q_GETQUOTA:
75 /* This is just informative test so we are satisfied without a lock */ 76 /* This is just an informative test so we are satisfied
77 * without the lock */
76 if (!sb_has_quota_active(sb, type)) 78 if (!sb_has_quota_active(sb, type))
77 return -ESRCH; 79 return -ESRCH;
78 } 80 }
@@ -92,7 +94,8 @@ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid
92} 94}
93 95
94/* Check validity of XFS Quota Manager commands */ 96/* Check validity of XFS Quota Manager commands */
95static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id) 97static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd,
98 qid_t id)
96{ 99{
97 if (type >= XQM_MAXQUOTAS) 100 if (type >= XQM_MAXQUOTAS)
98 return -EINVAL; 101 return -EINVAL;
@@ -142,7 +145,8 @@ static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t i
142 return 0; 145 return 0;
143} 146}
144 147
145static int check_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id) 148static int check_quotactl_valid(struct super_block *sb, int type, int cmd,
149 qid_t id)
146{ 150{
147 int error; 151 int error;
148 152
@@ -180,7 +184,8 @@ static void quota_sync_sb(struct super_block *sb, int type)
180 continue; 184 continue;
181 if (!sb_has_quota_active(sb, cnt)) 185 if (!sb_has_quota_active(sb, cnt))
182 continue; 186 continue;
183 mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, I_MUTEX_QUOTA); 187 mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex,
188 I_MUTEX_QUOTA);
184 truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0); 189 truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
185 mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex); 190 mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex);
186 } 191 }
@@ -200,14 +205,15 @@ void sync_dquots(struct super_block *sb, int type)
200 spin_lock(&sb_lock); 205 spin_lock(&sb_lock);
201restart: 206restart:
202 list_for_each_entry(sb, &super_blocks, s_list) { 207 list_for_each_entry(sb, &super_blocks, s_list) {
203 /* This test just improves performance so it needn't be reliable... */ 208 /* This test just improves performance so it needn't be
209 * reliable... */
204 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 210 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
205 if (type != -1 && type != cnt) 211 if (type != -1 && type != cnt)
206 continue; 212 continue;
207 if (!sb_has_quota_active(sb, cnt)) 213 if (!sb_has_quota_active(sb, cnt))
208 continue; 214 continue;
209 if (!info_dirty(&sb_dqopt(sb)->info[cnt]) && 215 if (!info_dirty(&sb_dqopt(sb)->info[cnt]) &&
210 list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list)) 216 list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
211 continue; 217 continue;
212 break; 218 break;
213 } 219 }
@@ -227,7 +233,8 @@ restart:
227} 233}
228 234
229/* Copy parameters and call proper function */ 235/* Copy parameters and call proper function */
230static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void __user *addr) 236static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
237 void __user *addr)
231{ 238{
232 int ret; 239 int ret;
233 240
@@ -235,7 +242,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
235 case Q_QUOTAON: { 242 case Q_QUOTAON: {
236 char *pathname; 243 char *pathname;
237 244
238 if (IS_ERR(pathname = getname(addr))) 245 pathname = getname(addr);
246 if (IS_ERR(pathname))
239 return PTR_ERR(pathname); 247 return PTR_ERR(pathname);
240 ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0); 248 ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0);
241 putname(pathname); 249 putname(pathname);
@@ -261,7 +269,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
261 case Q_GETINFO: { 269 case Q_GETINFO: {
262 struct if_dqinfo info; 270 struct if_dqinfo info;
263 271
264 if ((ret = sb->s_qcop->get_info(sb, type, &info))) 272 ret = sb->s_qcop->get_info(sb, type, &info);
273 if (ret)
265 return ret; 274 return ret;
266 if (copy_to_user(addr, &info, sizeof(info))) 275 if (copy_to_user(addr, &info, sizeof(info)))
267 return -EFAULT; 276 return -EFAULT;
@@ -277,7 +286,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
277 case Q_GETQUOTA: { 286 case Q_GETQUOTA: {
278 struct if_dqblk idq; 287 struct if_dqblk idq;
279 288
280 if ((ret = sb->s_qcop->get_dqblk(sb, type, id, &idq))) 289 ret = sb->s_qcop->get_dqblk(sb, type, id, &idq);
290 if (ret)
281 return ret; 291 return ret;
282 if (copy_to_user(addr, &idq, sizeof(idq))) 292 if (copy_to_user(addr, &idq, sizeof(idq)))
283 return -EFAULT; 293 return -EFAULT;
@@ -322,7 +332,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
322 case Q_XGETQUOTA: { 332 case Q_XGETQUOTA: {
323 struct fs_disk_quota fdq; 333 struct fs_disk_quota fdq;
324 334
325 if ((ret = sb->s_qcop->get_xquota(sb, type, id, &fdq))) 335 ret = sb->s_qcop->get_xquota(sb, type, id, &fdq);
336 if (ret)
326 return ret; 337 return ret;
327 if (copy_to_user(addr, &fdq, sizeof(fdq))) 338 if (copy_to_user(addr, &fdq, sizeof(fdq)))
328 return -EFAULT; 339 return -EFAULT;
@@ -341,7 +352,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
341 * look up a superblock on which quota ops will be performed 352 * look up a superblock on which quota ops will be performed
342 * - use the name of a block device to find the superblock thereon 353 * - use the name of a block device to find the superblock thereon
343 */ 354 */
344static inline struct super_block *quotactl_block(const char __user *special) 355static struct super_block *quotactl_block(const char __user *special)
345{ 356{
346#ifdef CONFIG_BLOCK 357#ifdef CONFIG_BLOCK
347 struct block_device *bdev; 358 struct block_device *bdev;
diff --git a/fs/quota_tree.c b/fs/quota/quota_tree.c
index 953404c95b17..f81f4bcfb178 100644
--- a/fs/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -22,8 +22,6 @@ MODULE_LICENSE("GPL");
22 22
23#define __QUOTA_QT_PARANOIA 23#define __QUOTA_QT_PARANOIA
24 24
25typedef char *dqbuf_t;
26
27static int get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth) 25static int get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth)
28{ 26{
29 unsigned int epb = info->dqi_usable_bs >> 2; 27 unsigned int epb = info->dqi_usable_bs >> 2;
@@ -35,46 +33,42 @@ static int get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth)
35} 33}
36 34
37/* Number of entries in one blocks */ 35/* Number of entries in one blocks */
38static inline int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info) 36static int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info)
39{ 37{
40 return (info->dqi_usable_bs - sizeof(struct qt_disk_dqdbheader)) 38 return (info->dqi_usable_bs - sizeof(struct qt_disk_dqdbheader))
41 / info->dqi_entry_size; 39 / info->dqi_entry_size;
42} 40}
43 41
44static dqbuf_t getdqbuf(size_t size) 42static char *getdqbuf(size_t size)
45{ 43{
46 dqbuf_t buf = kmalloc(size, GFP_NOFS); 44 char *buf = kmalloc(size, GFP_NOFS);
47 if (!buf) 45 if (!buf)
48 printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n"); 46 printk(KERN_WARNING
47 "VFS: Not enough memory for quota buffers.\n");
49 return buf; 48 return buf;
50} 49}
51 50
52static inline void freedqbuf(dqbuf_t buf) 51static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
53{
54 kfree(buf);
55}
56
57static inline ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf)
58{ 52{
59 struct super_block *sb = info->dqi_sb; 53 struct super_block *sb = info->dqi_sb;
60 54
61 memset(buf, 0, info->dqi_usable_bs); 55 memset(buf, 0, info->dqi_usable_bs);
62 return sb->s_op->quota_read(sb, info->dqi_type, (char *)buf, 56 return sb->s_op->quota_read(sb, info->dqi_type, buf,
63 info->dqi_usable_bs, blk << info->dqi_blocksize_bits); 57 info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
64} 58}
65 59
66static inline ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf) 60static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
67{ 61{
68 struct super_block *sb = info->dqi_sb; 62 struct super_block *sb = info->dqi_sb;
69 63
70 return sb->s_op->quota_write(sb, info->dqi_type, (char *)buf, 64 return sb->s_op->quota_write(sb, info->dqi_type, buf,
71 info->dqi_usable_bs, blk << info->dqi_blocksize_bits); 65 info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
72} 66}
73 67
74/* Remove empty block from list and return it */ 68/* Remove empty block from list and return it */
75static int get_free_dqblk(struct qtree_mem_dqinfo *info) 69static int get_free_dqblk(struct qtree_mem_dqinfo *info)
76{ 70{
77 dqbuf_t buf = getdqbuf(info->dqi_usable_bs); 71 char *buf = getdqbuf(info->dqi_usable_bs);
78 struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; 72 struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
79 int ret, blk; 73 int ret, blk;
80 74
@@ -98,12 +92,12 @@ static int get_free_dqblk(struct qtree_mem_dqinfo *info)
98 mark_info_dirty(info->dqi_sb, info->dqi_type); 92 mark_info_dirty(info->dqi_sb, info->dqi_type);
99 ret = blk; 93 ret = blk;
100out_buf: 94out_buf:
101 freedqbuf(buf); 95 kfree(buf);
102 return ret; 96 return ret;
103} 97}
104 98
105/* Insert empty block to the list */ 99/* Insert empty block to the list */
106static int put_free_dqblk(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk) 100static int put_free_dqblk(struct qtree_mem_dqinfo *info, char *buf, uint blk)
107{ 101{
108 struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; 102 struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
109 int err; 103 int err;
@@ -120,9 +114,10 @@ static int put_free_dqblk(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
120} 114}
121 115
122/* Remove given block from the list of blocks with free entries */ 116/* Remove given block from the list of blocks with free entries */
123static int remove_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk) 117static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
118 uint blk)
124{ 119{
125 dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs); 120 char *tmpbuf = getdqbuf(info->dqi_usable_bs);
126 struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; 121 struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
127 uint nextblk = le32_to_cpu(dh->dqdh_next_free); 122 uint nextblk = le32_to_cpu(dh->dqdh_next_free);
128 uint prevblk = le32_to_cpu(dh->dqdh_prev_free); 123 uint prevblk = le32_to_cpu(dh->dqdh_prev_free);
@@ -153,21 +148,24 @@ static int remove_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint
153 info->dqi_free_entry = nextblk; 148 info->dqi_free_entry = nextblk;
154 mark_info_dirty(info->dqi_sb, info->dqi_type); 149 mark_info_dirty(info->dqi_sb, info->dqi_type);
155 } 150 }
156 freedqbuf(tmpbuf); 151 kfree(tmpbuf);
157 dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0); 152 dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
158 /* No matter whether write succeeds block is out of list */ 153 /* No matter whether write succeeds block is out of list */
159 if (write_blk(info, blk, buf) < 0) 154 if (write_blk(info, blk, buf) < 0)
160 printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk); 155 printk(KERN_ERR
156 "VFS: Can't write block (%u) with free entries.\n",
157 blk);
161 return 0; 158 return 0;
162out_buf: 159out_buf:
163 freedqbuf(tmpbuf); 160 kfree(tmpbuf);
164 return err; 161 return err;
165} 162}
166 163
167/* Insert given block to the beginning of list with free entries */ 164/* Insert given block to the beginning of list with free entries */
168static int insert_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk) 165static int insert_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
166 uint blk)
169{ 167{
170 dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs); 168 char *tmpbuf = getdqbuf(info->dqi_usable_bs);
171 struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; 169 struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
172 int err; 170 int err;
173 171
@@ -188,12 +186,12 @@ static int insert_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint
188 if (err < 0) 186 if (err < 0)
189 goto out_buf; 187 goto out_buf;
190 } 188 }
191 freedqbuf(tmpbuf); 189 kfree(tmpbuf);
192 info->dqi_free_entry = blk; 190 info->dqi_free_entry = blk;
193 mark_info_dirty(info->dqi_sb, info->dqi_type); 191 mark_info_dirty(info->dqi_sb, info->dqi_type);
194 return 0; 192 return 0;
195out_buf: 193out_buf:
196 freedqbuf(tmpbuf); 194 kfree(tmpbuf);
197 return err; 195 return err;
198} 196}
199 197
@@ -215,7 +213,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
215{ 213{
216 uint blk, i; 214 uint blk, i;
217 struct qt_disk_dqdbheader *dh; 215 struct qt_disk_dqdbheader *dh;
218 dqbuf_t buf = getdqbuf(info->dqi_usable_bs); 216 char *buf = getdqbuf(info->dqi_usable_bs);
219 char *ddquot; 217 char *ddquot;
220 218
221 *err = 0; 219 *err = 0;
@@ -233,11 +231,12 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
233 blk = get_free_dqblk(info); 231 blk = get_free_dqblk(info);
234 if ((int)blk < 0) { 232 if ((int)blk < 0) {
235 *err = blk; 233 *err = blk;
236 freedqbuf(buf); 234 kfree(buf);
237 return 0; 235 return 0;
238 } 236 }
239 memset(buf, 0, info->dqi_usable_bs); 237 memset(buf, 0, info->dqi_usable_bs);
240 /* This is enough as block is already zeroed and entry list is empty... */ 238 /* This is enough as the block is already zeroed and the entry
239 * list is empty... */
241 info->dqi_free_entry = blk; 240 info->dqi_free_entry = blk;
242 mark_info_dirty(dquot->dq_sb, dquot->dq_type); 241 mark_info_dirty(dquot->dq_sb, dquot->dq_type);
243 } 242 }
@@ -253,9 +252,12 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
253 } 252 }
254 le16_add_cpu(&dh->dqdh_entries, 1); 253 le16_add_cpu(&dh->dqdh_entries, 1);
255 /* Find free structure in block */ 254 /* Find free structure in block */
256 for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader); 255 ddquot = buf + sizeof(struct qt_disk_dqdbheader);
257 i < qtree_dqstr_in_blk(info) && !qtree_entry_unused(info, ddquot); 256 for (i = 0; i < qtree_dqstr_in_blk(info); i++) {
258 i++, ddquot += info->dqi_entry_size); 257 if (qtree_entry_unused(info, ddquot))
258 break;
259 ddquot += info->dqi_entry_size;
260 }
259#ifdef __QUOTA_QT_PARANOIA 261#ifdef __QUOTA_QT_PARANOIA
260 if (i == qtree_dqstr_in_blk(info)) { 262 if (i == qtree_dqstr_in_blk(info)) {
261 printk(KERN_ERR "VFS: find_free_dqentry(): Data block full " 263 printk(KERN_ERR "VFS: find_free_dqentry(): Data block full "
@@ -273,10 +275,10 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
273 dquot->dq_off = (blk << info->dqi_blocksize_bits) + 275 dquot->dq_off = (blk << info->dqi_blocksize_bits) +
274 sizeof(struct qt_disk_dqdbheader) + 276 sizeof(struct qt_disk_dqdbheader) +
275 i * info->dqi_entry_size; 277 i * info->dqi_entry_size;
276 freedqbuf(buf); 278 kfree(buf);
277 return blk; 279 return blk;
278out_buf: 280out_buf:
279 freedqbuf(buf); 281 kfree(buf);
280 return 0; 282 return 0;
281} 283}
282 284
@@ -284,7 +286,7 @@ out_buf:
284static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, 286static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
285 uint *treeblk, int depth) 287 uint *treeblk, int depth)
286{ 288{
287 dqbuf_t buf = getdqbuf(info->dqi_usable_bs); 289 char *buf = getdqbuf(info->dqi_usable_bs);
288 int ret = 0, newson = 0, newact = 0; 290 int ret = 0, newson = 0, newact = 0;
289 __le32 *ref; 291 __le32 *ref;
290 uint newblk; 292 uint newblk;
@@ -333,7 +335,7 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
333 put_free_dqblk(info, buf, *treeblk); 335 put_free_dqblk(info, buf, *treeblk);
334 } 336 }
335out_buf: 337out_buf:
336 freedqbuf(buf); 338 kfree(buf);
337 return ret; 339 return ret;
338} 340}
339 341
@@ -346,14 +348,15 @@ static inline int dq_insert_tree(struct qtree_mem_dqinfo *info,
346} 348}
347 349
348/* 350/*
349 * We don't have to be afraid of deadlocks as we never have quotas on quota files... 351 * We don't have to be afraid of deadlocks as we never have quotas on quota
352 * files...
350 */ 353 */
351int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) 354int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
352{ 355{
353 int type = dquot->dq_type; 356 int type = dquot->dq_type;
354 struct super_block *sb = dquot->dq_sb; 357 struct super_block *sb = dquot->dq_sb;
355 ssize_t ret; 358 ssize_t ret;
356 dqbuf_t ddquot = getdqbuf(info->dqi_entry_size); 359 char *ddquot = getdqbuf(info->dqi_entry_size);
357 360
358 if (!ddquot) 361 if (!ddquot)
359 return -ENOMEM; 362 return -ENOMEM;
@@ -364,15 +367,15 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
364 if (ret < 0) { 367 if (ret < 0) {
365 printk(KERN_ERR "VFS: Error %zd occurred while " 368 printk(KERN_ERR "VFS: Error %zd occurred while "
366 "creating quota.\n", ret); 369 "creating quota.\n", ret);
367 freedqbuf(ddquot); 370 kfree(ddquot);
368 return ret; 371 return ret;
369 } 372 }
370 } 373 }
371 spin_lock(&dq_data_lock); 374 spin_lock(&dq_data_lock);
372 info->dqi_ops->mem2disk_dqblk(ddquot, dquot); 375 info->dqi_ops->mem2disk_dqblk(ddquot, dquot);
373 spin_unlock(&dq_data_lock); 376 spin_unlock(&dq_data_lock);
374 ret = sb->s_op->quota_write(sb, type, (char *)ddquot, 377 ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size,
375 info->dqi_entry_size, dquot->dq_off); 378 dquot->dq_off);
376 if (ret != info->dqi_entry_size) { 379 if (ret != info->dqi_entry_size) {
377 printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", 380 printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
378 sb->s_id); 381 sb->s_id);
@@ -382,7 +385,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
382 ret = 0; 385 ret = 0;
383 } 386 }
384 dqstats.writes++; 387 dqstats.writes++;
385 freedqbuf(ddquot); 388 kfree(ddquot);
386 389
387 return ret; 390 return ret;
388} 391}
@@ -393,7 +396,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
393 uint blk) 396 uint blk)
394{ 397{
395 struct qt_disk_dqdbheader *dh; 398 struct qt_disk_dqdbheader *dh;
396 dqbuf_t buf = getdqbuf(info->dqi_usable_bs); 399 char *buf = getdqbuf(info->dqi_usable_bs);
397 int ret = 0; 400 int ret = 0;
398 401
399 if (!buf) 402 if (!buf)
@@ -444,7 +447,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
444 } 447 }
445 dquot->dq_off = 0; /* Quota is now unattached */ 448 dquot->dq_off = 0; /* Quota is now unattached */
446out_buf: 449out_buf:
447 freedqbuf(buf); 450 kfree(buf);
448 return ret; 451 return ret;
449} 452}
450 453
@@ -452,7 +455,7 @@ out_buf:
452static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, 455static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
453 uint *blk, int depth) 456 uint *blk, int depth)
454{ 457{
455 dqbuf_t buf = getdqbuf(info->dqi_usable_bs); 458 char *buf = getdqbuf(info->dqi_usable_bs);
456 int ret = 0; 459 int ret = 0;
457 uint newblk; 460 uint newblk;
458 __le32 *ref = (__le32 *)buf; 461 __le32 *ref = (__le32 *)buf;
@@ -475,9 +478,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
475 int i; 478 int i;
476 ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0); 479 ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0);
477 /* Block got empty? */ 480 /* Block got empty? */
478 for (i = 0; 481 for (i = 0; i < (info->dqi_usable_bs >> 2) && !ref[i]; i++)
479 i < (info->dqi_usable_bs >> 2) && !ref[i]; 482 ;
480 i++);
481 /* Don't put the root block into the free block list */ 483 /* Don't put the root block into the free block list */
482 if (i == (info->dqi_usable_bs >> 2) 484 if (i == (info->dqi_usable_bs >> 2)
483 && *blk != QT_TREEOFF) { 485 && *blk != QT_TREEOFF) {
@@ -491,7 +493,7 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
491 } 493 }
492 } 494 }
493out_buf: 495out_buf:
494 freedqbuf(buf); 496 kfree(buf);
495 return ret; 497 return ret;
496} 498}
497 499
@@ -510,7 +512,7 @@ EXPORT_SYMBOL(qtree_delete_dquot);
510static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info, 512static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
511 struct dquot *dquot, uint blk) 513 struct dquot *dquot, uint blk)
512{ 514{
513 dqbuf_t buf = getdqbuf(info->dqi_usable_bs); 515 char *buf = getdqbuf(info->dqi_usable_bs);
514 loff_t ret = 0; 516 loff_t ret = 0;
515 int i; 517 int i;
516 char *ddquot; 518 char *ddquot;
@@ -522,9 +524,12 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
522 printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); 524 printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
523 goto out_buf; 525 goto out_buf;
524 } 526 }
525 for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader); 527 ddquot = buf + sizeof(struct qt_disk_dqdbheader);
526 i < qtree_dqstr_in_blk(info) && !info->dqi_ops->is_id(ddquot, dquot); 528 for (i = 0; i < qtree_dqstr_in_blk(info); i++) {
527 i++, ddquot += info->dqi_entry_size); 529 if (info->dqi_ops->is_id(ddquot, dquot))
530 break;
531 ddquot += info->dqi_entry_size;
532 }
528 if (i == qtree_dqstr_in_blk(info)) { 533 if (i == qtree_dqstr_in_blk(info)) {
529 printk(KERN_ERR "VFS: Quota for id %u referenced " 534 printk(KERN_ERR "VFS: Quota for id %u referenced "
530 "but not present.\n", dquot->dq_id); 535 "but not present.\n", dquot->dq_id);
@@ -535,7 +540,7 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
535 qt_disk_dqdbheader) + i * info->dqi_entry_size; 540 qt_disk_dqdbheader) + i * info->dqi_entry_size;
536 } 541 }
537out_buf: 542out_buf:
538 freedqbuf(buf); 543 kfree(buf);
539 return ret; 544 return ret;
540} 545}
541 546
@@ -543,7 +548,7 @@ out_buf:
543static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, 548static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
544 struct dquot *dquot, uint blk, int depth) 549 struct dquot *dquot, uint blk, int depth)
545{ 550{
546 dqbuf_t buf = getdqbuf(info->dqi_usable_bs); 551 char *buf = getdqbuf(info->dqi_usable_bs);
547 loff_t ret = 0; 552 loff_t ret = 0;
548 __le32 *ref = (__le32 *)buf; 553 __le32 *ref = (__le32 *)buf;
549 554
@@ -563,7 +568,7 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
563 else 568 else
564 ret = find_block_dqentry(info, dquot, blk); 569 ret = find_block_dqentry(info, dquot, blk);
565out_buf: 570out_buf:
566 freedqbuf(buf); 571 kfree(buf);
567 return ret; 572 return ret;
568} 573}
569 574
@@ -579,7 +584,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
579 int type = dquot->dq_type; 584 int type = dquot->dq_type;
580 struct super_block *sb = dquot->dq_sb; 585 struct super_block *sb = dquot->dq_sb;
581 loff_t offset; 586 loff_t offset;
582 dqbuf_t ddquot; 587 char *ddquot;
583 int ret = 0; 588 int ret = 0;
584 589
585#ifdef __QUOTA_QT_PARANOIA 590#ifdef __QUOTA_QT_PARANOIA
@@ -607,8 +612,8 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
607 ddquot = getdqbuf(info->dqi_entry_size); 612 ddquot = getdqbuf(info->dqi_entry_size);
608 if (!ddquot) 613 if (!ddquot)
609 return -ENOMEM; 614 return -ENOMEM;
610 ret = sb->s_op->quota_read(sb, type, (char *)ddquot, 615 ret = sb->s_op->quota_read(sb, type, ddquot, info->dqi_entry_size,
611 info->dqi_entry_size, dquot->dq_off); 616 dquot->dq_off);
612 if (ret != info->dqi_entry_size) { 617 if (ret != info->dqi_entry_size) {
613 if (ret >= 0) 618 if (ret >= 0)
614 ret = -EIO; 619 ret = -EIO;
@@ -616,7 +621,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
616 "structure for id %u.\n", dquot->dq_id); 621 "structure for id %u.\n", dquot->dq_id);
617 set_bit(DQ_FAKE_B, &dquot->dq_flags); 622 set_bit(DQ_FAKE_B, &dquot->dq_flags);
618 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); 623 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
619 freedqbuf(ddquot); 624 kfree(ddquot);
620 goto out; 625 goto out;
621 } 626 }
622 spin_lock(&dq_data_lock); 627 spin_lock(&dq_data_lock);
@@ -627,7 +632,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
627 !dquot->dq_dqb.dqb_isoftlimit) 632 !dquot->dq_dqb.dqb_isoftlimit)
628 set_bit(DQ_FAKE_B, &dquot->dq_flags); 633 set_bit(DQ_FAKE_B, &dquot->dq_flags);
629 spin_unlock(&dq_data_lock); 634 spin_unlock(&dq_data_lock);
630 freedqbuf(ddquot); 635 kfree(ddquot);
631out: 636out:
632 dqstats.reads++; 637 dqstats.reads++;
633 return ret; 638 return ret;
@@ -638,7 +643,8 @@ EXPORT_SYMBOL(qtree_read_dquot);
638 * the only one operating on dquot (thanks to dq_lock) */ 643 * the only one operating on dquot (thanks to dq_lock) */
639int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) 644int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
640{ 645{
641 if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace)) 646 if (test_bit(DQ_FAKE_B, &dquot->dq_flags) &&
647 !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace))
642 return qtree_delete_dquot(info, dquot); 648 return qtree_delete_dquot(info, dquot);
643 return 0; 649 return 0;
644} 650}
diff --git a/fs/quota_tree.h b/fs/quota/quota_tree.h
index a1ab8db81a51..a1ab8db81a51 100644
--- a/fs/quota_tree.h
+++ b/fs/quota/quota_tree.h
diff --git a/fs/quota_v1.c b/fs/quota/quota_v1.c
index b4af1c69ad16..0edcf42b1778 100644
--- a/fs/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -62,11 +62,14 @@ static int v1_read_dqblk(struct dquot *dquot)
62 62
63 /* Set structure to 0s in case read fails/is after end of file */ 63 /* Set structure to 0s in case read fails/is after end of file */
64 memset(&dqblk, 0, sizeof(struct v1_disk_dqblk)); 64 memset(&dqblk, 0, sizeof(struct v1_disk_dqblk));
65 dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type, (char *)&dqblk, sizeof(struct v1_disk_dqblk), v1_dqoff(dquot->dq_id)); 65 dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type, (char *)&dqblk,
66 sizeof(struct v1_disk_dqblk), v1_dqoff(dquot->dq_id));
66 67
67 v1_disk2mem_dqblk(&dquot->dq_dqb, &dqblk); 68 v1_disk2mem_dqblk(&dquot->dq_dqb, &dqblk);
68 if (dquot->dq_dqb.dqb_bhardlimit == 0 && dquot->dq_dqb.dqb_bsoftlimit == 0 && 69 if (dquot->dq_dqb.dqb_bhardlimit == 0 &&
69 dquot->dq_dqb.dqb_ihardlimit == 0 && dquot->dq_dqb.dqb_isoftlimit == 0) 70 dquot->dq_dqb.dqb_bsoftlimit == 0 &&
71 dquot->dq_dqb.dqb_ihardlimit == 0 &&
72 dquot->dq_dqb.dqb_isoftlimit == 0)
70 set_bit(DQ_FAKE_B, &dquot->dq_flags); 73 set_bit(DQ_FAKE_B, &dquot->dq_flags);
71 dqstats.reads++; 74 dqstats.reads++;
72 75
@@ -81,13 +84,16 @@ static int v1_commit_dqblk(struct dquot *dquot)
81 84
82 v1_mem2disk_dqblk(&dqblk, &dquot->dq_dqb); 85 v1_mem2disk_dqblk(&dqblk, &dquot->dq_dqb);
83 if (dquot->dq_id == 0) { 86 if (dquot->dq_id == 0) {
84 dqblk.dqb_btime = sb_dqopt(dquot->dq_sb)->info[type].dqi_bgrace; 87 dqblk.dqb_btime =
85 dqblk.dqb_itime = sb_dqopt(dquot->dq_sb)->info[type].dqi_igrace; 88 sb_dqopt(dquot->dq_sb)->info[type].dqi_bgrace;
89 dqblk.dqb_itime =
90 sb_dqopt(dquot->dq_sb)->info[type].dqi_igrace;
86 } 91 }
87 ret = 0; 92 ret = 0;
88 if (sb_dqopt(dquot->dq_sb)->files[type]) 93 if (sb_dqopt(dquot->dq_sb)->files[type])
89 ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type, (char *)&dqblk, 94 ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type,
90 sizeof(struct v1_disk_dqblk), v1_dqoff(dquot->dq_id)); 95 (char *)&dqblk, sizeof(struct v1_disk_dqblk),
96 v1_dqoff(dquot->dq_id));
91 if (ret != sizeof(struct v1_disk_dqblk)) { 97 if (ret != sizeof(struct v1_disk_dqblk)) {
92 printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", 98 printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
93 dquot->dq_sb->s_id); 99 dquot->dq_sb->s_id);
@@ -130,15 +136,20 @@ static int v1_check_quota_file(struct super_block *sb, int type)
130 return 0; 136 return 0;
131 blocks = isize >> BLOCK_SIZE_BITS; 137 blocks = isize >> BLOCK_SIZE_BITS;
132 off = isize & (BLOCK_SIZE - 1); 138 off = isize & (BLOCK_SIZE - 1);
133 if ((blocks % sizeof(struct v1_disk_dqblk) * BLOCK_SIZE + off) % sizeof(struct v1_disk_dqblk)) 139 if ((blocks % sizeof(struct v1_disk_dqblk) * BLOCK_SIZE + off) %
140 sizeof(struct v1_disk_dqblk))
134 return 0; 141 return 0;
135 /* Doublecheck whether we didn't get file with new format - with old quotactl() this could happen */ 142 /* Doublecheck whether we didn't get file with new format - with old
136 size = sb->s_op->quota_read(sb, type, (char *)&dqhead, sizeof(struct v2_disk_dqheader), 0); 143 * quotactl() this could happen */
144 size = sb->s_op->quota_read(sb, type, (char *)&dqhead,
145 sizeof(struct v2_disk_dqheader), 0);
137 if (size != sizeof(struct v2_disk_dqheader)) 146 if (size != sizeof(struct v2_disk_dqheader))
138 return 1; /* Probably not new format */ 147 return 1; /* Probably not new format */
139 if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type]) 148 if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type])
140 return 1; /* Definitely not new format */ 149 return 1; /* Definitely not new format */
141 printk(KERN_INFO "VFS: %s: Refusing to turn on old quota format on given file. It probably contains newer quota format.\n", sb->s_id); 150 printk(KERN_INFO
151 "VFS: %s: Refusing to turn on old quota format on given file."
152 " It probably contains newer quota format.\n", sb->s_id);
142 return 0; /* Seems like a new format file -> refuse it */ 153 return 0; /* Seems like a new format file -> refuse it */
143} 154}
144 155
@@ -148,7 +159,9 @@ static int v1_read_file_info(struct super_block *sb, int type)
148 struct v1_disk_dqblk dqblk; 159 struct v1_disk_dqblk dqblk;
149 int ret; 160 int ret;
150 161
151 if ((ret = sb->s_op->quota_read(sb, type, (char *)&dqblk, sizeof(struct v1_disk_dqblk), v1_dqoff(0))) != sizeof(struct v1_disk_dqblk)) { 162 ret = sb->s_op->quota_read(sb, type, (char *)&dqblk,
163 sizeof(struct v1_disk_dqblk), v1_dqoff(0));
164 if (ret != sizeof(struct v1_disk_dqblk)) {
152 if (ret >= 0) 165 if (ret >= 0)
153 ret = -EIO; 166 ret = -EIO;
154 goto out; 167 goto out;
@@ -157,8 +170,10 @@ static int v1_read_file_info(struct super_block *sb, int type)
157 /* limits are stored as unsigned 32-bit data */ 170 /* limits are stored as unsigned 32-bit data */
158 dqopt->info[type].dqi_maxblimit = 0xffffffff; 171 dqopt->info[type].dqi_maxblimit = 0xffffffff;
159 dqopt->info[type].dqi_maxilimit = 0xffffffff; 172 dqopt->info[type].dqi_maxilimit = 0xffffffff;
160 dqopt->info[type].dqi_igrace = dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME; 173 dqopt->info[type].dqi_igrace =
161 dqopt->info[type].dqi_bgrace = dqblk.dqb_btime ? dqblk.dqb_btime : MAX_DQ_TIME; 174 dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME;
175 dqopt->info[type].dqi_bgrace =
176 dqblk.dqb_btime ? dqblk.dqb_btime : MAX_DQ_TIME;
162out: 177out:
163 return ret; 178 return ret;
164} 179}
@@ -170,8 +185,9 @@ static int v1_write_file_info(struct super_block *sb, int type)
170 int ret; 185 int ret;
171 186
172 dqopt->info[type].dqi_flags &= ~DQF_INFO_DIRTY; 187 dqopt->info[type].dqi_flags &= ~DQF_INFO_DIRTY;
173 if ((ret = sb->s_op->quota_read(sb, type, (char *)&dqblk, 188 ret = sb->s_op->quota_read(sb, type, (char *)&dqblk,
174 sizeof(struct v1_disk_dqblk), v1_dqoff(0))) != sizeof(struct v1_disk_dqblk)) { 189 sizeof(struct v1_disk_dqblk), v1_dqoff(0));
190 if (ret != sizeof(struct v1_disk_dqblk)) {
175 if (ret >= 0) 191 if (ret >= 0)
176 ret = -EIO; 192 ret = -EIO;
177 goto out; 193 goto out;
diff --git a/fs/quota_v2.c b/fs/quota/quota_v2.c
index b618b563635c..a5475fb1ae44 100644
--- a/fs/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -54,7 +54,8 @@ static int v2_check_quota_file(struct super_block *sb, int type)
54 static const uint quota_magics[] = V2_INITQMAGICS; 54 static const uint quota_magics[] = V2_INITQMAGICS;
55 static const uint quota_versions[] = V2_INITQVERSIONS; 55 static const uint quota_versions[] = V2_INITQVERSIONS;
56 56
57 size = sb->s_op->quota_read(sb, type, (char *)&dqhead, sizeof(struct v2_disk_dqheader), 0); 57 size = sb->s_op->quota_read(sb, type, (char *)&dqhead,
58 sizeof(struct v2_disk_dqheader), 0);
58 if (size != sizeof(struct v2_disk_dqheader)) { 59 if (size != sizeof(struct v2_disk_dqheader)) {
59 printk("quota_v2: failed read expected=%zd got=%zd\n", 60 printk("quota_v2: failed read expected=%zd got=%zd\n",
60 sizeof(struct v2_disk_dqheader), size); 61 sizeof(struct v2_disk_dqheader), size);
diff --git a/fs/quotaio_v1.h b/fs/quota/quotaio_v1.h
index 746654b5de70..746654b5de70 100644
--- a/fs/quotaio_v1.h
+++ b/fs/quota/quotaio_v1.h
diff --git a/fs/quotaio_v2.h b/fs/quota/quotaio_v2.h
index 530fe580685c..530fe580685c 100644
--- a/fs/quotaio_v2.h
+++ b/fs/quota/quotaio_v2.h
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index b9b567a28376..ebb2c417912c 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -18,7 +18,6 @@
18#include <linux/string.h> 18#include <linux/string.h>
19#include <linux/backing-dev.h> 19#include <linux/backing-dev.h>
20#include <linux/ramfs.h> 20#include <linux/ramfs.h>
21#include <linux/quotaops.h>
22#include <linux/pagevec.h> 21#include <linux/pagevec.h>
23#include <linux/mman.h> 22#include <linux/mman.h>
24 23
@@ -60,7 +59,6 @@ const struct inode_operations ramfs_file_inode_operations = {
60 */ 59 */
61int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) 60int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
62{ 61{
63 struct pagevec lru_pvec;
64 unsigned long npages, xpages, loop, limit; 62 unsigned long npages, xpages, loop, limit;
65 struct page *pages; 63 struct page *pages;
66 unsigned order; 64 unsigned order;
@@ -103,21 +101,20 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
103 memset(data, 0, newsize); 101 memset(data, 0, newsize);
104 102
105 /* attach all the pages to the inode's address space */ 103 /* attach all the pages to the inode's address space */
106 pagevec_init(&lru_pvec, 0);
107 for (loop = 0; loop < npages; loop++) { 104 for (loop = 0; loop < npages; loop++) {
108 struct page *page = pages + loop; 105 struct page *page = pages + loop;
109 106
110 ret = add_to_page_cache(page, inode->i_mapping, loop, GFP_KERNEL); 107 ret = add_to_page_cache_lru(page, inode->i_mapping, loop,
108 GFP_KERNEL);
111 if (ret < 0) 109 if (ret < 0)
112 goto add_error; 110 goto add_error;
113 111
114 if (!pagevec_add(&lru_pvec, page)) 112 /* prevent the page from being discarded on memory pressure */
115 __pagevec_lru_add_file(&lru_pvec); 113 SetPageDirty(page);
116 114
117 unlock_page(page); 115 unlock_page(page);
118 } 116 }
119 117
120 pagevec_lru_add_file(&lru_pvec);
121 return 0; 118 return 0;
122 119
123 fsize_exceeded: 120 fsize_exceeded:
@@ -126,9 +123,8 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
126 return -EFBIG; 123 return -EFBIG;
127 124
128 add_error: 125 add_error:
129 page_cache_release(pages + loop); 126 while (loop < npages)
130 for (loop++; loop < npages; loop++) 127 __free_page(pages + loop++);
131 __free_page(pages + loop);
132 return ret; 128 return ret;
133} 129}
134 130
@@ -201,11 +197,6 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
201 if (ret) 197 if (ret)
202 return ret; 198 return ret;
203 199
204 /* by providing our own setattr() method, we skip this quotaism */
205 if ((old_ia_valid & ATTR_UID && ia->ia_uid != inode->i_uid) ||
206 (old_ia_valid & ATTR_GID && ia->ia_gid != inode->i_gid))
207 ret = DQUOT_TRANSFER(inode, ia) ? -EDQUOT : 0;
208
209 /* pick out size-changing events */ 200 /* pick out size-changing events */
210 if (ia->ia_valid & ATTR_SIZE) { 201 if (ia->ia_valid & ATTR_SIZE) {
211 loff_t size = i_size_read(inode); 202 loff_t size = i_size_read(inode);
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index b7e6ac706b87..a404fb88e456 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -33,12 +33,15 @@
33#include <linux/backing-dev.h> 33#include <linux/backing-dev.h>
34#include <linux/ramfs.h> 34#include <linux/ramfs.h>
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <linux/parser.h>
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
37#include "internal.h" 38#include "internal.h"
38 39
39/* some random number */ 40/* some random number */
40#define RAMFS_MAGIC 0x858458f6 41#define RAMFS_MAGIC 0x858458f6
41 42
43#define RAMFS_DEFAULT_MODE 0755
44
42static const struct super_operations ramfs_ops; 45static const struct super_operations ramfs_ops;
43static const struct inode_operations ramfs_dir_inode_operations; 46static const struct inode_operations ramfs_dir_inode_operations;
44 47
@@ -158,12 +161,75 @@ static const struct inode_operations ramfs_dir_inode_operations = {
158static const struct super_operations ramfs_ops = { 161static const struct super_operations ramfs_ops = {
159 .statfs = simple_statfs, 162 .statfs = simple_statfs,
160 .drop_inode = generic_delete_inode, 163 .drop_inode = generic_delete_inode,
164 .show_options = generic_show_options,
165};
166
167struct ramfs_mount_opts {
168 umode_t mode;
169};
170
171enum {
172 Opt_mode,
173 Opt_err
174};
175
176static const match_table_t tokens = {
177 {Opt_mode, "mode=%o"},
178 {Opt_err, NULL}
179};
180
181struct ramfs_fs_info {
182 struct ramfs_mount_opts mount_opts;
161}; 183};
162 184
185static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
186{
187 substring_t args[MAX_OPT_ARGS];
188 int option;
189 int token;
190 char *p;
191
192 opts->mode = RAMFS_DEFAULT_MODE;
193
194 while ((p = strsep(&data, ",")) != NULL) {
195 if (!*p)
196 continue;
197
198 token = match_token(p, tokens, args);
199 switch (token) {
200 case Opt_mode:
201 if (match_octal(&args[0], &option))
202 return -EINVAL;
203 opts->mode = option & S_IALLUGO;
204 break;
205 default:
206 printk(KERN_ERR "ramfs: bad mount option: %s\n", p);
207 return -EINVAL;
208 }
209 }
210
211 return 0;
212}
213
163static int ramfs_fill_super(struct super_block * sb, void * data, int silent) 214static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
164{ 215{
165 struct inode * inode; 216 struct ramfs_fs_info *fsi;
166 struct dentry * root; 217 struct inode *inode = NULL;
218 struct dentry *root;
219 int err;
220
221 save_mount_options(sb, data);
222
223 fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL);
224 if (!fsi) {
225 err = -ENOMEM;
226 goto fail;
227 }
228 sb->s_fs_info = fsi;
229
230 err = ramfs_parse_options(data, &fsi->mount_opts);
231 if (err)
232 goto fail;
167 233
168 sb->s_maxbytes = MAX_LFS_FILESIZE; 234 sb->s_maxbytes = MAX_LFS_FILESIZE;
169 sb->s_blocksize = PAGE_CACHE_SIZE; 235 sb->s_blocksize = PAGE_CACHE_SIZE;
@@ -171,17 +237,23 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
171 sb->s_magic = RAMFS_MAGIC; 237 sb->s_magic = RAMFS_MAGIC;
172 sb->s_op = &ramfs_ops; 238 sb->s_op = &ramfs_ops;
173 sb->s_time_gran = 1; 239 sb->s_time_gran = 1;
174 inode = ramfs_get_inode(sb, S_IFDIR | 0755, 0); 240 inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0);
175 if (!inode) 241 if (!inode) {
176 return -ENOMEM; 242 err = -ENOMEM;
243 goto fail;
244 }
177 245
178 root = d_alloc_root(inode); 246 root = d_alloc_root(inode);
179 if (!root) { 247 if (!root) {
180 iput(inode); 248 err = -ENOMEM;
181 return -ENOMEM; 249 goto fail;
182 } 250 }
183 sb->s_root = root; 251 sb->s_root = root;
184 return 0; 252 return 0;
253fail:
254 kfree(fsi);
255 iput(inode);
256 return err;
185} 257}
186 258
187int ramfs_get_sb(struct file_system_type *fs_type, 259int ramfs_get_sb(struct file_system_type *fs_type,
@@ -197,10 +269,16 @@ static int rootfs_get_sb(struct file_system_type *fs_type,
197 mnt); 269 mnt);
198} 270}
199 271
272static void ramfs_kill_sb(struct super_block *sb)
273{
274 kfree(sb->s_fs_info);
275 kill_litter_super(sb);
276}
277
200static struct file_system_type ramfs_fs_type = { 278static struct file_system_type ramfs_fs_type = {
201 .name = "ramfs", 279 .name = "ramfs",
202 .get_sb = ramfs_get_sb, 280 .get_sb = ramfs_get_sb,
203 .kill_sb = kill_litter_super, 281 .kill_sb = ramfs_kill_sb,
204}; 282};
205static struct file_system_type rootfs_fs_type = { 283static struct file_system_type rootfs_fs_type = {
206 .name = "rootfs", 284 .name = "rootfs",
diff --git a/fs/read_write.c b/fs/read_write.c
index 400fe81c973e..9d1e76bb9ee1 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -731,6 +731,62 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
731 return ret; 731 return ret;
732} 732}
733 733
734static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
735{
736#define HALF_LONG_BITS (BITS_PER_LONG / 2)
737 return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
738}
739
740SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
741 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
742{
743 loff_t pos = pos_from_hilo(pos_h, pos_l);
744 struct file *file;
745 ssize_t ret = -EBADF;
746 int fput_needed;
747
748 if (pos < 0)
749 return -EINVAL;
750
751 file = fget_light(fd, &fput_needed);
752 if (file) {
753 ret = -ESPIPE;
754 if (file->f_mode & FMODE_PREAD)
755 ret = vfs_readv(file, vec, vlen, &pos);
756 fput_light(file, fput_needed);
757 }
758
759 if (ret > 0)
760 add_rchar(current, ret);
761 inc_syscr(current);
762 return ret;
763}
764
765SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
766 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
767{
768 loff_t pos = pos_from_hilo(pos_h, pos_l);
769 struct file *file;
770 ssize_t ret = -EBADF;
771 int fput_needed;
772
773 if (pos < 0)
774 return -EINVAL;
775
776 file = fget_light(fd, &fput_needed);
777 if (file) {
778 ret = -ESPIPE;
779 if (file->f_mode & FMODE_PWRITE)
780 ret = vfs_writev(file, vec, vlen, &pos);
781 fput_light(file, fput_needed);
782 }
783
784 if (ret > 0)
785 add_wchar(current, ret);
786 inc_syscw(current);
787 return ret;
788}
789
734static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, 790static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
735 size_t count, loff_t max) 791 size_t count, loff_t max)
736{ 792{
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
index 949b8c6addc8..513f431038f9 100644
--- a/fs/reiserfs/Kconfig
+++ b/fs/reiserfs/Kconfig
@@ -1,5 +1,6 @@
1config REISERFS_FS 1config REISERFS_FS
2 tristate "Reiserfs support" 2 tristate "Reiserfs support"
3 select CRC32
3 help 4 help
4 Stores not just filenames but the files themselves in a balanced 5 Stores not just filenames but the files themselves in a balanced
5 tree. Uses journalling. 6 tree. Uses journalling.
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
index 0eb7ac080484..7c5ab6330dd6 100644
--- a/fs/reiserfs/Makefile
+++ b/fs/reiserfs/Makefile
@@ -7,10 +7,10 @@ obj-$(CONFIG_REISERFS_FS) += reiserfs.o
7reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \ 7reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \
8 super.o prints.o objectid.o lbalance.o ibalance.o stree.o \ 8 super.o prints.o objectid.o lbalance.o ibalance.o stree.o \
9 hashes.o tail_conversion.o journal.o resize.o \ 9 hashes.o tail_conversion.o journal.o resize.o \
10 item_ops.o ioctl.o procfs.o 10 item_ops.o ioctl.o procfs.o xattr.o
11 11
12ifeq ($(CONFIG_REISERFS_FS_XATTR),y) 12ifeq ($(CONFIG_REISERFS_FS_XATTR),y)
13reiserfs-objs += xattr.o xattr_user.o xattr_trusted.o 13reiserfs-objs += xattr_user.o xattr_trusted.o
14endif 14endif
15 15
16ifeq ($(CONFIG_REISERFS_FS_SECURITY),y) 16ifeq ($(CONFIG_REISERFS_FS_SECURITY),y)
diff --git a/fs/reiserfs/README b/fs/reiserfs/README
index 90e1670e4e6f..14e8c9d460e5 100644
--- a/fs/reiserfs/README
+++ b/fs/reiserfs/README
@@ -1,4 +1,4 @@
1[LICENSING] 1[LICENSING]
2 2
3ReiserFS is hereby licensed under the GNU General 3ReiserFS is hereby licensed under the GNU General
4Public License version 2. 4Public License version 2.
@@ -31,7 +31,7 @@ the GPL as not allowing those additional licensing options, you read
31it wrongly, and Richard Stallman agrees with me, when carefully read 31it wrongly, and Richard Stallman agrees with me, when carefully read
32you can see that those restrictions on additional terms do not apply 32you can see that those restrictions on additional terms do not apply
33to the owner of the copyright, and my interpretation of this shall 33to the owner of the copyright, and my interpretation of this shall
34govern for this license. 34govern for this license.
35 35
36Finally, nothing in this license shall be interpreted to allow you to 36Finally, nothing in this license shall be interpreted to allow you to
37fail to fairly credit me, or to remove my credits, without my 37fail to fairly credit me, or to remove my credits, without my
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index 4646caa60455..e716161ab325 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -40,8 +40,8 @@
40 40
41#define SET_OPTION(optname) \ 41#define SET_OPTION(optname) \
42 do { \ 42 do { \
43 reiserfs_warning(s, "reiserfs: option \"%s\" is set", #optname); \ 43 reiserfs_info(s, "block allocator option \"%s\" is set", #optname); \
44 set_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)); \ 44 set_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)); \
45 } while(0) 45 } while(0)
46#define TEST_OPTION(optname, s) \ 46#define TEST_OPTION(optname, s) \
47 test_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)) 47 test_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s))
@@ -64,9 +64,9 @@ int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value)
64 unsigned int bmap_count = reiserfs_bmap_count(s); 64 unsigned int bmap_count = reiserfs_bmap_count(s);
65 65
66 if (block == 0 || block >= SB_BLOCK_COUNT(s)) { 66 if (block == 0 || block >= SB_BLOCK_COUNT(s)) {
67 reiserfs_warning(s, 67 reiserfs_error(s, "vs-4010",
68 "vs-4010: is_reusable: block number is out of range %lu (%u)", 68 "block number is out of range %lu (%u)",
69 block, SB_BLOCK_COUNT(s)); 69 block, SB_BLOCK_COUNT(s));
70 return 0; 70 return 0;
71 } 71 }
72 72
@@ -79,31 +79,30 @@ int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value)
79 b_blocknr_t bmap1 = REISERFS_SB(s)->s_sbh->b_blocknr + 1; 79 b_blocknr_t bmap1 = REISERFS_SB(s)->s_sbh->b_blocknr + 1;
80 if (block >= bmap1 && 80 if (block >= bmap1 &&
81 block <= bmap1 + bmap_count) { 81 block <= bmap1 + bmap_count) {
82 reiserfs_warning(s, "vs: 4019: is_reusable: " 82 reiserfs_error(s, "vs-4019", "bitmap block %lu(%u) "
83 "bitmap block %lu(%u) can't be freed or reused", 83 "can't be freed or reused",
84 block, bmap_count); 84 block, bmap_count);
85 return 0; 85 return 0;
86 } 86 }
87 } else { 87 } else {
88 if (offset == 0) { 88 if (offset == 0) {
89 reiserfs_warning(s, "vs: 4020: is_reusable: " 89 reiserfs_error(s, "vs-4020", "bitmap block %lu(%u) "
90 "bitmap block %lu(%u) can't be freed or reused", 90 "can't be freed or reused",
91 block, bmap_count); 91 block, bmap_count);
92 return 0; 92 return 0;
93 } 93 }
94 } 94 }
95 95
96 if (bmap >= bmap_count) { 96 if (bmap >= bmap_count) {
97 reiserfs_warning(s, 97 reiserfs_error(s, "vs-4030", "bitmap for requested block "
98 "vs-4030: is_reusable: there is no so many bitmap blocks: " 98 "is out of range: block=%lu, bitmap_nr=%u",
99 "block=%lu, bitmap_nr=%u", block, bmap); 99 block, bmap);
100 return 0; 100 return 0;
101 } 101 }
102 102
103 if (bit_value == 0 && block == SB_ROOT_BLOCK(s)) { 103 if (bit_value == 0 && block == SB_ROOT_BLOCK(s)) {
104 reiserfs_warning(s, 104 reiserfs_error(s, "vs-4050", "this is root block (%u), "
105 "vs-4050: is_reusable: this is root block (%u), " 105 "it must be busy", SB_ROOT_BLOCK(s));
106 "it must be busy", SB_ROOT_BLOCK(s));
107 return 0; 106 return 0;
108 } 107 }
109 108
@@ -154,8 +153,8 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
154/* - I mean `a window of zero bits' as in description of this function - Zam. */ 153/* - I mean `a window of zero bits' as in description of this function - Zam. */
155 154
156 if (!bi) { 155 if (!bi) {
157 reiserfs_warning(s, "NULL bitmap info pointer for bitmap %d", 156 reiserfs_error(s, "jdm-4055", "NULL bitmap info pointer "
158 bmap_n); 157 "for bitmap %d", bmap_n);
159 return 0; 158 return 0;
160 } 159 }
161 160
@@ -400,11 +399,8 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
400 get_bit_address(s, block, &nr, &offset); 399 get_bit_address(s, block, &nr, &offset);
401 400
402 if (nr >= reiserfs_bmap_count(s)) { 401 if (nr >= reiserfs_bmap_count(s)) {
403 reiserfs_warning(s, "vs-4075: reiserfs_free_block: " 402 reiserfs_error(s, "vs-4075", "block %lu is out of range",
404 "block %lu is out of range on %s " 403 block);
405 "(nr=%u,max=%u)", block,
406 reiserfs_bdevname(s), nr,
407 reiserfs_bmap_count(s));
408 return; 404 return;
409 } 405 }
410 406
@@ -416,9 +412,8 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
416 412
417 /* clear bit for the given block in bit map */ 413 /* clear bit for the given block in bit map */
418 if (!reiserfs_test_and_clear_le_bit(offset, bmbh->b_data)) { 414 if (!reiserfs_test_and_clear_le_bit(offset, bmbh->b_data)) {
419 reiserfs_warning(s, "vs-4080: reiserfs_free_block: " 415 reiserfs_error(s, "vs-4080",
420 "free_block (%s:%lu)[dev:blocknr]: bit already cleared", 416 "block %lu: bit already cleared", block);
421 reiserfs_bdevname(s), block);
422 } 417 }
423 apbi[nr].free_count++; 418 apbi[nr].free_count++;
424 journal_mark_dirty(th, s, bmbh); 419 journal_mark_dirty(th, s, bmbh);
@@ -430,7 +425,7 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
430 425
431 journal_mark_dirty(th, s, sbh); 426 journal_mark_dirty(th, s, sbh);
432 if (for_unformatted) 427 if (for_unformatted)
433 DQUOT_FREE_BLOCK_NODIRTY(inode, 1); 428 vfs_dq_free_block_nodirty(inode, 1);
434} 429}
435 430
436void reiserfs_free_block(struct reiserfs_transaction_handle *th, 431void reiserfs_free_block(struct reiserfs_transaction_handle *th,
@@ -445,7 +440,7 @@ void reiserfs_free_block(struct reiserfs_transaction_handle *th,
445 return; 440 return;
446 441
447 if (block > sb_block_count(REISERFS_SB(s)->s_rs)) { 442 if (block > sb_block_count(REISERFS_SB(s)->s_rs)) {
448 reiserfs_panic(th->t_super, "bitmap-4072", 443 reiserfs_error(th->t_super, "bitmap-4072",
449 "Trying to free block outside file system " 444 "Trying to free block outside file system "
450 "boundaries (%lu > %lu)", 445 "boundaries (%lu > %lu)",
451 block, sb_block_count(REISERFS_SB(s)->s_rs)); 446 block, sb_block_count(REISERFS_SB(s)->s_rs));
@@ -477,9 +472,8 @@ static void __discard_prealloc(struct reiserfs_transaction_handle *th,
477 BUG_ON(!th->t_trans_id); 472 BUG_ON(!th->t_trans_id);
478#ifdef CONFIG_REISERFS_CHECK 473#ifdef CONFIG_REISERFS_CHECK
479 if (ei->i_prealloc_count < 0) 474 if (ei->i_prealloc_count < 0)
480 reiserfs_warning(th->t_super, 475 reiserfs_error(th->t_super, "zam-4001",
481 "zam-4001:%s: inode has negative prealloc blocks count.", 476 "inode has negative prealloc blocks count.");
482 __func__);
483#endif 477#endif
484 while (ei->i_prealloc_count > 0) { 478 while (ei->i_prealloc_count > 0) {
485 reiserfs_free_prealloc_block(th, inode, ei->i_prealloc_block); 479 reiserfs_free_prealloc_block(th, inode, ei->i_prealloc_block);
@@ -515,9 +509,9 @@ void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th)
515 i_prealloc_list); 509 i_prealloc_list);
516#ifdef CONFIG_REISERFS_CHECK 510#ifdef CONFIG_REISERFS_CHECK
517 if (!ei->i_prealloc_count) { 511 if (!ei->i_prealloc_count) {
518 reiserfs_warning(th->t_super, 512 reiserfs_error(th->t_super, "zam-4001",
519 "zam-4001:%s: inode is in prealloc list but has no preallocated blocks.", 513 "inode is in prealloc list but has "
520 __func__); 514 "no preallocated blocks.");
521 } 515 }
522#endif 516#endif
523 __discard_prealloc(th, ei); 517 __discard_prealloc(th, ei);
@@ -631,12 +625,12 @@ int reiserfs_parse_alloc_options(struct super_block *s, char *options)
631 continue; 625 continue;
632 } 626 }
633 627
634 reiserfs_warning(s, "zam-4001: %s : unknown option - %s", 628 reiserfs_warning(s, "zam-4001", "unknown option - %s",
635 __func__, this_char); 629 this_char);
636 return 1; 630 return 1;
637 } 631 }
638 632
639 reiserfs_warning(s, "allocator options = [%08x]\n", SB_ALLOC_OPTS(s)); 633 reiserfs_info(s, "allocator options = [%08x]\n", SB_ALLOC_OPTS(s));
640 return 0; 634 return 0;
641} 635}
642 636
@@ -1055,7 +1049,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
1055 amount_needed, hint->inode->i_uid); 1049 amount_needed, hint->inode->i_uid);
1056#endif 1050#endif
1057 quota_ret = 1051 quota_ret =
1058 DQUOT_ALLOC_BLOCK_NODIRTY(hint->inode, amount_needed); 1052 vfs_dq_alloc_block_nodirty(hint->inode, amount_needed);
1059 if (quota_ret) /* Quota exceeded? */ 1053 if (quota_ret) /* Quota exceeded? */
1060 return QUOTA_EXCEEDED; 1054 return QUOTA_EXCEEDED;
1061 if (hint->preallocate && hint->prealloc_size) { 1055 if (hint->preallocate && hint->prealloc_size) {
@@ -1064,8 +1058,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
1064 "reiserquota: allocating (prealloc) %d blocks id=%u", 1058 "reiserquota: allocating (prealloc) %d blocks id=%u",
1065 hint->prealloc_size, hint->inode->i_uid); 1059 hint->prealloc_size, hint->inode->i_uid);
1066#endif 1060#endif
1067 quota_ret = 1061 quota_ret = vfs_dq_prealloc_block_nodirty(hint->inode,
1068 DQUOT_PREALLOC_BLOCK_NODIRTY(hint->inode,
1069 hint->prealloc_size); 1062 hint->prealloc_size);
1070 if (quota_ret) 1063 if (quota_ret)
1071 hint->preallocate = hint->prealloc_size = 0; 1064 hint->preallocate = hint->prealloc_size = 0;
@@ -1098,7 +1091,10 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
1098 nr_allocated, 1091 nr_allocated,
1099 hint->inode->i_uid); 1092 hint->inode->i_uid);
1100#endif 1093#endif
1101 DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + hint->prealloc_size - nr_allocated); /* Free not allocated blocks */ 1094 /* Free not allocated blocks */
1095 vfs_dq_free_block_nodirty(hint->inode,
1096 amount_needed + hint->prealloc_size -
1097 nr_allocated);
1102 } 1098 }
1103 while (nr_allocated--) 1099 while (nr_allocated--)
1104 reiserfs_free_block(hint->th, hint->inode, 1100 reiserfs_free_block(hint->th, hint->inode,
@@ -1129,7 +1125,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
1129 REISERFS_I(hint->inode)->i_prealloc_count, 1125 REISERFS_I(hint->inode)->i_prealloc_count,
1130 hint->inode->i_uid); 1126 hint->inode->i_uid);
1131#endif 1127#endif
1132 DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + 1128 vfs_dq_free_block_nodirty(hint->inode, amount_needed +
1133 hint->prealloc_size - nr_allocated - 1129 hint->prealloc_size - nr_allocated -
1134 REISERFS_I(hint->inode)-> 1130 REISERFS_I(hint->inode)->
1135 i_prealloc_count); 1131 i_prealloc_count);
@@ -1219,7 +1215,9 @@ void reiserfs_cache_bitmap_metadata(struct super_block *sb,
1219 unsigned long *cur = (unsigned long *)(bh->b_data + bh->b_size); 1215 unsigned long *cur = (unsigned long *)(bh->b_data + bh->b_size);
1220 1216
1221 /* The first bit must ALWAYS be 1 */ 1217 /* The first bit must ALWAYS be 1 */
1222 BUG_ON(!reiserfs_test_le_bit(0, (unsigned long *)bh->b_data)); 1218 if (!reiserfs_test_le_bit(0, (unsigned long *)bh->b_data))
1219 reiserfs_error(sb, "reiserfs-2025", "bitmap block %lu is "
1220 "corrupted: first bit must be 1", bh->b_blocknr);
1223 1221
1224 info->free_count = 0; 1222 info->free_count = 0;
1225 1223
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index e6b03d2020c1..67a80d7e59e2 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -41,10 +41,10 @@ static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
41 41
42#define store_ih(where,what) copy_item_head (where, what) 42#define store_ih(where,what) copy_item_head (where, what)
43 43
44// 44int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
45static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir) 45 filldir_t filldir, loff_t *pos)
46{ 46{
47 struct inode *inode = filp->f_path.dentry->d_inode; 47 struct inode *inode = dentry->d_inode;
48 struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */ 48 struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */
49 INITIALIZE_PATH(path_to_entry); 49 INITIALIZE_PATH(path_to_entry);
50 struct buffer_head *bh; 50 struct buffer_head *bh;
@@ -64,13 +64,9 @@ static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
64 64
65 /* form key for search the next directory entry using f_pos field of 65 /* form key for search the next directory entry using f_pos field of
66 file structure */ 66 file structure */
67 make_cpu_key(&pos_key, inode, 67 make_cpu_key(&pos_key, inode, *pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3);
68 (filp->f_pos) ? (filp->f_pos) : DOT_OFFSET, TYPE_DIRENTRY,
69 3);
70 next_pos = cpu_key_k_offset(&pos_key); 68 next_pos = cpu_key_k_offset(&pos_key);
71 69
72 /* reiserfs_warning (inode->i_sb, "reiserfs_readdir 1: f_pos = %Ld", filp->f_pos); */
73
74 path_to_entry.reada = PATH_READA; 70 path_to_entry.reada = PATH_READA;
75 while (1) { 71 while (1) {
76 research: 72 research:
@@ -144,7 +140,7 @@ static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
144 /* Ignore the .reiserfs_priv entry */ 140 /* Ignore the .reiserfs_priv entry */
145 if (reiserfs_xattrs(inode->i_sb) && 141 if (reiserfs_xattrs(inode->i_sb) &&
146 !old_format_only(inode->i_sb) && 142 !old_format_only(inode->i_sb) &&
147 filp->f_path.dentry == inode->i_sb->s_root && 143 dentry == inode->i_sb->s_root &&
148 REISERFS_SB(inode->i_sb)->priv_root && 144 REISERFS_SB(inode->i_sb)->priv_root &&
149 REISERFS_SB(inode->i_sb)->priv_root->d_inode 145 REISERFS_SB(inode->i_sb)->priv_root->d_inode
150 && deh_objectid(deh) == 146 && deh_objectid(deh) ==
@@ -156,7 +152,7 @@ static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
156 } 152 }
157 153
158 d_off = deh_offset(deh); 154 d_off = deh_offset(deh);
159 filp->f_pos = d_off; 155 *pos = d_off;
160 d_ino = deh_objectid(deh); 156 d_ino = deh_objectid(deh);
161 if (d_reclen <= 32) { 157 if (d_reclen <= 32) {
162 local_buf = small_buf; 158 local_buf = small_buf;
@@ -223,15 +219,21 @@ static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
223 219
224 } /* while */ 220 } /* while */
225 221
226 end: 222end:
227 filp->f_pos = next_pos; 223 *pos = next_pos;
228 pathrelse(&path_to_entry); 224 pathrelse(&path_to_entry);
229 reiserfs_check_path(&path_to_entry); 225 reiserfs_check_path(&path_to_entry);
230 out: 226out:
231 reiserfs_write_unlock(inode->i_sb); 227 reiserfs_write_unlock(inode->i_sb);
232 return ret; 228 return ret;
233} 229}
234 230
231static int reiserfs_readdir(struct file *file, void *dirent, filldir_t filldir)
232{
233 struct dentry *dentry = file->f_path.dentry;
234 return reiserfs_readdir_dentry(dentry, dirent, filldir, &file->f_pos);
235}
236
235/* compose directory item containing "." and ".." entries (entries are 237/* compose directory item containing "." and ".." entries (entries are
236 not aligned to 4 byte boundary) */ 238 not aligned to 4 byte boundary) */
237/* the last four params are LE */ 239/* the last four params are LE */
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 2f87f5b14630..4beb964a2a3e 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -29,6 +29,43 @@ struct tree_balance *cur_tb = NULL; /* detects whether more than one
29 is interrupting do_balance */ 29 is interrupting do_balance */
30#endif 30#endif
31 31
32static inline void buffer_info_init_left(struct tree_balance *tb,
33 struct buffer_info *bi)
34{
35 bi->tb = tb;
36 bi->bi_bh = tb->L[0];
37 bi->bi_parent = tb->FL[0];
38 bi->bi_position = get_left_neighbor_position(tb, 0);
39}
40
41static inline void buffer_info_init_right(struct tree_balance *tb,
42 struct buffer_info *bi)
43{
44 bi->tb = tb;
45 bi->bi_bh = tb->R[0];
46 bi->bi_parent = tb->FR[0];
47 bi->bi_position = get_right_neighbor_position(tb, 0);
48}
49
50static inline void buffer_info_init_tbS0(struct tree_balance *tb,
51 struct buffer_info *bi)
52{
53 bi->tb = tb;
54 bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
55 bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
56 bi->bi_position = PATH_H_POSITION(tb->tb_path, 1);
57}
58
59static inline void buffer_info_init_bh(struct tree_balance *tb,
60 struct buffer_info *bi,
61 struct buffer_head *bh)
62{
63 bi->tb = tb;
64 bi->bi_bh = bh;
65 bi->bi_parent = NULL;
66 bi->bi_position = 0;
67}
68
32inline void do_balance_mark_leaf_dirty(struct tree_balance *tb, 69inline void do_balance_mark_leaf_dirty(struct tree_balance *tb,
33 struct buffer_head *bh, int flag) 70 struct buffer_head *bh, int flag)
34{ 71{
@@ -39,21 +76,21 @@ inline void do_balance_mark_leaf_dirty(struct tree_balance *tb,
39#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty 76#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
40#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty 77#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty
41 78
42/* summary: 79/* summary:
43 if deleting something ( tb->insert_size[0] < 0 ) 80 if deleting something ( tb->insert_size[0] < 0 )
44 return(balance_leaf_when_delete()); (flag d handled here) 81 return(balance_leaf_when_delete()); (flag d handled here)
45 else 82 else
46 if lnum is larger than 0 we put items into the left node 83 if lnum is larger than 0 we put items into the left node
47 if rnum is larger than 0 we put items into the right node 84 if rnum is larger than 0 we put items into the right node
48 if snum1 is larger than 0 we put items into the new node s1 85 if snum1 is larger than 0 we put items into the new node s1
49 if snum2 is larger than 0 we put items into the new node s2 86 if snum2 is larger than 0 we put items into the new node s2
50Note that all *num* count new items being created. 87Note that all *num* count new items being created.
51 88
52It would be easier to read balance_leaf() if each of these summary 89It would be easier to read balance_leaf() if each of these summary
53lines was a separate procedure rather than being inlined. I think 90lines was a separate procedure rather than being inlined. I think
54that there are many passages here and in balance_leaf_when_delete() in 91that there are many passages here and in balance_leaf_when_delete() in
55which two calls to one procedure can replace two passages, and it 92which two calls to one procedure can replace two passages, and it
56might save cache space and improve software maintenance costs to do so. 93might save cache space and improve software maintenance costs to do so.
57 94
58Vladimir made the perceptive comment that we should offload most of 95Vladimir made the perceptive comment that we should offload most of
59the decision making in this function into fix_nodes/check_balance, and 96the decision making in this function into fix_nodes/check_balance, and
@@ -86,6 +123,7 @@ static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
86 "PAP-12010: tree can not be empty"); 123 "PAP-12010: tree can not be empty");
87 124
88 ih = B_N_PITEM_HEAD(tbS0, item_pos); 125 ih = B_N_PITEM_HEAD(tbS0, item_pos);
126 buffer_info_init_tbS0(tb, &bi);
89 127
90 /* Delete or truncate the item */ 128 /* Delete or truncate the item */
91 129
@@ -96,10 +134,6 @@ static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
96 "vs-12013: mode Delete, insert size %d, ih to be deleted %h", 134 "vs-12013: mode Delete, insert size %d, ih to be deleted %h",
97 -tb->insert_size[0], ih); 135 -tb->insert_size[0], ih);
98 136
99 bi.tb = tb;
100 bi.bi_bh = tbS0;
101 bi.bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
102 bi.bi_position = PATH_H_POSITION(tb->tb_path, 1);
103 leaf_delete_items(&bi, 0, item_pos, 1, -1); 137 leaf_delete_items(&bi, 0, item_pos, 1, -1);
104 138
105 if (!item_pos && tb->CFL[0]) { 139 if (!item_pos && tb->CFL[0]) {
@@ -121,10 +155,6 @@ static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
121 break; 155 break;
122 156
123 case M_CUT:{ /* cut item in S[0] */ 157 case M_CUT:{ /* cut item in S[0] */
124 bi.tb = tb;
125 bi.bi_bh = tbS0;
126 bi.bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
127 bi.bi_position = PATH_H_POSITION(tb->tb_path, 1);
128 if (is_direntry_le_ih(ih)) { 158 if (is_direntry_le_ih(ih)) {
129 159
130 /* UFS unlink semantics are such that you can only delete one directory entry at a time. */ 160 /* UFS unlink semantics are such that you can only delete one directory entry at a time. */
@@ -153,8 +183,8 @@ static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
153 183
154 default: 184 default:
155 print_cur_tb("12040"); 185 print_cur_tb("12040");
156 reiserfs_panic(tb->tb_sb, 186 reiserfs_panic(tb->tb_sb, "PAP-12040",
157 "PAP-12040: balance_leaf_when_delete: unexpectable mode: %s(%d)", 187 "unexpected mode: %s(%d)",
158 (flag == 188 (flag ==
159 M_PASTE) ? "PASTE" : ((flag == 189 M_PASTE) ? "PASTE" : ((flag ==
160 M_INSERT) ? "INSERT" : 190 M_INSERT) ? "INSERT" :
@@ -258,15 +288,15 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
258 ) 288 )
259{ 289{
260 struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); 290 struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
261 int item_pos = PATH_LAST_POSITION(tb->tb_path); /* index into the array of item headers in S[0] 291 int item_pos = PATH_LAST_POSITION(tb->tb_path); /* index into the array of item headers in S[0]
262 of the affected item */ 292 of the affected item */
263 struct buffer_info bi; 293 struct buffer_info bi;
264 struct buffer_head *S_new[2]; /* new nodes allocated to hold what could not fit into S */ 294 struct buffer_head *S_new[2]; /* new nodes allocated to hold what could not fit into S */
265 int snum[2]; /* number of items that will be placed 295 int snum[2]; /* number of items that will be placed
266 into S_new (includes partially shifted 296 into S_new (includes partially shifted
267 items) */ 297 items) */
268 int sbytes[2]; /* if an item is partially shifted into S_new then 298 int sbytes[2]; /* if an item is partially shifted into S_new then
269 if it is a directory item 299 if it is a directory item
270 it is the number of entries from the item that are shifted into S_new 300 it is the number of entries from the item that are shifted into S_new
271 else 301 else
272 it is the number of bytes from the item that are shifted into S_new 302 it is the number of bytes from the item that are shifted into S_new
@@ -325,11 +355,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
325 ih_item_len(ih)); 355 ih_item_len(ih));
326 356
327 /* Insert new item into L[0] */ 357 /* Insert new item into L[0] */
328 bi.tb = tb; 358 buffer_info_init_left(tb, &bi);
329 bi.bi_bh = tb->L[0];
330 bi.bi_parent = tb->FL[0];
331 bi.bi_position =
332 get_left_neighbor_position(tb, 0);
333 leaf_insert_into_buf(&bi, 359 leaf_insert_into_buf(&bi,
334 n + item_pos - 360 n + item_pos -
335 ret_val, ih, body, 361 ret_val, ih, body,
@@ -369,11 +395,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
369 leaf_shift_left(tb, tb->lnum[0] - 1, 395 leaf_shift_left(tb, tb->lnum[0] - 1,
370 tb->lbytes); 396 tb->lbytes);
371 /* Insert new item into L[0] */ 397 /* Insert new item into L[0] */
372 bi.tb = tb; 398 buffer_info_init_left(tb, &bi);
373 bi.bi_bh = tb->L[0];
374 bi.bi_parent = tb->FL[0];
375 bi.bi_position =
376 get_left_neighbor_position(tb, 0);
377 leaf_insert_into_buf(&bi, 399 leaf_insert_into_buf(&bi,
378 n + item_pos - 400 n + item_pos -
379 ret_val, ih, body, 401 ret_val, ih, body,
@@ -429,13 +451,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
429 } 451 }
430 452
431 /* Append given directory entry to directory item */ 453 /* Append given directory entry to directory item */
432 bi.tb = tb; 454 buffer_info_init_left(tb, &bi);
433 bi.bi_bh = tb->L[0];
434 bi.bi_parent =
435 tb->FL[0];
436 bi.bi_position =
437 get_left_neighbor_position
438 (tb, 0);
439 leaf_paste_in_buffer 455 leaf_paste_in_buffer
440 (&bi, 456 (&bi,
441 n + item_pos - 457 n + item_pos -
@@ -449,8 +465,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
449 /* when we have merge directory item, pos_in_item has been changed too */ 465 /* when we have merge directory item, pos_in_item has been changed too */
450 466
451 /* paste new directory entry. 1 is entry number */ 467 /* paste new directory entry. 1 is entry number */
452 leaf_paste_entries(bi. 468 leaf_paste_entries(&bi,
453 bi_bh,
454 n + 469 n +
455 item_pos 470 item_pos
456 - 471 -
@@ -524,13 +539,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
524 (tbS0, 539 (tbS0,
525 item_pos))); 540 item_pos)));
526 /* Append to body of item in L[0] */ 541 /* Append to body of item in L[0] */
527 bi.tb = tb; 542 buffer_info_init_left(tb, &bi);
528 bi.bi_bh = tb->L[0];
529 bi.bi_parent =
530 tb->FL[0];
531 bi.bi_position =
532 get_left_neighbor_position
533 (tb, 0);
534 leaf_paste_in_buffer 543 leaf_paste_in_buffer
535 (&bi, 544 (&bi,
536 n + item_pos - 545 n + item_pos -
@@ -681,11 +690,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
681 leaf_shift_left(tb, tb->lnum[0], 690 leaf_shift_left(tb, tb->lnum[0],
682 tb->lbytes); 691 tb->lbytes);
683 /* Append to body of item in L[0] */ 692 /* Append to body of item in L[0] */
684 bi.tb = tb; 693 buffer_info_init_left(tb, &bi);
685 bi.bi_bh = tb->L[0];
686 bi.bi_parent = tb->FL[0];
687 bi.bi_position =
688 get_left_neighbor_position(tb, 0);
689 leaf_paste_in_buffer(&bi, 694 leaf_paste_in_buffer(&bi,
690 n + item_pos - 695 n + item_pos -
691 ret_val, 696 ret_val,
@@ -699,7 +704,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
699 n + item_pos - 704 n + item_pos -
700 ret_val); 705 ret_val);
701 if (is_direntry_le_ih(pasted)) 706 if (is_direntry_le_ih(pasted))
702 leaf_paste_entries(bi.bi_bh, 707 leaf_paste_entries(&bi,
703 n + 708 n +
704 item_pos - 709 item_pos -
705 ret_val, 710 ret_val,
@@ -722,8 +727,9 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
722 } 727 }
723 break; 728 break;
724 default: /* cases d and t */ 729 default: /* cases d and t */
725 reiserfs_panic(tb->tb_sb, 730 reiserfs_panic(tb->tb_sb, "PAP-12130",
726 "PAP-12130: balance_leaf: lnum > 0: unexpectable mode: %s(%d)", 731 "lnum > 0: unexpected mode: "
732 " %s(%d)",
727 (flag == 733 (flag ==
728 M_DELETE) ? "DELETE" : ((flag == 734 M_DELETE) ? "DELETE" : ((flag ==
729 M_CUT) 735 M_CUT)
@@ -776,11 +782,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
776 set_le_ih_k_offset(ih, offset); 782 set_le_ih_k_offset(ih, offset);
777 put_ih_item_len(ih, tb->rbytes); 783 put_ih_item_len(ih, tb->rbytes);
778 /* Insert part of the item into R[0] */ 784 /* Insert part of the item into R[0] */
779 bi.tb = tb; 785 buffer_info_init_right(tb, &bi);
780 bi.bi_bh = tb->R[0];
781 bi.bi_parent = tb->FR[0];
782 bi.bi_position =
783 get_right_neighbor_position(tb, 0);
784 if ((old_len - tb->rbytes) > zeros_num) { 786 if ((old_len - tb->rbytes) > zeros_num) {
785 r_zeros_number = 0; 787 r_zeros_number = 0;
786 r_body = 788 r_body =
@@ -817,11 +819,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
817 tb->rnum[0] - 1, 819 tb->rnum[0] - 1,
818 tb->rbytes); 820 tb->rbytes);
819 /* Insert new item into R[0] */ 821 /* Insert new item into R[0] */
820 bi.tb = tb; 822 buffer_info_init_right(tb, &bi);
821 bi.bi_bh = tb->R[0];
822 bi.bi_parent = tb->FR[0];
823 bi.bi_position =
824 get_right_neighbor_position(tb, 0);
825 leaf_insert_into_buf(&bi, 823 leaf_insert_into_buf(&bi,
826 item_pos - n + 824 item_pos - n +
827 tb->rnum[0] - 1, 825 tb->rnum[0] - 1,
@@ -881,21 +879,14 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
881 pos_in_item - 879 pos_in_item -
882 entry_count + 880 entry_count +
883 tb->rbytes - 1; 881 tb->rbytes - 1;
884 bi.tb = tb; 882 buffer_info_init_right(tb, &bi);
885 bi.bi_bh = tb->R[0];
886 bi.bi_parent =
887 tb->FR[0];
888 bi.bi_position =
889 get_right_neighbor_position
890 (tb, 0);
891 leaf_paste_in_buffer 883 leaf_paste_in_buffer
892 (&bi, 0, 884 (&bi, 0,
893 paste_entry_position, 885 paste_entry_position,
894 tb->insert_size[0], 886 tb->insert_size[0],
895 body, zeros_num); 887 body, zeros_num);
896 /* paste entry */ 888 /* paste entry */
897 leaf_paste_entries(bi. 889 leaf_paste_entries(&bi,
898 bi_bh,
899 0, 890 0,
900 paste_entry_position, 891 paste_entry_position,
901 1, 892 1,
@@ -1019,12 +1010,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1019 (tb, tb->CFR[0], 0); 1010 (tb, tb->CFR[0], 0);
1020 1011
1021 /* Append part of body into R[0] */ 1012 /* Append part of body into R[0] */
1022 bi.tb = tb; 1013 buffer_info_init_right(tb, &bi);
1023 bi.bi_bh = tb->R[0];
1024 bi.bi_parent = tb->FR[0];
1025 bi.bi_position =
1026 get_right_neighbor_position
1027 (tb, 0);
1028 if (n_rem > zeros_num) { 1014 if (n_rem > zeros_num) {
1029 r_zeros_number = 0; 1015 r_zeros_number = 0;
1030 r_body = 1016 r_body =
@@ -1071,12 +1057,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1071 tb->rbytes); 1057 tb->rbytes);
1072 /* append item in R[0] */ 1058 /* append item in R[0] */
1073 if (pos_in_item >= 0) { 1059 if (pos_in_item >= 0) {
1074 bi.tb = tb; 1060 buffer_info_init_right(tb, &bi);
1075 bi.bi_bh = tb->R[0];
1076 bi.bi_parent = tb->FR[0];
1077 bi.bi_position =
1078 get_right_neighbor_position
1079 (tb, 0);
1080 leaf_paste_in_buffer(&bi, 1061 leaf_paste_in_buffer(&bi,
1081 item_pos - 1062 item_pos -
1082 n + 1063 n +
@@ -1096,7 +1077,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1096 tb->rnum[0]); 1077 tb->rnum[0]);
1097 if (is_direntry_le_ih(pasted) 1078 if (is_direntry_le_ih(pasted)
1098 && pos_in_item >= 0) { 1079 && pos_in_item >= 0) {
1099 leaf_paste_entries(bi.bi_bh, 1080 leaf_paste_entries(&bi,
1100 item_pos - 1081 item_pos -
1101 n + 1082 n +
1102 tb->rnum[0], 1083 tb->rnum[0],
@@ -1136,8 +1117,8 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1136 } 1117 }
1137 break; 1118 break;
1138 default: /* cases d and t */ 1119 default: /* cases d and t */
1139 reiserfs_panic(tb->tb_sb, 1120 reiserfs_panic(tb->tb_sb, "PAP-12175",
1140 "PAP-12175: balance_leaf: rnum > 0: unexpectable mode: %s(%d)", 1121 "rnum > 0: unexpected mode: %s(%d)",
1141 (flag == 1122 (flag ==
1142 M_DELETE) ? "DELETE" : ((flag == 1123 M_DELETE) ? "DELETE" : ((flag ==
1143 M_CUT) ? "CUT" 1124 M_CUT) ? "CUT"
@@ -1167,8 +1148,8 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1167 not set correctly */ 1148 not set correctly */
1168 if (tb->CFL[0]) { 1149 if (tb->CFL[0]) {
1169 if (!tb->CFR[0]) 1150 if (!tb->CFR[0])
1170 reiserfs_panic(tb->tb_sb, 1151 reiserfs_panic(tb->tb_sb, "vs-12195",
1171 "vs-12195: balance_leaf: CFR not initialized"); 1152 "CFR not initialized");
1172 copy_key(B_N_PDELIM_KEY(tb->CFL[0], tb->lkey[0]), 1153 copy_key(B_N_PDELIM_KEY(tb->CFL[0], tb->lkey[0]),
1173 B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0])); 1154 B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0]));
1174 do_balance_mark_internal_dirty(tb, tb->CFL[0], 0); 1155 do_balance_mark_internal_dirty(tb, tb->CFL[0], 0);
@@ -1232,10 +1213,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1232 put_ih_item_len(ih, sbytes[i]); 1213 put_ih_item_len(ih, sbytes[i]);
1233 1214
1234 /* Insert part of the item into S_new[i] before 0-th item */ 1215 /* Insert part of the item into S_new[i] before 0-th item */
1235 bi.tb = tb; 1216 buffer_info_init_bh(tb, &bi, S_new[i]);
1236 bi.bi_bh = S_new[i];
1237 bi.bi_parent = NULL;
1238 bi.bi_position = 0;
1239 1217
1240 if ((old_len - sbytes[i]) > zeros_num) { 1218 if ((old_len - sbytes[i]) > zeros_num) {
1241 r_zeros_number = 0; 1219 r_zeros_number = 0;
@@ -1267,10 +1245,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1267 S_new[i]); 1245 S_new[i]);
1268 1246
1269 /* Insert new item into S_new[i] */ 1247 /* Insert new item into S_new[i] */
1270 bi.tb = tb; 1248 buffer_info_init_bh(tb, &bi, S_new[i]);
1271 bi.bi_bh = S_new[i];
1272 bi.bi_parent = NULL;
1273 bi.bi_position = 0;
1274 leaf_insert_into_buf(&bi, 1249 leaf_insert_into_buf(&bi,
1275 item_pos - n + 1250 item_pos - n +
1276 snum[i] - 1, ih, 1251 snum[i] - 1, ih,
@@ -1327,10 +1302,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1327 sbytes[i] - 1, 1302 sbytes[i] - 1,
1328 S_new[i]); 1303 S_new[i]);
1329 /* Paste given directory entry to directory item */ 1304 /* Paste given directory entry to directory item */
1330 bi.tb = tb; 1305 buffer_info_init_bh(tb, &bi, S_new[i]);
1331 bi.bi_bh = S_new[i];
1332 bi.bi_parent = NULL;
1333 bi.bi_position = 0;
1334 leaf_paste_in_buffer 1306 leaf_paste_in_buffer
1335 (&bi, 0, 1307 (&bi, 0,
1336 pos_in_item - 1308 pos_in_item -
@@ -1339,8 +1311,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1339 tb->insert_size[0], 1311 tb->insert_size[0],
1340 body, zeros_num); 1312 body, zeros_num);
1341 /* paste new directory entry */ 1313 /* paste new directory entry */
1342 leaf_paste_entries(bi. 1314 leaf_paste_entries(&bi,
1343 bi_bh,
1344 0, 1315 0,
1345 pos_in_item 1316 pos_in_item
1346 - 1317 -
@@ -1401,11 +1372,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1401 if (n_rem < 0) 1372 if (n_rem < 0)
1402 n_rem = 0; 1373 n_rem = 0;
1403 /* Append part of body into S_new[0] */ 1374 /* Append part of body into S_new[0] */
1404 bi.tb = tb; 1375 buffer_info_init_bh(tb, &bi, S_new[i]);
1405 bi.bi_bh = S_new[i];
1406 bi.bi_parent = NULL;
1407 bi.bi_position = 0;
1408
1409 if (n_rem > zeros_num) { 1376 if (n_rem > zeros_num) {
1410 r_zeros_number = 0; 1377 r_zeros_number = 0;
1411 r_body = 1378 r_body =
@@ -1475,7 +1442,10 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1475 && (pos_in_item != ih_item_len(ih_check) 1442 && (pos_in_item != ih_item_len(ih_check)
1476 || tb->insert_size[0] <= 0)) 1443 || tb->insert_size[0] <= 0))
1477 reiserfs_panic(tb->tb_sb, 1444 reiserfs_panic(tb->tb_sb,
1478 "PAP-12235: balance_leaf: pos_in_item must be equal to ih_item_len"); 1445 "PAP-12235",
1446 "pos_in_item "
1447 "must be equal "
1448 "to ih_item_len");
1479#endif /* CONFIG_REISERFS_CHECK */ 1449#endif /* CONFIG_REISERFS_CHECK */
1480 1450
1481 leaf_mi = 1451 leaf_mi =
@@ -1489,10 +1459,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1489 leaf_mi); 1459 leaf_mi);
1490 1460
1491 /* paste into item */ 1461 /* paste into item */
1492 bi.tb = tb; 1462 buffer_info_init_bh(tb, &bi, S_new[i]);
1493 bi.bi_bh = S_new[i];
1494 bi.bi_parent = NULL;
1495 bi.bi_position = 0;
1496 leaf_paste_in_buffer(&bi, 1463 leaf_paste_in_buffer(&bi,
1497 item_pos - n + 1464 item_pos - n +
1498 snum[i], 1465 snum[i],
@@ -1505,7 +1472,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1505 item_pos - n + 1472 item_pos - n +
1506 snum[i]); 1473 snum[i]);
1507 if (is_direntry_le_ih(pasted)) { 1474 if (is_direntry_le_ih(pasted)) {
1508 leaf_paste_entries(bi.bi_bh, 1475 leaf_paste_entries(&bi,
1509 item_pos - 1476 item_pos -
1510 n + snum[i], 1477 n + snum[i],
1511 pos_in_item, 1478 pos_in_item,
@@ -1535,8 +1502,8 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1535 } 1502 }
1536 break; 1503 break;
1537 default: /* cases d and t */ 1504 default: /* cases d and t */
1538 reiserfs_panic(tb->tb_sb, 1505 reiserfs_panic(tb->tb_sb, "PAP-12245",
1539 "PAP-12245: balance_leaf: blknum > 2: unexpectable mode: %s(%d)", 1506 "blknum > 2: unexpected mode: %s(%d)",
1540 (flag == 1507 (flag ==
1541 M_DELETE) ? "DELETE" : ((flag == 1508 M_DELETE) ? "DELETE" : ((flag ==
1542 M_CUT) ? "CUT" 1509 M_CUT) ? "CUT"
@@ -1559,10 +1526,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1559 1526
1560 switch (flag) { 1527 switch (flag) {
1561 case M_INSERT: /* insert item into S[0] */ 1528 case M_INSERT: /* insert item into S[0] */
1562 bi.tb = tb; 1529 buffer_info_init_tbS0(tb, &bi);
1563 bi.bi_bh = tbS0;
1564 bi.bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
1565 bi.bi_position = PATH_H_POSITION(tb->tb_path, 1);
1566 leaf_insert_into_buf(&bi, item_pos, ih, body, 1530 leaf_insert_into_buf(&bi, item_pos, ih, body,
1567 zeros_num); 1531 zeros_num);
1568 1532
@@ -1589,14 +1553,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1589 "PAP-12260: insert_size is 0 already"); 1553 "PAP-12260: insert_size is 0 already");
1590 1554
1591 /* prepare space */ 1555 /* prepare space */
1592 bi.tb = tb; 1556 buffer_info_init_tbS0(tb, &bi);
1593 bi.bi_bh = tbS0;
1594 bi.bi_parent =
1595 PATH_H_PPARENT(tb->tb_path,
1596 0);
1597 bi.bi_position =
1598 PATH_H_POSITION(tb->tb_path,
1599 1);
1600 leaf_paste_in_buffer(&bi, 1557 leaf_paste_in_buffer(&bi,
1601 item_pos, 1558 item_pos,
1602 pos_in_item, 1559 pos_in_item,
@@ -1606,7 +1563,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1606 zeros_num); 1563 zeros_num);
1607 1564
1608 /* paste entry */ 1565 /* paste entry */
1609 leaf_paste_entries(bi.bi_bh, 1566 leaf_paste_entries(&bi,
1610 item_pos, 1567 item_pos,
1611 pos_in_item, 1568 pos_in_item,
1612 1, 1569 1,
@@ -1644,14 +1601,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1644 RFALSE(tb->insert_size[0] <= 0, 1601 RFALSE(tb->insert_size[0] <= 0,
1645 "PAP-12275: insert size must not be %d", 1602 "PAP-12275: insert size must not be %d",
1646 tb->insert_size[0]); 1603 tb->insert_size[0]);
1647 bi.tb = tb; 1604 buffer_info_init_tbS0(tb, &bi);
1648 bi.bi_bh = tbS0;
1649 bi.bi_parent =
1650 PATH_H_PPARENT(tb->tb_path,
1651 0);
1652 bi.bi_position =
1653 PATH_H_POSITION(tb->tb_path,
1654 1);
1655 leaf_paste_in_buffer(&bi, 1605 leaf_paste_in_buffer(&bi,
1656 item_pos, 1606 item_pos,
1657 pos_in_item, 1607 pos_in_item,
@@ -1681,10 +1631,11 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1681 print_cur_tb("12285"); 1631 print_cur_tb("12285");
1682 reiserfs_panic(tb-> 1632 reiserfs_panic(tb->
1683 tb_sb, 1633 tb_sb,
1684 "PAP-12285: balance_leaf: insert_size must be 0 (%d)", 1634 "PAP-12285",
1685 tb-> 1635 "insert_size "
1686 insert_size 1636 "must be 0 "
1687 [0]); 1637 "(%d)",
1638 tb->insert_size[0]);
1688 } 1639 }
1689 } 1640 }
1690#endif /* CONFIG_REISERFS_CHECK */ 1641#endif /* CONFIG_REISERFS_CHECK */
@@ -1697,11 +1648,10 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1697 if (flag == M_PASTE && tb->insert_size[0]) { 1648 if (flag == M_PASTE && tb->insert_size[0]) {
1698 print_cur_tb("12290"); 1649 print_cur_tb("12290");
1699 reiserfs_panic(tb->tb_sb, 1650 reiserfs_panic(tb->tb_sb,
1700 "PAP-12290: balance_leaf: insert_size is still not 0 (%d)", 1651 "PAP-12290", "insert_size is still not 0 (%d)",
1701 tb->insert_size[0]); 1652 tb->insert_size[0]);
1702 } 1653 }
1703#endif /* CONFIG_REISERFS_CHECK */ 1654#endif /* CONFIG_REISERFS_CHECK */
1704
1705 return 0; 1655 return 0;
1706} /* Leaf level of the tree is balanced (end of balance_leaf) */ 1656} /* Leaf level of the tree is balanced (end of balance_leaf) */
1707 1657
@@ -1724,7 +1674,6 @@ void make_empty_node(struct buffer_info *bi)
1724struct buffer_head *get_FEB(struct tree_balance *tb) 1674struct buffer_head *get_FEB(struct tree_balance *tb)
1725{ 1675{
1726 int i; 1676 int i;
1727 struct buffer_head *first_b;
1728 struct buffer_info bi; 1677 struct buffer_info bi;
1729 1678
1730 for (i = 0; i < MAX_FEB_SIZE; i++) 1679 for (i = 0; i < MAX_FEB_SIZE; i++)
@@ -1732,19 +1681,15 @@ struct buffer_head *get_FEB(struct tree_balance *tb)
1732 break; 1681 break;
1733 1682
1734 if (i == MAX_FEB_SIZE) 1683 if (i == MAX_FEB_SIZE)
1735 reiserfs_panic(tb->tb_sb, 1684 reiserfs_panic(tb->tb_sb, "vs-12300", "FEB list is empty");
1736 "vs-12300: get_FEB: FEB list is empty");
1737 1685
1738 bi.tb = tb; 1686 buffer_info_init_bh(tb, &bi, tb->FEB[i]);
1739 bi.bi_bh = first_b = tb->FEB[i];
1740 bi.bi_parent = NULL;
1741 bi.bi_position = 0;
1742 make_empty_node(&bi); 1687 make_empty_node(&bi);
1743 set_buffer_uptodate(first_b); 1688 set_buffer_uptodate(tb->FEB[i]);
1689 tb->used[i] = tb->FEB[i];
1744 tb->FEB[i] = NULL; 1690 tb->FEB[i] = NULL;
1745 tb->used[i] = first_b;
1746 1691
1747 return (first_b); 1692 return tb->used[i];
1748} 1693}
1749 1694
1750/* This is now used because reiserfs_free_block has to be able to 1695/* This is now used because reiserfs_free_block has to be able to
@@ -1755,15 +1700,16 @@ static void store_thrown(struct tree_balance *tb, struct buffer_head *bh)
1755 int i; 1700 int i;
1756 1701
1757 if (buffer_dirty(bh)) 1702 if (buffer_dirty(bh))
1758 reiserfs_warning(tb->tb_sb, 1703 reiserfs_warning(tb->tb_sb, "reiserfs-12320",
1759 "store_thrown deals with dirty buffer"); 1704 "called with dirty buffer");
1760 for (i = 0; i < ARRAY_SIZE(tb->thrown); i++) 1705 for (i = 0; i < ARRAY_SIZE(tb->thrown); i++)
1761 if (!tb->thrown[i]) { 1706 if (!tb->thrown[i]) {
1762 tb->thrown[i] = bh; 1707 tb->thrown[i] = bh;
1763 get_bh(bh); /* free_thrown puts this */ 1708 get_bh(bh); /* free_thrown puts this */
1764 return; 1709 return;
1765 } 1710 }
1766 reiserfs_warning(tb->tb_sb, "store_thrown: too many thrown buffers"); 1711 reiserfs_warning(tb->tb_sb, "reiserfs-12321",
1712 "too many thrown buffers");
1767} 1713}
1768 1714
1769static void free_thrown(struct tree_balance *tb) 1715static void free_thrown(struct tree_balance *tb)
@@ -1774,8 +1720,8 @@ static void free_thrown(struct tree_balance *tb)
1774 if (tb->thrown[i]) { 1720 if (tb->thrown[i]) {
1775 blocknr = tb->thrown[i]->b_blocknr; 1721 blocknr = tb->thrown[i]->b_blocknr;
1776 if (buffer_dirty(tb->thrown[i])) 1722 if (buffer_dirty(tb->thrown[i]))
1777 reiserfs_warning(tb->tb_sb, 1723 reiserfs_warning(tb->tb_sb, "reiserfs-12322",
1778 "free_thrown deals with dirty buffer %d", 1724 "called with dirty buffer %d",
1779 blocknr); 1725 blocknr);
1780 brelse(tb->thrown[i]); /* incremented in store_thrown */ 1726 brelse(tb->thrown[i]); /* incremented in store_thrown */
1781 reiserfs_free_block(tb->transaction_handle, NULL, 1727 reiserfs_free_block(tb->transaction_handle, NULL,
@@ -1873,20 +1819,19 @@ static void check_internal_node(struct super_block *s, struct buffer_head *bh,
1873 for (i = 0; i <= B_NR_ITEMS(bh); i++, dc++) { 1819 for (i = 0; i <= B_NR_ITEMS(bh); i++, dc++) {
1874 if (!is_reusable(s, dc_block_number(dc), 1)) { 1820 if (!is_reusable(s, dc_block_number(dc), 1)) {
1875 print_cur_tb(mes); 1821 print_cur_tb(mes);
1876 reiserfs_panic(s, 1822 reiserfs_panic(s, "PAP-12338",
1877 "PAP-12338: check_internal_node: invalid child pointer %y in %b", 1823 "invalid child pointer %y in %b",
1878 dc, bh); 1824 dc, bh);
1879 } 1825 }
1880 } 1826 }
1881} 1827}
1882 1828
1883static int locked_or_not_in_tree(struct buffer_head *bh, char *which) 1829static int locked_or_not_in_tree(struct tree_balance *tb,
1830 struct buffer_head *bh, char *which)
1884{ 1831{
1885 if ((!buffer_journal_prepared(bh) && buffer_locked(bh)) || 1832 if ((!buffer_journal_prepared(bh) && buffer_locked(bh)) ||
1886 !B_IS_IN_TREE(bh)) { 1833 !B_IS_IN_TREE(bh)) {
1887 reiserfs_warning(NULL, 1834 reiserfs_warning(tb->tb_sb, "vs-12339", "%s (%b)", which, bh);
1888 "vs-12339: locked_or_not_in_tree: %s (%b)",
1889 which, bh);
1890 return 1; 1835 return 1;
1891 } 1836 }
1892 return 0; 1837 return 0;
@@ -1897,26 +1842,28 @@ static int check_before_balancing(struct tree_balance *tb)
1897 int retval = 0; 1842 int retval = 0;
1898 1843
1899 if (cur_tb) { 1844 if (cur_tb) {
1900 reiserfs_panic(tb->tb_sb, "vs-12335: check_before_balancing: " 1845 reiserfs_panic(tb->tb_sb, "vs-12335", "suspect that schedule "
1901 "suspect that schedule occurred based on cur_tb not being null at this point in code. " 1846 "occurred based on cur_tb not being null at "
1902 "do_balance cannot properly handle schedule occurring while it runs."); 1847 "this point in code. do_balance cannot properly "
1848 "handle schedule occurring while it runs.");
1903 } 1849 }
1904 1850
1905 /* double check that buffers that we will modify are unlocked. (fix_nodes should already have 1851 /* double check that buffers that we will modify are unlocked. (fix_nodes should already have
1906 prepped all of these for us). */ 1852 prepped all of these for us). */
1907 if (tb->lnum[0]) { 1853 if (tb->lnum[0]) {
1908 retval |= locked_or_not_in_tree(tb->L[0], "L[0]"); 1854 retval |= locked_or_not_in_tree(tb, tb->L[0], "L[0]");
1909 retval |= locked_or_not_in_tree(tb->FL[0], "FL[0]"); 1855 retval |= locked_or_not_in_tree(tb, tb->FL[0], "FL[0]");
1910 retval |= locked_or_not_in_tree(tb->CFL[0], "CFL[0]"); 1856 retval |= locked_or_not_in_tree(tb, tb->CFL[0], "CFL[0]");
1911 check_leaf(tb->L[0]); 1857 check_leaf(tb->L[0]);
1912 } 1858 }
1913 if (tb->rnum[0]) { 1859 if (tb->rnum[0]) {
1914 retval |= locked_or_not_in_tree(tb->R[0], "R[0]"); 1860 retval |= locked_or_not_in_tree(tb, tb->R[0], "R[0]");
1915 retval |= locked_or_not_in_tree(tb->FR[0], "FR[0]"); 1861 retval |= locked_or_not_in_tree(tb, tb->FR[0], "FR[0]");
1916 retval |= locked_or_not_in_tree(tb->CFR[0], "CFR[0]"); 1862 retval |= locked_or_not_in_tree(tb, tb->CFR[0], "CFR[0]");
1917 check_leaf(tb->R[0]); 1863 check_leaf(tb->R[0]);
1918 } 1864 }
1919 retval |= locked_or_not_in_tree(PATH_PLAST_BUFFER(tb->tb_path), "S[0]"); 1865 retval |= locked_or_not_in_tree(tb, PATH_PLAST_BUFFER(tb->tb_path),
1866 "S[0]");
1920 check_leaf(PATH_PLAST_BUFFER(tb->tb_path)); 1867 check_leaf(PATH_PLAST_BUFFER(tb->tb_path));
1921 1868
1922 return retval; 1869 return retval;
@@ -1930,8 +1877,8 @@ static void check_after_balance_leaf(struct tree_balance *tb)
1930 dc_size(B_N_CHILD 1877 dc_size(B_N_CHILD
1931 (tb->FL[0], get_left_neighbor_position(tb, 0)))) { 1878 (tb->FL[0], get_left_neighbor_position(tb, 0)))) {
1932 print_cur_tb("12221"); 1879 print_cur_tb("12221");
1933 reiserfs_panic(tb->tb_sb, 1880 reiserfs_panic(tb->tb_sb, "PAP-12355",
1934 "PAP-12355: check_after_balance_leaf: shift to left was incorrect"); 1881 "shift to left was incorrect");
1935 } 1882 }
1936 } 1883 }
1937 if (tb->rnum[0]) { 1884 if (tb->rnum[0]) {
@@ -1940,8 +1887,8 @@ static void check_after_balance_leaf(struct tree_balance *tb)
1940 dc_size(B_N_CHILD 1887 dc_size(B_N_CHILD
1941 (tb->FR[0], get_right_neighbor_position(tb, 0)))) { 1888 (tb->FR[0], get_right_neighbor_position(tb, 0)))) {
1942 print_cur_tb("12222"); 1889 print_cur_tb("12222");
1943 reiserfs_panic(tb->tb_sb, 1890 reiserfs_panic(tb->tb_sb, "PAP-12360",
1944 "PAP-12360: check_after_balance_leaf: shift to right was incorrect"); 1891 "shift to right was incorrect");
1945 } 1892 }
1946 } 1893 }
1947 if (PATH_H_PBUFFER(tb->tb_path, 1) && 1894 if (PATH_H_PBUFFER(tb->tb_path, 1) &&
@@ -1955,7 +1902,7 @@ static void check_after_balance_leaf(struct tree_balance *tb)
1955 PATH_H_POSITION(tb->tb_path, 1902 PATH_H_POSITION(tb->tb_path,
1956 1)))); 1903 1))));
1957 print_cur_tb("12223"); 1904 print_cur_tb("12223");
1958 reiserfs_warning(tb->tb_sb, 1905 reiserfs_warning(tb->tb_sb, "reiserfs-12363",
1959 "B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) = %d; " 1906 "B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) = %d; "
1960 "MAX_CHILD_SIZE (%d) - dc_size( %y, %d ) [%d] = %d", 1907 "MAX_CHILD_SIZE (%d) - dc_size( %y, %d ) [%d] = %d",
1961 left, 1908 left,
@@ -1966,8 +1913,7 @@ static void check_after_balance_leaf(struct tree_balance *tb)
1966 (PATH_H_PBUFFER(tb->tb_path, 1), 1913 (PATH_H_PBUFFER(tb->tb_path, 1),
1967 PATH_H_POSITION(tb->tb_path, 1))), 1914 PATH_H_POSITION(tb->tb_path, 1))),
1968 right); 1915 right);
1969 reiserfs_panic(tb->tb_sb, 1916 reiserfs_panic(tb->tb_sb, "PAP-12365", "S is incorrect");
1970 "PAP-12365: check_after_balance_leaf: S is incorrect");
1971 } 1917 }
1972} 1918}
1973 1919
@@ -2037,7 +1983,7 @@ static inline void do_balance_starts(struct tree_balance *tb)
2037 /* store_print_tb (tb); */ 1983 /* store_print_tb (tb); */
2038 1984
2039 /* do not delete, just comment it out */ 1985 /* do not delete, just comment it out */
2040/* print_tb(flag, PATH_LAST_POSITION(tb->tb_path), tb->tb_path->pos_in_item, tb, 1986/* print_tb(flag, PATH_LAST_POSITION(tb->tb_path), tb->tb_path->pos_in_item, tb,
2041 "check");*/ 1987 "check");*/
2042 RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB"); 1988 RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB");
2043#ifdef CONFIG_REISERFS_CHECK 1989#ifdef CONFIG_REISERFS_CHECK
@@ -2102,14 +2048,13 @@ void do_balance(struct tree_balance *tb, /* tree_balance structure */
2102 tb->need_balance_dirty = 0; 2048 tb->need_balance_dirty = 0;
2103 2049
2104 if (FILESYSTEM_CHANGED_TB(tb)) { 2050 if (FILESYSTEM_CHANGED_TB(tb)) {
2105 reiserfs_panic(tb->tb_sb, 2051 reiserfs_panic(tb->tb_sb, "clm-6000", "fs generation has "
2106 "clm-6000: do_balance, fs generation has changed\n"); 2052 "changed");
2107 } 2053 }
2108 /* if we have no real work to do */ 2054 /* if we have no real work to do */
2109 if (!tb->insert_size[0]) { 2055 if (!tb->insert_size[0]) {
2110 reiserfs_warning(tb->tb_sb, 2056 reiserfs_warning(tb->tb_sb, "PAP-12350",
2111 "PAP-12350: do_balance: insert_size == 0, mode == %c", 2057 "insert_size == 0, mode == %c", flag);
2112 flag);
2113 unfix_nodes(tb); 2058 unfix_nodes(tb);
2114 return; 2059 return;
2115 } 2060 }
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 33408417038c..9f436668b7f8 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -20,14 +20,14 @@
20** insertion/balancing, for files that are written in one write. 20** insertion/balancing, for files that are written in one write.
21** It avoids unnecessary tail packings (balances) for files that are written in 21** It avoids unnecessary tail packings (balances) for files that are written in
22** multiple writes and are small enough to have tails. 22** multiple writes and are small enough to have tails.
23** 23**
24** file_release is called by the VFS layer when the file is closed. If 24** file_release is called by the VFS layer when the file is closed. If
25** this is the last open file descriptor, and the file 25** this is the last open file descriptor, and the file
26** small enough to have a tail, and the tail is currently in an 26** small enough to have a tail, and the tail is currently in an
27** unformatted node, the tail is converted back into a direct item. 27** unformatted node, the tail is converted back into a direct item.
28** 28**
29** We use reiserfs_truncate_file to pack the tail, since it already has 29** We use reiserfs_truncate_file to pack the tail, since it already has
30** all the conditions coded. 30** all the conditions coded.
31*/ 31*/
32static int reiserfs_file_release(struct inode *inode, struct file *filp) 32static int reiserfs_file_release(struct inode *inode, struct file *filp)
33{ 33{
@@ -76,7 +76,7 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
76 * and let the admin know what is going on. 76 * and let the admin know what is going on.
77 */ 77 */
78 igrab(inode); 78 igrab(inode);
79 reiserfs_warning(inode->i_sb, 79 reiserfs_warning(inode->i_sb, "clm-9001",
80 "pinning inode %lu because the " 80 "pinning inode %lu because the "
81 "preallocation can't be freed", 81 "preallocation can't be freed",
82 inode->i_ino); 82 inode->i_ino);
@@ -134,23 +134,23 @@ static void reiserfs_vfs_truncate_file(struct inode *inode)
134 * be removed... 134 * be removed...
135 */ 135 */
136 136
137static int reiserfs_sync_file(struct file *p_s_filp, 137static int reiserfs_sync_file(struct file *filp,
138 struct dentry *p_s_dentry, int datasync) 138 struct dentry *dentry, int datasync)
139{ 139{
140 struct inode *p_s_inode = p_s_dentry->d_inode; 140 struct inode *inode = dentry->d_inode;
141 int n_err; 141 int err;
142 int barrier_done; 142 int barrier_done;
143 143
144 BUG_ON(!S_ISREG(p_s_inode->i_mode)); 144 BUG_ON(!S_ISREG(inode->i_mode));
145 n_err = sync_mapping_buffers(p_s_inode->i_mapping); 145 err = sync_mapping_buffers(inode->i_mapping);
146 reiserfs_write_lock(p_s_inode->i_sb); 146 reiserfs_write_lock(inode->i_sb);
147 barrier_done = reiserfs_commit_for_inode(p_s_inode); 147 barrier_done = reiserfs_commit_for_inode(inode);
148 reiserfs_write_unlock(p_s_inode->i_sb); 148 reiserfs_write_unlock(inode->i_sb);
149 if (barrier_done != 1 && reiserfs_barrier_flush(p_s_inode->i_sb)) 149 if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
150 blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL); 150 blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
151 if (barrier_done < 0) 151 if (barrier_done < 0)
152 return barrier_done; 152 return barrier_done;
153 return (n_err < 0) ? -EIO : 0; 153 return (err < 0) ? -EIO : 0;
154} 154}
155 155
156/* taken fs/buffer.c:__block_commit_write */ 156/* taken fs/buffer.c:__block_commit_write */
@@ -223,7 +223,7 @@ int reiserfs_commit_page(struct inode *inode, struct page *page,
223} 223}
224 224
225/* Write @count bytes at position @ppos in a file indicated by @file 225/* Write @count bytes at position @ppos in a file indicated by @file
226 from the buffer @buf. 226 from the buffer @buf.
227 227
228 generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want 228 generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want
229 something simple that works. It is not for serious use by general purpose filesystems, excepting the one that it was 229 something simple that works. It is not for serious use by general purpose filesystems, excepting the one that it was
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 07d05e0842b7..5e5a4e6fbaf8 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -30,8 +30,8 @@
30 ** get_direct_parent 30 ** get_direct_parent
31 ** get_neighbors 31 ** get_neighbors
32 ** fix_nodes 32 ** fix_nodes
33 ** 33 **
34 ** 34 **
35 **/ 35 **/
36 36
37#include <linux/time.h> 37#include <linux/time.h>
@@ -135,8 +135,7 @@ static void create_virtual_node(struct tree_balance *tb, int h)
135 vn->vn_free_ptr += 135 vn->vn_free_ptr +=
136 op_create_vi(vn, vi, is_affected, tb->insert_size[0]); 136 op_create_vi(vn, vi, is_affected, tb->insert_size[0]);
137 if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr) 137 if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
138 reiserfs_panic(tb->tb_sb, 138 reiserfs_panic(tb->tb_sb, "vs-8030",
139 "vs-8030: create_virtual_node: "
140 "virtual node space consumed"); 139 "virtual node space consumed");
141 140
142 if (!is_affected) 141 if (!is_affected)
@@ -186,8 +185,9 @@ static void create_virtual_node(struct tree_balance *tb, int h)
186 && I_ENTRY_COUNT(B_N_PITEM_HEAD(Sh, 0)) == 1)) { 185 && I_ENTRY_COUNT(B_N_PITEM_HEAD(Sh, 0)) == 1)) {
187 /* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */ 186 /* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */
188 print_block(Sh, 0, -1, -1); 187 print_block(Sh, 0, -1, -1);
189 reiserfs_panic(tb->tb_sb, 188 reiserfs_panic(tb->tb_sb, "vs-8045",
190 "vs-8045: create_virtual_node: rdkey %k, affected item==%d (mode==%c) Must be %c", 189 "rdkey %k, affected item==%d "
190 "(mode==%c) Must be %c",
191 key, vn->vn_affected_item_num, 191 key, vn->vn_affected_item_num,
192 vn->vn_mode, M_DELETE); 192 vn->vn_mode, M_DELETE);
193 } 193 }
@@ -377,9 +377,9 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
377 int needed_nodes; 377 int needed_nodes;
378 int start_item, /* position of item we start filling node from */ 378 int start_item, /* position of item we start filling node from */
379 end_item, /* position of item we finish filling node by */ 379 end_item, /* position of item we finish filling node by */
380 start_bytes, /* number of first bytes (entries for directory) of start_item-th item 380 start_bytes, /* number of first bytes (entries for directory) of start_item-th item
381 we do not include into node that is being filled */ 381 we do not include into node that is being filled */
382 end_bytes; /* number of last bytes (entries for directory) of end_item-th item 382 end_bytes; /* number of last bytes (entries for directory) of end_item-th item
383 we do node include into node that is being filled */ 383 we do node include into node that is being filled */
384 int split_item_positions[2]; /* these are positions in virtual item of 384 int split_item_positions[2]; /* these are positions in virtual item of
385 items, that are split between S[0] and 385 items, that are split between S[0] and
@@ -496,8 +496,8 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
496 snum012[needed_nodes - 1 + 3] = units; 496 snum012[needed_nodes - 1 + 3] = units;
497 497
498 if (needed_nodes > 2) 498 if (needed_nodes > 2)
499 reiserfs_warning(tb->tb_sb, "vs-8111: get_num_ver: " 499 reiserfs_warning(tb->tb_sb, "vs-8111",
500 "split_item_position is out of boundary"); 500 "split_item_position is out of range");
501 snum012[needed_nodes - 1]++; 501 snum012[needed_nodes - 1]++;
502 split_item_positions[needed_nodes - 1] = i; 502 split_item_positions[needed_nodes - 1] = i;
503 needed_nodes++; 503 needed_nodes++;
@@ -533,8 +533,8 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
533 533
534 if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY && 534 if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY &&
535 vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT) 535 vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT)
536 reiserfs_warning(tb->tb_sb, "vs-8115: get_num_ver: not " 536 reiserfs_warning(tb->tb_sb, "vs-8115",
537 "directory or indirect item"); 537 "not directory or indirect item");
538 } 538 }
539 539
540 /* now we know S2bytes, calculate S1bytes */ 540 /* now we know S2bytes, calculate S1bytes */
@@ -569,7 +569,7 @@ extern struct tree_balance *cur_tb;
569 569
570/* Set parameters for balancing. 570/* Set parameters for balancing.
571 * Performs write of results of analysis of balancing into structure tb, 571 * Performs write of results of analysis of balancing into structure tb,
572 * where it will later be used by the functions that actually do the balancing. 572 * where it will later be used by the functions that actually do the balancing.
573 * Parameters: 573 * Parameters:
574 * tb tree_balance structure; 574 * tb tree_balance structure;
575 * h current level of the node; 575 * h current level of the node;
@@ -749,25 +749,26 @@ else \
749 -1, -1);\ 749 -1, -1);\
750} 750}
751 751
752static void free_buffers_in_tb(struct tree_balance *p_s_tb) 752static void free_buffers_in_tb(struct tree_balance *tb)
753{ 753{
754 int n_counter; 754 int i;
755 755
756 decrement_counters_in_path(p_s_tb->tb_path); 756 pathrelse(tb->tb_path);
757 757
758 for (n_counter = 0; n_counter < MAX_HEIGHT; n_counter++) { 758 for (i = 0; i < MAX_HEIGHT; i++) {
759 decrement_bcount(p_s_tb->L[n_counter]); 759 brelse(tb->L[i]);
760 p_s_tb->L[n_counter] = NULL; 760 brelse(tb->R[i]);
761 decrement_bcount(p_s_tb->R[n_counter]); 761 brelse(tb->FL[i]);
762 p_s_tb->R[n_counter] = NULL; 762 brelse(tb->FR[i]);
763 decrement_bcount(p_s_tb->FL[n_counter]); 763 brelse(tb->CFL[i]);
764 p_s_tb->FL[n_counter] = NULL; 764 brelse(tb->CFR[i]);
765 decrement_bcount(p_s_tb->FR[n_counter]); 765
766 p_s_tb->FR[n_counter] = NULL; 766 tb->L[i] = NULL;
767 decrement_bcount(p_s_tb->CFL[n_counter]); 767 tb->R[i] = NULL;
768 p_s_tb->CFL[n_counter] = NULL; 768 tb->FL[i] = NULL;
769 decrement_bcount(p_s_tb->CFR[n_counter]); 769 tb->FR[i] = NULL;
770 p_s_tb->CFR[n_counter] = NULL; 770 tb->CFL[i] = NULL;
771 tb->CFR[i] = NULL;
771 } 772 }
772} 773}
773 774
@@ -777,14 +778,14 @@ static void free_buffers_in_tb(struct tree_balance *p_s_tb)
777 * NO_DISK_SPACE - no disk space. 778 * NO_DISK_SPACE - no disk space.
778 */ 779 */
779/* The function is NOT SCHEDULE-SAFE! */ 780/* The function is NOT SCHEDULE-SAFE! */
780static int get_empty_nodes(struct tree_balance *p_s_tb, int n_h) 781static int get_empty_nodes(struct tree_balance *tb, int h)
781{ 782{
782 struct buffer_head *p_s_new_bh, 783 struct buffer_head *new_bh,
783 *p_s_Sh = PATH_H_PBUFFER(p_s_tb->tb_path, n_h); 784 *Sh = PATH_H_PBUFFER(tb->tb_path, h);
784 b_blocknr_t *p_n_blocknr, a_n_blocknrs[MAX_AMOUNT_NEEDED] = { 0, }; 785 b_blocknr_t *blocknr, blocknrs[MAX_AMOUNT_NEEDED] = { 0, };
785 int n_counter, n_number_of_freeblk, n_amount_needed, /* number of needed empty blocks */ 786 int counter, number_of_freeblk, amount_needed, /* number of needed empty blocks */
786 n_retval = CARRY_ON; 787 retval = CARRY_ON;
787 struct super_block *p_s_sb = p_s_tb->tb_sb; 788 struct super_block *sb = tb->tb_sb;
788 789
789 /* number_of_freeblk is the number of empty blocks which have been 790 /* number_of_freeblk is the number of empty blocks which have been
790 acquired for use by the balancing algorithm minus the number of 791 acquired for use by the balancing algorithm minus the number of
@@ -792,7 +793,7 @@ static int get_empty_nodes(struct tree_balance *p_s_tb, int n_h)
792 number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs 793 number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs
793 after empty blocks are acquired, and the balancing analysis is 794 after empty blocks are acquired, and the balancing analysis is
794 then restarted, amount_needed is the number needed by this level 795 then restarted, amount_needed is the number needed by this level
795 (n_h) of the balancing analysis. 796 (h) of the balancing analysis.
796 797
797 Note that for systems with many processes writing, it would be 798 Note that for systems with many processes writing, it would be
798 more layout optimal to calculate the total number needed by all 799 more layout optimal to calculate the total number needed by all
@@ -800,54 +801,54 @@ static int get_empty_nodes(struct tree_balance *p_s_tb, int n_h)
800 801
801 /* Initiate number_of_freeblk to the amount acquired prior to the restart of 802 /* Initiate number_of_freeblk to the amount acquired prior to the restart of
802 the analysis or 0 if not restarted, then subtract the amount needed 803 the analysis or 0 if not restarted, then subtract the amount needed
803 by all of the levels of the tree below n_h. */ 804 by all of the levels of the tree below h. */
804 /* blknum includes S[n_h], so we subtract 1 in this calculation */ 805 /* blknum includes S[h], so we subtract 1 in this calculation */
805 for (n_counter = 0, n_number_of_freeblk = p_s_tb->cur_blknum; 806 for (counter = 0, number_of_freeblk = tb->cur_blknum;
806 n_counter < n_h; n_counter++) 807 counter < h; counter++)
807 n_number_of_freeblk -= 808 number_of_freeblk -=
808 (p_s_tb->blknum[n_counter]) ? (p_s_tb->blknum[n_counter] - 809 (tb->blknum[counter]) ? (tb->blknum[counter] -
809 1) : 0; 810 1) : 0;
810 811
811 /* Allocate missing empty blocks. */ 812 /* Allocate missing empty blocks. */
812 /* if p_s_Sh == 0 then we are getting a new root */ 813 /* if Sh == 0 then we are getting a new root */
813 n_amount_needed = (p_s_Sh) ? (p_s_tb->blknum[n_h] - 1) : 1; 814 amount_needed = (Sh) ? (tb->blknum[h] - 1) : 1;
814 /* Amount_needed = the amount that we need more than the amount that we have. */ 815 /* Amount_needed = the amount that we need more than the amount that we have. */
815 if (n_amount_needed > n_number_of_freeblk) 816 if (amount_needed > number_of_freeblk)
816 n_amount_needed -= n_number_of_freeblk; 817 amount_needed -= number_of_freeblk;
817 else /* If we have enough already then there is nothing to do. */ 818 else /* If we have enough already then there is nothing to do. */
818 return CARRY_ON; 819 return CARRY_ON;
819 820
820 /* No need to check quota - is not allocated for blocks used for formatted nodes */ 821 /* No need to check quota - is not allocated for blocks used for formatted nodes */
821 if (reiserfs_new_form_blocknrs(p_s_tb, a_n_blocknrs, 822 if (reiserfs_new_form_blocknrs(tb, blocknrs,
822 n_amount_needed) == NO_DISK_SPACE) 823 amount_needed) == NO_DISK_SPACE)
823 return NO_DISK_SPACE; 824 return NO_DISK_SPACE;
824 825
825 /* for each blocknumber we just got, get a buffer and stick it on FEB */ 826 /* for each blocknumber we just got, get a buffer and stick it on FEB */
826 for (p_n_blocknr = a_n_blocknrs, n_counter = 0; 827 for (blocknr = blocknrs, counter = 0;
827 n_counter < n_amount_needed; p_n_blocknr++, n_counter++) { 828 counter < amount_needed; blocknr++, counter++) {
828 829
829 RFALSE(!*p_n_blocknr, 830 RFALSE(!*blocknr,
830 "PAP-8135: reiserfs_new_blocknrs failed when got new blocks"); 831 "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
831 832
832 p_s_new_bh = sb_getblk(p_s_sb, *p_n_blocknr); 833 new_bh = sb_getblk(sb, *blocknr);
833 RFALSE(buffer_dirty(p_s_new_bh) || 834 RFALSE(buffer_dirty(new_bh) ||
834 buffer_journaled(p_s_new_bh) || 835 buffer_journaled(new_bh) ||
835 buffer_journal_dirty(p_s_new_bh), 836 buffer_journal_dirty(new_bh),
836 "PAP-8140: journlaled or dirty buffer %b for the new block", 837 "PAP-8140: journlaled or dirty buffer %b for the new block",
837 p_s_new_bh); 838 new_bh);
838 839
839 /* Put empty buffers into the array. */ 840 /* Put empty buffers into the array. */
840 RFALSE(p_s_tb->FEB[p_s_tb->cur_blknum], 841 RFALSE(tb->FEB[tb->cur_blknum],
841 "PAP-8141: busy slot for new buffer"); 842 "PAP-8141: busy slot for new buffer");
842 843
843 set_buffer_journal_new(p_s_new_bh); 844 set_buffer_journal_new(new_bh);
844 p_s_tb->FEB[p_s_tb->cur_blknum++] = p_s_new_bh; 845 tb->FEB[tb->cur_blknum++] = new_bh;
845 } 846 }
846 847
847 if (n_retval == CARRY_ON && FILESYSTEM_CHANGED_TB(p_s_tb)) 848 if (retval == CARRY_ON && FILESYSTEM_CHANGED_TB(tb))
848 n_retval = REPEAT_SEARCH; 849 retval = REPEAT_SEARCH;
849 850
850 return n_retval; 851 return retval;
851} 852}
852 853
853/* Get free space of the left neighbor, which is stored in the parent 854/* Get free space of the left neighbor, which is stored in the parent
@@ -895,35 +896,36 @@ static int get_rfree(struct tree_balance *tb, int h)
895} 896}
896 897
897/* Check whether left neighbor is in memory. */ 898/* Check whether left neighbor is in memory. */
898static int is_left_neighbor_in_cache(struct tree_balance *p_s_tb, int n_h) 899static int is_left_neighbor_in_cache(struct tree_balance *tb, int h)
899{ 900{
900 struct buffer_head *p_s_father, *left; 901 struct buffer_head *father, *left;
901 struct super_block *p_s_sb = p_s_tb->tb_sb; 902 struct super_block *sb = tb->tb_sb;
902 b_blocknr_t n_left_neighbor_blocknr; 903 b_blocknr_t left_neighbor_blocknr;
903 int n_left_neighbor_position; 904 int left_neighbor_position;
904 905
905 if (!p_s_tb->FL[n_h]) /* Father of the left neighbor does not exist. */ 906 /* Father of the left neighbor does not exist. */
907 if (!tb->FL[h])
906 return 0; 908 return 0;
907 909
908 /* Calculate father of the node to be balanced. */ 910 /* Calculate father of the node to be balanced. */
909 p_s_father = PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1); 911 father = PATH_H_PBUFFER(tb->tb_path, h + 1);
910 912
911 RFALSE(!p_s_father || 913 RFALSE(!father ||
912 !B_IS_IN_TREE(p_s_father) || 914 !B_IS_IN_TREE(father) ||
913 !B_IS_IN_TREE(p_s_tb->FL[n_h]) || 915 !B_IS_IN_TREE(tb->FL[h]) ||
914 !buffer_uptodate(p_s_father) || 916 !buffer_uptodate(father) ||
915 !buffer_uptodate(p_s_tb->FL[n_h]), 917 !buffer_uptodate(tb->FL[h]),
916 "vs-8165: F[h] (%b) or FL[h] (%b) is invalid", 918 "vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
917 p_s_father, p_s_tb->FL[n_h]); 919 father, tb->FL[h]);
918 920
919 /* Get position of the pointer to the left neighbor into the left father. */ 921 /* Get position of the pointer to the left neighbor into the left father. */
920 n_left_neighbor_position = (p_s_father == p_s_tb->FL[n_h]) ? 922 left_neighbor_position = (father == tb->FL[h]) ?
921 p_s_tb->lkey[n_h] : B_NR_ITEMS(p_s_tb->FL[n_h]); 923 tb->lkey[h] : B_NR_ITEMS(tb->FL[h]);
922 /* Get left neighbor block number. */ 924 /* Get left neighbor block number. */
923 n_left_neighbor_blocknr = 925 left_neighbor_blocknr =
924 B_N_CHILD_NUM(p_s_tb->FL[n_h], n_left_neighbor_position); 926 B_N_CHILD_NUM(tb->FL[h], left_neighbor_position);
925 /* Look for the left neighbor in the cache. */ 927 /* Look for the left neighbor in the cache. */
926 if ((left = sb_find_get_block(p_s_sb, n_left_neighbor_blocknr))) { 928 if ((left = sb_find_get_block(sb, left_neighbor_blocknr))) {
927 929
928 RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left), 930 RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left),
929 "vs-8170: left neighbor (%b %z) is not in the tree", 931 "vs-8170: left neighbor (%b %z) is not in the tree",
@@ -938,10 +940,10 @@ static int is_left_neighbor_in_cache(struct tree_balance *p_s_tb, int n_h)
938#define LEFT_PARENTS 'l' 940#define LEFT_PARENTS 'l'
939#define RIGHT_PARENTS 'r' 941#define RIGHT_PARENTS 'r'
940 942
941static void decrement_key(struct cpu_key *p_s_key) 943static void decrement_key(struct cpu_key *key)
942{ 944{
943 // call item specific function for this key 945 // call item specific function for this key
944 item_ops[cpu_key_k_type(p_s_key)]->decrement_key(p_s_key); 946 item_ops[cpu_key_k_type(key)]->decrement_key(key);
945} 947}
946 948
947/* Calculate far left/right parent of the left/right neighbor of the current node, that 949/* Calculate far left/right parent of the left/right neighbor of the current node, that
@@ -952,77 +954,77 @@ static void decrement_key(struct cpu_key *p_s_key)
952 SCHEDULE_OCCURRED - schedule occurred while the function worked; 954 SCHEDULE_OCCURRED - schedule occurred while the function worked;
953 * CARRY_ON - schedule didn't occur while the function worked; 955 * CARRY_ON - schedule didn't occur while the function worked;
954 */ 956 */
955static int get_far_parent(struct tree_balance *p_s_tb, 957static int get_far_parent(struct tree_balance *tb,
956 int n_h, 958 int h,
957 struct buffer_head **pp_s_father, 959 struct buffer_head **pfather,
958 struct buffer_head **pp_s_com_father, char c_lr_par) 960 struct buffer_head **pcom_father, char c_lr_par)
959{ 961{
960 struct buffer_head *p_s_parent; 962 struct buffer_head *parent;
961 INITIALIZE_PATH(s_path_to_neighbor_father); 963 INITIALIZE_PATH(s_path_to_neighbor_father);
962 struct treepath *p_s_path = p_s_tb->tb_path; 964 struct treepath *path = tb->tb_path;
963 struct cpu_key s_lr_father_key; 965 struct cpu_key s_lr_father_key;
964 int n_counter, 966 int counter,
965 n_position = INT_MAX, 967 position = INT_MAX,
966 n_first_last_position = 0, 968 first_last_position = 0,
967 n_path_offset = PATH_H_PATH_OFFSET(p_s_path, n_h); 969 path_offset = PATH_H_PATH_OFFSET(path, h);
968 970
969 /* Starting from F[n_h] go upwards in the tree, and look for the common 971 /* Starting from F[h] go upwards in the tree, and look for the common
970 ancestor of F[n_h], and its neighbor l/r, that should be obtained. */ 972 ancestor of F[h], and its neighbor l/r, that should be obtained. */
971 973
972 n_counter = n_path_offset; 974 counter = path_offset;
973 975
974 RFALSE(n_counter < FIRST_PATH_ELEMENT_OFFSET, 976 RFALSE(counter < FIRST_PATH_ELEMENT_OFFSET,
975 "PAP-8180: invalid path length"); 977 "PAP-8180: invalid path length");
976 978
977 for (; n_counter > FIRST_PATH_ELEMENT_OFFSET; n_counter--) { 979 for (; counter > FIRST_PATH_ELEMENT_OFFSET; counter--) {
978 /* Check whether parent of the current buffer in the path is really parent in the tree. */ 980 /* Check whether parent of the current buffer in the path is really parent in the tree. */
979 if (!B_IS_IN_TREE 981 if (!B_IS_IN_TREE
980 (p_s_parent = PATH_OFFSET_PBUFFER(p_s_path, n_counter - 1))) 982 (parent = PATH_OFFSET_PBUFFER(path, counter - 1)))
981 return REPEAT_SEARCH; 983 return REPEAT_SEARCH;
982 /* Check whether position in the parent is correct. */ 984 /* Check whether position in the parent is correct. */
983 if ((n_position = 985 if ((position =
984 PATH_OFFSET_POSITION(p_s_path, 986 PATH_OFFSET_POSITION(path,
985 n_counter - 1)) > 987 counter - 1)) >
986 B_NR_ITEMS(p_s_parent)) 988 B_NR_ITEMS(parent))
987 return REPEAT_SEARCH; 989 return REPEAT_SEARCH;
988 /* Check whether parent at the path really points to the child. */ 990 /* Check whether parent at the path really points to the child. */
989 if (B_N_CHILD_NUM(p_s_parent, n_position) != 991 if (B_N_CHILD_NUM(parent, position) !=
990 PATH_OFFSET_PBUFFER(p_s_path, n_counter)->b_blocknr) 992 PATH_OFFSET_PBUFFER(path, counter)->b_blocknr)
991 return REPEAT_SEARCH; 993 return REPEAT_SEARCH;
992 /* Return delimiting key if position in the parent is not equal to first/last one. */ 994 /* Return delimiting key if position in the parent is not equal to first/last one. */
993 if (c_lr_par == RIGHT_PARENTS) 995 if (c_lr_par == RIGHT_PARENTS)
994 n_first_last_position = B_NR_ITEMS(p_s_parent); 996 first_last_position = B_NR_ITEMS(parent);
995 if (n_position != n_first_last_position) { 997 if (position != first_last_position) {
996 *pp_s_com_father = p_s_parent; 998 *pcom_father = parent;
997 get_bh(*pp_s_com_father); 999 get_bh(*pcom_father);
998 /*(*pp_s_com_father = p_s_parent)->b_count++; */ 1000 /*(*pcom_father = parent)->b_count++; */
999 break; 1001 break;
1000 } 1002 }
1001 } 1003 }
1002 1004
1003 /* if we are in the root of the tree, then there is no common father */ 1005 /* if we are in the root of the tree, then there is no common father */
1004 if (n_counter == FIRST_PATH_ELEMENT_OFFSET) { 1006 if (counter == FIRST_PATH_ELEMENT_OFFSET) {
1005 /* Check whether first buffer in the path is the root of the tree. */ 1007 /* Check whether first buffer in the path is the root of the tree. */
1006 if (PATH_OFFSET_PBUFFER 1008 if (PATH_OFFSET_PBUFFER
1007 (p_s_tb->tb_path, 1009 (tb->tb_path,
1008 FIRST_PATH_ELEMENT_OFFSET)->b_blocknr == 1010 FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
1009 SB_ROOT_BLOCK(p_s_tb->tb_sb)) { 1011 SB_ROOT_BLOCK(tb->tb_sb)) {
1010 *pp_s_father = *pp_s_com_father = NULL; 1012 *pfather = *pcom_father = NULL;
1011 return CARRY_ON; 1013 return CARRY_ON;
1012 } 1014 }
1013 return REPEAT_SEARCH; 1015 return REPEAT_SEARCH;
1014 } 1016 }
1015 1017
1016 RFALSE(B_LEVEL(*pp_s_com_father) <= DISK_LEAF_NODE_LEVEL, 1018 RFALSE(B_LEVEL(*pcom_father) <= DISK_LEAF_NODE_LEVEL,
1017 "PAP-8185: (%b %z) level too small", 1019 "PAP-8185: (%b %z) level too small",
1018 *pp_s_com_father, *pp_s_com_father); 1020 *pcom_father, *pcom_father);
1019 1021
1020 /* Check whether the common parent is locked. */ 1022 /* Check whether the common parent is locked. */
1021 1023
1022 if (buffer_locked(*pp_s_com_father)) { 1024 if (buffer_locked(*pcom_father)) {
1023 __wait_on_buffer(*pp_s_com_father); 1025 __wait_on_buffer(*pcom_father);
1024 if (FILESYSTEM_CHANGED_TB(p_s_tb)) { 1026 if (FILESYSTEM_CHANGED_TB(tb)) {
1025 decrement_bcount(*pp_s_com_father); 1027 brelse(*pcom_father);
1026 return REPEAT_SEARCH; 1028 return REPEAT_SEARCH;
1027 } 1029 }
1028 } 1030 }
@@ -1032,128 +1034,131 @@ static int get_far_parent(struct tree_balance *p_s_tb,
1032 1034
1033 /* Form key to get parent of the left/right neighbor. */ 1035 /* Form key to get parent of the left/right neighbor. */
1034 le_key2cpu_key(&s_lr_father_key, 1036 le_key2cpu_key(&s_lr_father_key,
1035 B_N_PDELIM_KEY(*pp_s_com_father, 1037 B_N_PDELIM_KEY(*pcom_father,
1036 (c_lr_par == 1038 (c_lr_par ==
1037 LEFT_PARENTS) ? (p_s_tb->lkey[n_h - 1] = 1039 LEFT_PARENTS) ? (tb->lkey[h - 1] =
1038 n_position - 1040 position -
1039 1) : (p_s_tb->rkey[n_h - 1041 1) : (tb->rkey[h -
1040 1] = 1042 1] =
1041 n_position))); 1043 position)));
1042 1044
1043 if (c_lr_par == LEFT_PARENTS) 1045 if (c_lr_par == LEFT_PARENTS)
1044 decrement_key(&s_lr_father_key); 1046 decrement_key(&s_lr_father_key);
1045 1047
1046 if (search_by_key 1048 if (search_by_key
1047 (p_s_tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father, 1049 (tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father,
1048 n_h + 1) == IO_ERROR) 1050 h + 1) == IO_ERROR)
1049 // path is released 1051 // path is released
1050 return IO_ERROR; 1052 return IO_ERROR;
1051 1053
1052 if (FILESYSTEM_CHANGED_TB(p_s_tb)) { 1054 if (FILESYSTEM_CHANGED_TB(tb)) {
1053 decrement_counters_in_path(&s_path_to_neighbor_father); 1055 pathrelse(&s_path_to_neighbor_father);
1054 decrement_bcount(*pp_s_com_father); 1056 brelse(*pcom_father);
1055 return REPEAT_SEARCH; 1057 return REPEAT_SEARCH;
1056 } 1058 }
1057 1059
1058 *pp_s_father = PATH_PLAST_BUFFER(&s_path_to_neighbor_father); 1060 *pfather = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
1059 1061
1060 RFALSE(B_LEVEL(*pp_s_father) != n_h + 1, 1062 RFALSE(B_LEVEL(*pfather) != h + 1,
1061 "PAP-8190: (%b %z) level too small", *pp_s_father, *pp_s_father); 1063 "PAP-8190: (%b %z) level too small", *pfather, *pfather);
1062 RFALSE(s_path_to_neighbor_father.path_length < 1064 RFALSE(s_path_to_neighbor_father.path_length <
1063 FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small"); 1065 FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small");
1064 1066
1065 s_path_to_neighbor_father.path_length--; 1067 s_path_to_neighbor_father.path_length--;
1066 decrement_counters_in_path(&s_path_to_neighbor_father); 1068 pathrelse(&s_path_to_neighbor_father);
1067 return CARRY_ON; 1069 return CARRY_ON;
1068} 1070}
1069 1071
1070/* Get parents of neighbors of node in the path(S[n_path_offset]) and common parents of 1072/* Get parents of neighbors of node in the path(S[path_offset]) and common parents of
1071 * S[n_path_offset] and L[n_path_offset]/R[n_path_offset]: F[n_path_offset], FL[n_path_offset], 1073 * S[path_offset] and L[path_offset]/R[path_offset]: F[path_offset], FL[path_offset],
1072 * FR[n_path_offset], CFL[n_path_offset], CFR[n_path_offset]. 1074 * FR[path_offset], CFL[path_offset], CFR[path_offset].
1073 * Calculate numbers of left and right delimiting keys position: lkey[n_path_offset], rkey[n_path_offset]. 1075 * Calculate numbers of left and right delimiting keys position: lkey[path_offset], rkey[path_offset].
1074 * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; 1076 * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
1075 * CARRY_ON - schedule didn't occur while the function worked; 1077 * CARRY_ON - schedule didn't occur while the function worked;
1076 */ 1078 */
1077static int get_parents(struct tree_balance *p_s_tb, int n_h) 1079static int get_parents(struct tree_balance *tb, int h)
1078{ 1080{
1079 struct treepath *p_s_path = p_s_tb->tb_path; 1081 struct treepath *path = tb->tb_path;
1080 int n_position, 1082 int position,
1081 n_ret_value, 1083 ret,
1082 n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h); 1084 path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
1083 struct buffer_head *p_s_curf, *p_s_curcf; 1085 struct buffer_head *curf, *curcf;
1084 1086
1085 /* Current node is the root of the tree or will be root of the tree */ 1087 /* Current node is the root of the tree or will be root of the tree */
1086 if (n_path_offset <= FIRST_PATH_ELEMENT_OFFSET) { 1088 if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
1087 /* The root can not have parents. 1089 /* The root can not have parents.
1088 Release nodes which previously were obtained as parents of the current node neighbors. */ 1090 Release nodes which previously were obtained as parents of the current node neighbors. */
1089 decrement_bcount(p_s_tb->FL[n_h]); 1091 brelse(tb->FL[h]);
1090 decrement_bcount(p_s_tb->CFL[n_h]); 1092 brelse(tb->CFL[h]);
1091 decrement_bcount(p_s_tb->FR[n_h]); 1093 brelse(tb->FR[h]);
1092 decrement_bcount(p_s_tb->CFR[n_h]); 1094 brelse(tb->CFR[h]);
1093 p_s_tb->FL[n_h] = p_s_tb->CFL[n_h] = p_s_tb->FR[n_h] = 1095 tb->FL[h] = NULL;
1094 p_s_tb->CFR[n_h] = NULL; 1096 tb->CFL[h] = NULL;
1097 tb->FR[h] = NULL;
1098 tb->CFR[h] = NULL;
1095 return CARRY_ON; 1099 return CARRY_ON;
1096 } 1100 }
1097 1101
1098 /* Get parent FL[n_path_offset] of L[n_path_offset]. */ 1102 /* Get parent FL[path_offset] of L[path_offset]. */
1099 if ((n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1))) { 1103 position = PATH_OFFSET_POSITION(path, path_offset - 1);
1104 if (position) {
1100 /* Current node is not the first child of its parent. */ 1105 /* Current node is not the first child of its parent. */
1101 /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2; */ 1106 curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
1102 p_s_curf = p_s_curcf = 1107 curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
1103 PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1); 1108 get_bh(curf);
1104 get_bh(p_s_curf); 1109 get_bh(curf);
1105 get_bh(p_s_curf); 1110 tb->lkey[h] = position - 1;
1106 p_s_tb->lkey[n_h] = n_position - 1;
1107 } else { 1111 } else {
1108 /* Calculate current parent of L[n_path_offset], which is the left neighbor of the current node. 1112 /* Calculate current parent of L[path_offset], which is the left neighbor of the current node.
1109 Calculate current common parent of L[n_path_offset] and the current node. Note that 1113 Calculate current common parent of L[path_offset] and the current node. Note that
1110 CFL[n_path_offset] not equal FL[n_path_offset] and CFL[n_path_offset] not equal F[n_path_offset]. 1114 CFL[path_offset] not equal FL[path_offset] and CFL[path_offset] not equal F[path_offset].
1111 Calculate lkey[n_path_offset]. */ 1115 Calculate lkey[path_offset]. */
1112 if ((n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf, 1116 if ((ret = get_far_parent(tb, h + 1, &curf,
1113 &p_s_curcf, 1117 &curcf,
1114 LEFT_PARENTS)) != CARRY_ON) 1118 LEFT_PARENTS)) != CARRY_ON)
1115 return n_ret_value; 1119 return ret;
1116 } 1120 }
1117 1121
1118 decrement_bcount(p_s_tb->FL[n_h]); 1122 brelse(tb->FL[h]);
1119 p_s_tb->FL[n_h] = p_s_curf; /* New initialization of FL[n_h]. */ 1123 tb->FL[h] = curf; /* New initialization of FL[h]. */
1120 decrement_bcount(p_s_tb->CFL[n_h]); 1124 brelse(tb->CFL[h]);
1121 p_s_tb->CFL[n_h] = p_s_curcf; /* New initialization of CFL[n_h]. */ 1125 tb->CFL[h] = curcf; /* New initialization of CFL[h]. */
1122 1126
1123 RFALSE((p_s_curf && !B_IS_IN_TREE(p_s_curf)) || 1127 RFALSE((curf && !B_IS_IN_TREE(curf)) ||
1124 (p_s_curcf && !B_IS_IN_TREE(p_s_curcf)), 1128 (curcf && !B_IS_IN_TREE(curcf)),
1125 "PAP-8195: FL (%b) or CFL (%b) is invalid", p_s_curf, p_s_curcf); 1129 "PAP-8195: FL (%b) or CFL (%b) is invalid", curf, curcf);
1126 1130
1127/* Get parent FR[n_h] of R[n_h]. */ 1131/* Get parent FR[h] of R[h]. */
1128 1132
1129/* Current node is the last child of F[n_h]. FR[n_h] != F[n_h]. */ 1133/* Current node is the last child of F[h]. FR[h] != F[h]. */
1130 if (n_position == B_NR_ITEMS(PATH_H_PBUFFER(p_s_path, n_h + 1))) { 1134 if (position == B_NR_ITEMS(PATH_H_PBUFFER(path, h + 1))) {
1131/* Calculate current parent of R[n_h], which is the right neighbor of F[n_h]. 1135/* Calculate current parent of R[h], which is the right neighbor of F[h].
1132 Calculate current common parent of R[n_h] and current node. Note that CFR[n_h] 1136 Calculate current common parent of R[h] and current node. Note that CFR[h]
1133 not equal FR[n_path_offset] and CFR[n_h] not equal F[n_h]. */ 1137 not equal FR[path_offset] and CFR[h] not equal F[h]. */
1134 if ((n_ret_value = 1138 if ((ret =
1135 get_far_parent(p_s_tb, n_h + 1, &p_s_curf, &p_s_curcf, 1139 get_far_parent(tb, h + 1, &curf, &curcf,
1136 RIGHT_PARENTS)) != CARRY_ON) 1140 RIGHT_PARENTS)) != CARRY_ON)
1137 return n_ret_value; 1141 return ret;
1138 } else { 1142 } else {
1139/* Current node is not the last child of its parent F[n_h]. */ 1143/* Current node is not the last child of its parent F[h]. */
1140 /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2; */ 1144 curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
1141 p_s_curf = p_s_curcf = 1145 curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
1142 PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1); 1146 get_bh(curf);
1143 get_bh(p_s_curf); 1147 get_bh(curf);
1144 get_bh(p_s_curf); 1148 tb->rkey[h] = position;
1145 p_s_tb->rkey[n_h] = n_position;
1146 } 1149 }
1147 1150
1148 decrement_bcount(p_s_tb->FR[n_h]); 1151 brelse(tb->FR[h]);
1149 p_s_tb->FR[n_h] = p_s_curf; /* New initialization of FR[n_path_offset]. */ 1152 /* New initialization of FR[path_offset]. */
1153 tb->FR[h] = curf;
1150 1154
1151 decrement_bcount(p_s_tb->CFR[n_h]); 1155 brelse(tb->CFR[h]);
1152 p_s_tb->CFR[n_h] = p_s_curcf; /* New initialization of CFR[n_path_offset]. */ 1156 /* New initialization of CFR[path_offset]. */
1157 tb->CFR[h] = curcf;
1153 1158
1154 RFALSE((p_s_curf && !B_IS_IN_TREE(p_s_curf)) || 1159 RFALSE((curf && !B_IS_IN_TREE(curf)) ||
1155 (p_s_curcf && !B_IS_IN_TREE(p_s_curcf)), 1160 (curcf && !B_IS_IN_TREE(curcf)),
1156 "PAP-8205: FR (%b) or CFR (%b) is invalid", p_s_curf, p_s_curcf); 1161 "PAP-8205: FR (%b) or CFR (%b) is invalid", curf, curcf);
1157 1162
1158 return CARRY_ON; 1163 return CARRY_ON;
1159} 1164}
@@ -1203,7 +1208,7 @@ static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
1203 * h current level of the node; 1208 * h current level of the node;
1204 * inum item number in S[h]; 1209 * inum item number in S[h];
1205 * mode i - insert, p - paste; 1210 * mode i - insert, p - paste;
1206 * Returns: 1 - schedule occurred; 1211 * Returns: 1 - schedule occurred;
1207 * 0 - balancing for higher levels needed; 1212 * 0 - balancing for higher levels needed;
1208 * -1 - no balancing for higher levels needed; 1213 * -1 - no balancing for higher levels needed;
1209 * -2 - no disk space. 1214 * -2 - no disk space.
@@ -1217,7 +1222,7 @@ static int ip_check_balance(struct tree_balance *tb, int h)
1217 contains node being balanced. The mnemonic is 1222 contains node being balanced. The mnemonic is
1218 that the attempted change in node space used level 1223 that the attempted change in node space used level
1219 is levbytes bytes. */ 1224 is levbytes bytes. */
1220 n_ret_value; 1225 ret;
1221 1226
1222 int lfree, sfree, rfree /* free space in L, S and R */ ; 1227 int lfree, sfree, rfree /* free space in L, S and R */ ;
1223 1228
@@ -1238,7 +1243,7 @@ static int ip_check_balance(struct tree_balance *tb, int h)
1238 /* we perform 8 calls to get_num_ver(). For each call we calculate five parameters. 1243 /* we perform 8 calls to get_num_ver(). For each call we calculate five parameters.
1239 where 4th parameter is s1bytes and 5th - s2bytes 1244 where 4th parameter is s1bytes and 5th - s2bytes
1240 */ 1245 */
1241 short snum012[40] = { 0, }; /* s0num, s1num, s2num for 8 cases 1246 short snum012[40] = { 0, }; /* s0num, s1num, s2num for 8 cases
1242 0,1 - do not shift and do not shift but bottle 1247 0,1 - do not shift and do not shift but bottle
1243 2 - shift only whole item to left 1248 2 - shift only whole item to left
1244 3 - shift to left and bottle as much as possible 1249 3 - shift to left and bottle as much as possible
@@ -1255,24 +1260,24 @@ static int ip_check_balance(struct tree_balance *tb, int h)
1255 /* Calculate balance parameters for creating new root. */ 1260 /* Calculate balance parameters for creating new root. */
1256 if (!Sh) { 1261 if (!Sh) {
1257 if (!h) 1262 if (!h)
1258 reiserfs_panic(tb->tb_sb, 1263 reiserfs_panic(tb->tb_sb, "vs-8210",
1259 "vs-8210: ip_check_balance: S[0] can not be 0"); 1264 "S[0] can not be 0");
1260 switch (n_ret_value = get_empty_nodes(tb, h)) { 1265 switch (ret = get_empty_nodes(tb, h)) {
1261 case CARRY_ON: 1266 case CARRY_ON:
1262 set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); 1267 set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1263 return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */ 1268 return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */
1264 1269
1265 case NO_DISK_SPACE: 1270 case NO_DISK_SPACE:
1266 case REPEAT_SEARCH: 1271 case REPEAT_SEARCH:
1267 return n_ret_value; 1272 return ret;
1268 default: 1273 default:
1269 reiserfs_panic(tb->tb_sb, 1274 reiserfs_panic(tb->tb_sb, "vs-8215", "incorrect "
1270 "vs-8215: ip_check_balance: incorrect return value of get_empty_nodes"); 1275 "return value of get_empty_nodes");
1271 } 1276 }
1272 } 1277 }
1273 1278
1274 if ((n_ret_value = get_parents(tb, h)) != CARRY_ON) /* get parents of S[h] neighbors. */ 1279 if ((ret = get_parents(tb, h)) != CARRY_ON) /* get parents of S[h] neighbors. */
1275 return n_ret_value; 1280 return ret;
1276 1281
1277 sfree = B_FREE_SPACE(Sh); 1282 sfree = B_FREE_SPACE(Sh);
1278 1283
@@ -1287,7 +1292,7 @@ static int ip_check_balance(struct tree_balance *tb, int h)
1287 1292
1288 create_virtual_node(tb, h); 1293 create_virtual_node(tb, h);
1289 1294
1290 /* 1295 /*
1291 determine maximal number of items we can shift to the left neighbor (in tb structure) 1296 determine maximal number of items we can shift to the left neighbor (in tb structure)
1292 and the maximal number of bytes that can flow to the left neighbor 1297 and the maximal number of bytes that can flow to the left neighbor
1293 from the left most liquid item that cannot be shifted from S[0] entirely (returned value) 1298 from the left most liquid item that cannot be shifted from S[0] entirely (returned value)
@@ -1348,13 +1353,13 @@ static int ip_check_balance(struct tree_balance *tb, int h)
1348 1353
1349 { 1354 {
1350 int lpar, rpar, nset, lset, rset, lrset; 1355 int lpar, rpar, nset, lset, rset, lrset;
1351 /* 1356 /*
1352 * regular overflowing of the node 1357 * regular overflowing of the node
1353 */ 1358 */
1354 1359
1355 /* get_num_ver works in 2 modes (FLOW & NO_FLOW) 1360 /* get_num_ver works in 2 modes (FLOW & NO_FLOW)
1356 lpar, rpar - number of items we can shift to left/right neighbor (including splitting item) 1361 lpar, rpar - number of items we can shift to left/right neighbor (including splitting item)
1357 nset, lset, rset, lrset - shows, whether flowing items give better packing 1362 nset, lset, rset, lrset - shows, whether flowing items give better packing
1358 */ 1363 */
1359#define FLOW 1 1364#define FLOW 1
1360#define NO_FLOW 0 /* do not any splitting */ 1365#define NO_FLOW 0 /* do not any splitting */
@@ -1544,7 +1549,7 @@ static int ip_check_balance(struct tree_balance *tb, int h)
1544 * h current level of the node; 1549 * h current level of the node;
1545 * inum item number in S[h]; 1550 * inum item number in S[h];
1546 * mode i - insert, p - paste; 1551 * mode i - insert, p - paste;
1547 * Returns: 1 - schedule occurred; 1552 * Returns: 1 - schedule occurred;
1548 * 0 - balancing for higher levels needed; 1553 * 0 - balancing for higher levels needed;
1549 * -1 - no balancing for higher levels needed; 1554 * -1 - no balancing for higher levels needed;
1550 * -2 - no disk space. 1555 * -2 - no disk space.
@@ -1559,7 +1564,7 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h)
1559 /* Sh is the node whose balance is currently being checked, 1564 /* Sh is the node whose balance is currently being checked,
1560 and Fh is its father. */ 1565 and Fh is its father. */
1561 struct buffer_head *Sh, *Fh; 1566 struct buffer_head *Sh, *Fh;
1562 int maxsize, n_ret_value; 1567 int maxsize, ret;
1563 int lfree, rfree /* free space in L and R */ ; 1568 int lfree, rfree /* free space in L and R */ ;
1564 1569
1565 Sh = PATH_H_PBUFFER(tb->tb_path, h); 1570 Sh = PATH_H_PBUFFER(tb->tb_path, h);
@@ -1584,8 +1589,8 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h)
1584 return CARRY_ON; 1589 return CARRY_ON;
1585 } 1590 }
1586 1591
1587 if ((n_ret_value = get_parents(tb, h)) != CARRY_ON) 1592 if ((ret = get_parents(tb, h)) != CARRY_ON)
1588 return n_ret_value; 1593 return ret;
1589 1594
1590 /* get free space of neighbors */ 1595 /* get free space of neighbors */
1591 rfree = get_rfree(tb, h); 1596 rfree = get_rfree(tb, h);
@@ -1727,7 +1732,7 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h)
1727 * h current level of the node; 1732 * h current level of the node;
1728 * inum item number in S[h]; 1733 * inum item number in S[h];
1729 * mode i - insert, p - paste; 1734 * mode i - insert, p - paste;
1730 * Returns: 1 - schedule occurred; 1735 * Returns: 1 - schedule occurred;
1731 * 0 - balancing for higher levels needed; 1736 * 0 - balancing for higher levels needed;
1732 * -1 - no balancing for higher levels needed; 1737 * -1 - no balancing for higher levels needed;
1733 * -2 - no disk space. 1738 * -2 - no disk space.
@@ -1742,7 +1747,7 @@ static int dc_check_balance_leaf(struct tree_balance *tb, int h)
1742 attempted change in node space used level is levbytes bytes. */ 1747 attempted change in node space used level is levbytes bytes. */
1743 int levbytes; 1748 int levbytes;
1744 /* the maximal item size */ 1749 /* the maximal item size */
1745 int maxsize, n_ret_value; 1750 int maxsize, ret;
1746 /* S0 is the node whose balance is currently being checked, 1751 /* S0 is the node whose balance is currently being checked,
1747 and F0 is its father. */ 1752 and F0 is its father. */
1748 struct buffer_head *S0, *F0; 1753 struct buffer_head *S0, *F0;
@@ -1764,8 +1769,8 @@ static int dc_check_balance_leaf(struct tree_balance *tb, int h)
1764 return NO_BALANCING_NEEDED; 1769 return NO_BALANCING_NEEDED;
1765 } 1770 }
1766 1771
1767 if ((n_ret_value = get_parents(tb, h)) != CARRY_ON) 1772 if ((ret = get_parents(tb, h)) != CARRY_ON)
1768 return n_ret_value; 1773 return ret;
1769 1774
1770 /* get free space of neighbors */ 1775 /* get free space of neighbors */
1771 rfree = get_rfree(tb, h); 1776 rfree = get_rfree(tb, h);
@@ -1821,7 +1826,7 @@ static int dc_check_balance_leaf(struct tree_balance *tb, int h)
1821 * h current level of the node; 1826 * h current level of the node;
1822 * inum item number in S[h]; 1827 * inum item number in S[h];
1823 * mode d - delete, c - cut. 1828 * mode d - delete, c - cut.
1824 * Returns: 1 - schedule occurred; 1829 * Returns: 1 - schedule occurred;
1825 * 0 - balancing for higher levels needed; 1830 * 0 - balancing for higher levels needed;
1826 * -1 - no balancing for higher levels needed; 1831 * -1 - no balancing for higher levels needed;
1827 * -2 - no disk space. 1832 * -2 - no disk space.
@@ -1850,7 +1855,7 @@ static int dc_check_balance(struct tree_balance *tb, int h)
1850 * h current level of the node; 1855 * h current level of the node;
1851 * inum item number in S[h]; 1856 * inum item number in S[h];
1852 * mode i - insert, p - paste, d - delete, c - cut. 1857 * mode i - insert, p - paste, d - delete, c - cut.
1853 * Returns: 1 - schedule occurred; 1858 * Returns: 1 - schedule occurred;
1854 * 0 - balancing for higher levels needed; 1859 * 0 - balancing for higher levels needed;
1855 * -1 - no balancing for higher levels needed; 1860 * -1 - no balancing for higher levels needed;
1856 * -2 - no disk space. 1861 * -2 - no disk space.
@@ -1884,137 +1889,138 @@ static int check_balance(int mode,
1884} 1889}
1885 1890
1886/* Check whether parent at the path is the really parent of the current node.*/ 1891/* Check whether parent at the path is the really parent of the current node.*/
1887static int get_direct_parent(struct tree_balance *p_s_tb, int n_h) 1892static int get_direct_parent(struct tree_balance *tb, int h)
1888{ 1893{
1889 struct buffer_head *p_s_bh; 1894 struct buffer_head *bh;
1890 struct treepath *p_s_path = p_s_tb->tb_path; 1895 struct treepath *path = tb->tb_path;
1891 int n_position, 1896 int position,
1892 n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h); 1897 path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
1893 1898
1894 /* We are in the root or in the new root. */ 1899 /* We are in the root or in the new root. */
1895 if (n_path_offset <= FIRST_PATH_ELEMENT_OFFSET) { 1900 if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
1896 1901
1897 RFALSE(n_path_offset < FIRST_PATH_ELEMENT_OFFSET - 1, 1902 RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
1898 "PAP-8260: invalid offset in the path"); 1903 "PAP-8260: invalid offset in the path");
1899 1904
1900 if (PATH_OFFSET_PBUFFER(p_s_path, FIRST_PATH_ELEMENT_OFFSET)-> 1905 if (PATH_OFFSET_PBUFFER(path, FIRST_PATH_ELEMENT_OFFSET)->
1901 b_blocknr == SB_ROOT_BLOCK(p_s_tb->tb_sb)) { 1906 b_blocknr == SB_ROOT_BLOCK(tb->tb_sb)) {
1902 /* Root is not changed. */ 1907 /* Root is not changed. */
1903 PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1) = NULL; 1908 PATH_OFFSET_PBUFFER(path, path_offset - 1) = NULL;
1904 PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1) = 0; 1909 PATH_OFFSET_POSITION(path, path_offset - 1) = 0;
1905 return CARRY_ON; 1910 return CARRY_ON;
1906 } 1911 }
1907 return REPEAT_SEARCH; /* Root is changed and we must recalculate the path. */ 1912 return REPEAT_SEARCH; /* Root is changed and we must recalculate the path. */
1908 } 1913 }
1909 1914
1910 if (!B_IS_IN_TREE 1915 if (!B_IS_IN_TREE
1911 (p_s_bh = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))) 1916 (bh = PATH_OFFSET_PBUFFER(path, path_offset - 1)))
1912 return REPEAT_SEARCH; /* Parent in the path is not in the tree. */ 1917 return REPEAT_SEARCH; /* Parent in the path is not in the tree. */
1913 1918
1914 if ((n_position = 1919 if ((position =
1915 PATH_OFFSET_POSITION(p_s_path, 1920 PATH_OFFSET_POSITION(path,
1916 n_path_offset - 1)) > B_NR_ITEMS(p_s_bh)) 1921 path_offset - 1)) > B_NR_ITEMS(bh))
1917 return REPEAT_SEARCH; 1922 return REPEAT_SEARCH;
1918 1923
1919 if (B_N_CHILD_NUM(p_s_bh, n_position) != 1924 if (B_N_CHILD_NUM(bh, position) !=
1920 PATH_OFFSET_PBUFFER(p_s_path, n_path_offset)->b_blocknr) 1925 PATH_OFFSET_PBUFFER(path, path_offset)->b_blocknr)
1921 /* Parent in the path is not parent of the current node in the tree. */ 1926 /* Parent in the path is not parent of the current node in the tree. */
1922 return REPEAT_SEARCH; 1927 return REPEAT_SEARCH;
1923 1928
1924 if (buffer_locked(p_s_bh)) { 1929 if (buffer_locked(bh)) {
1925 __wait_on_buffer(p_s_bh); 1930 __wait_on_buffer(bh);
1926 if (FILESYSTEM_CHANGED_TB(p_s_tb)) 1931 if (FILESYSTEM_CHANGED_TB(tb))
1927 return REPEAT_SEARCH; 1932 return REPEAT_SEARCH;
1928 } 1933 }
1929 1934
1930 return CARRY_ON; /* Parent in the path is unlocked and really parent of the current node. */ 1935 return CARRY_ON; /* Parent in the path is unlocked and really parent of the current node. */
1931} 1936}
1932 1937
1933/* Using lnum[n_h] and rnum[n_h] we should determine what neighbors 1938/* Using lnum[h] and rnum[h] we should determine what neighbors
1934 * of S[n_h] we 1939 * of S[h] we
1935 * need in order to balance S[n_h], and get them if necessary. 1940 * need in order to balance S[h], and get them if necessary.
1936 * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; 1941 * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
1937 * CARRY_ON - schedule didn't occur while the function worked; 1942 * CARRY_ON - schedule didn't occur while the function worked;
1938 */ 1943 */
1939static int get_neighbors(struct tree_balance *p_s_tb, int n_h) 1944static int get_neighbors(struct tree_balance *tb, int h)
1940{ 1945{
1941 int n_child_position, 1946 int child_position,
1942 n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h + 1); 1947 path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h + 1);
1943 unsigned long n_son_number; 1948 unsigned long son_number;
1944 struct super_block *p_s_sb = p_s_tb->tb_sb; 1949 struct super_block *sb = tb->tb_sb;
1945 struct buffer_head *p_s_bh; 1950 struct buffer_head *bh;
1946 1951
1947 PROC_INFO_INC(p_s_sb, get_neighbors[n_h]); 1952 PROC_INFO_INC(sb, get_neighbors[h]);
1948 1953
1949 if (p_s_tb->lnum[n_h]) { 1954 if (tb->lnum[h]) {
1950 /* We need left neighbor to balance S[n_h]. */ 1955 /* We need left neighbor to balance S[h]. */
1951 PROC_INFO_INC(p_s_sb, need_l_neighbor[n_h]); 1956 PROC_INFO_INC(sb, need_l_neighbor[h]);
1952 p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset); 1957 bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
1953 1958
1954 RFALSE(p_s_bh == p_s_tb->FL[n_h] && 1959 RFALSE(bh == tb->FL[h] &&
1955 !PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset), 1960 !PATH_OFFSET_POSITION(tb->tb_path, path_offset),
1956 "PAP-8270: invalid position in the parent"); 1961 "PAP-8270: invalid position in the parent");
1957 1962
1958 n_child_position = 1963 child_position =
1959 (p_s_bh == 1964 (bh ==
1960 p_s_tb->FL[n_h]) ? p_s_tb->lkey[n_h] : B_NR_ITEMS(p_s_tb-> 1965 tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb->
1961 FL[n_h]); 1966 FL[h]);
1962 n_son_number = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position); 1967 son_number = B_N_CHILD_NUM(tb->FL[h], child_position);
1963 p_s_bh = sb_bread(p_s_sb, n_son_number); 1968 bh = sb_bread(sb, son_number);
1964 if (!p_s_bh) 1969 if (!bh)
1965 return IO_ERROR; 1970 return IO_ERROR;
1966 if (FILESYSTEM_CHANGED_TB(p_s_tb)) { 1971 if (FILESYSTEM_CHANGED_TB(tb)) {
1967 decrement_bcount(p_s_bh); 1972 brelse(bh);
1968 PROC_INFO_INC(p_s_sb, get_neighbors_restart[n_h]); 1973 PROC_INFO_INC(sb, get_neighbors_restart[h]);
1969 return REPEAT_SEARCH; 1974 return REPEAT_SEARCH;
1970 } 1975 }
1971 1976
1972 RFALSE(!B_IS_IN_TREE(p_s_tb->FL[n_h]) || 1977 RFALSE(!B_IS_IN_TREE(tb->FL[h]) ||
1973 n_child_position > B_NR_ITEMS(p_s_tb->FL[n_h]) || 1978 child_position > B_NR_ITEMS(tb->FL[h]) ||
1974 B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position) != 1979 B_N_CHILD_NUM(tb->FL[h], child_position) !=
1975 p_s_bh->b_blocknr, "PAP-8275: invalid parent"); 1980 bh->b_blocknr, "PAP-8275: invalid parent");
1976 RFALSE(!B_IS_IN_TREE(p_s_bh), "PAP-8280: invalid child"); 1981 RFALSE(!B_IS_IN_TREE(bh), "PAP-8280: invalid child");
1977 RFALSE(!n_h && 1982 RFALSE(!h &&
1978 B_FREE_SPACE(p_s_bh) != 1983 B_FREE_SPACE(bh) !=
1979 MAX_CHILD_SIZE(p_s_bh) - 1984 MAX_CHILD_SIZE(bh) -
1980 dc_size(B_N_CHILD(p_s_tb->FL[0], n_child_position)), 1985 dc_size(B_N_CHILD(tb->FL[0], child_position)),
1981 "PAP-8290: invalid child size of left neighbor"); 1986 "PAP-8290: invalid child size of left neighbor");
1982 1987
1983 decrement_bcount(p_s_tb->L[n_h]); 1988 brelse(tb->L[h]);
1984 p_s_tb->L[n_h] = p_s_bh; 1989 tb->L[h] = bh;
1985 } 1990 }
1986 1991
1987 if (p_s_tb->rnum[n_h]) { /* We need right neighbor to balance S[n_path_offset]. */ 1992 /* We need right neighbor to balance S[path_offset]. */
1988 PROC_INFO_INC(p_s_sb, need_r_neighbor[n_h]); 1993 if (tb->rnum[h]) { /* We need right neighbor to balance S[path_offset]. */
1989 p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset); 1994 PROC_INFO_INC(sb, need_r_neighbor[h]);
1995 bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
1990 1996
1991 RFALSE(p_s_bh == p_s_tb->FR[n_h] && 1997 RFALSE(bh == tb->FR[h] &&
1992 PATH_OFFSET_POSITION(p_s_tb->tb_path, 1998 PATH_OFFSET_POSITION(tb->tb_path,
1993 n_path_offset) >= 1999 path_offset) >=
1994 B_NR_ITEMS(p_s_bh), 2000 B_NR_ITEMS(bh),
1995 "PAP-8295: invalid position in the parent"); 2001 "PAP-8295: invalid position in the parent");
1996 2002
1997 n_child_position = 2003 child_position =
1998 (p_s_bh == p_s_tb->FR[n_h]) ? p_s_tb->rkey[n_h] + 1 : 0; 2004 (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0;
1999 n_son_number = B_N_CHILD_NUM(p_s_tb->FR[n_h], n_child_position); 2005 son_number = B_N_CHILD_NUM(tb->FR[h], child_position);
2000 p_s_bh = sb_bread(p_s_sb, n_son_number); 2006 bh = sb_bread(sb, son_number);
2001 if (!p_s_bh) 2007 if (!bh)
2002 return IO_ERROR; 2008 return IO_ERROR;
2003 if (FILESYSTEM_CHANGED_TB(p_s_tb)) { 2009 if (FILESYSTEM_CHANGED_TB(tb)) {
2004 decrement_bcount(p_s_bh); 2010 brelse(bh);
2005 PROC_INFO_INC(p_s_sb, get_neighbors_restart[n_h]); 2011 PROC_INFO_INC(sb, get_neighbors_restart[h]);
2006 return REPEAT_SEARCH; 2012 return REPEAT_SEARCH;
2007 } 2013 }
2008 decrement_bcount(p_s_tb->R[n_h]); 2014 brelse(tb->R[h]);
2009 p_s_tb->R[n_h] = p_s_bh; 2015 tb->R[h] = bh;
2010 2016
2011 RFALSE(!n_h 2017 RFALSE(!h
2012 && B_FREE_SPACE(p_s_bh) != 2018 && B_FREE_SPACE(bh) !=
2013 MAX_CHILD_SIZE(p_s_bh) - 2019 MAX_CHILD_SIZE(bh) -
2014 dc_size(B_N_CHILD(p_s_tb->FR[0], n_child_position)), 2020 dc_size(B_N_CHILD(tb->FR[0], child_position)),
2015 "PAP-8300: invalid child size of right neighbor (%d != %d - %d)", 2021 "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
2016 B_FREE_SPACE(p_s_bh), MAX_CHILD_SIZE(p_s_bh), 2022 B_FREE_SPACE(bh), MAX_CHILD_SIZE(bh),
2017 dc_size(B_N_CHILD(p_s_tb->FR[0], n_child_position))); 2023 dc_size(B_N_CHILD(tb->FR[0], child_position)));
2018 2024
2019 } 2025 }
2020 return CARRY_ON; 2026 return CARRY_ON;
@@ -2088,52 +2094,46 @@ static int get_mem_for_virtual_node(struct tree_balance *tb)
2088} 2094}
2089 2095
2090#ifdef CONFIG_REISERFS_CHECK 2096#ifdef CONFIG_REISERFS_CHECK
2091static void tb_buffer_sanity_check(struct super_block *p_s_sb, 2097static void tb_buffer_sanity_check(struct super_block *sb,
2092 struct buffer_head *p_s_bh, 2098 struct buffer_head *bh,
2093 const char *descr, int level) 2099 const char *descr, int level)
2094{ 2100{
2095 if (p_s_bh) { 2101 if (bh) {
2096 if (atomic_read(&(p_s_bh->b_count)) <= 0) { 2102 if (atomic_read(&(bh->b_count)) <= 0)
2097 2103
2098 reiserfs_panic(p_s_sb, 2104 reiserfs_panic(sb, "jmacd-1", "negative or zero "
2099 "jmacd-1: tb_buffer_sanity_check(): negative or zero reference counter for buffer %s[%d] (%b)\n", 2105 "reference counter for buffer %s[%d] "
2100 descr, level, p_s_bh); 2106 "(%b)", descr, level, bh);
2101 } 2107
2102 2108 if (!buffer_uptodate(bh))
2103 if (!buffer_uptodate(p_s_bh)) { 2109 reiserfs_panic(sb, "jmacd-2", "buffer is not up "
2104 reiserfs_panic(p_s_sb, 2110 "to date %s[%d] (%b)",
2105 "jmacd-2: tb_buffer_sanity_check(): buffer is not up to date %s[%d] (%b)\n", 2111 descr, level, bh);
2106 descr, level, p_s_bh); 2112
2107 } 2113 if (!B_IS_IN_TREE(bh))
2108 2114 reiserfs_panic(sb, "jmacd-3", "buffer is not "
2109 if (!B_IS_IN_TREE(p_s_bh)) { 2115 "in tree %s[%d] (%b)",
2110 reiserfs_panic(p_s_sb, 2116 descr, level, bh);
2111 "jmacd-3: tb_buffer_sanity_check(): buffer is not in tree %s[%d] (%b)\n", 2117
2112 descr, level, p_s_bh); 2118 if (bh->b_bdev != sb->s_bdev)
2113 } 2119 reiserfs_panic(sb, "jmacd-4", "buffer has wrong "
2114 2120 "device %s[%d] (%b)",
2115 if (p_s_bh->b_bdev != p_s_sb->s_bdev) { 2121 descr, level, bh);
2116 reiserfs_panic(p_s_sb, 2122
2117 "jmacd-4: tb_buffer_sanity_check(): buffer has wrong device %s[%d] (%b)\n", 2123 if (bh->b_size != sb->s_blocksize)
2118 descr, level, p_s_bh); 2124 reiserfs_panic(sb, "jmacd-5", "buffer has wrong "
2119 } 2125 "blocksize %s[%d] (%b)",
2120 2126 descr, level, bh);
2121 if (p_s_bh->b_size != p_s_sb->s_blocksize) { 2127
2122 reiserfs_panic(p_s_sb, 2128 if (bh->b_blocknr > SB_BLOCK_COUNT(sb))
2123 "jmacd-5: tb_buffer_sanity_check(): buffer has wrong blocksize %s[%d] (%b)\n", 2129 reiserfs_panic(sb, "jmacd-6", "buffer block "
2124 descr, level, p_s_bh); 2130 "number too high %s[%d] (%b)",
2125 } 2131 descr, level, bh);
2126
2127 if (p_s_bh->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) {
2128 reiserfs_panic(p_s_sb,
2129 "jmacd-6: tb_buffer_sanity_check(): buffer block number too high %s[%d] (%b)\n",
2130 descr, level, p_s_bh);
2131 }
2132 } 2132 }
2133} 2133}
2134#else 2134#else
2135static void tb_buffer_sanity_check(struct super_block *p_s_sb, 2135static void tb_buffer_sanity_check(struct super_block *sb,
2136 struct buffer_head *p_s_bh, 2136 struct buffer_head *bh,
2137 const char *descr, int level) 2137 const char *descr, int level)
2138{; 2138{;
2139} 2139}
@@ -2144,7 +2144,7 @@ static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh)
2144 return reiserfs_prepare_for_journal(s, bh, 0); 2144 return reiserfs_prepare_for_journal(s, bh, 0);
2145} 2145}
2146 2146
2147static int wait_tb_buffers_until_unlocked(struct tree_balance *p_s_tb) 2147static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
2148{ 2148{
2149 struct buffer_head *locked; 2149 struct buffer_head *locked;
2150#ifdef CONFIG_REISERFS_CHECK 2150#ifdef CONFIG_REISERFS_CHECK
@@ -2156,95 +2156,94 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *p_s_tb)
2156 2156
2157 locked = NULL; 2157 locked = NULL;
2158 2158
2159 for (i = p_s_tb->tb_path->path_length; 2159 for (i = tb->tb_path->path_length;
2160 !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) { 2160 !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) {
2161 if (PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) { 2161 if (PATH_OFFSET_PBUFFER(tb->tb_path, i)) {
2162 /* if I understand correctly, we can only be sure the last buffer 2162 /* if I understand correctly, we can only be sure the last buffer
2163 ** in the path is in the tree --clm 2163 ** in the path is in the tree --clm
2164 */ 2164 */
2165#ifdef CONFIG_REISERFS_CHECK 2165#ifdef CONFIG_REISERFS_CHECK
2166 if (PATH_PLAST_BUFFER(p_s_tb->tb_path) == 2166 if (PATH_PLAST_BUFFER(tb->tb_path) ==
2167 PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) { 2167 PATH_OFFSET_PBUFFER(tb->tb_path, i))
2168 tb_buffer_sanity_check(p_s_tb->tb_sb, 2168 tb_buffer_sanity_check(tb->tb_sb,
2169 PATH_OFFSET_PBUFFER 2169 PATH_OFFSET_PBUFFER
2170 (p_s_tb->tb_path, 2170 (tb->tb_path,
2171 i), "S", 2171 i), "S",
2172 p_s_tb->tb_path-> 2172 tb->tb_path->
2173 path_length - i); 2173 path_length - i);
2174 }
2175#endif 2174#endif
2176 if (!clear_all_dirty_bits(p_s_tb->tb_sb, 2175 if (!clear_all_dirty_bits(tb->tb_sb,
2177 PATH_OFFSET_PBUFFER 2176 PATH_OFFSET_PBUFFER
2178 (p_s_tb->tb_path, 2177 (tb->tb_path,
2179 i))) { 2178 i))) {
2180 locked = 2179 locked =
2181 PATH_OFFSET_PBUFFER(p_s_tb->tb_path, 2180 PATH_OFFSET_PBUFFER(tb->tb_path,
2182 i); 2181 i);
2183 } 2182 }
2184 } 2183 }
2185 } 2184 }
2186 2185
2187 for (i = 0; !locked && i < MAX_HEIGHT && p_s_tb->insert_size[i]; 2186 for (i = 0; !locked && i < MAX_HEIGHT && tb->insert_size[i];
2188 i++) { 2187 i++) {
2189 2188
2190 if (p_s_tb->lnum[i]) { 2189 if (tb->lnum[i]) {
2191 2190
2192 if (p_s_tb->L[i]) { 2191 if (tb->L[i]) {
2193 tb_buffer_sanity_check(p_s_tb->tb_sb, 2192 tb_buffer_sanity_check(tb->tb_sb,
2194 p_s_tb->L[i], 2193 tb->L[i],
2195 "L", i); 2194 "L", i);
2196 if (!clear_all_dirty_bits 2195 if (!clear_all_dirty_bits
2197 (p_s_tb->tb_sb, p_s_tb->L[i])) 2196 (tb->tb_sb, tb->L[i]))
2198 locked = p_s_tb->L[i]; 2197 locked = tb->L[i];
2199 } 2198 }
2200 2199
2201 if (!locked && p_s_tb->FL[i]) { 2200 if (!locked && tb->FL[i]) {
2202 tb_buffer_sanity_check(p_s_tb->tb_sb, 2201 tb_buffer_sanity_check(tb->tb_sb,
2203 p_s_tb->FL[i], 2202 tb->FL[i],
2204 "FL", i); 2203 "FL", i);
2205 if (!clear_all_dirty_bits 2204 if (!clear_all_dirty_bits
2206 (p_s_tb->tb_sb, p_s_tb->FL[i])) 2205 (tb->tb_sb, tb->FL[i]))
2207 locked = p_s_tb->FL[i]; 2206 locked = tb->FL[i];
2208 } 2207 }
2209 2208
2210 if (!locked && p_s_tb->CFL[i]) { 2209 if (!locked && tb->CFL[i]) {
2211 tb_buffer_sanity_check(p_s_tb->tb_sb, 2210 tb_buffer_sanity_check(tb->tb_sb,
2212 p_s_tb->CFL[i], 2211 tb->CFL[i],
2213 "CFL", i); 2212 "CFL", i);
2214 if (!clear_all_dirty_bits 2213 if (!clear_all_dirty_bits
2215 (p_s_tb->tb_sb, p_s_tb->CFL[i])) 2214 (tb->tb_sb, tb->CFL[i]))
2216 locked = p_s_tb->CFL[i]; 2215 locked = tb->CFL[i];
2217 } 2216 }
2218 2217
2219 } 2218 }
2220 2219
2221 if (!locked && (p_s_tb->rnum[i])) { 2220 if (!locked && (tb->rnum[i])) {
2222 2221
2223 if (p_s_tb->R[i]) { 2222 if (tb->R[i]) {
2224 tb_buffer_sanity_check(p_s_tb->tb_sb, 2223 tb_buffer_sanity_check(tb->tb_sb,
2225 p_s_tb->R[i], 2224 tb->R[i],
2226 "R", i); 2225 "R", i);
2227 if (!clear_all_dirty_bits 2226 if (!clear_all_dirty_bits
2228 (p_s_tb->tb_sb, p_s_tb->R[i])) 2227 (tb->tb_sb, tb->R[i]))
2229 locked = p_s_tb->R[i]; 2228 locked = tb->R[i];
2230 } 2229 }
2231 2230
2232 if (!locked && p_s_tb->FR[i]) { 2231 if (!locked && tb->FR[i]) {
2233 tb_buffer_sanity_check(p_s_tb->tb_sb, 2232 tb_buffer_sanity_check(tb->tb_sb,
2234 p_s_tb->FR[i], 2233 tb->FR[i],
2235 "FR", i); 2234 "FR", i);
2236 if (!clear_all_dirty_bits 2235 if (!clear_all_dirty_bits
2237 (p_s_tb->tb_sb, p_s_tb->FR[i])) 2236 (tb->tb_sb, tb->FR[i]))
2238 locked = p_s_tb->FR[i]; 2237 locked = tb->FR[i];
2239 } 2238 }
2240 2239
2241 if (!locked && p_s_tb->CFR[i]) { 2240 if (!locked && tb->CFR[i]) {
2242 tb_buffer_sanity_check(p_s_tb->tb_sb, 2241 tb_buffer_sanity_check(tb->tb_sb,
2243 p_s_tb->CFR[i], 2242 tb->CFR[i],
2244 "CFR", i); 2243 "CFR", i);
2245 if (!clear_all_dirty_bits 2244 if (!clear_all_dirty_bits
2246 (p_s_tb->tb_sb, p_s_tb->CFR[i])) 2245 (tb->tb_sb, tb->CFR[i]))
2247 locked = p_s_tb->CFR[i]; 2246 locked = tb->CFR[i];
2248 } 2247 }
2249 } 2248 }
2250 } 2249 }
@@ -2257,10 +2256,10 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *p_s_tb)
2257 ** --clm 2256 ** --clm
2258 */ 2257 */
2259 for (i = 0; !locked && i < MAX_FEB_SIZE; i++) { 2258 for (i = 0; !locked && i < MAX_FEB_SIZE; i++) {
2260 if (p_s_tb->FEB[i]) { 2259 if (tb->FEB[i]) {
2261 if (!clear_all_dirty_bits 2260 if (!clear_all_dirty_bits
2262 (p_s_tb->tb_sb, p_s_tb->FEB[i])) 2261 (tb->tb_sb, tb->FEB[i]))
2263 locked = p_s_tb->FEB[i]; 2262 locked = tb->FEB[i];
2264 } 2263 }
2265 } 2264 }
2266 2265
@@ -2268,21 +2267,20 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *p_s_tb)
2268#ifdef CONFIG_REISERFS_CHECK 2267#ifdef CONFIG_REISERFS_CHECK
2269 repeat_counter++; 2268 repeat_counter++;
2270 if ((repeat_counter % 10000) == 0) { 2269 if ((repeat_counter % 10000) == 0) {
2271 reiserfs_warning(p_s_tb->tb_sb, 2270 reiserfs_warning(tb->tb_sb, "reiserfs-8200",
2272 "wait_tb_buffers_until_released(): too many " 2271 "too many iterations waiting "
2273 "iterations waiting for buffer to unlock " 2272 "for buffer to unlock "
2274 "(%b)", locked); 2273 "(%b)", locked);
2275 2274
2276 /* Don't loop forever. Try to recover from possible error. */ 2275 /* Don't loop forever. Try to recover from possible error. */
2277 2276
2278 return (FILESYSTEM_CHANGED_TB(p_s_tb)) ? 2277 return (FILESYSTEM_CHANGED_TB(tb)) ?
2279 REPEAT_SEARCH : CARRY_ON; 2278 REPEAT_SEARCH : CARRY_ON;
2280 } 2279 }
2281#endif 2280#endif
2282 __wait_on_buffer(locked); 2281 __wait_on_buffer(locked);
2283 if (FILESYSTEM_CHANGED_TB(p_s_tb)) { 2282 if (FILESYSTEM_CHANGED_TB(tb))
2284 return REPEAT_SEARCH; 2283 return REPEAT_SEARCH;
2285 }
2286 } 2284 }
2287 2285
2288 } while (locked); 2286 } while (locked);
@@ -2295,15 +2293,15 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *p_s_tb)
2295 * analyze what and where should be moved; 2293 * analyze what and where should be moved;
2296 * get sufficient number of new nodes; 2294 * get sufficient number of new nodes;
2297 * Balancing will start only after all resources will be collected at a time. 2295 * Balancing will start only after all resources will be collected at a time.
2298 * 2296 *
2299 * When ported to SMP kernels, only at the last moment after all needed nodes 2297 * When ported to SMP kernels, only at the last moment after all needed nodes
2300 * are collected in cache, will the resources be locked using the usual 2298 * are collected in cache, will the resources be locked using the usual
2301 * textbook ordered lock acquisition algorithms. Note that ensuring that 2299 * textbook ordered lock acquisition algorithms. Note that ensuring that
2302 * this code neither write locks what it does not need to write lock nor locks out of order 2300 * this code neither write locks what it does not need to write lock nor locks out of order
2303 * will be a pain in the butt that could have been avoided. Grumble grumble. -Hans 2301 * will be a pain in the butt that could have been avoided. Grumble grumble. -Hans
2304 * 2302 *
2305 * fix is meant in the sense of render unchanging 2303 * fix is meant in the sense of render unchanging
2306 * 2304 *
2307 * Latency might be improved by first gathering a list of what buffers are needed 2305 * Latency might be improved by first gathering a list of what buffers are needed
2308 * and then getting as many of them in parallel as possible? -Hans 2306 * and then getting as many of them in parallel as possible? -Hans
2309 * 2307 *
@@ -2312,159 +2310,160 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *p_s_tb)
2312 * tb tree_balance structure; 2310 * tb tree_balance structure;
2313 * inum item number in S[h]; 2311 * inum item number in S[h];
2314 * pos_in_item - comment this if you can 2312 * pos_in_item - comment this if you can
2315 * ins_ih & ins_sd are used when inserting 2313 * ins_ih item head of item being inserted
2314 * data inserted item or data to be pasted
2316 * Returns: 1 - schedule occurred while the function worked; 2315 * Returns: 1 - schedule occurred while the function worked;
2317 * 0 - schedule didn't occur while the function worked; 2316 * 0 - schedule didn't occur while the function worked;
2318 * -1 - if no_disk_space 2317 * -1 - if no_disk_space
2319 */ 2318 */
2320 2319
2321int fix_nodes(int n_op_mode, struct tree_balance *p_s_tb, struct item_head *p_s_ins_ih, // item head of item being inserted 2320int fix_nodes(int op_mode, struct tree_balance *tb,
2322 const void *data // inserted item or data to be pasted 2321 struct item_head *ins_ih, const void *data)
2323 )
2324{ 2322{
2325 int n_ret_value, n_h, n_item_num = PATH_LAST_POSITION(p_s_tb->tb_path); 2323 int ret, h, item_num = PATH_LAST_POSITION(tb->tb_path);
2326 int n_pos_in_item; 2324 int pos_in_item;
2327 2325
2328 /* we set wait_tb_buffers_run when we have to restore any dirty bits cleared 2326 /* we set wait_tb_buffers_run when we have to restore any dirty bits cleared
2329 ** during wait_tb_buffers_run 2327 ** during wait_tb_buffers_run
2330 */ 2328 */
2331 int wait_tb_buffers_run = 0; 2329 int wait_tb_buffers_run = 0;
2332 struct buffer_head *p_s_tbS0 = PATH_PLAST_BUFFER(p_s_tb->tb_path); 2330 struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
2333 2331
2334 ++REISERFS_SB(p_s_tb->tb_sb)->s_fix_nodes; 2332 ++REISERFS_SB(tb->tb_sb)->s_fix_nodes;
2335 2333
2336 n_pos_in_item = p_s_tb->tb_path->pos_in_item; 2334 pos_in_item = tb->tb_path->pos_in_item;
2337 2335
2338 p_s_tb->fs_gen = get_generation(p_s_tb->tb_sb); 2336 tb->fs_gen = get_generation(tb->tb_sb);
2339 2337
2340 /* we prepare and log the super here so it will already be in the 2338 /* we prepare and log the super here so it will already be in the
2341 ** transaction when do_balance needs to change it. 2339 ** transaction when do_balance needs to change it.
2342 ** This way do_balance won't have to schedule when trying to prepare 2340 ** This way do_balance won't have to schedule when trying to prepare
2343 ** the super for logging 2341 ** the super for logging
2344 */ 2342 */
2345 reiserfs_prepare_for_journal(p_s_tb->tb_sb, 2343 reiserfs_prepare_for_journal(tb->tb_sb,
2346 SB_BUFFER_WITH_SB(p_s_tb->tb_sb), 1); 2344 SB_BUFFER_WITH_SB(tb->tb_sb), 1);
2347 journal_mark_dirty(p_s_tb->transaction_handle, p_s_tb->tb_sb, 2345 journal_mark_dirty(tb->transaction_handle, tb->tb_sb,
2348 SB_BUFFER_WITH_SB(p_s_tb->tb_sb)); 2346 SB_BUFFER_WITH_SB(tb->tb_sb));
2349 if (FILESYSTEM_CHANGED_TB(p_s_tb)) 2347 if (FILESYSTEM_CHANGED_TB(tb))
2350 return REPEAT_SEARCH; 2348 return REPEAT_SEARCH;
2351 2349
2352 /* if it possible in indirect_to_direct conversion */ 2350 /* if it possible in indirect_to_direct conversion */
2353 if (buffer_locked(p_s_tbS0)) { 2351 if (buffer_locked(tbS0)) {
2354 __wait_on_buffer(p_s_tbS0); 2352 __wait_on_buffer(tbS0);
2355 if (FILESYSTEM_CHANGED_TB(p_s_tb)) 2353 if (FILESYSTEM_CHANGED_TB(tb))
2356 return REPEAT_SEARCH; 2354 return REPEAT_SEARCH;
2357 } 2355 }
2358#ifdef CONFIG_REISERFS_CHECK 2356#ifdef CONFIG_REISERFS_CHECK
2359 if (cur_tb) { 2357 if (cur_tb) {
2360 print_cur_tb("fix_nodes"); 2358 print_cur_tb("fix_nodes");
2361 reiserfs_panic(p_s_tb->tb_sb, 2359 reiserfs_panic(tb->tb_sb, "PAP-8305",
2362 "PAP-8305: fix_nodes: there is pending do_balance"); 2360 "there is pending do_balance");
2363 } 2361 }
2364 2362
2365 if (!buffer_uptodate(p_s_tbS0) || !B_IS_IN_TREE(p_s_tbS0)) { 2363 if (!buffer_uptodate(tbS0) || !B_IS_IN_TREE(tbS0))
2366 reiserfs_panic(p_s_tb->tb_sb, 2364 reiserfs_panic(tb->tb_sb, "PAP-8320", "S[0] (%b %z) is "
2367 "PAP-8320: fix_nodes: S[0] (%b %z) is not uptodate " 2365 "not uptodate at the beginning of fix_nodes "
2368 "at the beginning of fix_nodes or not in tree (mode %c)", 2366 "or not in tree (mode %c)",
2369 p_s_tbS0, p_s_tbS0, n_op_mode); 2367 tbS0, tbS0, op_mode);
2370 }
2371 2368
2372 /* Check parameters. */ 2369 /* Check parameters. */
2373 switch (n_op_mode) { 2370 switch (op_mode) {
2374 case M_INSERT: 2371 case M_INSERT:
2375 if (n_item_num <= 0 || n_item_num > B_NR_ITEMS(p_s_tbS0)) 2372 if (item_num <= 0 || item_num > B_NR_ITEMS(tbS0))
2376 reiserfs_panic(p_s_tb->tb_sb, 2373 reiserfs_panic(tb->tb_sb, "PAP-8330", "Incorrect "
2377 "PAP-8330: fix_nodes: Incorrect item number %d (in S0 - %d) in case of insert", 2374 "item number %d (in S0 - %d) in case "
2378 n_item_num, B_NR_ITEMS(p_s_tbS0)); 2375 "of insert", item_num,
2376 B_NR_ITEMS(tbS0));
2379 break; 2377 break;
2380 case M_PASTE: 2378 case M_PASTE:
2381 case M_DELETE: 2379 case M_DELETE:
2382 case M_CUT: 2380 case M_CUT:
2383 if (n_item_num < 0 || n_item_num >= B_NR_ITEMS(p_s_tbS0)) { 2381 if (item_num < 0 || item_num >= B_NR_ITEMS(tbS0)) {
2384 print_block(p_s_tbS0, 0, -1, -1); 2382 print_block(tbS0, 0, -1, -1);
2385 reiserfs_panic(p_s_tb->tb_sb, 2383 reiserfs_panic(tb->tb_sb, "PAP-8335", "Incorrect "
2386 "PAP-8335: fix_nodes: Incorrect item number(%d); mode = %c insert_size = %d\n", 2384 "item number(%d); mode = %c "
2387 n_item_num, n_op_mode, 2385 "insert_size = %d",
2388 p_s_tb->insert_size[0]); 2386 item_num, op_mode,
2387 tb->insert_size[0]);
2389 } 2388 }
2390 break; 2389 break;
2391 default: 2390 default:
2392 reiserfs_panic(p_s_tb->tb_sb, 2391 reiserfs_panic(tb->tb_sb, "PAP-8340", "Incorrect mode "
2393 "PAP-8340: fix_nodes: Incorrect mode of operation"); 2392 "of operation");
2394 } 2393 }
2395#endif 2394#endif
2396 2395
2397 if (get_mem_for_virtual_node(p_s_tb) == REPEAT_SEARCH) 2396 if (get_mem_for_virtual_node(tb) == REPEAT_SEARCH)
2398 // FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat 2397 // FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat
2399 return REPEAT_SEARCH; 2398 return REPEAT_SEARCH;
2400 2399
2401 /* Starting from the leaf level; for all levels n_h of the tree. */ 2400 /* Starting from the leaf level; for all levels h of the tree. */
2402 for (n_h = 0; n_h < MAX_HEIGHT && p_s_tb->insert_size[n_h]; n_h++) { 2401 for (h = 0; h < MAX_HEIGHT && tb->insert_size[h]; h++) {
2403 if ((n_ret_value = get_direct_parent(p_s_tb, n_h)) != CARRY_ON) { 2402 ret = get_direct_parent(tb, h);
2403 if (ret != CARRY_ON)
2404 goto repeat; 2404 goto repeat;
2405 }
2406 2405
2407 if ((n_ret_value = 2406 ret = check_balance(op_mode, tb, h, item_num,
2408 check_balance(n_op_mode, p_s_tb, n_h, n_item_num, 2407 pos_in_item, ins_ih, data);
2409 n_pos_in_item, p_s_ins_ih, 2408 if (ret != CARRY_ON) {
2410 data)) != CARRY_ON) { 2409 if (ret == NO_BALANCING_NEEDED) {
2411 if (n_ret_value == NO_BALANCING_NEEDED) {
2412 /* No balancing for higher levels needed. */ 2410 /* No balancing for higher levels needed. */
2413 if ((n_ret_value = 2411 ret = get_neighbors(tb, h);
2414 get_neighbors(p_s_tb, n_h)) != CARRY_ON) { 2412 if (ret != CARRY_ON)
2415 goto repeat; 2413 goto repeat;
2416 } 2414 if (h != MAX_HEIGHT - 1)
2417 if (n_h != MAX_HEIGHT - 1) 2415 tb->insert_size[h + 1] = 0;
2418 p_s_tb->insert_size[n_h + 1] = 0;
2419 /* ok, analysis and resource gathering are complete */ 2416 /* ok, analysis and resource gathering are complete */
2420 break; 2417 break;
2421 } 2418 }
2422 goto repeat; 2419 goto repeat;
2423 } 2420 }
2424 2421
2425 if ((n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON) { 2422 ret = get_neighbors(tb, h);
2423 if (ret != CARRY_ON)
2426 goto repeat; 2424 goto repeat;
2427 }
2428 2425
2429 if ((n_ret_value = get_empty_nodes(p_s_tb, n_h)) != CARRY_ON) { 2426 /* No disk space, or schedule occurred and analysis may be
2430 goto repeat; /* No disk space, or schedule occurred and 2427 * invalid and needs to be redone. */
2431 analysis may be invalid and needs to be redone. */ 2428 ret = get_empty_nodes(tb, h);
2432 } 2429 if (ret != CARRY_ON)
2430 goto repeat;
2433 2431
2434 if (!PATH_H_PBUFFER(p_s_tb->tb_path, n_h)) { 2432 if (!PATH_H_PBUFFER(tb->tb_path, h)) {
2435 /* We have a positive insert size but no nodes exist on this 2433 /* We have a positive insert size but no nodes exist on this
2436 level, this means that we are creating a new root. */ 2434 level, this means that we are creating a new root. */
2437 2435
2438 RFALSE(p_s_tb->blknum[n_h] != 1, 2436 RFALSE(tb->blknum[h] != 1,
2439 "PAP-8350: creating new empty root"); 2437 "PAP-8350: creating new empty root");
2440 2438
2441 if (n_h < MAX_HEIGHT - 1) 2439 if (h < MAX_HEIGHT - 1)
2442 p_s_tb->insert_size[n_h + 1] = 0; 2440 tb->insert_size[h + 1] = 0;
2443 } else if (!PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1)) { 2441 } else if (!PATH_H_PBUFFER(tb->tb_path, h + 1)) {
2444 if (p_s_tb->blknum[n_h] > 1) { 2442 if (tb->blknum[h] > 1) {
2445 /* The tree needs to be grown, so this node S[n_h] 2443 /* The tree needs to be grown, so this node S[h]
2446 which is the root node is split into two nodes, 2444 which is the root node is split into two nodes,
2447 and a new node (S[n_h+1]) will be created to 2445 and a new node (S[h+1]) will be created to
2448 become the root node. */ 2446 become the root node. */
2449 2447
2450 RFALSE(n_h == MAX_HEIGHT - 1, 2448 RFALSE(h == MAX_HEIGHT - 1,
2451 "PAP-8355: attempt to create too high of a tree"); 2449 "PAP-8355: attempt to create too high of a tree");
2452 2450
2453 p_s_tb->insert_size[n_h + 1] = 2451 tb->insert_size[h + 1] =
2454 (DC_SIZE + 2452 (DC_SIZE +
2455 KEY_SIZE) * (p_s_tb->blknum[n_h] - 1) + 2453 KEY_SIZE) * (tb->blknum[h] - 1) +
2456 DC_SIZE; 2454 DC_SIZE;
2457 } else if (n_h < MAX_HEIGHT - 1) 2455 } else if (h < MAX_HEIGHT - 1)
2458 p_s_tb->insert_size[n_h + 1] = 0; 2456 tb->insert_size[h + 1] = 0;
2459 } else 2457 } else
2460 p_s_tb->insert_size[n_h + 1] = 2458 tb->insert_size[h + 1] =
2461 (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1); 2459 (DC_SIZE + KEY_SIZE) * (tb->blknum[h] - 1);
2462 } 2460 }
2463 2461
2464 if ((n_ret_value = wait_tb_buffers_until_unlocked(p_s_tb)) == CARRY_ON) { 2462 ret = wait_tb_buffers_until_unlocked(tb);
2465 if (FILESYSTEM_CHANGED_TB(p_s_tb)) { 2463 if (ret == CARRY_ON) {
2464 if (FILESYSTEM_CHANGED_TB(tb)) {
2466 wait_tb_buffers_run = 1; 2465 wait_tb_buffers_run = 1;
2467 n_ret_value = REPEAT_SEARCH; 2466 ret = REPEAT_SEARCH;
2468 goto repeat; 2467 goto repeat;
2469 } else { 2468 } else {
2470 return CARRY_ON; 2469 return CARRY_ON;
@@ -2485,57 +2484,57 @@ int fix_nodes(int n_op_mode, struct tree_balance *p_s_tb, struct item_head *p_s_
2485 2484
2486 /* Release path buffers. */ 2485 /* Release path buffers. */
2487 if (wait_tb_buffers_run) { 2486 if (wait_tb_buffers_run) {
2488 pathrelse_and_restore(p_s_tb->tb_sb, p_s_tb->tb_path); 2487 pathrelse_and_restore(tb->tb_sb, tb->tb_path);
2489 } else { 2488 } else {
2490 pathrelse(p_s_tb->tb_path); 2489 pathrelse(tb->tb_path);
2491 } 2490 }
2492 /* brelse all resources collected for balancing */ 2491 /* brelse all resources collected for balancing */
2493 for (i = 0; i < MAX_HEIGHT; i++) { 2492 for (i = 0; i < MAX_HEIGHT; i++) {
2494 if (wait_tb_buffers_run) { 2493 if (wait_tb_buffers_run) {
2495 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, 2494 reiserfs_restore_prepared_buffer(tb->tb_sb,
2496 p_s_tb->L[i]); 2495 tb->L[i]);
2497 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, 2496 reiserfs_restore_prepared_buffer(tb->tb_sb,
2498 p_s_tb->R[i]); 2497 tb->R[i]);
2499 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, 2498 reiserfs_restore_prepared_buffer(tb->tb_sb,
2500 p_s_tb->FL[i]); 2499 tb->FL[i]);
2501 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, 2500 reiserfs_restore_prepared_buffer(tb->tb_sb,
2502 p_s_tb->FR[i]); 2501 tb->FR[i]);
2503 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, 2502 reiserfs_restore_prepared_buffer(tb->tb_sb,
2504 p_s_tb-> 2503 tb->
2505 CFL[i]); 2504 CFL[i]);
2506 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, 2505 reiserfs_restore_prepared_buffer(tb->tb_sb,
2507 p_s_tb-> 2506 tb->
2508 CFR[i]); 2507 CFR[i]);
2509 } 2508 }
2510 2509
2511 brelse(p_s_tb->L[i]); 2510 brelse(tb->L[i]);
2512 p_s_tb->L[i] = NULL; 2511 brelse(tb->R[i]);
2513 brelse(p_s_tb->R[i]); 2512 brelse(tb->FL[i]);
2514 p_s_tb->R[i] = NULL; 2513 brelse(tb->FR[i]);
2515 brelse(p_s_tb->FL[i]); 2514 brelse(tb->CFL[i]);
2516 p_s_tb->FL[i] = NULL; 2515 brelse(tb->CFR[i]);
2517 brelse(p_s_tb->FR[i]); 2516
2518 p_s_tb->FR[i] = NULL; 2517 tb->L[i] = NULL;
2519 brelse(p_s_tb->CFL[i]); 2518 tb->R[i] = NULL;
2520 p_s_tb->CFL[i] = NULL; 2519 tb->FL[i] = NULL;
2521 brelse(p_s_tb->CFR[i]); 2520 tb->FR[i] = NULL;
2522 p_s_tb->CFR[i] = NULL; 2521 tb->CFL[i] = NULL;
2522 tb->CFR[i] = NULL;
2523 } 2523 }
2524 2524
2525 if (wait_tb_buffers_run) { 2525 if (wait_tb_buffers_run) {
2526 for (i = 0; i < MAX_FEB_SIZE; i++) { 2526 for (i = 0; i < MAX_FEB_SIZE; i++) {
2527 if (p_s_tb->FEB[i]) { 2527 if (tb->FEB[i])
2528 reiserfs_restore_prepared_buffer 2528 reiserfs_restore_prepared_buffer
2529 (p_s_tb->tb_sb, p_s_tb->FEB[i]); 2529 (tb->tb_sb, tb->FEB[i]);
2530 }
2531 } 2530 }
2532 } 2531 }
2533 return n_ret_value; 2532 return ret;
2534 } 2533 }
2535 2534
2536} 2535}
2537 2536
2538/* Anatoly will probably forgive me renaming p_s_tb to tb. I just 2537/* Anatoly will probably forgive me renaming tb to tb. I just
2539 wanted to make lines shorter */ 2538 wanted to make lines shorter */
2540void unfix_nodes(struct tree_balance *tb) 2539void unfix_nodes(struct tree_balance *tb)
2541{ 2540{
diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c
index e664ac16fad9..6471c670743e 100644
--- a/fs/reiserfs/hashes.c
+++ b/fs/reiserfs/hashes.c
@@ -7,7 +7,7 @@
7 * (see Applied Cryptography, 2nd edition, p448). 7 * (see Applied Cryptography, 2nd edition, p448).
8 * 8 *
9 * Jeremy Fitzhardinge <jeremy@zip.com.au> 1998 9 * Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
10 * 10 *
11 * Jeremy has agreed to the contents of reiserfs/README. -Hans 11 * Jeremy has agreed to the contents of reiserfs/README. -Hans
12 * Yura's function is added (04/07/2000) 12 * Yura's function is added (04/07/2000)
13 */ 13 */
diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c
index de391a82b999..2074fd95046b 100644
--- a/fs/reiserfs/ibalance.c
+++ b/fs/reiserfs/ibalance.c
@@ -105,8 +105,8 @@ static void internal_define_dest_src_infos(int shift_mode,
105 break; 105 break;
106 106
107 default: 107 default:
108 reiserfs_panic(tb->tb_sb, 108 reiserfs_panic(tb->tb_sb, "ibalance-1",
109 "internal_define_dest_src_infos: shift type is unknown (%d)", 109 "shift type is unknown (%d)",
110 shift_mode); 110 shift_mode);
111 } 111 }
112} 112}
@@ -278,7 +278,7 @@ static void internal_delete_childs(struct buffer_info *cur_bi, int from, int n)
278 278
279/* copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest 279/* copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest
280* last_first == FIRST_TO_LAST means, that we copy first items from src to tail of dest 280* last_first == FIRST_TO_LAST means, that we copy first items from src to tail of dest
281 * last_first == LAST_TO_FIRST means, that we copy last items from src to head of dest 281 * last_first == LAST_TO_FIRST means, that we copy last items from src to head of dest
282 */ 282 */
283static void internal_copy_pointers_items(struct buffer_info *dest_bi, 283static void internal_copy_pointers_items(struct buffer_info *dest_bi,
284 struct buffer_head *src, 284 struct buffer_head *src,
@@ -385,7 +385,7 @@ static void internal_move_pointers_items(struct buffer_info *dest_bi,
385 if (last_first == FIRST_TO_LAST) { /* shift_left occurs */ 385 if (last_first == FIRST_TO_LAST) { /* shift_left occurs */
386 first_pointer = 0; 386 first_pointer = 0;
387 first_item = 0; 387 first_item = 0;
388 /* delete cpy_num - del_par pointers and keys starting for pointers with first_pointer, 388 /* delete cpy_num - del_par pointers and keys starting for pointers with first_pointer,
389 for key - with first_item */ 389 for key - with first_item */
390 internal_delete_pointers_items(src_bi, first_pointer, 390 internal_delete_pointers_items(src_bi, first_pointer,
391 first_item, cpy_num - del_par); 391 first_item, cpy_num - del_par);
@@ -453,7 +453,7 @@ static void internal_insert_key(struct buffer_info *dest_bi, int dest_position_b
453 } 453 }
454} 454}
455 455
456/* Insert d_key'th (delimiting) key from buffer cfl to tail of dest. 456/* Insert d_key'th (delimiting) key from buffer cfl to tail of dest.
457 * Copy pointer_amount node pointers and pointer_amount - 1 items from buffer src to buffer dest. 457 * Copy pointer_amount node pointers and pointer_amount - 1 items from buffer src to buffer dest.
458 * Replace d_key'th key in buffer cfl. 458 * Replace d_key'th key in buffer cfl.
459 * Delete pointer_amount items and node pointers from buffer src. 459 * Delete pointer_amount items and node pointers from buffer src.
@@ -518,7 +518,7 @@ static void internal_shift1_left(struct tree_balance *tb,
518 /* internal_move_pointers_items (tb->L[h], tb->S[h], FIRST_TO_LAST, pointer_amount, 1); */ 518 /* internal_move_pointers_items (tb->L[h], tb->S[h], FIRST_TO_LAST, pointer_amount, 1); */
519} 519}
520 520
521/* Insert d_key'th (delimiting) key from buffer cfr to head of dest. 521/* Insert d_key'th (delimiting) key from buffer cfr to head of dest.
522 * Copy n node pointers and n - 1 items from buffer src to buffer dest. 522 * Copy n node pointers and n - 1 items from buffer src to buffer dest.
523 * Replace d_key'th key in buffer cfr. 523 * Replace d_key'th key in buffer cfr.
524 * Delete n items and node pointers from buffer src. 524 * Delete n items and node pointers from buffer src.
@@ -702,8 +702,8 @@ static void balance_internal_when_delete(struct tree_balance *tb,
702 702
703 return; 703 return;
704 } 704 }
705 reiserfs_panic(tb->tb_sb, 705 reiserfs_panic(tb->tb_sb, "ibalance-2",
706 "balance_internal_when_delete: unexpected tb->lnum[%d]==%d or tb->rnum[%d]==%d", 706 "unexpected tb->lnum[%d]==%d or tb->rnum[%d]==%d",
707 h, tb->lnum[h], h, tb->rnum[h]); 707 h, tb->lnum[h], h, tb->rnum[h]);
708} 708}
709 709
@@ -749,7 +749,7 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure
749 this means that new pointers and items must be inserted AFTER * 749 this means that new pointers and items must be inserted AFTER *
750 child_pos 750 child_pos
751 } 751 }
752 else 752 else
753 { 753 {
754 it is the position of the leftmost pointer that must be deleted (together with 754 it is the position of the leftmost pointer that must be deleted (together with
755 its corresponding key to the left of the pointer) 755 its corresponding key to the left of the pointer)
@@ -940,8 +940,8 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure
940 struct block_head *blkh; 940 struct block_head *blkh;
941 941
942 if (tb->blknum[h] != 1) 942 if (tb->blknum[h] != 1)
943 reiserfs_panic(NULL, 943 reiserfs_panic(NULL, "ibalance-3", "One new node "
944 "balance_internal: One new node required for creating the new root"); 944 "required for creating the new root");
945 /* S[h] = empty buffer from the list FEB. */ 945 /* S[h] = empty buffer from the list FEB. */
946 tbSh = get_FEB(tb); 946 tbSh = get_FEB(tb);
947 blkh = B_BLK_HEAD(tbSh); 947 blkh = B_BLK_HEAD(tbSh);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 55fce92cdf18..6fd0f47e45db 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -53,7 +53,7 @@ void reiserfs_delete_inode(struct inode *inode)
53 * after delete_object so that quota updates go into the same transaction as 53 * after delete_object so that quota updates go into the same transaction as
54 * stat data deletion */ 54 * stat data deletion */
55 if (!err) 55 if (!err)
56 DQUOT_FREE_INODE(inode); 56 vfs_dq_free_inode(inode);
57 57
58 if (journal_end(&th, inode->i_sb, jbegin_count)) 58 if (journal_end(&th, inode->i_sb, jbegin_count))
59 goto out; 59 goto out;
@@ -363,7 +363,7 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
363 } 363 }
364 /* make sure we don't read more bytes than actually exist in 364 /* make sure we don't read more bytes than actually exist in
365 ** the file. This can happen in odd cases where i_size isn't 365 ** the file. This can happen in odd cases where i_size isn't
366 ** correct, and when direct item padding results in a few 366 ** correct, and when direct item padding results in a few
367 ** extra bytes at the end of the direct item 367 ** extra bytes at the end of the direct item
368 */ 368 */
369 if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size) 369 if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
@@ -438,15 +438,15 @@ static int reiserfs_bmap(struct inode *inode, sector_t block,
438** -ENOENT instead of a valid buffer. block_prepare_write expects to 438** -ENOENT instead of a valid buffer. block_prepare_write expects to
439** be able to do i/o on the buffers returned, unless an error value 439** be able to do i/o on the buffers returned, unless an error value
440** is also returned. 440** is also returned.
441** 441**
442** So, this allows block_prepare_write to be used for reading a single block 442** So, this allows block_prepare_write to be used for reading a single block
443** in a page. Where it does not produce a valid page for holes, or past the 443** in a page. Where it does not produce a valid page for holes, or past the
444** end of the file. This turns out to be exactly what we need for reading 444** end of the file. This turns out to be exactly what we need for reading
445** tails for conversion. 445** tails for conversion.
446** 446**
447** The point of the wrapper is forcing a certain value for create, even 447** The point of the wrapper is forcing a certain value for create, even
448** though the VFS layer is calling this function with create==1. If you 448** though the VFS layer is calling this function with create==1. If you
449** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block, 449** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
450** don't use this function. 450** don't use this function.
451*/ 451*/
452static int reiserfs_get_block_create_0(struct inode *inode, sector_t block, 452static int reiserfs_get_block_create_0(struct inode *inode, sector_t block,
@@ -602,7 +602,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
602 int done; 602 int done;
603 int fs_gen; 603 int fs_gen;
604 struct reiserfs_transaction_handle *th = NULL; 604 struct reiserfs_transaction_handle *th = NULL;
605 /* space reserved in transaction batch: 605 /* space reserved in transaction batch:
606 . 3 balancings in direct->indirect conversion 606 . 3 balancings in direct->indirect conversion
607 . 1 block involved into reiserfs_update_sd() 607 . 1 block involved into reiserfs_update_sd()
608 XXX in practically impossible worst case direct2indirect() 608 XXX in practically impossible worst case direct2indirect()
@@ -754,7 +754,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
754 reiserfs_write_unlock(inode->i_sb); 754 reiserfs_write_unlock(inode->i_sb);
755 755
756 /* the item was found, so new blocks were not added to the file 756 /* the item was found, so new blocks were not added to the file
757 ** there is no need to make sure the inode is updated with this 757 ** there is no need to make sure the inode is updated with this
758 ** transaction 758 ** transaction
759 */ 759 */
760 return retval; 760 return retval;
@@ -841,10 +841,12 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
841 tail_offset); 841 tail_offset);
842 if (retval) { 842 if (retval) {
843 if (retval != -ENOSPC) 843 if (retval != -ENOSPC)
844 reiserfs_warning(inode->i_sb, 844 reiserfs_error(inode->i_sb,
845 "clm-6004: convert tail failed inode %lu, error %d", 845 "clm-6004",
846 inode->i_ino, 846 "convert tail failed "
847 retval); 847 "inode %lu, error %d",
848 inode->i_ino,
849 retval);
848 if (allocated_block_nr) { 850 if (allocated_block_nr) {
849 /* the bitmap, the super, and the stat data == 3 */ 851 /* the bitmap, the super, and the stat data == 3 */
850 if (!th) 852 if (!th)
@@ -984,7 +986,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
984 986
985 /* this loop could log more blocks than we had originally asked 987 /* this loop could log more blocks than we had originally asked
986 ** for. So, we have to allow the transaction to end if it is 988 ** for. So, we have to allow the transaction to end if it is
987 ** too big or too full. Update the inode so things are 989 ** too big or too full. Update the inode so things are
988 ** consistent if we crash before the function returns 990 ** consistent if we crash before the function returns
989 ** 991 **
990 ** release the path so that anybody waiting on the path before 992 ** release the path so that anybody waiting on the path before
@@ -995,7 +997,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
995 if (retval) 997 if (retval)
996 goto failure; 998 goto failure;
997 } 999 }
998 /* inserting indirect pointers for a hole can take a 1000 /* inserting indirect pointers for a hole can take a
999 ** long time. reschedule if needed 1001 ** long time. reschedule if needed
1000 */ 1002 */
1001 cond_resched(); 1003 cond_resched();
@@ -1006,8 +1008,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
1006 goto failure; 1008 goto failure;
1007 } 1009 }
1008 if (retval == POSITION_FOUND) { 1010 if (retval == POSITION_FOUND) {
1009 reiserfs_warning(inode->i_sb, 1011 reiserfs_warning(inode->i_sb, "vs-825",
1010 "vs-825: reiserfs_get_block: "
1011 "%K should not be found", &key); 1012 "%K should not be found", &key);
1012 retval = -EEXIST; 1013 retval = -EEXIST;
1013 if (allocated_block_nr) 1014 if (allocated_block_nr)
@@ -1299,8 +1300,7 @@ static void update_stat_data(struct treepath *path, struct inode *inode,
1299 ih = PATH_PITEM_HEAD(path); 1300 ih = PATH_PITEM_HEAD(path);
1300 1301
1301 if (!is_statdata_le_ih(ih)) 1302 if (!is_statdata_le_ih(ih))
1302 reiserfs_panic(inode->i_sb, 1303 reiserfs_panic(inode->i_sb, "vs-13065", "key %k, found item %h",
1303 "vs-13065: update_stat_data: key %k, found item %h",
1304 INODE_PKEY(inode), ih); 1304 INODE_PKEY(inode), ih);
1305 1305
1306 if (stat_data_v1(ih)) { 1306 if (stat_data_v1(ih)) {
@@ -1332,10 +1332,9 @@ void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
1332 /* look for the object's stat data */ 1332 /* look for the object's stat data */
1333 retval = search_item(inode->i_sb, &key, &path); 1333 retval = search_item(inode->i_sb, &key, &path);
1334 if (retval == IO_ERROR) { 1334 if (retval == IO_ERROR) {
1335 reiserfs_warning(inode->i_sb, 1335 reiserfs_error(inode->i_sb, "vs-13050",
1336 "vs-13050: reiserfs_update_sd: " 1336 "i/o failure occurred trying to "
1337 "i/o failure occurred trying to update %K stat data", 1337 "update %K stat data", &key);
1338 &key);
1339 return; 1338 return;
1340 } 1339 }
1341 if (retval == ITEM_NOT_FOUND) { 1340 if (retval == ITEM_NOT_FOUND) {
@@ -1345,9 +1344,9 @@ void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
1345 /*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found"); */ 1344 /*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found"); */
1346 return; 1345 return;
1347 } 1346 }
1348 reiserfs_warning(inode->i_sb, 1347 reiserfs_warning(inode->i_sb, "vs-13060",
1349 "vs-13060: reiserfs_update_sd: " 1348 "stat data of object %k (nlink == %d) "
1350 "stat data of object %k (nlink == %d) not found (pos %d)", 1349 "not found (pos %d)",
1351 INODE_PKEY(inode), inode->i_nlink, 1350 INODE_PKEY(inode), inode->i_nlink,
1352 pos); 1351 pos);
1353 reiserfs_check_path(&path); 1352 reiserfs_check_path(&path);
@@ -1424,10 +1423,9 @@ void reiserfs_read_locked_inode(struct inode *inode,
1424 /* look for the object's stat data */ 1423 /* look for the object's stat data */
1425 retval = search_item(inode->i_sb, &key, &path_to_sd); 1424 retval = search_item(inode->i_sb, &key, &path_to_sd);
1426 if (retval == IO_ERROR) { 1425 if (retval == IO_ERROR) {
1427 reiserfs_warning(inode->i_sb, 1426 reiserfs_error(inode->i_sb, "vs-13070",
1428 "vs-13070: reiserfs_read_locked_inode: " 1427 "i/o failure occurred trying to find "
1429 "i/o failure occurred trying to find stat data of %K", 1428 "stat data of %K", &key);
1430 &key);
1431 reiserfs_make_bad_inode(inode); 1429 reiserfs_make_bad_inode(inode);
1432 return; 1430 return;
1433 } 1431 }
@@ -1446,7 +1444,7 @@ void reiserfs_read_locked_inode(struct inode *inode,
1446 update sd on unlink all that is required is to check for nlink 1444 update sd on unlink all that is required is to check for nlink
1447 here. This bug was first found by Sizif when debugging 1445 here. This bug was first found by Sizif when debugging
1448 SquidNG/Butterfly, forgotten, and found again after Philippe 1446 SquidNG/Butterfly, forgotten, and found again after Philippe
1449 Gramoulle <philippe.gramoulle@mmania.com> reproduced it. 1447 Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
1450 1448
1451 More logical fix would require changes in fs/inode.c:iput() to 1449 More logical fix would require changes in fs/inode.c:iput() to
1452 remove inode from hash-table _after_ fs cleaned disk stuff up and 1450 remove inode from hash-table _after_ fs cleaned disk stuff up and
@@ -1457,8 +1455,7 @@ void reiserfs_read_locked_inode(struct inode *inode,
1457 during mount (fs/reiserfs/super.c:finish_unfinished()). */ 1455 during mount (fs/reiserfs/super.c:finish_unfinished()). */
1458 if ((inode->i_nlink == 0) && 1456 if ((inode->i_nlink == 0) &&
1459 !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) { 1457 !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) {
1460 reiserfs_warning(inode->i_sb, 1458 reiserfs_warning(inode->i_sb, "vs-13075",
1461 "vs-13075: reiserfs_read_locked_inode: "
1462 "dead inode read from disk %K. " 1459 "dead inode read from disk %K. "
1463 "This is likely to be race with knfsd. Ignore", 1460 "This is likely to be race with knfsd. Ignore",
1464 &key); 1461 &key);
@@ -1555,7 +1552,7 @@ struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
1555 */ 1552 */
1556 if (fh_type > fh_len) { 1553 if (fh_type > fh_len) {
1557 if (fh_type != 6 || fh_len != 5) 1554 if (fh_type != 6 || fh_len != 5)
1558 reiserfs_warning(sb, 1555 reiserfs_warning(sb, "reiserfs-13077",
1559 "nfsd/reiserfs, fhtype=%d, len=%d - odd", 1556 "nfsd/reiserfs, fhtype=%d, len=%d - odd",
1560 fh_type, fh_len); 1557 fh_type, fh_len);
1561 fh_type = 5; 1558 fh_type = 5;
@@ -1622,7 +1619,7 @@ int reiserfs_write_inode(struct inode *inode, int do_sync)
1622 if (inode->i_sb->s_flags & MS_RDONLY) 1619 if (inode->i_sb->s_flags & MS_RDONLY)
1623 return -EROFS; 1620 return -EROFS;
1624 /* memory pressure can sometimes initiate write_inode calls with sync == 1, 1621 /* memory pressure can sometimes initiate write_inode calls with sync == 1,
1625 ** these cases are just when the system needs ram, not when the 1622 ** these cases are just when the system needs ram, not when the
1626 ** inode needs to reach disk for safety, and they can safely be 1623 ** inode needs to reach disk for safety, and they can safely be
1627 ** ignored because the altered inode has already been logged. 1624 ** ignored because the altered inode has already been logged.
1628 */ 1625 */
@@ -1680,13 +1677,13 @@ static int reiserfs_new_directory(struct reiserfs_transaction_handle *th,
1680 /* look for place in the tree for new item */ 1677 /* look for place in the tree for new item */
1681 retval = search_item(sb, &key, path); 1678 retval = search_item(sb, &key, path);
1682 if (retval == IO_ERROR) { 1679 if (retval == IO_ERROR) {
1683 reiserfs_warning(sb, "vs-13080: reiserfs_new_directory: " 1680 reiserfs_error(sb, "vs-13080",
1684 "i/o failure occurred creating new directory"); 1681 "i/o failure occurred creating new directory");
1685 return -EIO; 1682 return -EIO;
1686 } 1683 }
1687 if (retval == ITEM_FOUND) { 1684 if (retval == ITEM_FOUND) {
1688 pathrelse(path); 1685 pathrelse(path);
1689 reiserfs_warning(sb, "vs-13070: reiserfs_new_directory: " 1686 reiserfs_warning(sb, "vs-13070",
1690 "object with this key exists (%k)", 1687 "object with this key exists (%k)",
1691 &(ih->ih_key)); 1688 &(ih->ih_key));
1692 return -EEXIST; 1689 return -EEXIST;
@@ -1720,13 +1717,13 @@ static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct i
1720 /* look for place in the tree for new item */ 1717 /* look for place in the tree for new item */
1721 retval = search_item(sb, &key, path); 1718 retval = search_item(sb, &key, path);
1722 if (retval == IO_ERROR) { 1719 if (retval == IO_ERROR) {
1723 reiserfs_warning(sb, "vs-13080: reiserfs_new_symlinik: " 1720 reiserfs_error(sb, "vs-13080",
1724 "i/o failure occurred creating new symlink"); 1721 "i/o failure occurred creating new symlink");
1725 return -EIO; 1722 return -EIO;
1726 } 1723 }
1727 if (retval == ITEM_FOUND) { 1724 if (retval == ITEM_FOUND) {
1728 pathrelse(path); 1725 pathrelse(path);
1729 reiserfs_warning(sb, "vs-13080: reiserfs_new_symlink: " 1726 reiserfs_warning(sb, "vs-13080",
1730 "object with this key exists (%k)", 1727 "object with this key exists (%k)",
1731 &(ih->ih_key)); 1728 &(ih->ih_key));
1732 return -EEXIST; 1729 return -EEXIST;
@@ -1739,7 +1736,7 @@ static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct i
1739/* inserts the stat data into the tree, and then calls 1736/* inserts the stat data into the tree, and then calls
1740 reiserfs_new_directory (to insert ".", ".." item if new object is 1737 reiserfs_new_directory (to insert ".", ".." item if new object is
1741 directory) or reiserfs_new_symlink (to insert symlink body if new 1738 directory) or reiserfs_new_symlink (to insert symlink body if new
1742 object is symlink) or nothing (if new object is regular file) 1739 object is symlink) or nothing (if new object is regular file)
1743 1740
1744 NOTE! uid and gid must already be set in the inode. If we return 1741 NOTE! uid and gid must already be set in the inode. If we return
1745 non-zero due to an error, we have to drop the quota previously allocated 1742 non-zero due to an error, we have to drop the quota previously allocated
@@ -1747,10 +1744,11 @@ static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct i
1747 if we return non-zero, we also end the transaction. */ 1744 if we return non-zero, we also end the transaction. */
1748int reiserfs_new_inode(struct reiserfs_transaction_handle *th, 1745int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1749 struct inode *dir, int mode, const char *symname, 1746 struct inode *dir, int mode, const char *symname,
1750 /* 0 for regular, EMTRY_DIR_SIZE for dirs, 1747 /* 0 for regular, EMTRY_DIR_SIZE for dirs,
1751 strlen (symname) for symlinks) */ 1748 strlen (symname) for symlinks) */
1752 loff_t i_size, struct dentry *dentry, 1749 loff_t i_size, struct dentry *dentry,
1753 struct inode *inode) 1750 struct inode *inode,
1751 struct reiserfs_security_handle *security)
1754{ 1752{
1755 struct super_block *sb; 1753 struct super_block *sb;
1756 struct reiserfs_iget_args args; 1754 struct reiserfs_iget_args args;
@@ -1763,7 +1761,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1763 1761
1764 BUG_ON(!th->t_trans_id); 1762 BUG_ON(!th->t_trans_id);
1765 1763
1766 if (DQUOT_ALLOC_INODE(inode)) { 1764 if (vfs_dq_alloc_inode(inode)) {
1767 err = -EDQUOT; 1765 err = -EDQUOT;
1768 goto out_end_trans; 1766 goto out_end_trans;
1769 } 1767 }
@@ -1796,7 +1794,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1796 goto out_bad_inode; 1794 goto out_bad_inode;
1797 } 1795 }
1798 if (old_format_only(sb)) 1796 if (old_format_only(sb))
1799 /* not a perfect generation count, as object ids can be reused, but 1797 /* not a perfect generation count, as object ids can be reused, but
1800 ** this is as good as reiserfs can do right now. 1798 ** this is as good as reiserfs can do right now.
1801 ** note that the private part of inode isn't filled in yet, we have 1799 ** note that the private part of inode isn't filled in yet, we have
1802 ** to use the directory. 1800 ** to use the directory.
@@ -1917,9 +1915,8 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1917 goto out_inserted_sd; 1915 goto out_inserted_sd;
1918 } 1916 }
1919 1917
1920 /* XXX CHECK THIS */
1921 if (reiserfs_posixacl(inode->i_sb)) { 1918 if (reiserfs_posixacl(inode->i_sb)) {
1922 retval = reiserfs_inherit_default_acl(dir, dentry, inode); 1919 retval = reiserfs_inherit_default_acl(th, dir, dentry, inode);
1923 if (retval) { 1920 if (retval) {
1924 err = retval; 1921 err = retval;
1925 reiserfs_check_path(&path_to_key); 1922 reiserfs_check_path(&path_to_key);
@@ -1927,10 +1924,23 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1927 goto out_inserted_sd; 1924 goto out_inserted_sd;
1928 } 1925 }
1929 } else if (inode->i_sb->s_flags & MS_POSIXACL) { 1926 } else if (inode->i_sb->s_flags & MS_POSIXACL) {
1930 reiserfs_warning(inode->i_sb, "ACLs aren't enabled in the fs, " 1927 reiserfs_warning(inode->i_sb, "jdm-13090",
1928 "ACLs aren't enabled in the fs, "
1931 "but vfs thinks they are!"); 1929 "but vfs thinks they are!");
1932 } else if (is_reiserfs_priv_object(dir)) { 1930 } else if (IS_PRIVATE(dir))
1933 reiserfs_mark_inode_private(inode); 1931 inode->i_flags |= S_PRIVATE;
1932
1933 if (security->name) {
1934 retval = reiserfs_security_write(th, inode, security);
1935 if (retval) {
1936 err = retval;
1937 reiserfs_check_path(&path_to_key);
1938 retval = journal_end(th, th->t_super,
1939 th->t_blocks_allocated);
1940 if (retval)
1941 err = retval;
1942 goto out_inserted_sd;
1943 }
1934 } 1944 }
1935 1945
1936 reiserfs_update_sd(th, inode); 1946 reiserfs_update_sd(th, inode);
@@ -1947,12 +1957,12 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1947 INODE_PKEY(inode)->k_objectid = 0; 1957 INODE_PKEY(inode)->k_objectid = 0;
1948 1958
1949 /* Quota change must be inside a transaction for journaling */ 1959 /* Quota change must be inside a transaction for journaling */
1950 DQUOT_FREE_INODE(inode); 1960 vfs_dq_free_inode(inode);
1951 1961
1952 out_end_trans: 1962 out_end_trans:
1953 journal_end(th, th->t_super, th->t_blocks_allocated); 1963 journal_end(th, th->t_super, th->t_blocks_allocated);
1954 /* Drop can be outside and it needs more credits so it's better to have it outside */ 1964 /* Drop can be outside and it needs more credits so it's better to have it outside */
1955 DQUOT_DROP(inode); 1965 vfs_dq_drop(inode);
1956 inode->i_flags |= S_NOQUOTA; 1966 inode->i_flags |= S_NOQUOTA;
1957 make_bad_inode(inode); 1967 make_bad_inode(inode);
1958 1968
@@ -1960,19 +1970,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1960 inode->i_nlink = 0; 1970 inode->i_nlink = 0;
1961 th->t_trans_id = 0; /* so the caller can't use this handle later */ 1971 th->t_trans_id = 0; /* so the caller can't use this handle later */
1962 unlock_new_inode(inode); /* OK to do even if we hadn't locked it */ 1972 unlock_new_inode(inode); /* OK to do even if we hadn't locked it */
1963 1973 iput(inode);
1964 /* If we were inheriting an ACL, we need to release the lock so that
1965 * iput doesn't deadlock in reiserfs_delete_xattrs. The locking
1966 * code really needs to be reworked, but this will take care of it
1967 * for now. -jeffm */
1968#ifdef CONFIG_REISERFS_FS_POSIX_ACL
1969 if (REISERFS_I(dir)->i_acl_default && !IS_ERR(REISERFS_I(dir)->i_acl_default)) {
1970 reiserfs_write_unlock_xattrs(dir->i_sb);
1971 iput(inode);
1972 reiserfs_write_lock_xattrs(dir->i_sb);
1973 } else
1974#endif
1975 iput(inode);
1976 return err; 1974 return err;
1977} 1975}
1978 1976
@@ -1989,7 +1987,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1989** 1987**
1990** on failure, nonzero is returned, page_result and bh_result are untouched. 1988** on failure, nonzero is returned, page_result and bh_result are untouched.
1991*/ 1989*/
1992static int grab_tail_page(struct inode *p_s_inode, 1990static int grab_tail_page(struct inode *inode,
1993 struct page **page_result, 1991 struct page **page_result,
1994 struct buffer_head **bh_result) 1992 struct buffer_head **bh_result)
1995{ 1993{
@@ -1997,11 +1995,11 @@ static int grab_tail_page(struct inode *p_s_inode,
1997 /* we want the page with the last byte in the file, 1995 /* we want the page with the last byte in the file,
1998 ** not the page that will hold the next byte for appending 1996 ** not the page that will hold the next byte for appending
1999 */ 1997 */
2000 unsigned long index = (p_s_inode->i_size - 1) >> PAGE_CACHE_SHIFT; 1998 unsigned long index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
2001 unsigned long pos = 0; 1999 unsigned long pos = 0;
2002 unsigned long start = 0; 2000 unsigned long start = 0;
2003 unsigned long blocksize = p_s_inode->i_sb->s_blocksize; 2001 unsigned long blocksize = inode->i_sb->s_blocksize;
2004 unsigned long offset = (p_s_inode->i_size) & (PAGE_CACHE_SIZE - 1); 2002 unsigned long offset = (inode->i_size) & (PAGE_CACHE_SIZE - 1);
2005 struct buffer_head *bh; 2003 struct buffer_head *bh;
2006 struct buffer_head *head; 2004 struct buffer_head *head;
2007 struct page *page; 2005 struct page *page;
@@ -2015,7 +2013,7 @@ static int grab_tail_page(struct inode *p_s_inode,
2015 if ((offset & (blocksize - 1)) == 0) { 2013 if ((offset & (blocksize - 1)) == 0) {
2016 return -ENOENT; 2014 return -ENOENT;
2017 } 2015 }
2018 page = grab_cache_page(p_s_inode->i_mapping, index); 2016 page = grab_cache_page(inode->i_mapping, index);
2019 error = -ENOMEM; 2017 error = -ENOMEM;
2020 if (!page) { 2018 if (!page) {
2021 goto out; 2019 goto out;
@@ -2044,10 +2042,8 @@ static int grab_tail_page(struct inode *p_s_inode,
2044 ** I've screwed up the code to find the buffer, or the code to 2042 ** I've screwed up the code to find the buffer, or the code to
2045 ** call prepare_write 2043 ** call prepare_write
2046 */ 2044 */
2047 reiserfs_warning(p_s_inode->i_sb, 2045 reiserfs_error(inode->i_sb, "clm-6000",
2048 "clm-6000: error reading block %lu on dev %s", 2046 "error reading block %lu", bh->b_blocknr);
2049 bh->b_blocknr,
2050 reiserfs_bdevname(p_s_inode->i_sb));
2051 error = -EIO; 2047 error = -EIO;
2052 goto unlock; 2048 goto unlock;
2053 } 2049 }
@@ -2069,57 +2065,58 @@ static int grab_tail_page(struct inode *p_s_inode,
2069** 2065**
2070** some code taken from block_truncate_page 2066** some code taken from block_truncate_page
2071*/ 2067*/
2072int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps) 2068int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
2073{ 2069{
2074 struct reiserfs_transaction_handle th; 2070 struct reiserfs_transaction_handle th;
2075 /* we want the offset for the first byte after the end of the file */ 2071 /* we want the offset for the first byte after the end of the file */
2076 unsigned long offset = p_s_inode->i_size & (PAGE_CACHE_SIZE - 1); 2072 unsigned long offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
2077 unsigned blocksize = p_s_inode->i_sb->s_blocksize; 2073 unsigned blocksize = inode->i_sb->s_blocksize;
2078 unsigned length; 2074 unsigned length;
2079 struct page *page = NULL; 2075 struct page *page = NULL;
2080 int error; 2076 int error;
2081 struct buffer_head *bh = NULL; 2077 struct buffer_head *bh = NULL;
2082 int err2; 2078 int err2;
2083 2079
2084 reiserfs_write_lock(p_s_inode->i_sb); 2080 reiserfs_write_lock(inode->i_sb);
2085 2081
2086 if (p_s_inode->i_size > 0) { 2082 if (inode->i_size > 0) {
2087 if ((error = grab_tail_page(p_s_inode, &page, &bh))) { 2083 error = grab_tail_page(inode, &page, &bh);
2088 // -ENOENT means we truncated past the end of the file, 2084 if (error) {
2085 // -ENOENT means we truncated past the end of the file,
2089 // and get_block_create_0 could not find a block to read in, 2086 // and get_block_create_0 could not find a block to read in,
2090 // which is ok. 2087 // which is ok.
2091 if (error != -ENOENT) 2088 if (error != -ENOENT)
2092 reiserfs_warning(p_s_inode->i_sb, 2089 reiserfs_error(inode->i_sb, "clm-6001",
2093 "clm-6001: grab_tail_page failed %d", 2090 "grab_tail_page failed %d",
2094 error); 2091 error);
2095 page = NULL; 2092 page = NULL;
2096 bh = NULL; 2093 bh = NULL;
2097 } 2094 }
2098 } 2095 }
2099 2096
2100 /* so, if page != NULL, we have a buffer head for the offset at 2097 /* so, if page != NULL, we have a buffer head for the offset at
2101 ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0, 2098 ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
2102 ** then we have an unformatted node. Otherwise, we have a direct item, 2099 ** then we have an unformatted node. Otherwise, we have a direct item,
2103 ** and no zeroing is required on disk. We zero after the truncate, 2100 ** and no zeroing is required on disk. We zero after the truncate,
2104 ** because the truncate might pack the item anyway 2101 ** because the truncate might pack the item anyway
2105 ** (it will unmap bh if it packs). 2102 ** (it will unmap bh if it packs).
2106 */ 2103 */
2107 /* it is enough to reserve space in transaction for 2 balancings: 2104 /* it is enough to reserve space in transaction for 2 balancings:
2108 one for "save" link adding and another for the first 2105 one for "save" link adding and another for the first
2109 cut_from_item. 1 is for update_sd */ 2106 cut_from_item. 1 is for update_sd */
2110 error = journal_begin(&th, p_s_inode->i_sb, 2107 error = journal_begin(&th, inode->i_sb,
2111 JOURNAL_PER_BALANCE_CNT * 2 + 1); 2108 JOURNAL_PER_BALANCE_CNT * 2 + 1);
2112 if (error) 2109 if (error)
2113 goto out; 2110 goto out;
2114 reiserfs_update_inode_transaction(p_s_inode); 2111 reiserfs_update_inode_transaction(inode);
2115 if (update_timestamps) 2112 if (update_timestamps)
2116 /* we are doing real truncate: if the system crashes before the last 2113 /* we are doing real truncate: if the system crashes before the last
2117 transaction of truncating gets committed - on reboot the file 2114 transaction of truncating gets committed - on reboot the file
2118 either appears truncated properly or not truncated at all */ 2115 either appears truncated properly or not truncated at all */
2119 add_save_link(&th, p_s_inode, 1); 2116 add_save_link(&th, inode, 1);
2120 err2 = reiserfs_do_truncate(&th, p_s_inode, page, update_timestamps); 2117 err2 = reiserfs_do_truncate(&th, inode, page, update_timestamps);
2121 error = 2118 error =
2122 journal_end(&th, p_s_inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1); 2119 journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1);
2123 if (error) 2120 if (error)
2124 goto out; 2121 goto out;
2125 2122
@@ -2130,7 +2127,7 @@ int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps)
2130 } 2127 }
2131 2128
2132 if (update_timestamps) { 2129 if (update_timestamps) {
2133 error = remove_save_link(p_s_inode, 1 /* truncate */ ); 2130 error = remove_save_link(inode, 1 /* truncate */);
2134 if (error) 2131 if (error)
2135 goto out; 2132 goto out;
2136 } 2133 }
@@ -2149,14 +2146,14 @@ int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps)
2149 page_cache_release(page); 2146 page_cache_release(page);
2150 } 2147 }
2151 2148
2152 reiserfs_write_unlock(p_s_inode->i_sb); 2149 reiserfs_write_unlock(inode->i_sb);
2153 return 0; 2150 return 0;
2154 out: 2151 out:
2155 if (page) { 2152 if (page) {
2156 unlock_page(page); 2153 unlock_page(page);
2157 page_cache_release(page); 2154 page_cache_release(page);
2158 } 2155 }
2159 reiserfs_write_unlock(p_s_inode->i_sb); 2156 reiserfs_write_unlock(inode->i_sb);
2160 return error; 2157 return error;
2161} 2158}
2162 2159
@@ -2208,9 +2205,8 @@ static int map_block_for_writepage(struct inode *inode,
2208 /* we've found an unformatted node */ 2205 /* we've found an unformatted node */
2209 if (indirect_item_found(retval, ih)) { 2206 if (indirect_item_found(retval, ih)) {
2210 if (bytes_copied > 0) { 2207 if (bytes_copied > 0) {
2211 reiserfs_warning(inode->i_sb, 2208 reiserfs_warning(inode->i_sb, "clm-6002",
2212 "clm-6002: bytes_copied %d", 2209 "bytes_copied %d", bytes_copied);
2213 bytes_copied);
2214 } 2210 }
2215 if (!get_block_num(item, pos_in_item)) { 2211 if (!get_block_num(item, pos_in_item)) {
2216 /* crap, we are writing to a hole */ 2212 /* crap, we are writing to a hole */
@@ -2267,9 +2263,8 @@ static int map_block_for_writepage(struct inode *inode,
2267 goto research; 2263 goto research;
2268 } 2264 }
2269 } else { 2265 } else {
2270 reiserfs_warning(inode->i_sb, 2266 reiserfs_warning(inode->i_sb, "clm-6003",
2271 "clm-6003: bad item inode %lu, device %s", 2267 "bad item inode %lu", inode->i_ino);
2272 inode->i_ino, reiserfs_bdevname(inode->i_sb));
2273 retval = -EIO; 2268 retval = -EIO;
2274 goto out; 2269 goto out;
2275 } 2270 }
@@ -2312,8 +2307,8 @@ static int map_block_for_writepage(struct inode *inode,
2312 return retval; 2307 return retval;
2313} 2308}
2314 2309
2315/* 2310/*
2316 * mason@suse.com: updated in 2.5.54 to follow the same general io 2311 * mason@suse.com: updated in 2.5.54 to follow the same general io
2317 * start/recovery path as __block_write_full_page, along with special 2312 * start/recovery path as __block_write_full_page, along with special
2318 * code to handle reiserfs tails. 2313 * code to handle reiserfs tails.
2319 */ 2314 */
@@ -2453,7 +2448,7 @@ static int reiserfs_write_full_page(struct page *page,
2453 unlock_page(page); 2448 unlock_page(page);
2454 2449
2455 /* 2450 /*
2456 * since any buffer might be the only dirty buffer on the page, 2451 * since any buffer might be the only dirty buffer on the page,
2457 * the first submit_bh can bring the page out of writeback. 2452 * the first submit_bh can bring the page out of writeback.
2458 * be careful with the buffers. 2453 * be careful with the buffers.
2459 */ 2454 */
@@ -2472,8 +2467,8 @@ static int reiserfs_write_full_page(struct page *page,
2472 if (nr == 0) { 2467 if (nr == 0) {
2473 /* 2468 /*
2474 * if this page only had a direct item, it is very possible for 2469 * if this page only had a direct item, it is very possible for
2475 * no io to be required without there being an error. Or, 2470 * no io to be required without there being an error. Or,
2476 * someone else could have locked them and sent them down the 2471 * someone else could have locked them and sent them down the
2477 * pipe without locking the page 2472 * pipe without locking the page
2478 */ 2473 */
2479 bh = head; 2474 bh = head;
@@ -2492,7 +2487,7 @@ static int reiserfs_write_full_page(struct page *page,
2492 2487
2493 fail: 2488 fail:
2494 /* catches various errors, we need to make sure any valid dirty blocks 2489 /* catches various errors, we need to make sure any valid dirty blocks
2495 * get to the media. The page is currently locked and not marked for 2490 * get to the media. The page is currently locked and not marked for
2496 * writeback 2491 * writeback
2497 */ 2492 */
2498 ClearPageUptodate(page); 2493 ClearPageUptodate(page);
@@ -3119,7 +3114,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3119 if (error) 3114 if (error)
3120 goto out; 3115 goto out;
3121 error = 3116 error =
3122 DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; 3117 vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
3123 if (error) { 3118 if (error) {
3124 journal_end(&th, inode->i_sb, 3119 journal_end(&th, inode->i_sb,
3125 jbegin_count); 3120 jbegin_count);
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 830332021ed4..0ccc3fdda7bf 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -189,7 +189,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
189 } 189 }
190 190
191 /* we unpack by finding the page with the tail, and calling 191 /* we unpack by finding the page with the tail, and calling
192 ** reiserfs_prepare_write on that page. This will force a 192 ** reiserfs_prepare_write on that page. This will force a
193 ** reiserfs_get_block to unpack the tail for us. 193 ** reiserfs_get_block to unpack the tail for us.
194 */ 194 */
195 index = inode->i_size >> PAGE_CACHE_SHIFT; 195 index = inode->i_size >> PAGE_CACHE_SHIFT;
diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
index 9475557ab499..72cb1cc51b87 100644
--- a/fs/reiserfs/item_ops.c
+++ b/fs/reiserfs/item_ops.c
@@ -97,7 +97,8 @@ static int sd_unit_num(struct virtual_item *vi)
97 97
98static void sd_print_vi(struct virtual_item *vi) 98static void sd_print_vi(struct virtual_item *vi)
99{ 99{
100 reiserfs_warning(NULL, "STATDATA, index %d, type 0x%x, %h", 100 reiserfs_warning(NULL, "reiserfs-16100",
101 "STATDATA, index %d, type 0x%x, %h",
101 vi->vi_index, vi->vi_type, vi->vi_ih); 102 vi->vi_index, vi->vi_type, vi->vi_ih);
102} 103}
103 104
@@ -190,7 +191,8 @@ static int direct_unit_num(struct virtual_item *vi)
190 191
191static void direct_print_vi(struct virtual_item *vi) 192static void direct_print_vi(struct virtual_item *vi)
192{ 193{
193 reiserfs_warning(NULL, "DIRECT, index %d, type 0x%x, %h", 194 reiserfs_warning(NULL, "reiserfs-16101",
195 "DIRECT, index %d, type 0x%x, %h",
194 vi->vi_index, vi->vi_type, vi->vi_ih); 196 vi->vi_index, vi->vi_type, vi->vi_ih);
195} 197}
196 198
@@ -278,7 +280,7 @@ static void indirect_print_item(struct item_head *ih, char *item)
278 unp = (__le32 *) item; 280 unp = (__le32 *) item;
279 281
280 if (ih_item_len(ih) % UNFM_P_SIZE) 282 if (ih_item_len(ih) % UNFM_P_SIZE)
281 reiserfs_warning(NULL, "indirect_print_item: invalid item len"); 283 reiserfs_warning(NULL, "reiserfs-16102", "invalid item len");
282 284
283 printk("%d pointers\n[ ", (int)I_UNFM_NUM(ih)); 285 printk("%d pointers\n[ ", (int)I_UNFM_NUM(ih));
284 for (j = 0; j < I_UNFM_NUM(ih); j++) { 286 for (j = 0; j < I_UNFM_NUM(ih); j++) {
@@ -334,7 +336,8 @@ static int indirect_unit_num(struct virtual_item *vi)
334 336
335static void indirect_print_vi(struct virtual_item *vi) 337static void indirect_print_vi(struct virtual_item *vi)
336{ 338{
337 reiserfs_warning(NULL, "INDIRECT, index %d, type 0x%x, %h", 339 reiserfs_warning(NULL, "reiserfs-16103",
340 "INDIRECT, index %d, type 0x%x, %h",
338 vi->vi_index, vi->vi_type, vi->vi_ih); 341 vi->vi_index, vi->vi_type, vi->vi_ih);
339} 342}
340 343
@@ -359,7 +362,7 @@ static struct item_operations indirect_ops = {
359 362
360static int direntry_bytes_number(struct item_head *ih, int block_size) 363static int direntry_bytes_number(struct item_head *ih, int block_size)
361{ 364{
362 reiserfs_warning(NULL, "vs-16090: direntry_bytes_number: " 365 reiserfs_warning(NULL, "vs-16090",
363 "bytes number is asked for direntry"); 366 "bytes number is asked for direntry");
364 return 0; 367 return 0;
365} 368}
@@ -514,8 +517,9 @@ static int direntry_create_vi(struct virtual_node *vn,
514 ((is_affected 517 ((is_affected
515 && (vn->vn_mode == M_PASTE 518 && (vn->vn_mode == M_PASTE
516 || vn->vn_mode == M_CUT)) ? insert_size : 0)) { 519 || vn->vn_mode == M_CUT)) ? insert_size : 0)) {
517 reiserfs_panic(NULL, 520 reiserfs_panic(NULL, "vs-8025", "(mode==%c, "
518 "vs-8025: set_entry_sizes: (mode==%c, insert_size==%d), invalid length of directory item", 521 "insert_size==%d), invalid length of "
522 "directory item",
519 vn->vn_mode, insert_size); 523 vn->vn_mode, insert_size);
520 } 524 }
521 } 525 }
@@ -546,7 +550,8 @@ static int direntry_check_left(struct virtual_item *vi, int free,
546 } 550 }
547 551
548 if (entries == dir_u->entry_count) { 552 if (entries == dir_u->entry_count) {
549 reiserfs_panic(NULL, "free space %d, entry_count %d\n", free, 553 reiserfs_panic(NULL, "item_ops-1",
554 "free space %d, entry_count %d", free,
550 dir_u->entry_count); 555 dir_u->entry_count);
551 } 556 }
552 557
@@ -614,7 +619,8 @@ static void direntry_print_vi(struct virtual_item *vi)
614 int i; 619 int i;
615 struct direntry_uarea *dir_u = vi->vi_uarea; 620 struct direntry_uarea *dir_u = vi->vi_uarea;
616 621
617 reiserfs_warning(NULL, "DIRENTRY, index %d, type 0x%x, %h, flags 0x%x", 622 reiserfs_warning(NULL, "reiserfs-16104",
623 "DIRENTRY, index %d, type 0x%x, %h, flags 0x%x",
618 vi->vi_index, vi->vi_type, vi->vi_ih, dir_u->flags); 624 vi->vi_index, vi->vi_type, vi->vi_ih, dir_u->flags);
619 printk("%d entries: ", dir_u->entry_count); 625 printk("%d entries: ", dir_u->entry_count);
620 for (i = 0; i < dir_u->entry_count; i++) 626 for (i = 0; i < dir_u->entry_count; i++)
@@ -642,43 +648,43 @@ static struct item_operations direntry_ops = {
642// 648//
643static int errcatch_bytes_number(struct item_head *ih, int block_size) 649static int errcatch_bytes_number(struct item_head *ih, int block_size)
644{ 650{
645 reiserfs_warning(NULL, 651 reiserfs_warning(NULL, "green-16001",
646 "green-16001: Invalid item type observed, run fsck ASAP"); 652 "Invalid item type observed, run fsck ASAP");
647 return 0; 653 return 0;
648} 654}
649 655
650static void errcatch_decrement_key(struct cpu_key *key) 656static void errcatch_decrement_key(struct cpu_key *key)
651{ 657{
652 reiserfs_warning(NULL, 658 reiserfs_warning(NULL, "green-16002",
653 "green-16002: Invalid item type observed, run fsck ASAP"); 659 "Invalid item type observed, run fsck ASAP");
654} 660}
655 661
656static int errcatch_is_left_mergeable(struct reiserfs_key *key, 662static int errcatch_is_left_mergeable(struct reiserfs_key *key,
657 unsigned long bsize) 663 unsigned long bsize)
658{ 664{
659 reiserfs_warning(NULL, 665 reiserfs_warning(NULL, "green-16003",
660 "green-16003: Invalid item type observed, run fsck ASAP"); 666 "Invalid item type observed, run fsck ASAP");
661 return 0; 667 return 0;
662} 668}
663 669
664static void errcatch_print_item(struct item_head *ih, char *item) 670static void errcatch_print_item(struct item_head *ih, char *item)
665{ 671{
666 reiserfs_warning(NULL, 672 reiserfs_warning(NULL, "green-16004",
667 "green-16004: Invalid item type observed, run fsck ASAP"); 673 "Invalid item type observed, run fsck ASAP");
668} 674}
669 675
670static void errcatch_check_item(struct item_head *ih, char *item) 676static void errcatch_check_item(struct item_head *ih, char *item)
671{ 677{
672 reiserfs_warning(NULL, 678 reiserfs_warning(NULL, "green-16005",
673 "green-16005: Invalid item type observed, run fsck ASAP"); 679 "Invalid item type observed, run fsck ASAP");
674} 680}
675 681
676static int errcatch_create_vi(struct virtual_node *vn, 682static int errcatch_create_vi(struct virtual_node *vn,
677 struct virtual_item *vi, 683 struct virtual_item *vi,
678 int is_affected, int insert_size) 684 int is_affected, int insert_size)
679{ 685{
680 reiserfs_warning(NULL, 686 reiserfs_warning(NULL, "green-16006",
681 "green-16006: Invalid item type observed, run fsck ASAP"); 687 "Invalid item type observed, run fsck ASAP");
682 return 0; // We might return -1 here as well, but it won't help as create_virtual_node() from where 688 return 0; // We might return -1 here as well, but it won't help as create_virtual_node() from where
683 // this operation is called from is of return type void. 689 // this operation is called from is of return type void.
684} 690}
@@ -686,36 +692,36 @@ static int errcatch_create_vi(struct virtual_node *vn,
686static int errcatch_check_left(struct virtual_item *vi, int free, 692static int errcatch_check_left(struct virtual_item *vi, int free,
687 int start_skip, int end_skip) 693 int start_skip, int end_skip)
688{ 694{
689 reiserfs_warning(NULL, 695 reiserfs_warning(NULL, "green-16007",
690 "green-16007: Invalid item type observed, run fsck ASAP"); 696 "Invalid item type observed, run fsck ASAP");
691 return -1; 697 return -1;
692} 698}
693 699
694static int errcatch_check_right(struct virtual_item *vi, int free) 700static int errcatch_check_right(struct virtual_item *vi, int free)
695{ 701{
696 reiserfs_warning(NULL, 702 reiserfs_warning(NULL, "green-16008",
697 "green-16008: Invalid item type observed, run fsck ASAP"); 703 "Invalid item type observed, run fsck ASAP");
698 return -1; 704 return -1;
699} 705}
700 706
701static int errcatch_part_size(struct virtual_item *vi, int first, int count) 707static int errcatch_part_size(struct virtual_item *vi, int first, int count)
702{ 708{
703 reiserfs_warning(NULL, 709 reiserfs_warning(NULL, "green-16009",
704 "green-16009: Invalid item type observed, run fsck ASAP"); 710 "Invalid item type observed, run fsck ASAP");
705 return 0; 711 return 0;
706} 712}
707 713
708static int errcatch_unit_num(struct virtual_item *vi) 714static int errcatch_unit_num(struct virtual_item *vi)
709{ 715{
710 reiserfs_warning(NULL, 716 reiserfs_warning(NULL, "green-16010",
711 "green-16010: Invalid item type observed, run fsck ASAP"); 717 "Invalid item type observed, run fsck ASAP");
712 return 0; 718 return 0;
713} 719}
714 720
715static void errcatch_print_vi(struct virtual_item *vi) 721static void errcatch_print_vi(struct virtual_item *vi)
716{ 722{
717 reiserfs_warning(NULL, 723 reiserfs_warning(NULL, "green-16011",
718 "green-16011: Invalid item type observed, run fsck ASAP"); 724 "Invalid item type observed, run fsck ASAP");
719} 725}
720 726
721static struct item_operations errcatch_ops = { 727static struct item_operations errcatch_ops = {
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 9643c3bbeb3b..77f5bb746bf0 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1,36 +1,36 @@
1/* 1/*
2** Write ahead logging implementation copyright Chris Mason 2000 2** Write ahead logging implementation copyright Chris Mason 2000
3** 3**
4** The background commits make this code very interelated, and 4** The background commits make this code very interelated, and
5** overly complex. I need to rethink things a bit....The major players: 5** overly complex. I need to rethink things a bit....The major players:
6** 6**
7** journal_begin -- call with the number of blocks you expect to log. 7** journal_begin -- call with the number of blocks you expect to log.
8** If the current transaction is too 8** If the current transaction is too
9** old, it will block until the current transaction is 9** old, it will block until the current transaction is
10** finished, and then start a new one. 10** finished, and then start a new one.
11** Usually, your transaction will get joined in with 11** Usually, your transaction will get joined in with
12** previous ones for speed. 12** previous ones for speed.
13** 13**
14** journal_join -- same as journal_begin, but won't block on the current 14** journal_join -- same as journal_begin, but won't block on the current
15** transaction regardless of age. Don't ever call 15** transaction regardless of age. Don't ever call
16** this. Ever. There are only two places it should be 16** this. Ever. There are only two places it should be
17** called from, and they are both inside this file. 17** called from, and they are both inside this file.
18** 18**
19** journal_mark_dirty -- adds blocks into this transaction. clears any flags 19** journal_mark_dirty -- adds blocks into this transaction. clears any flags
20** that might make them get sent to disk 20** that might make them get sent to disk
21** and then marks them BH_JDirty. Puts the buffer head 21** and then marks them BH_JDirty. Puts the buffer head
22** into the current transaction hash. 22** into the current transaction hash.
23** 23**
24** journal_end -- if the current transaction is batchable, it does nothing 24** journal_end -- if the current transaction is batchable, it does nothing
25** otherwise, it could do an async/synchronous commit, or 25** otherwise, it could do an async/synchronous commit, or
26** a full flush of all log and real blocks in the 26** a full flush of all log and real blocks in the
27** transaction. 27** transaction.
28** 28**
29** flush_old_commits -- if the current transaction is too old, it is ended and 29** flush_old_commits -- if the current transaction is too old, it is ended and
30** commit blocks are sent to disk. Forces commit blocks 30** commit blocks are sent to disk. Forces commit blocks
31** to disk for all backgrounded commits that have been 31** to disk for all backgrounded commits that have been
32** around too long. 32** around too long.
33** -- Note, if you call this as an immediate flush from 33** -- Note, if you call this as an immediate flush from
34** from within kupdate, it will ignore the immediate flag 34** from within kupdate, it will ignore the immediate flag
35*/ 35*/
36 36
@@ -97,7 +97,7 @@ static int flush_commit_list(struct super_block *s,
97 struct reiserfs_journal_list *jl, int flushall); 97 struct reiserfs_journal_list *jl, int flushall);
98static int can_dirty(struct reiserfs_journal_cnode *cn); 98static int can_dirty(struct reiserfs_journal_cnode *cn);
99static int journal_join(struct reiserfs_transaction_handle *th, 99static int journal_join(struct reiserfs_transaction_handle *th,
100 struct super_block *p_s_sb, unsigned long nblocks); 100 struct super_block *sb, unsigned long nblocks);
101static int release_journal_dev(struct super_block *super, 101static int release_journal_dev(struct super_block *super,
102 struct reiserfs_journal *journal); 102 struct reiserfs_journal *journal);
103static int dirty_one_transaction(struct super_block *s, 103static int dirty_one_transaction(struct super_block *s,
@@ -113,12 +113,12 @@ enum {
113}; 113};
114 114
115static int do_journal_begin_r(struct reiserfs_transaction_handle *th, 115static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
116 struct super_block *p_s_sb, 116 struct super_block *sb,
117 unsigned long nblocks, int join); 117 unsigned long nblocks, int join);
118 118
119static void init_journal_hash(struct super_block *p_s_sb) 119static void init_journal_hash(struct super_block *sb)
120{ 120{
121 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); 121 struct reiserfs_journal *journal = SB_JOURNAL(sb);
122 memset(journal->j_hash_table, 0, 122 memset(journal->j_hash_table, 0,
123 JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)); 123 JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
124} 124}
@@ -145,7 +145,7 @@ static void disable_barrier(struct super_block *s)
145} 145}
146 146
147static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block 147static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
148 *p_s_sb) 148 *sb)
149{ 149{
150 struct reiserfs_bitmap_node *bn; 150 struct reiserfs_bitmap_node *bn;
151 static int id; 151 static int id;
@@ -154,7 +154,7 @@ static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
154 if (!bn) { 154 if (!bn) {
155 return NULL; 155 return NULL;
156 } 156 }
157 bn->data = kzalloc(p_s_sb->s_blocksize, GFP_NOFS); 157 bn->data = kzalloc(sb->s_blocksize, GFP_NOFS);
158 if (!bn->data) { 158 if (!bn->data) {
159 kfree(bn); 159 kfree(bn);
160 return NULL; 160 return NULL;
@@ -164,9 +164,9 @@ static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
164 return bn; 164 return bn;
165} 165}
166 166
167static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *p_s_sb) 167static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *sb)
168{ 168{
169 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); 169 struct reiserfs_journal *journal = SB_JOURNAL(sb);
170 struct reiserfs_bitmap_node *bn = NULL; 170 struct reiserfs_bitmap_node *bn = NULL;
171 struct list_head *entry = journal->j_bitmap_nodes.next; 171 struct list_head *entry = journal->j_bitmap_nodes.next;
172 172
@@ -176,21 +176,21 @@ static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *p_s_sb)
176 if (entry != &journal->j_bitmap_nodes) { 176 if (entry != &journal->j_bitmap_nodes) {
177 bn = list_entry(entry, struct reiserfs_bitmap_node, list); 177 bn = list_entry(entry, struct reiserfs_bitmap_node, list);
178 list_del(entry); 178 list_del(entry);
179 memset(bn->data, 0, p_s_sb->s_blocksize); 179 memset(bn->data, 0, sb->s_blocksize);
180 journal->j_free_bitmap_nodes--; 180 journal->j_free_bitmap_nodes--;
181 return bn; 181 return bn;
182 } 182 }
183 bn = allocate_bitmap_node(p_s_sb); 183 bn = allocate_bitmap_node(sb);
184 if (!bn) { 184 if (!bn) {
185 yield(); 185 yield();
186 goto repeat; 186 goto repeat;
187 } 187 }
188 return bn; 188 return bn;
189} 189}
190static inline void free_bitmap_node(struct super_block *p_s_sb, 190static inline void free_bitmap_node(struct super_block *sb,
191 struct reiserfs_bitmap_node *bn) 191 struct reiserfs_bitmap_node *bn)
192{ 192{
193 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); 193 struct reiserfs_journal *journal = SB_JOURNAL(sb);
194 journal->j_used_bitmap_nodes--; 194 journal->j_used_bitmap_nodes--;
195 if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) { 195 if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) {
196 kfree(bn->data); 196 kfree(bn->data);
@@ -201,46 +201,46 @@ static inline void free_bitmap_node(struct super_block *p_s_sb,
201 } 201 }
202} 202}
203 203
204static void allocate_bitmap_nodes(struct super_block *p_s_sb) 204static void allocate_bitmap_nodes(struct super_block *sb)
205{ 205{
206 int i; 206 int i;
207 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); 207 struct reiserfs_journal *journal = SB_JOURNAL(sb);
208 struct reiserfs_bitmap_node *bn = NULL; 208 struct reiserfs_bitmap_node *bn = NULL;
209 for (i = 0; i < REISERFS_MIN_BITMAP_NODES; i++) { 209 for (i = 0; i < REISERFS_MIN_BITMAP_NODES; i++) {
210 bn = allocate_bitmap_node(p_s_sb); 210 bn = allocate_bitmap_node(sb);
211 if (bn) { 211 if (bn) {
212 list_add(&bn->list, &journal->j_bitmap_nodes); 212 list_add(&bn->list, &journal->j_bitmap_nodes);
213 journal->j_free_bitmap_nodes++; 213 journal->j_free_bitmap_nodes++;
214 } else { 214 } else {
215 break; // this is ok, we'll try again when more are needed 215 break; /* this is ok, we'll try again when more are needed */
216 } 216 }
217 } 217 }
218} 218}
219 219
220static int set_bit_in_list_bitmap(struct super_block *p_s_sb, 220static int set_bit_in_list_bitmap(struct super_block *sb,
221 b_blocknr_t block, 221 b_blocknr_t block,
222 struct reiserfs_list_bitmap *jb) 222 struct reiserfs_list_bitmap *jb)
223{ 223{
224 unsigned int bmap_nr = block / (p_s_sb->s_blocksize << 3); 224 unsigned int bmap_nr = block / (sb->s_blocksize << 3);
225 unsigned int bit_nr = block % (p_s_sb->s_blocksize << 3); 225 unsigned int bit_nr = block % (sb->s_blocksize << 3);
226 226
227 if (!jb->bitmaps[bmap_nr]) { 227 if (!jb->bitmaps[bmap_nr]) {
228 jb->bitmaps[bmap_nr] = get_bitmap_node(p_s_sb); 228 jb->bitmaps[bmap_nr] = get_bitmap_node(sb);
229 } 229 }
230 set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data); 230 set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data);
231 return 0; 231 return 0;
232} 232}
233 233
234static void cleanup_bitmap_list(struct super_block *p_s_sb, 234static void cleanup_bitmap_list(struct super_block *sb,
235 struct reiserfs_list_bitmap *jb) 235 struct reiserfs_list_bitmap *jb)
236{ 236{
237 int i; 237 int i;
238 if (jb->bitmaps == NULL) 238 if (jb->bitmaps == NULL)
239 return; 239 return;
240 240
241 for (i = 0; i < reiserfs_bmap_count(p_s_sb); i++) { 241 for (i = 0; i < reiserfs_bmap_count(sb); i++) {
242 if (jb->bitmaps[i]) { 242 if (jb->bitmaps[i]) {
243 free_bitmap_node(p_s_sb, jb->bitmaps[i]); 243 free_bitmap_node(sb, jb->bitmaps[i]);
244 jb->bitmaps[i] = NULL; 244 jb->bitmaps[i] = NULL;
245 } 245 }
246 } 246 }
@@ -249,7 +249,7 @@ static void cleanup_bitmap_list(struct super_block *p_s_sb,
249/* 249/*
250** only call this on FS unmount. 250** only call this on FS unmount.
251*/ 251*/
252static int free_list_bitmaps(struct super_block *p_s_sb, 252static int free_list_bitmaps(struct super_block *sb,
253 struct reiserfs_list_bitmap *jb_array) 253 struct reiserfs_list_bitmap *jb_array)
254{ 254{
255 int i; 255 int i;
@@ -257,16 +257,16 @@ static int free_list_bitmaps(struct super_block *p_s_sb,
257 for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { 257 for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
258 jb = jb_array + i; 258 jb = jb_array + i;
259 jb->journal_list = NULL; 259 jb->journal_list = NULL;
260 cleanup_bitmap_list(p_s_sb, jb); 260 cleanup_bitmap_list(sb, jb);
261 vfree(jb->bitmaps); 261 vfree(jb->bitmaps);
262 jb->bitmaps = NULL; 262 jb->bitmaps = NULL;
263 } 263 }
264 return 0; 264 return 0;
265} 265}
266 266
267static int free_bitmap_nodes(struct super_block *p_s_sb) 267static int free_bitmap_nodes(struct super_block *sb)
268{ 268{
269 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); 269 struct reiserfs_journal *journal = SB_JOURNAL(sb);
270 struct list_head *next = journal->j_bitmap_nodes.next; 270 struct list_head *next = journal->j_bitmap_nodes.next;
271 struct reiserfs_bitmap_node *bn; 271 struct reiserfs_bitmap_node *bn;
272 272
@@ -283,10 +283,10 @@ static int free_bitmap_nodes(struct super_block *p_s_sb)
283} 283}
284 284
285/* 285/*
286** get memory for JOURNAL_NUM_BITMAPS worth of bitmaps. 286** get memory for JOURNAL_NUM_BITMAPS worth of bitmaps.
287** jb_array is the array to be filled in. 287** jb_array is the array to be filled in.
288*/ 288*/
289int reiserfs_allocate_list_bitmaps(struct super_block *p_s_sb, 289int reiserfs_allocate_list_bitmaps(struct super_block *sb,
290 struct reiserfs_list_bitmap *jb_array, 290 struct reiserfs_list_bitmap *jb_array,
291 unsigned int bmap_nr) 291 unsigned int bmap_nr)
292{ 292{
@@ -300,30 +300,30 @@ int reiserfs_allocate_list_bitmaps(struct super_block *p_s_sb,
300 jb->journal_list = NULL; 300 jb->journal_list = NULL;
301 jb->bitmaps = vmalloc(mem); 301 jb->bitmaps = vmalloc(mem);
302 if (!jb->bitmaps) { 302 if (!jb->bitmaps) {
303 reiserfs_warning(p_s_sb, 303 reiserfs_warning(sb, "clm-2000", "unable to "
304 "clm-2000, unable to allocate bitmaps for journal lists"); 304 "allocate bitmaps for journal lists");
305 failed = 1; 305 failed = 1;
306 break; 306 break;
307 } 307 }
308 memset(jb->bitmaps, 0, mem); 308 memset(jb->bitmaps, 0, mem);
309 } 309 }
310 if (failed) { 310 if (failed) {
311 free_list_bitmaps(p_s_sb, jb_array); 311 free_list_bitmaps(sb, jb_array);
312 return -1; 312 return -1;
313 } 313 }
314 return 0; 314 return 0;
315} 315}
316 316
317/* 317/*
318** find an available list bitmap. If you can't find one, flush a commit list 318** find an available list bitmap. If you can't find one, flush a commit list
319** and try again 319** and try again
320*/ 320*/
321static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *p_s_sb, 321static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb,
322 struct reiserfs_journal_list 322 struct reiserfs_journal_list
323 *jl) 323 *jl)
324{ 324{
325 int i, j; 325 int i, j;
326 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); 326 struct reiserfs_journal *journal = SB_JOURNAL(sb);
327 struct reiserfs_list_bitmap *jb = NULL; 327 struct reiserfs_list_bitmap *jb = NULL;
328 328
329 for (j = 0; j < (JOURNAL_NUM_BITMAPS * 3); j++) { 329 for (j = 0; j < (JOURNAL_NUM_BITMAPS * 3); j++) {
@@ -331,7 +331,7 @@ static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *p_s_sb,
331 journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS; 331 journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS;
332 jb = journal->j_list_bitmap + i; 332 jb = journal->j_list_bitmap + i;
333 if (journal->j_list_bitmap[i].journal_list) { 333 if (journal->j_list_bitmap[i].journal_list) {
334 flush_commit_list(p_s_sb, 334 flush_commit_list(sb,
335 journal->j_list_bitmap[i]. 335 journal->j_list_bitmap[i].
336 journal_list, 1); 336 journal_list, 1);
337 if (!journal->j_list_bitmap[i].journal_list) { 337 if (!journal->j_list_bitmap[i].journal_list) {
@@ -348,7 +348,7 @@ static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *p_s_sb,
348 return jb; 348 return jb;
349} 349}
350 350
351/* 351/*
352** allocates a new chunk of X nodes, and links them all together as a list. 352** allocates a new chunk of X nodes, and links them all together as a list.
353** Uses the cnode->next and cnode->prev pointers 353** Uses the cnode->next and cnode->prev pointers
354** returns NULL on failure 354** returns NULL on failure
@@ -376,14 +376,14 @@ static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)
376} 376}
377 377
378/* 378/*
379** pulls a cnode off the free list, or returns NULL on failure 379** pulls a cnode off the free list, or returns NULL on failure
380*/ 380*/
381static struct reiserfs_journal_cnode *get_cnode(struct super_block *p_s_sb) 381static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb)
382{ 382{
383 struct reiserfs_journal_cnode *cn; 383 struct reiserfs_journal_cnode *cn;
384 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); 384 struct reiserfs_journal *journal = SB_JOURNAL(sb);
385 385
386 reiserfs_check_lock_depth(p_s_sb, "get_cnode"); 386 reiserfs_check_lock_depth(sb, "get_cnode");
387 387
388 if (journal->j_cnode_free <= 0) { 388 if (journal->j_cnode_free <= 0) {
389 return NULL; 389 return NULL;
@@ -403,14 +403,14 @@ static struct reiserfs_journal_cnode *get_cnode(struct super_block *p_s_sb)
403} 403}
404 404
405/* 405/*
406** returns a cnode to the free list 406** returns a cnode to the free list
407*/ 407*/
408static void free_cnode(struct super_block *p_s_sb, 408static void free_cnode(struct super_block *sb,
409 struct reiserfs_journal_cnode *cn) 409 struct reiserfs_journal_cnode *cn)
410{ 410{
411 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); 411 struct reiserfs_journal *journal = SB_JOURNAL(sb);
412 412
413 reiserfs_check_lock_depth(p_s_sb, "free_cnode"); 413 reiserfs_check_lock_depth(sb, "free_cnode");
414 414
415 journal->j_cnode_used--; 415 journal->j_cnode_used--;
416 journal->j_cnode_free++; 416 journal->j_cnode_free++;
@@ -436,8 +436,8 @@ void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
436{ 436{
437#ifdef CONFIG_SMP 437#ifdef CONFIG_SMP
438 if (current->lock_depth < 0) { 438 if (current->lock_depth < 0) {
439 reiserfs_panic(sb, "%s called without kernel lock held", 439 reiserfs_panic(sb, "journal-1", "%s called without kernel "
440 caller); 440 "lock held", caller);
441 } 441 }
442#else 442#else
443 ; 443 ;
@@ -481,11 +481,11 @@ static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
481** reject it on the next call to reiserfs_in_journal 481** reject it on the next call to reiserfs_in_journal
482** 482**
483*/ 483*/
484int reiserfs_in_journal(struct super_block *p_s_sb, 484int reiserfs_in_journal(struct super_block *sb,
485 unsigned int bmap_nr, int bit_nr, int search_all, 485 unsigned int bmap_nr, int bit_nr, int search_all,
486 b_blocknr_t * next_zero_bit) 486 b_blocknr_t * next_zero_bit)
487{ 487{
488 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); 488 struct reiserfs_journal *journal = SB_JOURNAL(sb);
489 struct reiserfs_journal_cnode *cn; 489 struct reiserfs_journal_cnode *cn;
490 struct reiserfs_list_bitmap *jb; 490 struct reiserfs_list_bitmap *jb;
491 int i; 491 int i;
@@ -493,14 +493,14 @@ int reiserfs_in_journal(struct super_block *p_s_sb,
493 493
494 *next_zero_bit = 0; /* always start this at zero. */ 494 *next_zero_bit = 0; /* always start this at zero. */
495 495
496 PROC_INFO_INC(p_s_sb, journal.in_journal); 496 PROC_INFO_INC(sb, journal.in_journal);
497 /* If we aren't doing a search_all, this is a metablock, and it will be logged before use. 497 /* If we aren't doing a search_all, this is a metablock, and it will be logged before use.
498 ** if we crash before the transaction that freed it commits, this transaction won't 498 ** if we crash before the transaction that freed it commits, this transaction won't
499 ** have committed either, and the block will never be written 499 ** have committed either, and the block will never be written
500 */ 500 */
501 if (search_all) { 501 if (search_all) {
502 for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { 502 for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
503 PROC_INFO_INC(p_s_sb, journal.in_journal_bitmap); 503 PROC_INFO_INC(sb, journal.in_journal_bitmap);
504 jb = journal->j_list_bitmap + i; 504 jb = journal->j_list_bitmap + i;
505 if (jb->journal_list && jb->bitmaps[bmap_nr] && 505 if (jb->journal_list && jb->bitmaps[bmap_nr] &&
506 test_bit(bit_nr, 506 test_bit(bit_nr,
@@ -510,28 +510,28 @@ int reiserfs_in_journal(struct super_block *p_s_sb,
510 find_next_zero_bit((unsigned long *) 510 find_next_zero_bit((unsigned long *)
511 (jb->bitmaps[bmap_nr]-> 511 (jb->bitmaps[bmap_nr]->
512 data), 512 data),
513 p_s_sb->s_blocksize << 3, 513 sb->s_blocksize << 3,
514 bit_nr + 1); 514 bit_nr + 1);
515 return 1; 515 return 1;
516 } 516 }
517 } 517 }
518 } 518 }
519 519
520 bl = bmap_nr * (p_s_sb->s_blocksize << 3) + bit_nr; 520 bl = bmap_nr * (sb->s_blocksize << 3) + bit_nr;
521 /* is it in any old transactions? */ 521 /* is it in any old transactions? */
522 if (search_all 522 if (search_all
523 && (cn = 523 && (cn =
524 get_journal_hash_dev(p_s_sb, journal->j_list_hash_table, bl))) { 524 get_journal_hash_dev(sb, journal->j_list_hash_table, bl))) {
525 return 1; 525 return 1;
526 } 526 }
527 527
528 /* is it in the current transaction. This should never happen */ 528 /* is it in the current transaction. This should never happen */
529 if ((cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, bl))) { 529 if ((cn = get_journal_hash_dev(sb, journal->j_hash_table, bl))) {
530 BUG(); 530 BUG();
531 return 1; 531 return 1;
532 } 532 }
533 533
534 PROC_INFO_INC(p_s_sb, journal.in_journal_reusable); 534 PROC_INFO_INC(sb, journal.in_journal_reusable);
535 /* safe for reuse */ 535 /* safe for reuse */
536 return 0; 536 return 0;
537} 537}
@@ -553,16 +553,16 @@ static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
553} 553}
554 554
555/* lock the current transaction */ 555/* lock the current transaction */
556static inline void lock_journal(struct super_block *p_s_sb) 556static inline void lock_journal(struct super_block *sb)
557{ 557{
558 PROC_INFO_INC(p_s_sb, journal.lock_journal); 558 PROC_INFO_INC(sb, journal.lock_journal);
559 mutex_lock(&SB_JOURNAL(p_s_sb)->j_mutex); 559 mutex_lock(&SB_JOURNAL(sb)->j_mutex);
560} 560}
561 561
562/* unlock the current transaction */ 562/* unlock the current transaction */
563static inline void unlock_journal(struct super_block *p_s_sb) 563static inline void unlock_journal(struct super_block *sb)
564{ 564{
565 mutex_unlock(&SB_JOURNAL(p_s_sb)->j_mutex); 565 mutex_unlock(&SB_JOURNAL(sb)->j_mutex);
566} 566}
567 567
568static inline void get_journal_list(struct reiserfs_journal_list *jl) 568static inline void get_journal_list(struct reiserfs_journal_list *jl)
@@ -574,7 +574,7 @@ static inline void put_journal_list(struct super_block *s,
574 struct reiserfs_journal_list *jl) 574 struct reiserfs_journal_list *jl)
575{ 575{
576 if (jl->j_refcount < 1) { 576 if (jl->j_refcount < 1) {
577 reiserfs_panic(s, "trans id %lu, refcount at %d", 577 reiserfs_panic(s, "journal-2", "trans id %u, refcount at %d",
578 jl->j_trans_id, jl->j_refcount); 578 jl->j_trans_id, jl->j_refcount);
579 } 579 }
580 if (--jl->j_refcount == 0) 580 if (--jl->j_refcount == 0)
@@ -586,20 +586,20 @@ static inline void put_journal_list(struct super_block *s,
586** it gets called by flush_commit_list, and cleans up any data stored about blocks freed during a 586** it gets called by flush_commit_list, and cleans up any data stored about blocks freed during a
587** transaction. 587** transaction.
588*/ 588*/
589static void cleanup_freed_for_journal_list(struct super_block *p_s_sb, 589static void cleanup_freed_for_journal_list(struct super_block *sb,
590 struct reiserfs_journal_list *jl) 590 struct reiserfs_journal_list *jl)
591{ 591{
592 592
593 struct reiserfs_list_bitmap *jb = jl->j_list_bitmap; 593 struct reiserfs_list_bitmap *jb = jl->j_list_bitmap;
594 if (jb) { 594 if (jb) {
595 cleanup_bitmap_list(p_s_sb, jb); 595 cleanup_bitmap_list(sb, jb);
596 } 596 }
597 jl->j_list_bitmap->journal_list = NULL; 597 jl->j_list_bitmap->journal_list = NULL;
598 jl->j_list_bitmap = NULL; 598 jl->j_list_bitmap = NULL;
599} 599}
600 600
601static int journal_list_still_alive(struct super_block *s, 601static int journal_list_still_alive(struct super_block *s,
602 unsigned long trans_id) 602 unsigned int trans_id)
603{ 603{
604 struct reiserfs_journal *journal = SB_JOURNAL(s); 604 struct reiserfs_journal *journal = SB_JOURNAL(s);
605 struct list_head *entry = &journal->j_journal_list; 605 struct list_head *entry = &journal->j_journal_list;
@@ -644,8 +644,8 @@ static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
644 char b[BDEVNAME_SIZE]; 644 char b[BDEVNAME_SIZE];
645 645
646 if (buffer_journaled(bh)) { 646 if (buffer_journaled(bh)) {
647 reiserfs_warning(NULL, 647 reiserfs_warning(NULL, "clm-2084",
648 "clm-2084: pinned buffer %lu:%s sent to disk", 648 "pinned buffer %lu:%s sent to disk",
649 bh->b_blocknr, bdevname(bh->b_bdev, b)); 649 bh->b_blocknr, bdevname(bh->b_bdev, b));
650 } 650 }
651 if (uptodate) 651 if (uptodate)
@@ -933,9 +933,9 @@ static int flush_older_commits(struct super_block *s,
933 struct reiserfs_journal_list *other_jl; 933 struct reiserfs_journal_list *other_jl;
934 struct reiserfs_journal_list *first_jl; 934 struct reiserfs_journal_list *first_jl;
935 struct list_head *entry; 935 struct list_head *entry;
936 unsigned long trans_id = jl->j_trans_id; 936 unsigned int trans_id = jl->j_trans_id;
937 unsigned long other_trans_id; 937 unsigned int other_trans_id;
938 unsigned long first_trans_id; 938 unsigned int first_trans_id;
939 939
940 find_first: 940 find_first:
941 /* 941 /*
@@ -1014,7 +1014,7 @@ static int flush_commit_list(struct super_block *s,
1014 int i; 1014 int i;
1015 b_blocknr_t bn; 1015 b_blocknr_t bn;
1016 struct buffer_head *tbh = NULL; 1016 struct buffer_head *tbh = NULL;
1017 unsigned long trans_id = jl->j_trans_id; 1017 unsigned int trans_id = jl->j_trans_id;
1018 struct reiserfs_journal *journal = SB_JOURNAL(s); 1018 struct reiserfs_journal *journal = SB_JOURNAL(s);
1019 int barrier = 0; 1019 int barrier = 0;
1020 int retval = 0; 1020 int retval = 0;
@@ -1122,7 +1122,8 @@ static int flush_commit_list(struct super_block *s,
1122 sync_dirty_buffer(tbh); 1122 sync_dirty_buffer(tbh);
1123 if (unlikely(!buffer_uptodate(tbh))) { 1123 if (unlikely(!buffer_uptodate(tbh))) {
1124#ifdef CONFIG_REISERFS_CHECK 1124#ifdef CONFIG_REISERFS_CHECK
1125 reiserfs_warning(s, "journal-601, buffer write failed"); 1125 reiserfs_warning(s, "journal-601",
1126 "buffer write failed");
1126#endif 1127#endif
1127 retval = -EIO; 1128 retval = -EIO;
1128 } 1129 }
@@ -1154,14 +1155,14 @@ static int flush_commit_list(struct super_block *s,
1154 * up propagating the write error out to the filesystem. */ 1155 * up propagating the write error out to the filesystem. */
1155 if (unlikely(!buffer_uptodate(jl->j_commit_bh))) { 1156 if (unlikely(!buffer_uptodate(jl->j_commit_bh))) {
1156#ifdef CONFIG_REISERFS_CHECK 1157#ifdef CONFIG_REISERFS_CHECK
1157 reiserfs_warning(s, "journal-615: buffer write failed"); 1158 reiserfs_warning(s, "journal-615", "buffer write failed");
1158#endif 1159#endif
1159 retval = -EIO; 1160 retval = -EIO;
1160 } 1161 }
1161 bforget(jl->j_commit_bh); 1162 bforget(jl->j_commit_bh);
1162 if (journal->j_last_commit_id != 0 && 1163 if (journal->j_last_commit_id != 0 &&
1163 (jl->j_trans_id - journal->j_last_commit_id) != 1) { 1164 (jl->j_trans_id - journal->j_last_commit_id) != 1) {
1164 reiserfs_warning(s, "clm-2200: last commit %lu, current %lu", 1165 reiserfs_warning(s, "clm-2200", "last commit %lu, current %lu",
1165 journal->j_last_commit_id, jl->j_trans_id); 1166 journal->j_last_commit_id, jl->j_trans_id);
1166 } 1167 }
1167 journal->j_last_commit_id = jl->j_trans_id; 1168 journal->j_last_commit_id = jl->j_trans_id;
@@ -1191,8 +1192,8 @@ static int flush_commit_list(struct super_block *s,
1191} 1192}
1192 1193
1193/* 1194/*
1194** flush_journal_list frequently needs to find a newer transaction for a given block. This does that, or 1195** flush_journal_list frequently needs to find a newer transaction for a given block. This does that, or
1195** returns NULL if it can't find anything 1196** returns NULL if it can't find anything
1196*/ 1197*/
1197static struct reiserfs_journal_list *find_newer_jl_for_cn(struct 1198static struct reiserfs_journal_list *find_newer_jl_for_cn(struct
1198 reiserfs_journal_cnode 1199 reiserfs_journal_cnode
@@ -1236,11 +1237,11 @@ static void remove_journal_hash(struct super_block *,
1236** journal list for this transaction. Aside from freeing the cnode, this also allows the 1237** journal list for this transaction. Aside from freeing the cnode, this also allows the
1237** block to be reallocated for data blocks if it had been deleted. 1238** block to be reallocated for data blocks if it had been deleted.
1238*/ 1239*/
1239static void remove_all_from_journal_list(struct super_block *p_s_sb, 1240static void remove_all_from_journal_list(struct super_block *sb,
1240 struct reiserfs_journal_list *jl, 1241 struct reiserfs_journal_list *jl,
1241 int debug) 1242 int debug)
1242{ 1243{
1243 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); 1244 struct reiserfs_journal *journal = SB_JOURNAL(sb);
1244 struct reiserfs_journal_cnode *cn, *last; 1245 struct reiserfs_journal_cnode *cn, *last;
1245 cn = jl->j_realblock; 1246 cn = jl->j_realblock;
1246 1247
@@ -1250,18 +1251,18 @@ static void remove_all_from_journal_list(struct super_block *p_s_sb,
1250 while (cn) { 1251 while (cn) {
1251 if (cn->blocknr != 0) { 1252 if (cn->blocknr != 0) {
1252 if (debug) { 1253 if (debug) {
1253 reiserfs_warning(p_s_sb, 1254 reiserfs_warning(sb, "reiserfs-2201",
1254 "block %u, bh is %d, state %ld", 1255 "block %u, bh is %d, state %ld",
1255 cn->blocknr, cn->bh ? 1 : 0, 1256 cn->blocknr, cn->bh ? 1 : 0,
1256 cn->state); 1257 cn->state);
1257 } 1258 }
1258 cn->state = 0; 1259 cn->state = 0;
1259 remove_journal_hash(p_s_sb, journal->j_list_hash_table, 1260 remove_journal_hash(sb, journal->j_list_hash_table,
1260 jl, cn->blocknr, 1); 1261 jl, cn->blocknr, 1);
1261 } 1262 }
1262 last = cn; 1263 last = cn;
1263 cn = cn->next; 1264 cn = cn->next;
1264 free_cnode(p_s_sb, last); 1265 free_cnode(sb, last);
1265 } 1266 }
1266 jl->j_realblock = NULL; 1267 jl->j_realblock = NULL;
1267} 1268}
@@ -1273,12 +1274,12 @@ static void remove_all_from_journal_list(struct super_block *p_s_sb,
1273** called by flush_journal_list, before it calls remove_all_from_journal_list 1274** called by flush_journal_list, before it calls remove_all_from_journal_list
1274** 1275**
1275*/ 1276*/
1276static int _update_journal_header_block(struct super_block *p_s_sb, 1277static int _update_journal_header_block(struct super_block *sb,
1277 unsigned long offset, 1278 unsigned long offset,
1278 unsigned long trans_id) 1279 unsigned int trans_id)
1279{ 1280{
1280 struct reiserfs_journal_header *jh; 1281 struct reiserfs_journal_header *jh;
1281 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); 1282 struct reiserfs_journal *journal = SB_JOURNAL(sb);
1282 1283
1283 if (reiserfs_is_journal_aborted(journal)) 1284 if (reiserfs_is_journal_aborted(journal))
1284 return -EIO; 1285 return -EIO;
@@ -1288,8 +1289,8 @@ static int _update_journal_header_block(struct super_block *p_s_sb,
1288 wait_on_buffer((journal->j_header_bh)); 1289 wait_on_buffer((journal->j_header_bh));
1289 if (unlikely(!buffer_uptodate(journal->j_header_bh))) { 1290 if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
1290#ifdef CONFIG_REISERFS_CHECK 1291#ifdef CONFIG_REISERFS_CHECK
1291 reiserfs_warning(p_s_sb, 1292 reiserfs_warning(sb, "journal-699",
1292 "journal-699: buffer write failed"); 1293 "buffer write failed");
1293#endif 1294#endif
1294 return -EIO; 1295 return -EIO;
1295 } 1296 }
@@ -1302,49 +1303,49 @@ static int _update_journal_header_block(struct super_block *p_s_sb,
1302 jh->j_first_unflushed_offset = cpu_to_le32(offset); 1303 jh->j_first_unflushed_offset = cpu_to_le32(offset);
1303 jh->j_mount_id = cpu_to_le32(journal->j_mount_id); 1304 jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
1304 1305
1305 if (reiserfs_barrier_flush(p_s_sb)) { 1306 if (reiserfs_barrier_flush(sb)) {
1306 int ret; 1307 int ret;
1307 lock_buffer(journal->j_header_bh); 1308 lock_buffer(journal->j_header_bh);
1308 ret = submit_barrier_buffer(journal->j_header_bh); 1309 ret = submit_barrier_buffer(journal->j_header_bh);
1309 if (ret == -EOPNOTSUPP) { 1310 if (ret == -EOPNOTSUPP) {
1310 set_buffer_uptodate(journal->j_header_bh); 1311 set_buffer_uptodate(journal->j_header_bh);
1311 disable_barrier(p_s_sb); 1312 disable_barrier(sb);
1312 goto sync; 1313 goto sync;
1313 } 1314 }
1314 wait_on_buffer(journal->j_header_bh); 1315 wait_on_buffer(journal->j_header_bh);
1315 check_barrier_completion(p_s_sb, journal->j_header_bh); 1316 check_barrier_completion(sb, journal->j_header_bh);
1316 } else { 1317 } else {
1317 sync: 1318 sync:
1318 set_buffer_dirty(journal->j_header_bh); 1319 set_buffer_dirty(journal->j_header_bh);
1319 sync_dirty_buffer(journal->j_header_bh); 1320 sync_dirty_buffer(journal->j_header_bh);
1320 } 1321 }
1321 if (!buffer_uptodate(journal->j_header_bh)) { 1322 if (!buffer_uptodate(journal->j_header_bh)) {
1322 reiserfs_warning(p_s_sb, 1323 reiserfs_warning(sb, "journal-837",
1323 "journal-837: IO error during journal replay"); 1324 "IO error during journal replay");
1324 return -EIO; 1325 return -EIO;
1325 } 1326 }
1326 } 1327 }
1327 return 0; 1328 return 0;
1328} 1329}
1329 1330
1330static int update_journal_header_block(struct super_block *p_s_sb, 1331static int update_journal_header_block(struct super_block *sb,
1331 unsigned long offset, 1332 unsigned long offset,
1332 unsigned long trans_id) 1333 unsigned int trans_id)
1333{ 1334{
1334 return _update_journal_header_block(p_s_sb, offset, trans_id); 1335 return _update_journal_header_block(sb, offset, trans_id);
1335} 1336}
1336 1337
1337/* 1338/*
1338** flush any and all journal lists older than you are 1339** flush any and all journal lists older than you are
1339** can only be called from flush_journal_list 1340** can only be called from flush_journal_list
1340*/ 1341*/
1341static int flush_older_journal_lists(struct super_block *p_s_sb, 1342static int flush_older_journal_lists(struct super_block *sb,
1342 struct reiserfs_journal_list *jl) 1343 struct reiserfs_journal_list *jl)
1343{ 1344{
1344 struct list_head *entry; 1345 struct list_head *entry;
1345 struct reiserfs_journal_list *other_jl; 1346 struct reiserfs_journal_list *other_jl;
1346 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); 1347 struct reiserfs_journal *journal = SB_JOURNAL(sb);
1347 unsigned long trans_id = jl->j_trans_id; 1348 unsigned int trans_id = jl->j_trans_id;
1348 1349
1349 /* we know we are the only ones flushing things, no extra race 1350 /* we know we are the only ones flushing things, no extra race
1350 * protection is required. 1351 * protection is required.
@@ -1358,7 +1359,7 @@ static int flush_older_journal_lists(struct super_block *p_s_sb,
1358 if (other_jl->j_trans_id < trans_id) { 1359 if (other_jl->j_trans_id < trans_id) {
1359 BUG_ON(other_jl->j_refcount <= 0); 1360 BUG_ON(other_jl->j_refcount <= 0);
1360 /* do not flush all */ 1361 /* do not flush all */
1361 flush_journal_list(p_s_sb, other_jl, 0); 1362 flush_journal_list(sb, other_jl, 0);
1362 1363
1363 /* other_jl is now deleted from the list */ 1364 /* other_jl is now deleted from the list */
1364 goto restart; 1365 goto restart;
@@ -1381,8 +1382,8 @@ static void del_from_work_list(struct super_block *s,
1381** always set flushall to 1, unless you are calling from inside 1382** always set flushall to 1, unless you are calling from inside
1382** flush_journal_list 1383** flush_journal_list
1383** 1384**
1384** IMPORTANT. This can only be called while there are no journal writers, 1385** IMPORTANT. This can only be called while there are no journal writers,
1385** and the journal is locked. That means it can only be called from 1386** and the journal is locked. That means it can only be called from
1386** do_journal_end, or by journal_release 1387** do_journal_end, or by journal_release
1387*/ 1388*/
1388static int flush_journal_list(struct super_block *s, 1389static int flush_journal_list(struct super_block *s,
@@ -1401,8 +1402,7 @@ static int flush_journal_list(struct super_block *s,
1401 BUG_ON(j_len_saved <= 0); 1402 BUG_ON(j_len_saved <= 0);
1402 1403
1403 if (atomic_read(&journal->j_wcount) != 0) { 1404 if (atomic_read(&journal->j_wcount) != 0) {
1404 reiserfs_warning(s, 1405 reiserfs_warning(s, "clm-2048", "called with wcount %d",
1405 "clm-2048: flush_journal_list called with wcount %d",
1406 atomic_read(&journal->j_wcount)); 1406 atomic_read(&journal->j_wcount));
1407 } 1407 }
1408 BUG_ON(jl->j_trans_id == 0); 1408 BUG_ON(jl->j_trans_id == 0);
@@ -1416,8 +1416,7 @@ static int flush_journal_list(struct super_block *s,
1416 1416
1417 count = 0; 1417 count = 0;
1418 if (j_len_saved > journal->j_trans_max) { 1418 if (j_len_saved > journal->j_trans_max) {
1419 reiserfs_panic(s, 1419 reiserfs_panic(s, "journal-715", "length is %lu, trans id %lu",
1420 "journal-715: flush_journal_list, length is %lu, trans id %lu\n",
1421 j_len_saved, jl->j_trans_id); 1420 j_len_saved, jl->j_trans_id);
1422 return 0; 1421 return 0;
1423 } 1422 }
@@ -1430,7 +1429,7 @@ static int flush_journal_list(struct super_block *s,
1430 goto flush_older_and_return; 1429 goto flush_older_and_return;
1431 } 1430 }
1432 1431
1433 /* start by putting the commit list on disk. This will also flush 1432 /* start by putting the commit list on disk. This will also flush
1434 ** the commit lists of any olders transactions 1433 ** the commit lists of any olders transactions
1435 */ 1434 */
1436 flush_commit_list(s, jl, 1); 1435 flush_commit_list(s, jl, 1);
@@ -1445,12 +1444,12 @@ static int flush_journal_list(struct super_block *s,
1445 goto flush_older_and_return; 1444 goto flush_older_and_return;
1446 } 1445 }
1447 1446
1448 /* loop through each cnode, see if we need to write it, 1447 /* loop through each cnode, see if we need to write it,
1449 ** or wait on a more recent transaction, or just ignore it 1448 ** or wait on a more recent transaction, or just ignore it
1450 */ 1449 */
1451 if (atomic_read(&(journal->j_wcount)) != 0) { 1450 if (atomic_read(&(journal->j_wcount)) != 0) {
1452 reiserfs_panic(s, 1451 reiserfs_panic(s, "journal-844", "journal list is flushing, "
1453 "journal-844: panic journal list is flushing, wcount is not 0\n"); 1452 "wcount is not 0");
1454 } 1453 }
1455 cn = jl->j_realblock; 1454 cn = jl->j_realblock;
1456 while (cn) { 1455 while (cn) {
@@ -1474,8 +1473,8 @@ static int flush_journal_list(struct super_block *s,
1474 if (!pjl && cn->bh) { 1473 if (!pjl && cn->bh) {
1475 saved_bh = cn->bh; 1474 saved_bh = cn->bh;
1476 1475
1477 /* we do this to make sure nobody releases the buffer while 1476 /* we do this to make sure nobody releases the buffer while
1478 ** we are working with it 1477 ** we are working with it
1479 */ 1478 */
1480 get_bh(saved_bh); 1479 get_bh(saved_bh);
1481 1480
@@ -1498,8 +1497,8 @@ static int flush_journal_list(struct super_block *s,
1498 goto free_cnode; 1497 goto free_cnode;
1499 } 1498 }
1500 1499
1501 /* bh == NULL when the block got to disk on its own, OR, 1500 /* bh == NULL when the block got to disk on its own, OR,
1502 ** the block got freed in a future transaction 1501 ** the block got freed in a future transaction
1503 */ 1502 */
1504 if (saved_bh == NULL) { 1503 if (saved_bh == NULL) {
1505 goto free_cnode; 1504 goto free_cnode;
@@ -1510,8 +1509,8 @@ static int flush_journal_list(struct super_block *s,
1510 ** is not marked JDirty_wait 1509 ** is not marked JDirty_wait
1511 */ 1510 */
1512 if ((!was_jwait) && !buffer_locked(saved_bh)) { 1511 if ((!was_jwait) && !buffer_locked(saved_bh)) {
1513 reiserfs_warning(s, 1512 reiserfs_warning(s, "journal-813",
1514 "journal-813: BAD! buffer %llu %cdirty %cjwait, " 1513 "BAD! buffer %llu %cdirty %cjwait, "
1515 "not in a newer tranasction", 1514 "not in a newer tranasction",
1516 (unsigned long long)saved_bh-> 1515 (unsigned long long)saved_bh->
1517 b_blocknr, was_dirty ? ' ' : '!', 1516 b_blocknr, was_dirty ? ' ' : '!',
@@ -1529,8 +1528,8 @@ static int flush_journal_list(struct super_block *s,
1529 unlock_buffer(saved_bh); 1528 unlock_buffer(saved_bh);
1530 count++; 1529 count++;
1531 } else { 1530 } else {
1532 reiserfs_warning(s, 1531 reiserfs_warning(s, "clm-2082",
1533 "clm-2082: Unable to flush buffer %llu in %s", 1532 "Unable to flush buffer %llu in %s",
1534 (unsigned long long)saved_bh-> 1533 (unsigned long long)saved_bh->
1535 b_blocknr, __func__); 1534 b_blocknr, __func__);
1536 } 1535 }
@@ -1541,8 +1540,8 @@ static int flush_journal_list(struct super_block *s,
1541 /* we incremented this to keep others from taking the buffer head away */ 1540 /* we incremented this to keep others from taking the buffer head away */
1542 put_bh(saved_bh); 1541 put_bh(saved_bh);
1543 if (atomic_read(&(saved_bh->b_count)) < 0) { 1542 if (atomic_read(&(saved_bh->b_count)) < 0) {
1544 reiserfs_warning(s, 1543 reiserfs_warning(s, "journal-945",
1545 "journal-945: saved_bh->b_count < 0"); 1544 "saved_bh->b_count < 0");
1546 } 1545 }
1547 } 1546 }
1548 } 1547 }
@@ -1551,18 +1550,18 @@ static int flush_journal_list(struct super_block *s,
1551 while (cn) { 1550 while (cn) {
1552 if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) { 1551 if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
1553 if (!cn->bh) { 1552 if (!cn->bh) {
1554 reiserfs_panic(s, 1553 reiserfs_panic(s, "journal-1011",
1555 "journal-1011: cn->bh is NULL\n"); 1554 "cn->bh is NULL");
1556 } 1555 }
1557 wait_on_buffer(cn->bh); 1556 wait_on_buffer(cn->bh);
1558 if (!cn->bh) { 1557 if (!cn->bh) {
1559 reiserfs_panic(s, 1558 reiserfs_panic(s, "journal-1012",
1560 "journal-1012: cn->bh is NULL\n"); 1559 "cn->bh is NULL");
1561 } 1560 }
1562 if (unlikely(!buffer_uptodate(cn->bh))) { 1561 if (unlikely(!buffer_uptodate(cn->bh))) {
1563#ifdef CONFIG_REISERFS_CHECK 1562#ifdef CONFIG_REISERFS_CHECK
1564 reiserfs_warning(s, 1563 reiserfs_warning(s, "journal-949",
1565 "journal-949: buffer write failed\n"); 1564 "buffer write failed");
1566#endif 1565#endif
1567 err = -EIO; 1566 err = -EIO;
1568 } 1567 }
@@ -1587,7 +1586,7 @@ static int flush_journal_list(struct super_block *s,
1587 __func__); 1586 __func__);
1588 flush_older_and_return: 1587 flush_older_and_return:
1589 1588
1590 /* before we can update the journal header block, we _must_ flush all 1589 /* before we can update the journal header block, we _must_ flush all
1591 ** real blocks from all older transactions to disk. This is because 1590 ** real blocks from all older transactions to disk. This is because
1592 ** once the header block is updated, this transaction will not be 1591 ** once the header block is updated, this transaction will not be
1593 ** replayed after a crash 1592 ** replayed after a crash
@@ -1597,7 +1596,7 @@ static int flush_journal_list(struct super_block *s,
1597 } 1596 }
1598 1597
1599 err = journal->j_errno; 1598 err = journal->j_errno;
1600 /* before we can remove everything from the hash tables for this 1599 /* before we can remove everything from the hash tables for this
1601 ** transaction, we must make sure it can never be replayed 1600 ** transaction, we must make sure it can never be replayed
1602 ** 1601 **
1603 ** since we are only called from do_journal_end, we know for sure there 1602 ** since we are only called from do_journal_end, we know for sure there
@@ -1623,7 +1622,7 @@ static int flush_journal_list(struct super_block *s,
1623 1622
1624 if (journal->j_last_flush_id != 0 && 1623 if (journal->j_last_flush_id != 0 &&
1625 (jl->j_trans_id - journal->j_last_flush_id) != 1) { 1624 (jl->j_trans_id - journal->j_last_flush_id) != 1) {
1626 reiserfs_warning(s, "clm-2201: last flush %lu, current %lu", 1625 reiserfs_warning(s, "clm-2201", "last flush %lu, current %lu",
1627 journal->j_last_flush_id, jl->j_trans_id); 1626 journal->j_last_flush_id, jl->j_trans_id);
1628 } 1627 }
1629 journal->j_last_flush_id = jl->j_trans_id; 1628 journal->j_last_flush_id = jl->j_trans_id;
@@ -1758,13 +1757,13 @@ static int dirty_one_transaction(struct super_block *s,
1758static int kupdate_transactions(struct super_block *s, 1757static int kupdate_transactions(struct super_block *s,
1759 struct reiserfs_journal_list *jl, 1758 struct reiserfs_journal_list *jl,
1760 struct reiserfs_journal_list **next_jl, 1759 struct reiserfs_journal_list **next_jl,
1761 unsigned long *next_trans_id, 1760 unsigned int *next_trans_id,
1762 int num_blocks, int num_trans) 1761 int num_blocks, int num_trans)
1763{ 1762{
1764 int ret = 0; 1763 int ret = 0;
1765 int written = 0; 1764 int written = 0;
1766 int transactions_flushed = 0; 1765 int transactions_flushed = 0;
1767 unsigned long orig_trans_id = jl->j_trans_id; 1766 unsigned int orig_trans_id = jl->j_trans_id;
1768 struct buffer_chunk chunk; 1767 struct buffer_chunk chunk;
1769 struct list_head *entry; 1768 struct list_head *entry;
1770 struct reiserfs_journal *journal = SB_JOURNAL(s); 1769 struct reiserfs_journal *journal = SB_JOURNAL(s);
@@ -1833,7 +1832,7 @@ static int flush_used_journal_lists(struct super_block *s,
1833 int limit = 256; 1832 int limit = 256;
1834 struct reiserfs_journal_list *tjl; 1833 struct reiserfs_journal_list *tjl;
1835 struct reiserfs_journal_list *flush_jl; 1834 struct reiserfs_journal_list *flush_jl;
1836 unsigned long trans_id; 1835 unsigned int trans_id;
1837 struct reiserfs_journal *journal = SB_JOURNAL(s); 1836 struct reiserfs_journal *journal = SB_JOURNAL(s);
1838 1837
1839 flush_jl = tjl = jl; 1838 flush_jl = tjl = jl;
@@ -1909,22 +1908,22 @@ void remove_journal_hash(struct super_block *sb,
1909 } 1908 }
1910} 1909}
1911 1910
1912static void free_journal_ram(struct super_block *p_s_sb) 1911static void free_journal_ram(struct super_block *sb)
1913{ 1912{
1914 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); 1913 struct reiserfs_journal *journal = SB_JOURNAL(sb);
1915 kfree(journal->j_current_jl); 1914 kfree(journal->j_current_jl);
1916 journal->j_num_lists--; 1915 journal->j_num_lists--;
1917 1916
1918 vfree(journal->j_cnode_free_orig); 1917 vfree(journal->j_cnode_free_orig);
1919 free_list_bitmaps(p_s_sb, journal->j_list_bitmap); 1918 free_list_bitmaps(sb, journal->j_list_bitmap);
1920 free_bitmap_nodes(p_s_sb); /* must be after free_list_bitmaps */ 1919 free_bitmap_nodes(sb); /* must be after free_list_bitmaps */
1921 if (journal->j_header_bh) { 1920 if (journal->j_header_bh) {
1922 brelse(journal->j_header_bh); 1921 brelse(journal->j_header_bh);
1923 } 1922 }
1924 /* j_header_bh is on the journal dev, make sure not to release the journal 1923 /* j_header_bh is on the journal dev, make sure not to release the journal
1925 * dev until we brelse j_header_bh 1924 * dev until we brelse j_header_bh
1926 */ 1925 */
1927 release_journal_dev(p_s_sb, journal); 1926 release_journal_dev(sb, journal);
1928 vfree(journal); 1927 vfree(journal);
1929} 1928}
1930 1929
@@ -1933,27 +1932,27 @@ static void free_journal_ram(struct super_block *p_s_sb)
1933** of read_super() yet. Any other caller must keep error at 0. 1932** of read_super() yet. Any other caller must keep error at 0.
1934*/ 1933*/
1935static int do_journal_release(struct reiserfs_transaction_handle *th, 1934static int do_journal_release(struct reiserfs_transaction_handle *th,
1936 struct super_block *p_s_sb, int error) 1935 struct super_block *sb, int error)
1937{ 1936{
1938 struct reiserfs_transaction_handle myth; 1937 struct reiserfs_transaction_handle myth;
1939 int flushed = 0; 1938 int flushed = 0;
1940 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); 1939 struct reiserfs_journal *journal = SB_JOURNAL(sb);
1941 1940
1942 /* we only want to flush out transactions if we were called with error == 0 1941 /* we only want to flush out transactions if we were called with error == 0
1943 */ 1942 */
1944 if (!error && !(p_s_sb->s_flags & MS_RDONLY)) { 1943 if (!error && !(sb->s_flags & MS_RDONLY)) {
1945 /* end the current trans */ 1944 /* end the current trans */
1946 BUG_ON(!th->t_trans_id); 1945 BUG_ON(!th->t_trans_id);
1947 do_journal_end(th, p_s_sb, 10, FLUSH_ALL); 1946 do_journal_end(th, sb, 10, FLUSH_ALL);
1948 1947
1949 /* make sure something gets logged to force our way into the flush code */ 1948 /* make sure something gets logged to force our way into the flush code */
1950 if (!journal_join(&myth, p_s_sb, 1)) { 1949 if (!journal_join(&myth, sb, 1)) {
1951 reiserfs_prepare_for_journal(p_s_sb, 1950 reiserfs_prepare_for_journal(sb,
1952 SB_BUFFER_WITH_SB(p_s_sb), 1951 SB_BUFFER_WITH_SB(sb),
1953 1); 1952 1);
1954 journal_mark_dirty(&myth, p_s_sb, 1953 journal_mark_dirty(&myth, sb,
1955 SB_BUFFER_WITH_SB(p_s_sb)); 1954 SB_BUFFER_WITH_SB(sb));
1956 do_journal_end(&myth, p_s_sb, 1, FLUSH_ALL); 1955 do_journal_end(&myth, sb, 1, FLUSH_ALL);
1957 flushed = 1; 1956 flushed = 1;
1958 } 1957 }
1959 } 1958 }
@@ -1961,26 +1960,26 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
1961 /* this also catches errors during the do_journal_end above */ 1960 /* this also catches errors during the do_journal_end above */
1962 if (!error && reiserfs_is_journal_aborted(journal)) { 1961 if (!error && reiserfs_is_journal_aborted(journal)) {
1963 memset(&myth, 0, sizeof(myth)); 1962 memset(&myth, 0, sizeof(myth));
1964 if (!journal_join_abort(&myth, p_s_sb, 1)) { 1963 if (!journal_join_abort(&myth, sb, 1)) {
1965 reiserfs_prepare_for_journal(p_s_sb, 1964 reiserfs_prepare_for_journal(sb,
1966 SB_BUFFER_WITH_SB(p_s_sb), 1965 SB_BUFFER_WITH_SB(sb),
1967 1); 1966 1);
1968 journal_mark_dirty(&myth, p_s_sb, 1967 journal_mark_dirty(&myth, sb,
1969 SB_BUFFER_WITH_SB(p_s_sb)); 1968 SB_BUFFER_WITH_SB(sb));
1970 do_journal_end(&myth, p_s_sb, 1, FLUSH_ALL); 1969 do_journal_end(&myth, sb, 1, FLUSH_ALL);
1971 } 1970 }
1972 } 1971 }
1973 1972
1974 reiserfs_mounted_fs_count--; 1973 reiserfs_mounted_fs_count--;
1975 /* wait for all commits to finish */ 1974 /* wait for all commits to finish */
1976 cancel_delayed_work(&SB_JOURNAL(p_s_sb)->j_work); 1975 cancel_delayed_work(&SB_JOURNAL(sb)->j_work);
1977 flush_workqueue(commit_wq); 1976 flush_workqueue(commit_wq);
1978 if (!reiserfs_mounted_fs_count) { 1977 if (!reiserfs_mounted_fs_count) {
1979 destroy_workqueue(commit_wq); 1978 destroy_workqueue(commit_wq);
1980 commit_wq = NULL; 1979 commit_wq = NULL;
1981 } 1980 }
1982 1981
1983 free_journal_ram(p_s_sb); 1982 free_journal_ram(sb);
1984 1983
1985 return 0; 1984 return 0;
1986} 1985}
@@ -1989,41 +1988,41 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
1989** call on unmount. flush all journal trans, release all alloc'd ram 1988** call on unmount. flush all journal trans, release all alloc'd ram
1990*/ 1989*/
1991int journal_release(struct reiserfs_transaction_handle *th, 1990int journal_release(struct reiserfs_transaction_handle *th,
1992 struct super_block *p_s_sb) 1991 struct super_block *sb)
1993{ 1992{
1994 return do_journal_release(th, p_s_sb, 0); 1993 return do_journal_release(th, sb, 0);
1995} 1994}
1996 1995
1997/* 1996/*
1998** only call from an error condition inside reiserfs_read_super! 1997** only call from an error condition inside reiserfs_read_super!
1999*/ 1998*/
2000int journal_release_error(struct reiserfs_transaction_handle *th, 1999int journal_release_error(struct reiserfs_transaction_handle *th,
2001 struct super_block *p_s_sb) 2000 struct super_block *sb)
2002{ 2001{
2003 return do_journal_release(th, p_s_sb, 1); 2002 return do_journal_release(th, sb, 1);
2004} 2003}
2005 2004
2006/* compares description block with commit block. returns 1 if they differ, 0 if they are the same */ 2005/* compares description block with commit block. returns 1 if they differ, 0 if they are the same */
2007static int journal_compare_desc_commit(struct super_block *p_s_sb, 2006static int journal_compare_desc_commit(struct super_block *sb,
2008 struct reiserfs_journal_desc *desc, 2007 struct reiserfs_journal_desc *desc,
2009 struct reiserfs_journal_commit *commit) 2008 struct reiserfs_journal_commit *commit)
2010{ 2009{
2011 if (get_commit_trans_id(commit) != get_desc_trans_id(desc) || 2010 if (get_commit_trans_id(commit) != get_desc_trans_id(desc) ||
2012 get_commit_trans_len(commit) != get_desc_trans_len(desc) || 2011 get_commit_trans_len(commit) != get_desc_trans_len(desc) ||
2013 get_commit_trans_len(commit) > SB_JOURNAL(p_s_sb)->j_trans_max || 2012 get_commit_trans_len(commit) > SB_JOURNAL(sb)->j_trans_max ||
2014 get_commit_trans_len(commit) <= 0) { 2013 get_commit_trans_len(commit) <= 0) {
2015 return 1; 2014 return 1;
2016 } 2015 }
2017 return 0; 2016 return 0;
2018} 2017}
2019 2018
2020/* returns 0 if it did not find a description block 2019/* returns 0 if it did not find a description block
2021** returns -1 if it found a corrupt commit block 2020** returns -1 if it found a corrupt commit block
2022** returns 1 if both desc and commit were valid 2021** returns 1 if both desc and commit were valid
2023*/ 2022*/
2024static int journal_transaction_is_valid(struct super_block *p_s_sb, 2023static int journal_transaction_is_valid(struct super_block *sb,
2025 struct buffer_head *d_bh, 2024 struct buffer_head *d_bh,
2026 unsigned long *oldest_invalid_trans_id, 2025 unsigned int *oldest_invalid_trans_id,
2027 unsigned long *newest_mount_id) 2026 unsigned long *newest_mount_id)
2028{ 2027{
2029 struct reiserfs_journal_desc *desc; 2028 struct reiserfs_journal_desc *desc;
@@ -2039,7 +2038,7 @@ static int journal_transaction_is_valid(struct super_block *p_s_sb,
2039 && !memcmp(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8)) { 2038 && !memcmp(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8)) {
2040 if (oldest_invalid_trans_id && *oldest_invalid_trans_id 2039 if (oldest_invalid_trans_id && *oldest_invalid_trans_id
2041 && get_desc_trans_id(desc) > *oldest_invalid_trans_id) { 2040 && get_desc_trans_id(desc) > *oldest_invalid_trans_id) {
2042 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, 2041 reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2043 "journal-986: transaction " 2042 "journal-986: transaction "
2044 "is valid returning because trans_id %d is greater than " 2043 "is valid returning because trans_id %d is greater than "
2045 "oldest_invalid %lu", 2044 "oldest_invalid %lu",
@@ -2049,7 +2048,7 @@ static int journal_transaction_is_valid(struct super_block *p_s_sb,
2049 } 2048 }
2050 if (newest_mount_id 2049 if (newest_mount_id
2051 && *newest_mount_id > get_desc_mount_id(desc)) { 2050 && *newest_mount_id > get_desc_mount_id(desc)) {
2052 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, 2051 reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2053 "journal-1087: transaction " 2052 "journal-1087: transaction "
2054 "is valid returning because mount_id %d is less than " 2053 "is valid returning because mount_id %d is less than "
2055 "newest_mount_id %lu", 2054 "newest_mount_id %lu",
@@ -2057,36 +2056,37 @@ static int journal_transaction_is_valid(struct super_block *p_s_sb,
2057 *newest_mount_id); 2056 *newest_mount_id);
2058 return -1; 2057 return -1;
2059 } 2058 }
2060 if (get_desc_trans_len(desc) > SB_JOURNAL(p_s_sb)->j_trans_max) { 2059 if (get_desc_trans_len(desc) > SB_JOURNAL(sb)->j_trans_max) {
2061 reiserfs_warning(p_s_sb, 2060 reiserfs_warning(sb, "journal-2018",
2062 "journal-2018: Bad transaction length %d encountered, ignoring transaction", 2061 "Bad transaction length %d "
2062 "encountered, ignoring transaction",
2063 get_desc_trans_len(desc)); 2063 get_desc_trans_len(desc));
2064 return -1; 2064 return -1;
2065 } 2065 }
2066 offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb); 2066 offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
2067 2067
2068 /* ok, we have a journal description block, lets see if the transaction was valid */ 2068 /* ok, we have a journal description block, lets see if the transaction was valid */
2069 c_bh = 2069 c_bh =
2070 journal_bread(p_s_sb, 2070 journal_bread(sb,
2071 SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + 2071 SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2072 ((offset + get_desc_trans_len(desc) + 2072 ((offset + get_desc_trans_len(desc) +
2073 1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))); 2073 1) % SB_ONDISK_JOURNAL_SIZE(sb)));
2074 if (!c_bh) 2074 if (!c_bh)
2075 return 0; 2075 return 0;
2076 commit = (struct reiserfs_journal_commit *)c_bh->b_data; 2076 commit = (struct reiserfs_journal_commit *)c_bh->b_data;
2077 if (journal_compare_desc_commit(p_s_sb, desc, commit)) { 2077 if (journal_compare_desc_commit(sb, desc, commit)) {
2078 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, 2078 reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2079 "journal_transaction_is_valid, commit offset %ld had bad " 2079 "journal_transaction_is_valid, commit offset %ld had bad "
2080 "time %d or length %d", 2080 "time %d or length %d",
2081 c_bh->b_blocknr - 2081 c_bh->b_blocknr -
2082 SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 2082 SB_ONDISK_JOURNAL_1st_BLOCK(sb),
2083 get_commit_trans_id(commit), 2083 get_commit_trans_id(commit),
2084 get_commit_trans_len(commit)); 2084 get_commit_trans_len(commit));
2085 brelse(c_bh); 2085 brelse(c_bh);
2086 if (oldest_invalid_trans_id) { 2086 if (oldest_invalid_trans_id) {
2087 *oldest_invalid_trans_id = 2087 *oldest_invalid_trans_id =
2088 get_desc_trans_id(desc); 2088 get_desc_trans_id(desc);
2089 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, 2089 reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2090 "journal-1004: " 2090 "journal-1004: "
2091 "transaction_is_valid setting oldest invalid trans_id " 2091 "transaction_is_valid setting oldest invalid trans_id "
2092 "to %d", 2092 "to %d",
@@ -2095,11 +2095,11 @@ static int journal_transaction_is_valid(struct super_block *p_s_sb,
2095 return -1; 2095 return -1;
2096 } 2096 }
2097 brelse(c_bh); 2097 brelse(c_bh);
2098 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, 2098 reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2099 "journal-1006: found valid " 2099 "journal-1006: found valid "
2100 "transaction start offset %llu, len %d id %d", 2100 "transaction start offset %llu, len %d id %d",
2101 d_bh->b_blocknr - 2101 d_bh->b_blocknr -
2102 SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 2102 SB_ONDISK_JOURNAL_1st_BLOCK(sb),
2103 get_desc_trans_len(desc), 2103 get_desc_trans_len(desc),
2104 get_desc_trans_id(desc)); 2104 get_desc_trans_id(desc));
2105 return 1; 2105 return 1;
@@ -2121,63 +2121,63 @@ static void brelse_array(struct buffer_head **heads, int num)
2121** this either reads in a replays a transaction, or returns because the transaction 2121** this either reads in a replays a transaction, or returns because the transaction
2122** is invalid, or too old. 2122** is invalid, or too old.
2123*/ 2123*/
2124static int journal_read_transaction(struct super_block *p_s_sb, 2124static int journal_read_transaction(struct super_block *sb,
2125 unsigned long cur_dblock, 2125 unsigned long cur_dblock,
2126 unsigned long oldest_start, 2126 unsigned long oldest_start,
2127 unsigned long oldest_trans_id, 2127 unsigned int oldest_trans_id,
2128 unsigned long newest_mount_id) 2128 unsigned long newest_mount_id)
2129{ 2129{
2130 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); 2130 struct reiserfs_journal *journal = SB_JOURNAL(sb);
2131 struct reiserfs_journal_desc *desc; 2131 struct reiserfs_journal_desc *desc;
2132 struct reiserfs_journal_commit *commit; 2132 struct reiserfs_journal_commit *commit;
2133 unsigned long trans_id = 0; 2133 unsigned int trans_id = 0;
2134 struct buffer_head *c_bh; 2134 struct buffer_head *c_bh;
2135 struct buffer_head *d_bh; 2135 struct buffer_head *d_bh;
2136 struct buffer_head **log_blocks = NULL; 2136 struct buffer_head **log_blocks = NULL;
2137 struct buffer_head **real_blocks = NULL; 2137 struct buffer_head **real_blocks = NULL;
2138 unsigned long trans_offset; 2138 unsigned int trans_offset;
2139 int i; 2139 int i;
2140 int trans_half; 2140 int trans_half;
2141 2141
2142 d_bh = journal_bread(p_s_sb, cur_dblock); 2142 d_bh = journal_bread(sb, cur_dblock);
2143 if (!d_bh) 2143 if (!d_bh)
2144 return 1; 2144 return 1;
2145 desc = (struct reiserfs_journal_desc *)d_bh->b_data; 2145 desc = (struct reiserfs_journal_desc *)d_bh->b_data;
2146 trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb); 2146 trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
2147 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1037: " 2147 reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1037: "
2148 "journal_read_transaction, offset %llu, len %d mount_id %d", 2148 "journal_read_transaction, offset %llu, len %d mount_id %d",
2149 d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 2149 d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
2150 get_desc_trans_len(desc), get_desc_mount_id(desc)); 2150 get_desc_trans_len(desc), get_desc_mount_id(desc));
2151 if (get_desc_trans_id(desc) < oldest_trans_id) { 2151 if (get_desc_trans_id(desc) < oldest_trans_id) {
2152 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1039: " 2152 reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1039: "
2153 "journal_read_trans skipping because %lu is too old", 2153 "journal_read_trans skipping because %lu is too old",
2154 cur_dblock - 2154 cur_dblock -
2155 SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb)); 2155 SB_ONDISK_JOURNAL_1st_BLOCK(sb));
2156 brelse(d_bh); 2156 brelse(d_bh);
2157 return 1; 2157 return 1;
2158 } 2158 }
2159 if (get_desc_mount_id(desc) != newest_mount_id) { 2159 if (get_desc_mount_id(desc) != newest_mount_id) {
2160 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1146: " 2160 reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1146: "
2161 "journal_read_trans skipping because %d is != " 2161 "journal_read_trans skipping because %d is != "
2162 "newest_mount_id %lu", get_desc_mount_id(desc), 2162 "newest_mount_id %lu", get_desc_mount_id(desc),
2163 newest_mount_id); 2163 newest_mount_id);
2164 brelse(d_bh); 2164 brelse(d_bh);
2165 return 1; 2165 return 1;
2166 } 2166 }
2167 c_bh = journal_bread(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + 2167 c_bh = journal_bread(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2168 ((trans_offset + get_desc_trans_len(desc) + 1) % 2168 ((trans_offset + get_desc_trans_len(desc) + 1) %
2169 SB_ONDISK_JOURNAL_SIZE(p_s_sb))); 2169 SB_ONDISK_JOURNAL_SIZE(sb)));
2170 if (!c_bh) { 2170 if (!c_bh) {
2171 brelse(d_bh); 2171 brelse(d_bh);
2172 return 1; 2172 return 1;
2173 } 2173 }
2174 commit = (struct reiserfs_journal_commit *)c_bh->b_data; 2174 commit = (struct reiserfs_journal_commit *)c_bh->b_data;
2175 if (journal_compare_desc_commit(p_s_sb, desc, commit)) { 2175 if (journal_compare_desc_commit(sb, desc, commit)) {
2176 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, 2176 reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2177 "journal_read_transaction, " 2177 "journal_read_transaction, "
2178 "commit offset %llu had bad time %d or length %d", 2178 "commit offset %llu had bad time %d or length %d",
2179 c_bh->b_blocknr - 2179 c_bh->b_blocknr -
2180 SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 2180 SB_ONDISK_JOURNAL_1st_BLOCK(sb),
2181 get_commit_trans_id(commit), 2181 get_commit_trans_id(commit),
2182 get_commit_trans_len(commit)); 2182 get_commit_trans_len(commit));
2183 brelse(c_bh); 2183 brelse(c_bh);
@@ -2195,38 +2195,41 @@ static int journal_read_transaction(struct super_block *p_s_sb,
2195 brelse(d_bh); 2195 brelse(d_bh);
2196 kfree(log_blocks); 2196 kfree(log_blocks);
2197 kfree(real_blocks); 2197 kfree(real_blocks);
2198 reiserfs_warning(p_s_sb, 2198 reiserfs_warning(sb, "journal-1169",
2199 "journal-1169: kmalloc failed, unable to mount FS"); 2199 "kmalloc failed, unable to mount FS");
2200 return -1; 2200 return -1;
2201 } 2201 }
2202 /* get all the buffer heads */ 2202 /* get all the buffer heads */
2203 trans_half = journal_trans_half(p_s_sb->s_blocksize); 2203 trans_half = journal_trans_half(sb->s_blocksize);
2204 for (i = 0; i < get_desc_trans_len(desc); i++) { 2204 for (i = 0; i < get_desc_trans_len(desc); i++) {
2205 log_blocks[i] = 2205 log_blocks[i] =
2206 journal_getblk(p_s_sb, 2206 journal_getblk(sb,
2207 SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + 2207 SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2208 (trans_offset + 1 + 2208 (trans_offset + 1 +
2209 i) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)); 2209 i) % SB_ONDISK_JOURNAL_SIZE(sb));
2210 if (i < trans_half) { 2210 if (i < trans_half) {
2211 real_blocks[i] = 2211 real_blocks[i] =
2212 sb_getblk(p_s_sb, 2212 sb_getblk(sb,
2213 le32_to_cpu(desc->j_realblock[i])); 2213 le32_to_cpu(desc->j_realblock[i]));
2214 } else { 2214 } else {
2215 real_blocks[i] = 2215 real_blocks[i] =
2216 sb_getblk(p_s_sb, 2216 sb_getblk(sb,
2217 le32_to_cpu(commit-> 2217 le32_to_cpu(commit->
2218 j_realblock[i - trans_half])); 2218 j_realblock[i - trans_half]));
2219 } 2219 }
2220 if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) { 2220 if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(sb)) {
2221 reiserfs_warning(p_s_sb, 2221 reiserfs_warning(sb, "journal-1207",
2222 "journal-1207: REPLAY FAILURE fsck required! Block to replay is outside of filesystem"); 2222 "REPLAY FAILURE fsck required! "
2223 "Block to replay is outside of "
2224 "filesystem");
2223 goto abort_replay; 2225 goto abort_replay;
2224 } 2226 }
2225 /* make sure we don't try to replay onto log or reserved area */ 2227 /* make sure we don't try to replay onto log or reserved area */
2226 if (is_block_in_log_or_reserved_area 2228 if (is_block_in_log_or_reserved_area
2227 (p_s_sb, real_blocks[i]->b_blocknr)) { 2229 (sb, real_blocks[i]->b_blocknr)) {
2228 reiserfs_warning(p_s_sb, 2230 reiserfs_warning(sb, "journal-1204",
2229 "journal-1204: REPLAY FAILURE fsck required! Trying to replay onto a log block"); 2231 "REPLAY FAILURE fsck required! "
2232 "Trying to replay onto a log block");
2230 abort_replay: 2233 abort_replay:
2231 brelse_array(log_blocks, i); 2234 brelse_array(log_blocks, i);
2232 brelse_array(real_blocks, i); 2235 brelse_array(real_blocks, i);
@@ -2242,8 +2245,9 @@ static int journal_read_transaction(struct super_block *p_s_sb,
2242 for (i = 0; i < get_desc_trans_len(desc); i++) { 2245 for (i = 0; i < get_desc_trans_len(desc); i++) {
2243 wait_on_buffer(log_blocks[i]); 2246 wait_on_buffer(log_blocks[i]);
2244 if (!buffer_uptodate(log_blocks[i])) { 2247 if (!buffer_uptodate(log_blocks[i])) {
2245 reiserfs_warning(p_s_sb, 2248 reiserfs_warning(sb, "journal-1212",
2246 "journal-1212: REPLAY FAILURE fsck required! buffer write failed"); 2249 "REPLAY FAILURE fsck required! "
2250 "buffer write failed");
2247 brelse_array(log_blocks + i, 2251 brelse_array(log_blocks + i,
2248 get_desc_trans_len(desc) - i); 2252 get_desc_trans_len(desc) - i);
2249 brelse_array(real_blocks, get_desc_trans_len(desc)); 2253 brelse_array(real_blocks, get_desc_trans_len(desc));
@@ -2266,8 +2270,9 @@ static int journal_read_transaction(struct super_block *p_s_sb,
2266 for (i = 0; i < get_desc_trans_len(desc); i++) { 2270 for (i = 0; i < get_desc_trans_len(desc); i++) {
2267 wait_on_buffer(real_blocks[i]); 2271 wait_on_buffer(real_blocks[i]);
2268 if (!buffer_uptodate(real_blocks[i])) { 2272 if (!buffer_uptodate(real_blocks[i])) {
2269 reiserfs_warning(p_s_sb, 2273 reiserfs_warning(sb, "journal-1226",
2270 "journal-1226: REPLAY FAILURE, fsck required! buffer write failed"); 2274 "REPLAY FAILURE, fsck required! "
2275 "buffer write failed");
2271 brelse_array(real_blocks + i, 2276 brelse_array(real_blocks + i,
2272 get_desc_trans_len(desc) - i); 2277 get_desc_trans_len(desc) - i);
2273 brelse(c_bh); 2278 brelse(c_bh);
@@ -2279,15 +2284,15 @@ static int journal_read_transaction(struct super_block *p_s_sb,
2279 brelse(real_blocks[i]); 2284 brelse(real_blocks[i]);
2280 } 2285 }
2281 cur_dblock = 2286 cur_dblock =
2282 SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + 2287 SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2283 ((trans_offset + get_desc_trans_len(desc) + 2288 ((trans_offset + get_desc_trans_len(desc) +
2284 2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)); 2289 2) % SB_ONDISK_JOURNAL_SIZE(sb));
2285 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, 2290 reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2286 "journal-1095: setting journal " "start to offset %ld", 2291 "journal-1095: setting journal " "start to offset %ld",
2287 cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb)); 2292 cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb));
2288 2293
2289 /* init starting values for the first transaction, in case this is the last transaction to be replayed. */ 2294 /* init starting values for the first transaction, in case this is the last transaction to be replayed. */
2290 journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb); 2295 journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
2291 journal->j_last_flush_trans_id = trans_id; 2296 journal->j_last_flush_trans_id = trans_id;
2292 journal->j_trans_id = trans_id + 1; 2297 journal->j_trans_id = trans_id + 1;
2293 /* check for trans_id overflow */ 2298 /* check for trans_id overflow */
@@ -2352,12 +2357,12 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev,
2352** 2357**
2353** On exit, it sets things up so the first transaction will work correctly. 2358** On exit, it sets things up so the first transaction will work correctly.
2354*/ 2359*/
2355static int journal_read(struct super_block *p_s_sb) 2360static int journal_read(struct super_block *sb)
2356{ 2361{
2357 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); 2362 struct reiserfs_journal *journal = SB_JOURNAL(sb);
2358 struct reiserfs_journal_desc *desc; 2363 struct reiserfs_journal_desc *desc;
2359 unsigned long oldest_trans_id = 0; 2364 unsigned int oldest_trans_id = 0;
2360 unsigned long oldest_invalid_trans_id = 0; 2365 unsigned int oldest_invalid_trans_id = 0;
2361 time_t start; 2366 time_t start;
2362 unsigned long oldest_start = 0; 2367 unsigned long oldest_start = 0;
2363 unsigned long cur_dblock = 0; 2368 unsigned long cur_dblock = 0;
@@ -2370,46 +2375,46 @@ static int journal_read(struct super_block *p_s_sb)
2370 int ret; 2375 int ret;
2371 char b[BDEVNAME_SIZE]; 2376 char b[BDEVNAME_SIZE];
2372 2377
2373 cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb); 2378 cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
2374 reiserfs_info(p_s_sb, "checking transaction log (%s)\n", 2379 reiserfs_info(sb, "checking transaction log (%s)\n",
2375 bdevname(journal->j_dev_bd, b)); 2380 bdevname(journal->j_dev_bd, b));
2376 start = get_seconds(); 2381 start = get_seconds();
2377 2382
2378 /* step 1, read in the journal header block. Check the transaction it says 2383 /* step 1, read in the journal header block. Check the transaction it says
2379 ** is the first unflushed, and if that transaction is not valid, 2384 ** is the first unflushed, and if that transaction is not valid,
2380 ** replay is done 2385 ** replay is done
2381 */ 2386 */
2382 journal->j_header_bh = journal_bread(p_s_sb, 2387 journal->j_header_bh = journal_bread(sb,
2383 SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) 2388 SB_ONDISK_JOURNAL_1st_BLOCK(sb)
2384 + SB_ONDISK_JOURNAL_SIZE(p_s_sb)); 2389 + SB_ONDISK_JOURNAL_SIZE(sb));
2385 if (!journal->j_header_bh) { 2390 if (!journal->j_header_bh) {
2386 return 1; 2391 return 1;
2387 } 2392 }
2388 jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data); 2393 jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data);
2389 if (le32_to_cpu(jh->j_first_unflushed_offset) < 2394 if (le32_to_cpu(jh->j_first_unflushed_offset) <
2390 SB_ONDISK_JOURNAL_SIZE(p_s_sb) 2395 SB_ONDISK_JOURNAL_SIZE(sb)
2391 && le32_to_cpu(jh->j_last_flush_trans_id) > 0) { 2396 && le32_to_cpu(jh->j_last_flush_trans_id) > 0) {
2392 oldest_start = 2397 oldest_start =
2393 SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + 2398 SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2394 le32_to_cpu(jh->j_first_unflushed_offset); 2399 le32_to_cpu(jh->j_first_unflushed_offset);
2395 oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1; 2400 oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1;
2396 newest_mount_id = le32_to_cpu(jh->j_mount_id); 2401 newest_mount_id = le32_to_cpu(jh->j_mount_id);
2397 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, 2402 reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2398 "journal-1153: found in " 2403 "journal-1153: found in "
2399 "header: first_unflushed_offset %d, last_flushed_trans_id " 2404 "header: first_unflushed_offset %d, last_flushed_trans_id "
2400 "%lu", le32_to_cpu(jh->j_first_unflushed_offset), 2405 "%lu", le32_to_cpu(jh->j_first_unflushed_offset),
2401 le32_to_cpu(jh->j_last_flush_trans_id)); 2406 le32_to_cpu(jh->j_last_flush_trans_id));
2402 valid_journal_header = 1; 2407 valid_journal_header = 1;
2403 2408
2404 /* now, we try to read the first unflushed offset. If it is not valid, 2409 /* now, we try to read the first unflushed offset. If it is not valid,
2405 ** there is nothing more we can do, and it makes no sense to read 2410 ** there is nothing more we can do, and it makes no sense to read
2406 ** through the whole log. 2411 ** through the whole log.
2407 */ 2412 */
2408 d_bh = 2413 d_bh =
2409 journal_bread(p_s_sb, 2414 journal_bread(sb,
2410 SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + 2415 SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2411 le32_to_cpu(jh->j_first_unflushed_offset)); 2416 le32_to_cpu(jh->j_first_unflushed_offset));
2412 ret = journal_transaction_is_valid(p_s_sb, d_bh, NULL, NULL); 2417 ret = journal_transaction_is_valid(sb, d_bh, NULL, NULL);
2413 if (!ret) { 2418 if (!ret) {
2414 continue_replay = 0; 2419 continue_replay = 0;
2415 } 2420 }
@@ -2417,9 +2422,9 @@ static int journal_read(struct super_block *p_s_sb)
2417 goto start_log_replay; 2422 goto start_log_replay;
2418 } 2423 }
2419 2424
2420 if (continue_replay && bdev_read_only(p_s_sb->s_bdev)) { 2425 if (continue_replay && bdev_read_only(sb->s_bdev)) {
2421 reiserfs_warning(p_s_sb, 2426 reiserfs_warning(sb, "clm-2076",
2422 "clm-2076: device is readonly, unable to replay log"); 2427 "device is readonly, unable to replay log");
2423 return -1; 2428 return -1;
2424 } 2429 }
2425 2430
@@ -2428,17 +2433,17 @@ static int journal_read(struct super_block *p_s_sb)
2428 */ 2433 */
2429 while (continue_replay 2434 while (continue_replay
2430 && cur_dblock < 2435 && cur_dblock <
2431 (SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + 2436 (SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2432 SB_ONDISK_JOURNAL_SIZE(p_s_sb))) { 2437 SB_ONDISK_JOURNAL_SIZE(sb))) {
2433 /* Note that it is required for blocksize of primary fs device and journal 2438 /* Note that it is required for blocksize of primary fs device and journal
2434 device to be the same */ 2439 device to be the same */
2435 d_bh = 2440 d_bh =
2436 reiserfs_breada(journal->j_dev_bd, cur_dblock, 2441 reiserfs_breada(journal->j_dev_bd, cur_dblock,
2437 p_s_sb->s_blocksize, 2442 sb->s_blocksize,
2438 SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + 2443 SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2439 SB_ONDISK_JOURNAL_SIZE(p_s_sb)); 2444 SB_ONDISK_JOURNAL_SIZE(sb));
2440 ret = 2445 ret =
2441 journal_transaction_is_valid(p_s_sb, d_bh, 2446 journal_transaction_is_valid(sb, d_bh,
2442 &oldest_invalid_trans_id, 2447 &oldest_invalid_trans_id,
2443 &newest_mount_id); 2448 &newest_mount_id);
2444 if (ret == 1) { 2449 if (ret == 1) {
@@ -2447,26 +2452,26 @@ static int journal_read(struct super_block *p_s_sb)
2447 oldest_trans_id = get_desc_trans_id(desc); 2452 oldest_trans_id = get_desc_trans_id(desc);
2448 oldest_start = d_bh->b_blocknr; 2453 oldest_start = d_bh->b_blocknr;
2449 newest_mount_id = get_desc_mount_id(desc); 2454 newest_mount_id = get_desc_mount_id(desc);
2450 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, 2455 reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2451 "journal-1179: Setting " 2456 "journal-1179: Setting "
2452 "oldest_start to offset %llu, trans_id %lu", 2457 "oldest_start to offset %llu, trans_id %lu",
2453 oldest_start - 2458 oldest_start -
2454 SB_ONDISK_JOURNAL_1st_BLOCK 2459 SB_ONDISK_JOURNAL_1st_BLOCK
2455 (p_s_sb), oldest_trans_id); 2460 (sb), oldest_trans_id);
2456 } else if (oldest_trans_id > get_desc_trans_id(desc)) { 2461 } else if (oldest_trans_id > get_desc_trans_id(desc)) {
2457 /* one we just read was older */ 2462 /* one we just read was older */
2458 oldest_trans_id = get_desc_trans_id(desc); 2463 oldest_trans_id = get_desc_trans_id(desc);
2459 oldest_start = d_bh->b_blocknr; 2464 oldest_start = d_bh->b_blocknr;
2460 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, 2465 reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2461 "journal-1180: Resetting " 2466 "journal-1180: Resetting "
2462 "oldest_start to offset %lu, trans_id %lu", 2467 "oldest_start to offset %lu, trans_id %lu",
2463 oldest_start - 2468 oldest_start -
2464 SB_ONDISK_JOURNAL_1st_BLOCK 2469 SB_ONDISK_JOURNAL_1st_BLOCK
2465 (p_s_sb), oldest_trans_id); 2470 (sb), oldest_trans_id);
2466 } 2471 }
2467 if (newest_mount_id < get_desc_mount_id(desc)) { 2472 if (newest_mount_id < get_desc_mount_id(desc)) {
2468 newest_mount_id = get_desc_mount_id(desc); 2473 newest_mount_id = get_desc_mount_id(desc);
2469 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, 2474 reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2470 "journal-1299: Setting " 2475 "journal-1299: Setting "
2471 "newest_mount_id to %d", 2476 "newest_mount_id to %d",
2472 get_desc_mount_id(desc)); 2477 get_desc_mount_id(desc));
@@ -2481,17 +2486,17 @@ static int journal_read(struct super_block *p_s_sb)
2481 start_log_replay: 2486 start_log_replay:
2482 cur_dblock = oldest_start; 2487 cur_dblock = oldest_start;
2483 if (oldest_trans_id) { 2488 if (oldest_trans_id) {
2484 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, 2489 reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2485 "journal-1206: Starting replay " 2490 "journal-1206: Starting replay "
2486 "from offset %llu, trans_id %lu", 2491 "from offset %llu, trans_id %lu",
2487 cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 2492 cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
2488 oldest_trans_id); 2493 oldest_trans_id);
2489 2494
2490 } 2495 }
2491 replay_count = 0; 2496 replay_count = 0;
2492 while (continue_replay && oldest_trans_id > 0) { 2497 while (continue_replay && oldest_trans_id > 0) {
2493 ret = 2498 ret =
2494 journal_read_transaction(p_s_sb, cur_dblock, oldest_start, 2499 journal_read_transaction(sb, cur_dblock, oldest_start,
2495 oldest_trans_id, newest_mount_id); 2500 oldest_trans_id, newest_mount_id);
2496 if (ret < 0) { 2501 if (ret < 0) {
2497 return ret; 2502 return ret;
@@ -2499,14 +2504,14 @@ static int journal_read(struct super_block *p_s_sb)
2499 break; 2504 break;
2500 } 2505 }
2501 cur_dblock = 2506 cur_dblock =
2502 SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + journal->j_start; 2507 SB_ONDISK_JOURNAL_1st_BLOCK(sb) + journal->j_start;
2503 replay_count++; 2508 replay_count++;
2504 if (cur_dblock == oldest_start) 2509 if (cur_dblock == oldest_start)
2505 break; 2510 break;
2506 } 2511 }
2507 2512
2508 if (oldest_trans_id == 0) { 2513 if (oldest_trans_id == 0) {
2509 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, 2514 reiserfs_debug(sb, REISERFS_DEBUG_CODE,
2510 "journal-1225: No valid " "transactions found"); 2515 "journal-1225: No valid " "transactions found");
2511 } 2516 }
2512 /* j_start does not get set correctly if we don't replay any transactions. 2517 /* j_start does not get set correctly if we don't replay any transactions.
@@ -2526,16 +2531,16 @@ static int journal_read(struct super_block *p_s_sb)
2526 } else { 2531 } else {
2527 journal->j_mount_id = newest_mount_id + 1; 2532 journal->j_mount_id = newest_mount_id + 1;
2528 } 2533 }
2529 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1299: Setting " 2534 reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1299: Setting "
2530 "newest_mount_id to %lu", journal->j_mount_id); 2535 "newest_mount_id to %lu", journal->j_mount_id);
2531 journal->j_first_unflushed_offset = journal->j_start; 2536 journal->j_first_unflushed_offset = journal->j_start;
2532 if (replay_count > 0) { 2537 if (replay_count > 0) {
2533 reiserfs_info(p_s_sb, 2538 reiserfs_info(sb,
2534 "replayed %d transactions in %lu seconds\n", 2539 "replayed %d transactions in %lu seconds\n",
2535 replay_count, get_seconds() - start); 2540 replay_count, get_seconds() - start);
2536 } 2541 }
2537 if (!bdev_read_only(p_s_sb->s_bdev) && 2542 if (!bdev_read_only(sb->s_bdev) &&
2538 _update_journal_header_block(p_s_sb, journal->j_start, 2543 _update_journal_header_block(sb, journal->j_start,
2539 journal->j_last_flush_trans_id)) { 2544 journal->j_last_flush_trans_id)) {
2540 /* replay failed, caller must call free_journal_ram and abort 2545 /* replay failed, caller must call free_journal_ram and abort
2541 ** the mount 2546 ** the mount
@@ -2560,9 +2565,9 @@ static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
2560 return jl; 2565 return jl;
2561} 2566}
2562 2567
2563static void journal_list_init(struct super_block *p_s_sb) 2568static void journal_list_init(struct super_block *sb)
2564{ 2569{
2565 SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb); 2570 SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb);
2566} 2571}
2567 2572
2568static int release_journal_dev(struct super_block *super, 2573static int release_journal_dev(struct super_block *super,
@@ -2580,9 +2585,8 @@ static int release_journal_dev(struct super_block *super,
2580 } 2585 }
2581 2586
2582 if (result != 0) { 2587 if (result != 0) {
2583 reiserfs_warning(super, 2588 reiserfs_warning(super, "sh-457",
2584 "sh-457: release_journal_dev: Cannot release journal device: %i", 2589 "Cannot release journal device: %i", result);
2585 result);
2586 } 2590 }
2587 return result; 2591 return result;
2588} 2592}
@@ -2612,7 +2616,7 @@ static int journal_init_dev(struct super_block *super,
2612 if (IS_ERR(journal->j_dev_bd)) { 2616 if (IS_ERR(journal->j_dev_bd)) {
2613 result = PTR_ERR(journal->j_dev_bd); 2617 result = PTR_ERR(journal->j_dev_bd);
2614 journal->j_dev_bd = NULL; 2618 journal->j_dev_bd = NULL;
2615 reiserfs_warning(super, "sh-458: journal_init_dev: " 2619 reiserfs_warning(super, "sh-458",
2616 "cannot init journal device '%s': %i", 2620 "cannot init journal device '%s': %i",
2617 __bdevname(jdev, b), result); 2621 __bdevname(jdev, b), result);
2618 return result; 2622 return result;
@@ -2662,30 +2666,30 @@ static int journal_init_dev(struct super_block *super,
2662 */ 2666 */
2663#define REISERFS_STANDARD_BLKSIZE (4096) 2667#define REISERFS_STANDARD_BLKSIZE (4096)
2664 2668
2665static int check_advise_trans_params(struct super_block *p_s_sb, 2669static int check_advise_trans_params(struct super_block *sb,
2666 struct reiserfs_journal *journal) 2670 struct reiserfs_journal *journal)
2667{ 2671{
2668 if (journal->j_trans_max) { 2672 if (journal->j_trans_max) {
2669 /* Non-default journal params. 2673 /* Non-default journal params.
2670 Do sanity check for them. */ 2674 Do sanity check for them. */
2671 int ratio = 1; 2675 int ratio = 1;
2672 if (p_s_sb->s_blocksize < REISERFS_STANDARD_BLKSIZE) 2676 if (sb->s_blocksize < REISERFS_STANDARD_BLKSIZE)
2673 ratio = REISERFS_STANDARD_BLKSIZE / p_s_sb->s_blocksize; 2677 ratio = REISERFS_STANDARD_BLKSIZE / sb->s_blocksize;
2674 2678
2675 if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio || 2679 if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio ||
2676 journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio || 2680 journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio ||
2677 SB_ONDISK_JOURNAL_SIZE(p_s_sb) / journal->j_trans_max < 2681 SB_ONDISK_JOURNAL_SIZE(sb) / journal->j_trans_max <
2678 JOURNAL_MIN_RATIO) { 2682 JOURNAL_MIN_RATIO) {
2679 reiserfs_warning(p_s_sb, 2683 reiserfs_warning(sb, "sh-462",
2680 "sh-462: bad transaction max size (%u). FSCK?", 2684 "bad transaction max size (%u). "
2681 journal->j_trans_max); 2685 "FSCK?", journal->j_trans_max);
2682 return 1; 2686 return 1;
2683 } 2687 }
2684 if (journal->j_max_batch != (journal->j_trans_max) * 2688 if (journal->j_max_batch != (journal->j_trans_max) *
2685 JOURNAL_MAX_BATCH_DEFAULT/JOURNAL_TRANS_MAX_DEFAULT) { 2689 JOURNAL_MAX_BATCH_DEFAULT/JOURNAL_TRANS_MAX_DEFAULT) {
2686 reiserfs_warning(p_s_sb, 2690 reiserfs_warning(sb, "sh-463",
2687 "sh-463: bad transaction max batch (%u). FSCK?", 2691 "bad transaction max batch (%u). "
2688 journal->j_max_batch); 2692 "FSCK?", journal->j_max_batch);
2689 return 1; 2693 return 1;
2690 } 2694 }
2691 } else { 2695 } else {
@@ -2693,9 +2697,11 @@ static int check_advise_trans_params(struct super_block *p_s_sb,
2693 The file system was created by old version 2697 The file system was created by old version
2694 of mkreiserfs, so some fields contain zeros, 2698 of mkreiserfs, so some fields contain zeros,
2695 and we need to advise proper values for them */ 2699 and we need to advise proper values for them */
2696 if (p_s_sb->s_blocksize != REISERFS_STANDARD_BLKSIZE) 2700 if (sb->s_blocksize != REISERFS_STANDARD_BLKSIZE) {
2697 reiserfs_panic(p_s_sb, "sh-464: bad blocksize (%u)", 2701 reiserfs_warning(sb, "sh-464", "bad blocksize (%u)",
2698 p_s_sb->s_blocksize); 2702 sb->s_blocksize);
2703 return 1;
2704 }
2699 journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT; 2705 journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT;
2700 journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT; 2706 journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT;
2701 journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE; 2707 journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE;
@@ -2706,10 +2712,10 @@ static int check_advise_trans_params(struct super_block *p_s_sb,
2706/* 2712/*
2707** must be called once on fs mount. calls journal_read for you 2713** must be called once on fs mount. calls journal_read for you
2708*/ 2714*/
2709int journal_init(struct super_block *p_s_sb, const char *j_dev_name, 2715int journal_init(struct super_block *sb, const char *j_dev_name,
2710 int old_format, unsigned int commit_max_age) 2716 int old_format, unsigned int commit_max_age)
2711{ 2717{
2712 int num_cnodes = SB_ONDISK_JOURNAL_SIZE(p_s_sb) * 2; 2718 int num_cnodes = SB_ONDISK_JOURNAL_SIZE(sb) * 2;
2713 struct buffer_head *bhjh; 2719 struct buffer_head *bhjh;
2714 struct reiserfs_super_block *rs; 2720 struct reiserfs_super_block *rs;
2715 struct reiserfs_journal_header *jh; 2721 struct reiserfs_journal_header *jh;
@@ -2717,10 +2723,10 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
2717 struct reiserfs_journal_list *jl; 2723 struct reiserfs_journal_list *jl;
2718 char b[BDEVNAME_SIZE]; 2724 char b[BDEVNAME_SIZE];
2719 2725
2720 journal = SB_JOURNAL(p_s_sb) = vmalloc(sizeof(struct reiserfs_journal)); 2726 journal = SB_JOURNAL(sb) = vmalloc(sizeof(struct reiserfs_journal));
2721 if (!journal) { 2727 if (!journal) {
2722 reiserfs_warning(p_s_sb, 2728 reiserfs_warning(sb, "journal-1256",
2723 "journal-1256: unable to get memory for journal structure"); 2729 "unable to get memory for journal structure");
2724 return 1; 2730 return 1;
2725 } 2731 }
2726 memset(journal, 0, sizeof(struct reiserfs_journal)); 2732 memset(journal, 0, sizeof(struct reiserfs_journal));
@@ -2729,51 +2735,51 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
2729 INIT_LIST_HEAD(&journal->j_working_list); 2735 INIT_LIST_HEAD(&journal->j_working_list);
2730 INIT_LIST_HEAD(&journal->j_journal_list); 2736 INIT_LIST_HEAD(&journal->j_journal_list);
2731 journal->j_persistent_trans = 0; 2737 journal->j_persistent_trans = 0;
2732 if (reiserfs_allocate_list_bitmaps(p_s_sb, 2738 if (reiserfs_allocate_list_bitmaps(sb,
2733 journal->j_list_bitmap, 2739 journal->j_list_bitmap,
2734 reiserfs_bmap_count(p_s_sb))) 2740 reiserfs_bmap_count(sb)))
2735 goto free_and_return; 2741 goto free_and_return;
2736 allocate_bitmap_nodes(p_s_sb); 2742 allocate_bitmap_nodes(sb);
2737 2743
2738 /* reserved for journal area support */ 2744 /* reserved for journal area support */
2739 SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) = (old_format ? 2745 SB_JOURNAL_1st_RESERVED_BLOCK(sb) = (old_format ?
2740 REISERFS_OLD_DISK_OFFSET_IN_BYTES 2746 REISERFS_OLD_DISK_OFFSET_IN_BYTES
2741 / p_s_sb->s_blocksize + 2747 / sb->s_blocksize +
2742 reiserfs_bmap_count(p_s_sb) + 2748 reiserfs_bmap_count(sb) +
2743 1 : 2749 1 :
2744 REISERFS_DISK_OFFSET_IN_BYTES / 2750 REISERFS_DISK_OFFSET_IN_BYTES /
2745 p_s_sb->s_blocksize + 2); 2751 sb->s_blocksize + 2);
2746 2752
2747 /* Sanity check to see is the standard journal fitting withing first bitmap 2753 /* Sanity check to see is the standard journal fitting withing first bitmap
2748 (actual for small blocksizes) */ 2754 (actual for small blocksizes) */
2749 if (!SB_ONDISK_JOURNAL_DEVICE(p_s_sb) && 2755 if (!SB_ONDISK_JOURNAL_DEVICE(sb) &&
2750 (SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) + 2756 (SB_JOURNAL_1st_RESERVED_BLOCK(sb) +
2751 SB_ONDISK_JOURNAL_SIZE(p_s_sb) > p_s_sb->s_blocksize * 8)) { 2757 SB_ONDISK_JOURNAL_SIZE(sb) > sb->s_blocksize * 8)) {
2752 reiserfs_warning(p_s_sb, 2758 reiserfs_warning(sb, "journal-1393",
2753 "journal-1393: journal does not fit for area " 2759 "journal does not fit for area addressed "
2754 "addressed by first of bitmap blocks. It starts at " 2760 "by first of bitmap blocks. It starts at "
2755 "%u and its size is %u. Block size %ld", 2761 "%u and its size is %u. Block size %ld",
2756 SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb), 2762 SB_JOURNAL_1st_RESERVED_BLOCK(sb),
2757 SB_ONDISK_JOURNAL_SIZE(p_s_sb), 2763 SB_ONDISK_JOURNAL_SIZE(sb),
2758 p_s_sb->s_blocksize); 2764 sb->s_blocksize);
2759 goto free_and_return; 2765 goto free_and_return;
2760 } 2766 }
2761 2767
2762 if (journal_init_dev(p_s_sb, journal, j_dev_name) != 0) { 2768 if (journal_init_dev(sb, journal, j_dev_name) != 0) {
2763 reiserfs_warning(p_s_sb, 2769 reiserfs_warning(sb, "sh-462",
2764 "sh-462: unable to initialize jornal device"); 2770 "unable to initialize jornal device");
2765 goto free_and_return; 2771 goto free_and_return;
2766 } 2772 }
2767 2773
2768 rs = SB_DISK_SUPER_BLOCK(p_s_sb); 2774 rs = SB_DISK_SUPER_BLOCK(sb);
2769 2775
2770 /* read journal header */ 2776 /* read journal header */
2771 bhjh = journal_bread(p_s_sb, 2777 bhjh = journal_bread(sb,
2772 SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + 2778 SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
2773 SB_ONDISK_JOURNAL_SIZE(p_s_sb)); 2779 SB_ONDISK_JOURNAL_SIZE(sb));
2774 if (!bhjh) { 2780 if (!bhjh) {
2775 reiserfs_warning(p_s_sb, 2781 reiserfs_warning(sb, "sh-459",
2776 "sh-459: unable to read journal header"); 2782 "unable to read journal header");
2777 goto free_and_return; 2783 goto free_and_return;
2778 } 2784 }
2779 jh = (struct reiserfs_journal_header *)(bhjh->b_data); 2785 jh = (struct reiserfs_journal_header *)(bhjh->b_data);
@@ -2782,10 +2788,10 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
2782 if (is_reiserfs_jr(rs) 2788 if (is_reiserfs_jr(rs)
2783 && (le32_to_cpu(jh->jh_journal.jp_journal_magic) != 2789 && (le32_to_cpu(jh->jh_journal.jp_journal_magic) !=
2784 sb_jp_journal_magic(rs))) { 2790 sb_jp_journal_magic(rs))) {
2785 reiserfs_warning(p_s_sb, 2791 reiserfs_warning(sb, "sh-460",
2786 "sh-460: journal header magic %x " 2792 "journal header magic %x (device %s) does "
2787 "(device %s) does not match to magic found in super " 2793 "not match to magic found in super block %x",
2788 "block %x", jh->jh_journal.jp_journal_magic, 2794 jh->jh_journal.jp_journal_magic,
2789 bdevname(journal->j_dev_bd, b), 2795 bdevname(journal->j_dev_bd, b),
2790 sb_jp_journal_magic(rs)); 2796 sb_jp_journal_magic(rs));
2791 brelse(bhjh); 2797 brelse(bhjh);
@@ -2798,7 +2804,7 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
2798 le32_to_cpu(jh->jh_journal.jp_journal_max_commit_age); 2804 le32_to_cpu(jh->jh_journal.jp_journal_max_commit_age);
2799 journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE; 2805 journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
2800 2806
2801 if (check_advise_trans_params(p_s_sb, journal) != 0) 2807 if (check_advise_trans_params(sb, journal) != 0)
2802 goto free_and_return; 2808 goto free_and_return;
2803 journal->j_default_max_commit_age = journal->j_max_commit_age; 2809 journal->j_default_max_commit_age = journal->j_max_commit_age;
2804 2810
@@ -2807,12 +2813,12 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
2807 journal->j_max_trans_age = commit_max_age; 2813 journal->j_max_trans_age = commit_max_age;
2808 } 2814 }
2809 2815
2810 reiserfs_info(p_s_sb, "journal params: device %s, size %u, " 2816 reiserfs_info(sb, "journal params: device %s, size %u, "
2811 "journal first block %u, max trans len %u, max batch %u, " 2817 "journal first block %u, max trans len %u, max batch %u, "
2812 "max commit age %u, max trans age %u\n", 2818 "max commit age %u, max trans age %u\n",
2813 bdevname(journal->j_dev_bd, b), 2819 bdevname(journal->j_dev_bd, b),
2814 SB_ONDISK_JOURNAL_SIZE(p_s_sb), 2820 SB_ONDISK_JOURNAL_SIZE(sb),
2815 SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 2821 SB_ONDISK_JOURNAL_1st_BLOCK(sb),
2816 journal->j_trans_max, 2822 journal->j_trans_max,
2817 journal->j_max_batch, 2823 journal->j_max_batch,
2818 journal->j_max_commit_age, journal->j_max_trans_age); 2824 journal->j_max_commit_age, journal->j_max_trans_age);
@@ -2820,7 +2826,7 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
2820 brelse(bhjh); 2826 brelse(bhjh);
2821 2827
2822 journal->j_list_bitmap_index = 0; 2828 journal->j_list_bitmap_index = 0;
2823 journal_list_init(p_s_sb); 2829 journal_list_init(sb);
2824 2830
2825 memset(journal->j_list_hash_table, 0, 2831 memset(journal->j_list_hash_table, 0,
2826 JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)); 2832 JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
@@ -2852,7 +2858,7 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
2852 journal->j_must_wait = 0; 2858 journal->j_must_wait = 0;
2853 2859
2854 if (journal->j_cnode_free == 0) { 2860 if (journal->j_cnode_free == 0) {
2855 reiserfs_warning(p_s_sb, "journal-2004: Journal cnode memory " 2861 reiserfs_warning(sb, "journal-2004", "Journal cnode memory "
2856 "allocation failed (%ld bytes). Journal is " 2862 "allocation failed (%ld bytes). Journal is "
2857 "too large for available memory. Usually " 2863 "too large for available memory. Usually "
2858 "this is due to a journal that is too large.", 2864 "this is due to a journal that is too large.",
@@ -2860,16 +2866,17 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
2860 goto free_and_return; 2866 goto free_and_return;
2861 } 2867 }
2862 2868
2863 init_journal_hash(p_s_sb); 2869 init_journal_hash(sb);
2864 jl = journal->j_current_jl; 2870 jl = journal->j_current_jl;
2865 jl->j_list_bitmap = get_list_bitmap(p_s_sb, jl); 2871 jl->j_list_bitmap = get_list_bitmap(sb, jl);
2866 if (!jl->j_list_bitmap) { 2872 if (!jl->j_list_bitmap) {
2867 reiserfs_warning(p_s_sb, 2873 reiserfs_warning(sb, "journal-2005",
2868 "journal-2005, get_list_bitmap failed for journal list 0"); 2874 "get_list_bitmap failed for journal list 0");
2869 goto free_and_return; 2875 goto free_and_return;
2870 } 2876 }
2871 if (journal_read(p_s_sb) < 0) { 2877 if (journal_read(sb) < 0) {
2872 reiserfs_warning(p_s_sb, "Replay Failure, unable to mount"); 2878 reiserfs_warning(sb, "reiserfs-2006",
2879 "Replay Failure, unable to mount");
2873 goto free_and_return; 2880 goto free_and_return;
2874 } 2881 }
2875 2882
@@ -2878,10 +2885,10 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
2878 commit_wq = create_workqueue("reiserfs"); 2885 commit_wq = create_workqueue("reiserfs");
2879 2886
2880 INIT_DELAYED_WORK(&journal->j_work, flush_async_commits); 2887 INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
2881 journal->j_work_sb = p_s_sb; 2888 journal->j_work_sb = sb;
2882 return 0; 2889 return 0;
2883 free_and_return: 2890 free_and_return:
2884 free_journal_ram(p_s_sb); 2891 free_journal_ram(sb);
2885 return 1; 2892 return 1;
2886} 2893}
2887 2894
@@ -2912,7 +2919,7 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
2912 return 0; 2919 return 0;
2913} 2920}
2914 2921
2915/* this must be called inside a transaction, and requires the 2922/* this must be called inside a transaction, and requires the
2916** kernel_lock to be held 2923** kernel_lock to be held
2917*/ 2924*/
2918void reiserfs_block_writes(struct reiserfs_transaction_handle *th) 2925void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
@@ -2970,7 +2977,7 @@ static void wake_queued_writers(struct super_block *s)
2970 wake_up(&journal->j_join_wait); 2977 wake_up(&journal->j_join_wait);
2971} 2978}
2972 2979
2973static void let_transaction_grow(struct super_block *sb, unsigned long trans_id) 2980static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
2974{ 2981{
2975 struct reiserfs_journal *journal = SB_JOURNAL(sb); 2982 struct reiserfs_journal *journal = SB_JOURNAL(sb);
2976 unsigned long bcount = journal->j_bcount; 2983 unsigned long bcount = journal->j_bcount;
@@ -2997,43 +3004,43 @@ static void let_transaction_grow(struct super_block *sb, unsigned long trans_id)
2997** expect to use in nblocks. 3004** expect to use in nblocks.
2998*/ 3005*/
2999static int do_journal_begin_r(struct reiserfs_transaction_handle *th, 3006static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
3000 struct super_block *p_s_sb, unsigned long nblocks, 3007 struct super_block *sb, unsigned long nblocks,
3001 int join) 3008 int join)
3002{ 3009{
3003 time_t now = get_seconds(); 3010 time_t now = get_seconds();
3004 int old_trans_id; 3011 unsigned int old_trans_id;
3005 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); 3012 struct reiserfs_journal *journal = SB_JOURNAL(sb);
3006 struct reiserfs_transaction_handle myth; 3013 struct reiserfs_transaction_handle myth;
3007 int sched_count = 0; 3014 int sched_count = 0;
3008 int retval; 3015 int retval;
3009 3016
3010 reiserfs_check_lock_depth(p_s_sb, "journal_begin"); 3017 reiserfs_check_lock_depth(sb, "journal_begin");
3011 BUG_ON(nblocks > journal->j_trans_max); 3018 BUG_ON(nblocks > journal->j_trans_max);
3012 3019
3013 PROC_INFO_INC(p_s_sb, journal.journal_being); 3020 PROC_INFO_INC(sb, journal.journal_being);
3014 /* set here for journal_join */ 3021 /* set here for journal_join */
3015 th->t_refcount = 1; 3022 th->t_refcount = 1;
3016 th->t_super = p_s_sb; 3023 th->t_super = sb;
3017 3024
3018 relock: 3025 relock:
3019 lock_journal(p_s_sb); 3026 lock_journal(sb);
3020 if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) { 3027 if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) {
3021 unlock_journal(p_s_sb); 3028 unlock_journal(sb);
3022 retval = journal->j_errno; 3029 retval = journal->j_errno;
3023 goto out_fail; 3030 goto out_fail;
3024 } 3031 }
3025 journal->j_bcount++; 3032 journal->j_bcount++;
3026 3033
3027 if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) { 3034 if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
3028 unlock_journal(p_s_sb); 3035 unlock_journal(sb);
3029 reiserfs_wait_on_write_block(p_s_sb); 3036 reiserfs_wait_on_write_block(sb);
3030 PROC_INFO_INC(p_s_sb, journal.journal_relock_writers); 3037 PROC_INFO_INC(sb, journal.journal_relock_writers);
3031 goto relock; 3038 goto relock;
3032 } 3039 }
3033 now = get_seconds(); 3040 now = get_seconds();
3034 3041
3035 /* if there is no room in the journal OR 3042 /* if there is no room in the journal OR
3036 ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning 3043 ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning
3037 ** we don't sleep if there aren't other writers 3044 ** we don't sleep if there aren't other writers
3038 */ 3045 */
3039 3046
@@ -3048,7 +3055,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
3048 || (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) { 3055 || (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) {
3049 3056
3050 old_trans_id = journal->j_trans_id; 3057 old_trans_id = journal->j_trans_id;
3051 unlock_journal(p_s_sb); /* allow others to finish this transaction */ 3058 unlock_journal(sb); /* allow others to finish this transaction */
3052 3059
3053 if (!join && (journal->j_len_alloc + nblocks + 2) >= 3060 if (!join && (journal->j_len_alloc + nblocks + 2) >=
3054 journal->j_max_batch && 3061 journal->j_max_batch &&
@@ -3056,7 +3063,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
3056 (journal->j_len_alloc * 75)) { 3063 (journal->j_len_alloc * 75)) {
3057 if (atomic_read(&journal->j_wcount) > 10) { 3064 if (atomic_read(&journal->j_wcount) > 10) {
3058 sched_count++; 3065 sched_count++;
3059 queue_log_writer(p_s_sb); 3066 queue_log_writer(sb);
3060 goto relock; 3067 goto relock;
3061 } 3068 }
3062 } 3069 }
@@ -3066,25 +3073,25 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
3066 if (atomic_read(&journal->j_jlock)) { 3073 if (atomic_read(&journal->j_jlock)) {
3067 while (journal->j_trans_id == old_trans_id && 3074 while (journal->j_trans_id == old_trans_id &&
3068 atomic_read(&journal->j_jlock)) { 3075 atomic_read(&journal->j_jlock)) {
3069 queue_log_writer(p_s_sb); 3076 queue_log_writer(sb);
3070 } 3077 }
3071 goto relock; 3078 goto relock;
3072 } 3079 }
3073 retval = journal_join(&myth, p_s_sb, 1); 3080 retval = journal_join(&myth, sb, 1);
3074 if (retval) 3081 if (retval)
3075 goto out_fail; 3082 goto out_fail;
3076 3083
3077 /* someone might have ended the transaction while we joined */ 3084 /* someone might have ended the transaction while we joined */
3078 if (old_trans_id != journal->j_trans_id) { 3085 if (old_trans_id != journal->j_trans_id) {
3079 retval = do_journal_end(&myth, p_s_sb, 1, 0); 3086 retval = do_journal_end(&myth, sb, 1, 0);
3080 } else { 3087 } else {
3081 retval = do_journal_end(&myth, p_s_sb, 1, COMMIT_NOW); 3088 retval = do_journal_end(&myth, sb, 1, COMMIT_NOW);
3082 } 3089 }
3083 3090
3084 if (retval) 3091 if (retval)
3085 goto out_fail; 3092 goto out_fail;
3086 3093
3087 PROC_INFO_INC(p_s_sb, journal.journal_relock_wcount); 3094 PROC_INFO_INC(sb, journal.journal_relock_wcount);
3088 goto relock; 3095 goto relock;
3089 } 3096 }
3090 /* we are the first writer, set trans_id */ 3097 /* we are the first writer, set trans_id */
@@ -3096,7 +3103,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
3096 th->t_blocks_logged = 0; 3103 th->t_blocks_logged = 0;
3097 th->t_blocks_allocated = nblocks; 3104 th->t_blocks_allocated = nblocks;
3098 th->t_trans_id = journal->j_trans_id; 3105 th->t_trans_id = journal->j_trans_id;
3099 unlock_journal(p_s_sb); 3106 unlock_journal(sb);
3100 INIT_LIST_HEAD(&th->t_list); 3107 INIT_LIST_HEAD(&th->t_list);
3101 get_fs_excl(); 3108 get_fs_excl();
3102 return 0; 3109 return 0;
@@ -3106,7 +3113,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
3106 /* Re-set th->t_super, so we can properly keep track of how many 3113 /* Re-set th->t_super, so we can properly keep track of how many
3107 * persistent transactions there are. We need to do this so if this 3114 * persistent transactions there are. We need to do this so if this
3108 * call is part of a failed restart_transaction, we can free it later */ 3115 * call is part of a failed restart_transaction, we can free it later */
3109 th->t_super = p_s_sb; 3116 th->t_super = sb;
3110 return retval; 3117 return retval;
3111} 3118}
3112 3119
@@ -3157,7 +3164,7 @@ int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th)
3157} 3164}
3158 3165
3159static int journal_join(struct reiserfs_transaction_handle *th, 3166static int journal_join(struct reiserfs_transaction_handle *th,
3160 struct super_block *p_s_sb, unsigned long nblocks) 3167 struct super_block *sb, unsigned long nblocks)
3161{ 3168{
3162 struct reiserfs_transaction_handle *cur_th = current->journal_info; 3169 struct reiserfs_transaction_handle *cur_th = current->journal_info;
3163 3170
@@ -3166,11 +3173,11 @@ static int journal_join(struct reiserfs_transaction_handle *th,
3166 */ 3173 */
3167 th->t_handle_save = cur_th; 3174 th->t_handle_save = cur_th;
3168 BUG_ON(cur_th && cur_th->t_refcount > 1); 3175 BUG_ON(cur_th && cur_th->t_refcount > 1);
3169 return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_JOIN); 3176 return do_journal_begin_r(th, sb, nblocks, JBEGIN_JOIN);
3170} 3177}
3171 3178
3172int journal_join_abort(struct reiserfs_transaction_handle *th, 3179int journal_join_abort(struct reiserfs_transaction_handle *th,
3173 struct super_block *p_s_sb, unsigned long nblocks) 3180 struct super_block *sb, unsigned long nblocks)
3174{ 3181{
3175 struct reiserfs_transaction_handle *cur_th = current->journal_info; 3182 struct reiserfs_transaction_handle *cur_th = current->journal_info;
3176 3183
@@ -3179,11 +3186,11 @@ int journal_join_abort(struct reiserfs_transaction_handle *th,
3179 */ 3186 */
3180 th->t_handle_save = cur_th; 3187 th->t_handle_save = cur_th;
3181 BUG_ON(cur_th && cur_th->t_refcount > 1); 3188 BUG_ON(cur_th && cur_th->t_refcount > 1);
3182 return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_ABORT); 3189 return do_journal_begin_r(th, sb, nblocks, JBEGIN_ABORT);
3183} 3190}
3184 3191
3185int journal_begin(struct reiserfs_transaction_handle *th, 3192int journal_begin(struct reiserfs_transaction_handle *th,
3186 struct super_block *p_s_sb, unsigned long nblocks) 3193 struct super_block *sb, unsigned long nblocks)
3187{ 3194{
3188 struct reiserfs_transaction_handle *cur_th = current->journal_info; 3195 struct reiserfs_transaction_handle *cur_th = current->journal_info;
3189 int ret; 3196 int ret;
@@ -3191,28 +3198,29 @@ int journal_begin(struct reiserfs_transaction_handle *th,
3191 th->t_handle_save = NULL; 3198 th->t_handle_save = NULL;
3192 if (cur_th) { 3199 if (cur_th) {
3193 /* we are nesting into the current transaction */ 3200 /* we are nesting into the current transaction */
3194 if (cur_th->t_super == p_s_sb) { 3201 if (cur_th->t_super == sb) {
3195 BUG_ON(!cur_th->t_refcount); 3202 BUG_ON(!cur_th->t_refcount);
3196 cur_th->t_refcount++; 3203 cur_th->t_refcount++;
3197 memcpy(th, cur_th, sizeof(*th)); 3204 memcpy(th, cur_th, sizeof(*th));
3198 if (th->t_refcount <= 1) 3205 if (th->t_refcount <= 1)
3199 reiserfs_warning(p_s_sb, 3206 reiserfs_warning(sb, "reiserfs-2005",
3200 "BAD: refcount <= 1, but journal_info != 0"); 3207 "BAD: refcount <= 1, but "
3208 "journal_info != 0");
3201 return 0; 3209 return 0;
3202 } else { 3210 } else {
3203 /* we've ended up with a handle from a different filesystem. 3211 /* we've ended up with a handle from a different filesystem.
3204 ** save it and restore on journal_end. This should never 3212 ** save it and restore on journal_end. This should never
3205 ** really happen... 3213 ** really happen...
3206 */ 3214 */
3207 reiserfs_warning(p_s_sb, 3215 reiserfs_warning(sb, "clm-2100",
3208 "clm-2100: nesting info a different FS"); 3216 "nesting info a different FS");
3209 th->t_handle_save = current->journal_info; 3217 th->t_handle_save = current->journal_info;
3210 current->journal_info = th; 3218 current->journal_info = th;
3211 } 3219 }
3212 } else { 3220 } else {
3213 current->journal_info = th; 3221 current->journal_info = th;
3214 } 3222 }
3215 ret = do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_REG); 3223 ret = do_journal_begin_r(th, sb, nblocks, JBEGIN_REG);
3216 BUG_ON(current->journal_info != th); 3224 BUG_ON(current->journal_info != th);
3217 3225
3218 /* I guess this boils down to being the reciprocal of clm-2100 above. 3226 /* I guess this boils down to being the reciprocal of clm-2100 above.
@@ -3232,32 +3240,32 @@ int journal_begin(struct reiserfs_transaction_handle *th,
3232** 3240**
3233** if it was dirty, cleans and files onto the clean list. I can't let it be dirty again until the 3241** if it was dirty, cleans and files onto the clean list. I can't let it be dirty again until the
3234** transaction is committed. 3242** transaction is committed.
3235** 3243**
3236** if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len. 3244** if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len.
3237*/ 3245*/
3238int journal_mark_dirty(struct reiserfs_transaction_handle *th, 3246int journal_mark_dirty(struct reiserfs_transaction_handle *th,
3239 struct super_block *p_s_sb, struct buffer_head *bh) 3247 struct super_block *sb, struct buffer_head *bh)
3240{ 3248{
3241 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); 3249 struct reiserfs_journal *journal = SB_JOURNAL(sb);
3242 struct reiserfs_journal_cnode *cn = NULL; 3250 struct reiserfs_journal_cnode *cn = NULL;
3243 int count_already_incd = 0; 3251 int count_already_incd = 0;
3244 int prepared = 0; 3252 int prepared = 0;
3245 BUG_ON(!th->t_trans_id); 3253 BUG_ON(!th->t_trans_id);
3246 3254
3247 PROC_INFO_INC(p_s_sb, journal.mark_dirty); 3255 PROC_INFO_INC(sb, journal.mark_dirty);
3248 if (th->t_trans_id != journal->j_trans_id) { 3256 if (th->t_trans_id != journal->j_trans_id) {
3249 reiserfs_panic(th->t_super, 3257 reiserfs_panic(th->t_super, "journal-1577",
3250 "journal-1577: handle trans id %ld != current trans id %ld\n", 3258 "handle trans id %ld != current trans id %ld",
3251 th->t_trans_id, journal->j_trans_id); 3259 th->t_trans_id, journal->j_trans_id);
3252 } 3260 }
3253 3261
3254 p_s_sb->s_dirt = 1; 3262 sb->s_dirt = 1;
3255 3263
3256 prepared = test_clear_buffer_journal_prepared(bh); 3264 prepared = test_clear_buffer_journal_prepared(bh);
3257 clear_buffer_journal_restore_dirty(bh); 3265 clear_buffer_journal_restore_dirty(bh);
3258 /* already in this transaction, we are done */ 3266 /* already in this transaction, we are done */
3259 if (buffer_journaled(bh)) { 3267 if (buffer_journaled(bh)) {
3260 PROC_INFO_INC(p_s_sb, journal.mark_dirty_already); 3268 PROC_INFO_INC(sb, journal.mark_dirty_already);
3261 return 0; 3269 return 0;
3262 } 3270 }
3263 3271
@@ -3266,7 +3274,8 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
3266 ** could get to disk too early. NOT GOOD. 3274 ** could get to disk too early. NOT GOOD.
3267 */ 3275 */
3268 if (!prepared || buffer_dirty(bh)) { 3276 if (!prepared || buffer_dirty(bh)) {
3269 reiserfs_warning(p_s_sb, "journal-1777: buffer %llu bad state " 3277 reiserfs_warning(sb, "journal-1777",
3278 "buffer %llu bad state "
3270 "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT", 3279 "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT",
3271 (unsigned long long)bh->b_blocknr, 3280 (unsigned long long)bh->b_blocknr,
3272 prepared ? ' ' : '!', 3281 prepared ? ' ' : '!',
@@ -3276,23 +3285,23 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
3276 } 3285 }
3277 3286
3278 if (atomic_read(&(journal->j_wcount)) <= 0) { 3287 if (atomic_read(&(journal->j_wcount)) <= 0) {
3279 reiserfs_warning(p_s_sb, 3288 reiserfs_warning(sb, "journal-1409",
3280 "journal-1409: journal_mark_dirty returning because j_wcount was %d", 3289 "returning because j_wcount was %d",
3281 atomic_read(&(journal->j_wcount))); 3290 atomic_read(&(journal->j_wcount)));
3282 return 1; 3291 return 1;
3283 } 3292 }
3284 /* this error means I've screwed up, and we've overflowed the transaction. 3293 /* this error means I've screwed up, and we've overflowed the transaction.
3285 ** Nothing can be done here, except make the FS readonly or panic. 3294 ** Nothing can be done here, except make the FS readonly or panic.
3286 */ 3295 */
3287 if (journal->j_len >= journal->j_trans_max) { 3296 if (journal->j_len >= journal->j_trans_max) {
3288 reiserfs_panic(th->t_super, 3297 reiserfs_panic(th->t_super, "journal-1413",
3289 "journal-1413: journal_mark_dirty: j_len (%lu) is too big\n", 3298 "j_len (%lu) is too big",
3290 journal->j_len); 3299 journal->j_len);
3291 } 3300 }
3292 3301
3293 if (buffer_journal_dirty(bh)) { 3302 if (buffer_journal_dirty(bh)) {
3294 count_already_incd = 1; 3303 count_already_incd = 1;
3295 PROC_INFO_INC(p_s_sb, journal.mark_dirty_notjournal); 3304 PROC_INFO_INC(sb, journal.mark_dirty_notjournal);
3296 clear_buffer_journal_dirty(bh); 3305 clear_buffer_journal_dirty(bh);
3297 } 3306 }
3298 3307
@@ -3304,9 +3313,9 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
3304 3313
3305 /* now put this guy on the end */ 3314 /* now put this guy on the end */
3306 if (!cn) { 3315 if (!cn) {
3307 cn = get_cnode(p_s_sb); 3316 cn = get_cnode(sb);
3308 if (!cn) { 3317 if (!cn) {
3309 reiserfs_panic(p_s_sb, "get_cnode failed!\n"); 3318 reiserfs_panic(sb, "journal-4", "get_cnode failed!");
3310 } 3319 }
3311 3320
3312 if (th->t_blocks_logged == th->t_blocks_allocated) { 3321 if (th->t_blocks_logged == th->t_blocks_allocated) {
@@ -3318,7 +3327,7 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
3318 3327
3319 cn->bh = bh; 3328 cn->bh = bh;
3320 cn->blocknr = bh->b_blocknr; 3329 cn->blocknr = bh->b_blocknr;
3321 cn->sb = p_s_sb; 3330 cn->sb = sb;
3322 cn->jlist = NULL; 3331 cn->jlist = NULL;
3323 insert_journal_hash(journal->j_hash_table, cn); 3332 insert_journal_hash(journal->j_hash_table, cn);
3324 if (!count_already_incd) { 3333 if (!count_already_incd) {
@@ -3339,11 +3348,11 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
3339} 3348}
3340 3349
3341int journal_end(struct reiserfs_transaction_handle *th, 3350int journal_end(struct reiserfs_transaction_handle *th,
3342 struct super_block *p_s_sb, unsigned long nblocks) 3351 struct super_block *sb, unsigned long nblocks)
3343{ 3352{
3344 if (!current->journal_info && th->t_refcount > 1) 3353 if (!current->journal_info && th->t_refcount > 1)
3345 reiserfs_warning(p_s_sb, "REISER-NESTING: th NULL, refcount %d", 3354 reiserfs_warning(sb, "REISER-NESTING",
3346 th->t_refcount); 3355 "th NULL, refcount %d", th->t_refcount);
3347 3356
3348 if (!th->t_trans_id) { 3357 if (!th->t_trans_id) {
3349 WARN_ON(1); 3358 WARN_ON(1);
@@ -3366,26 +3375,26 @@ int journal_end(struct reiserfs_transaction_handle *th,
3366 } 3375 }
3367 return 0; 3376 return 0;
3368 } else { 3377 } else {
3369 return do_journal_end(th, p_s_sb, nblocks, 0); 3378 return do_journal_end(th, sb, nblocks, 0);
3370 } 3379 }
3371} 3380}
3372 3381
3373/* removes from the current transaction, relsing and descrementing any counters. 3382/* removes from the current transaction, relsing and descrementing any counters.
3374** also files the removed buffer directly onto the clean list 3383** also files the removed buffer directly onto the clean list
3375** 3384**
3376** called by journal_mark_freed when a block has been deleted 3385** called by journal_mark_freed when a block has been deleted
3377** 3386**
3378** returns 1 if it cleaned and relsed the buffer. 0 otherwise 3387** returns 1 if it cleaned and relsed the buffer. 0 otherwise
3379*/ 3388*/
3380static int remove_from_transaction(struct super_block *p_s_sb, 3389static int remove_from_transaction(struct super_block *sb,
3381 b_blocknr_t blocknr, int already_cleaned) 3390 b_blocknr_t blocknr, int already_cleaned)
3382{ 3391{
3383 struct buffer_head *bh; 3392 struct buffer_head *bh;
3384 struct reiserfs_journal_cnode *cn; 3393 struct reiserfs_journal_cnode *cn;
3385 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); 3394 struct reiserfs_journal *journal = SB_JOURNAL(sb);
3386 int ret = 0; 3395 int ret = 0;
3387 3396
3388 cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr); 3397 cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
3389 if (!cn || !cn->bh) { 3398 if (!cn || !cn->bh) {
3390 return ret; 3399 return ret;
3391 } 3400 }
@@ -3403,7 +3412,7 @@ static int remove_from_transaction(struct super_block *p_s_sb,
3403 journal->j_last = cn->prev; 3412 journal->j_last = cn->prev;
3404 } 3413 }
3405 if (bh) 3414 if (bh)
3406 remove_journal_hash(p_s_sb, journal->j_hash_table, NULL, 3415 remove_journal_hash(sb, journal->j_hash_table, NULL,
3407 bh->b_blocknr, 0); 3416 bh->b_blocknr, 0);
3408 clear_buffer_journaled(bh); /* don't log this one */ 3417 clear_buffer_journaled(bh); /* don't log this one */
3409 3418
@@ -3413,14 +3422,14 @@ static int remove_from_transaction(struct super_block *p_s_sb,
3413 clear_buffer_journal_test(bh); 3422 clear_buffer_journal_test(bh);
3414 put_bh(bh); 3423 put_bh(bh);
3415 if (atomic_read(&(bh->b_count)) < 0) { 3424 if (atomic_read(&(bh->b_count)) < 0) {
3416 reiserfs_warning(p_s_sb, 3425 reiserfs_warning(sb, "journal-1752",
3417 "journal-1752: remove from trans, b_count < 0"); 3426 "b_count < 0");
3418 } 3427 }
3419 ret = 1; 3428 ret = 1;
3420 } 3429 }
3421 journal->j_len--; 3430 journal->j_len--;
3422 journal->j_len_alloc--; 3431 journal->j_len_alloc--;
3423 free_cnode(p_s_sb, cn); 3432 free_cnode(sb, cn);
3424 return ret; 3433 return ret;
3425} 3434}
3426 3435
@@ -3468,22 +3477,22 @@ static int can_dirty(struct reiserfs_journal_cnode *cn)
3468} 3477}
3469 3478
3470/* syncs the commit blocks, but does not force the real buffers to disk 3479/* syncs the commit blocks, but does not force the real buffers to disk
3471** will wait until the current transaction is done/committed before returning 3480** will wait until the current transaction is done/committed before returning
3472*/ 3481*/
3473int journal_end_sync(struct reiserfs_transaction_handle *th, 3482int journal_end_sync(struct reiserfs_transaction_handle *th,
3474 struct super_block *p_s_sb, unsigned long nblocks) 3483 struct super_block *sb, unsigned long nblocks)
3475{ 3484{
3476 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); 3485 struct reiserfs_journal *journal = SB_JOURNAL(sb);
3477 3486
3478 BUG_ON(!th->t_trans_id); 3487 BUG_ON(!th->t_trans_id);
3479 /* you can sync while nested, very, very bad */ 3488 /* you can sync while nested, very, very bad */
3480 BUG_ON(th->t_refcount > 1); 3489 BUG_ON(th->t_refcount > 1);
3481 if (journal->j_len == 0) { 3490 if (journal->j_len == 0) {
3482 reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 3491 reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
3483 1); 3492 1);
3484 journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)); 3493 journal_mark_dirty(th, sb, SB_BUFFER_WITH_SB(sb));
3485 } 3494 }
3486 return do_journal_end(th, p_s_sb, nblocks, COMMIT_NOW | WAIT); 3495 return do_journal_end(th, sb, nblocks, COMMIT_NOW | WAIT);
3487} 3496}
3488 3497
3489/* 3498/*
@@ -3493,7 +3502,7 @@ static void flush_async_commits(struct work_struct *work)
3493{ 3502{
3494 struct reiserfs_journal *journal = 3503 struct reiserfs_journal *journal =
3495 container_of(work, struct reiserfs_journal, j_work.work); 3504 container_of(work, struct reiserfs_journal, j_work.work);
3496 struct super_block *p_s_sb = journal->j_work_sb; 3505 struct super_block *sb = journal->j_work_sb;
3497 struct reiserfs_journal_list *jl; 3506 struct reiserfs_journal_list *jl;
3498 struct list_head *entry; 3507 struct list_head *entry;
3499 3508
@@ -3502,7 +3511,7 @@ static void flush_async_commits(struct work_struct *work)
3502 /* last entry is the youngest, commit it and you get everything */ 3511 /* last entry is the youngest, commit it and you get everything */
3503 entry = journal->j_journal_list.prev; 3512 entry = journal->j_journal_list.prev;
3504 jl = JOURNAL_LIST_ENTRY(entry); 3513 jl = JOURNAL_LIST_ENTRY(entry);
3505 flush_commit_list(p_s_sb, jl, 1); 3514 flush_commit_list(sb, jl, 1);
3506 } 3515 }
3507 unlock_kernel(); 3516 unlock_kernel();
3508} 3517}
@@ -3511,11 +3520,11 @@ static void flush_async_commits(struct work_struct *work)
3511** flushes any old transactions to disk 3520** flushes any old transactions to disk
3512** ends the current transaction if it is too old 3521** ends the current transaction if it is too old
3513*/ 3522*/
3514int reiserfs_flush_old_commits(struct super_block *p_s_sb) 3523int reiserfs_flush_old_commits(struct super_block *sb)
3515{ 3524{
3516 time_t now; 3525 time_t now;
3517 struct reiserfs_transaction_handle th; 3526 struct reiserfs_transaction_handle th;
3518 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); 3527 struct reiserfs_journal *journal = SB_JOURNAL(sb);
3519 3528
3520 now = get_seconds(); 3529 now = get_seconds();
3521 /* safety check so we don't flush while we are replaying the log during 3530 /* safety check so we don't flush while we are replaying the log during
@@ -3532,35 +3541,35 @@ int reiserfs_flush_old_commits(struct super_block *p_s_sb)
3532 journal->j_trans_start_time > 0 && 3541 journal->j_trans_start_time > 0 &&
3533 journal->j_len > 0 && 3542 journal->j_len > 0 &&
3534 (now - journal->j_trans_start_time) > journal->j_max_trans_age) { 3543 (now - journal->j_trans_start_time) > journal->j_max_trans_age) {
3535 if (!journal_join(&th, p_s_sb, 1)) { 3544 if (!journal_join(&th, sb, 1)) {
3536 reiserfs_prepare_for_journal(p_s_sb, 3545 reiserfs_prepare_for_journal(sb,
3537 SB_BUFFER_WITH_SB(p_s_sb), 3546 SB_BUFFER_WITH_SB(sb),
3538 1); 3547 1);
3539 journal_mark_dirty(&th, p_s_sb, 3548 journal_mark_dirty(&th, sb,
3540 SB_BUFFER_WITH_SB(p_s_sb)); 3549 SB_BUFFER_WITH_SB(sb));
3541 3550
3542 /* we're only being called from kreiserfsd, it makes no sense to do 3551 /* we're only being called from kreiserfsd, it makes no sense to do
3543 ** an async commit so that kreiserfsd can do it later 3552 ** an async commit so that kreiserfsd can do it later
3544 */ 3553 */
3545 do_journal_end(&th, p_s_sb, 1, COMMIT_NOW | WAIT); 3554 do_journal_end(&th, sb, 1, COMMIT_NOW | WAIT);
3546 } 3555 }
3547 } 3556 }
3548 return p_s_sb->s_dirt; 3557 return sb->s_dirt;
3549} 3558}
3550 3559
3551/* 3560/*
3552** returns 0 if do_journal_end should return right away, returns 1 if do_journal_end should finish the commit 3561** returns 0 if do_journal_end should return right away, returns 1 if do_journal_end should finish the commit
3553** 3562**
3554** if the current transaction is too old, but still has writers, this will wait on j_join_wait until all 3563** if the current transaction is too old, but still has writers, this will wait on j_join_wait until all
3555** the writers are done. By the time it wakes up, the transaction it was called has already ended, so it just 3564** the writers are done. By the time it wakes up, the transaction it was called has already ended, so it just
3556** flushes the commit list and returns 0. 3565** flushes the commit list and returns 0.
3557** 3566**
3558** Won't batch when flush or commit_now is set. Also won't batch when others are waiting on j_join_wait. 3567** Won't batch when flush or commit_now is set. Also won't batch when others are waiting on j_join_wait.
3559** 3568**
3560** Note, we can't allow the journal_end to proceed while there are still writers in the log. 3569** Note, we can't allow the journal_end to proceed while there are still writers in the log.
3561*/ 3570*/
3562static int check_journal_end(struct reiserfs_transaction_handle *th, 3571static int check_journal_end(struct reiserfs_transaction_handle *th,
3563 struct super_block *p_s_sb, unsigned long nblocks, 3572 struct super_block *sb, unsigned long nblocks,
3564 int flags) 3573 int flags)
3565{ 3574{
3566 3575
@@ -3569,13 +3578,13 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
3569 int commit_now = flags & COMMIT_NOW; 3578 int commit_now = flags & COMMIT_NOW;
3570 int wait_on_commit = flags & WAIT; 3579 int wait_on_commit = flags & WAIT;
3571 struct reiserfs_journal_list *jl; 3580 struct reiserfs_journal_list *jl;
3572 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); 3581 struct reiserfs_journal *journal = SB_JOURNAL(sb);
3573 3582
3574 BUG_ON(!th->t_trans_id); 3583 BUG_ON(!th->t_trans_id);
3575 3584
3576 if (th->t_trans_id != journal->j_trans_id) { 3585 if (th->t_trans_id != journal->j_trans_id) {
3577 reiserfs_panic(th->t_super, 3586 reiserfs_panic(th->t_super, "journal-1577",
3578 "journal-1577: handle trans id %ld != current trans id %ld\n", 3587 "handle trans id %ld != current trans id %ld",
3579 th->t_trans_id, journal->j_trans_id); 3588 th->t_trans_id, journal->j_trans_id);
3580 } 3589 }
3581 3590
@@ -3584,7 +3593,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
3584 atomic_dec(&(journal->j_wcount)); 3593 atomic_dec(&(journal->j_wcount));
3585 } 3594 }
3586 3595
3587 /* BUG, deal with case where j_len is 0, but people previously freed blocks need to be released 3596 /* BUG, deal with case where j_len is 0, but people previously freed blocks need to be released
3588 ** will be dealt with by next transaction that actually writes something, but should be taken 3597 ** will be dealt with by next transaction that actually writes something, but should be taken
3589 ** care of in this trans 3598 ** care of in this trans
3590 */ 3599 */
@@ -3593,7 +3602,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
3593 /* if wcount > 0, and we are called to with flush or commit_now, 3602 /* if wcount > 0, and we are called to with flush or commit_now,
3594 ** we wait on j_join_wait. We will wake up when the last writer has 3603 ** we wait on j_join_wait. We will wake up when the last writer has
3595 ** finished the transaction, and started it on its way to the disk. 3604 ** finished the transaction, and started it on its way to the disk.
3596 ** Then, we flush the commit or journal list, and just return 0 3605 ** Then, we flush the commit or journal list, and just return 0
3597 ** because the rest of journal end was already done for this transaction. 3606 ** because the rest of journal end was already done for this transaction.
3598 */ 3607 */
3599 if (atomic_read(&(journal->j_wcount)) > 0) { 3608 if (atomic_read(&(journal->j_wcount)) > 0) {
@@ -3608,31 +3617,31 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
3608 if (flush) { 3617 if (flush) {
3609 journal->j_next_full_flush = 1; 3618 journal->j_next_full_flush = 1;
3610 } 3619 }
3611 unlock_journal(p_s_sb); 3620 unlock_journal(sb);
3612 3621
3613 /* sleep while the current transaction is still j_jlocked */ 3622 /* sleep while the current transaction is still j_jlocked */
3614 while (journal->j_trans_id == trans_id) { 3623 while (journal->j_trans_id == trans_id) {
3615 if (atomic_read(&journal->j_jlock)) { 3624 if (atomic_read(&journal->j_jlock)) {
3616 queue_log_writer(p_s_sb); 3625 queue_log_writer(sb);
3617 } else { 3626 } else {
3618 lock_journal(p_s_sb); 3627 lock_journal(sb);
3619 if (journal->j_trans_id == trans_id) { 3628 if (journal->j_trans_id == trans_id) {
3620 atomic_set(&(journal->j_jlock), 3629 atomic_set(&(journal->j_jlock),
3621 1); 3630 1);
3622 } 3631 }
3623 unlock_journal(p_s_sb); 3632 unlock_journal(sb);
3624 } 3633 }
3625 } 3634 }
3626 BUG_ON(journal->j_trans_id == trans_id); 3635 BUG_ON(journal->j_trans_id == trans_id);
3627 3636
3628 if (commit_now 3637 if (commit_now
3629 && journal_list_still_alive(p_s_sb, trans_id) 3638 && journal_list_still_alive(sb, trans_id)
3630 && wait_on_commit) { 3639 && wait_on_commit) {
3631 flush_commit_list(p_s_sb, jl, 1); 3640 flush_commit_list(sb, jl, 1);
3632 } 3641 }
3633 return 0; 3642 return 0;
3634 } 3643 }
3635 unlock_journal(p_s_sb); 3644 unlock_journal(sb);
3636 return 0; 3645 return 0;
3637 } 3646 }
3638 3647
@@ -3649,13 +3658,13 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
3649 && journal->j_len_alloc < journal->j_max_batch 3658 && journal->j_len_alloc < journal->j_max_batch
3650 && journal->j_cnode_free > (journal->j_trans_max * 3)) { 3659 && journal->j_cnode_free > (journal->j_trans_max * 3)) {
3651 journal->j_bcount++; 3660 journal->j_bcount++;
3652 unlock_journal(p_s_sb); 3661 unlock_journal(sb);
3653 return 0; 3662 return 0;
3654 } 3663 }
3655 3664
3656 if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(p_s_sb)) { 3665 if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(sb)) {
3657 reiserfs_panic(p_s_sb, 3666 reiserfs_panic(sb, "journal-003",
3658 "journal-003: journal_end: j_start (%ld) is too high\n", 3667 "j_start (%ld) is too high",
3659 journal->j_start); 3668 journal->j_start);
3660 } 3669 }
3661 return 1; 3670 return 1;
@@ -3664,7 +3673,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
3664/* 3673/*
3665** Does all the work that makes deleting blocks safe. 3674** Does all the work that makes deleting blocks safe.
3666** when deleting a block mark BH_JNew, just remove it from the current transaction, clean it's buffer_head and move on. 3675** when deleting a block mark BH_JNew, just remove it from the current transaction, clean it's buffer_head and move on.
3667** 3676**
3668** otherwise: 3677** otherwise:
3669** set a bit for the block in the journal bitmap. That will prevent it from being allocated for unformatted nodes 3678** set a bit for the block in the journal bitmap. That will prevent it from being allocated for unformatted nodes
3670** before this transaction has finished. 3679** before this transaction has finished.
@@ -3676,16 +3685,16 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
3676** Then remove it from the current transaction, decrementing any counters and filing it on the clean list. 3685** Then remove it from the current transaction, decrementing any counters and filing it on the clean list.
3677*/ 3686*/
3678int journal_mark_freed(struct reiserfs_transaction_handle *th, 3687int journal_mark_freed(struct reiserfs_transaction_handle *th,
3679 struct super_block *p_s_sb, b_blocknr_t blocknr) 3688 struct super_block *sb, b_blocknr_t blocknr)
3680{ 3689{
3681 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); 3690 struct reiserfs_journal *journal = SB_JOURNAL(sb);
3682 struct reiserfs_journal_cnode *cn = NULL; 3691 struct reiserfs_journal_cnode *cn = NULL;
3683 struct buffer_head *bh = NULL; 3692 struct buffer_head *bh = NULL;
3684 struct reiserfs_list_bitmap *jb = NULL; 3693 struct reiserfs_list_bitmap *jb = NULL;
3685 int cleaned = 0; 3694 int cleaned = 0;
3686 BUG_ON(!th->t_trans_id); 3695 BUG_ON(!th->t_trans_id);
3687 3696
3688 cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr); 3697 cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
3689 if (cn && cn->bh) { 3698 if (cn && cn->bh) {
3690 bh = cn->bh; 3699 bh = cn->bh;
3691 get_bh(bh); 3700 get_bh(bh);
@@ -3695,15 +3704,15 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th,
3695 clear_buffer_journal_new(bh); 3704 clear_buffer_journal_new(bh);
3696 clear_prepared_bits(bh); 3705 clear_prepared_bits(bh);
3697 reiserfs_clean_and_file_buffer(bh); 3706 reiserfs_clean_and_file_buffer(bh);
3698 cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned); 3707 cleaned = remove_from_transaction(sb, blocknr, cleaned);
3699 } else { 3708 } else {
3700 /* set the bit for this block in the journal bitmap for this transaction */ 3709 /* set the bit for this block in the journal bitmap for this transaction */
3701 jb = journal->j_current_jl->j_list_bitmap; 3710 jb = journal->j_current_jl->j_list_bitmap;
3702 if (!jb) { 3711 if (!jb) {
3703 reiserfs_panic(p_s_sb, 3712 reiserfs_panic(sb, "journal-1702",
3704 "journal-1702: journal_mark_freed, journal_list_bitmap is NULL\n"); 3713 "journal_list_bitmap is NULL");
3705 } 3714 }
3706 set_bit_in_list_bitmap(p_s_sb, blocknr, jb); 3715 set_bit_in_list_bitmap(sb, blocknr, jb);
3707 3716
3708 /* Note, the entire while loop is not allowed to schedule. */ 3717 /* Note, the entire while loop is not allowed to schedule. */
3709 3718
@@ -3711,13 +3720,13 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th,
3711 clear_prepared_bits(bh); 3720 clear_prepared_bits(bh);
3712 reiserfs_clean_and_file_buffer(bh); 3721 reiserfs_clean_and_file_buffer(bh);
3713 } 3722 }
3714 cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned); 3723 cleaned = remove_from_transaction(sb, blocknr, cleaned);
3715 3724
3716 /* find all older transactions with this block, make sure they don't try to write it out */ 3725 /* find all older transactions with this block, make sure they don't try to write it out */
3717 cn = get_journal_hash_dev(p_s_sb, journal->j_list_hash_table, 3726 cn = get_journal_hash_dev(sb, journal->j_list_hash_table,
3718 blocknr); 3727 blocknr);
3719 while (cn) { 3728 while (cn) {
3720 if (p_s_sb == cn->sb && blocknr == cn->blocknr) { 3729 if (sb == cn->sb && blocknr == cn->blocknr) {
3721 set_bit(BLOCK_FREED, &cn->state); 3730 set_bit(BLOCK_FREED, &cn->state);
3722 if (cn->bh) { 3731 if (cn->bh) {
3723 if (!cleaned) { 3732 if (!cleaned) {
@@ -3733,8 +3742,9 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th,
3733 put_bh(cn->bh); 3742 put_bh(cn->bh);
3734 if (atomic_read 3743 if (atomic_read
3735 (&(cn->bh->b_count)) < 0) { 3744 (&(cn->bh->b_count)) < 0) {
3736 reiserfs_warning(p_s_sb, 3745 reiserfs_warning(sb,
3737 "journal-2138: cn->bh->b_count < 0"); 3746 "journal-2138",
3747 "cn->bh->b_count < 0");
3738 } 3748 }
3739 } 3749 }
3740 if (cn->jlist) { /* since we are clearing the bh, we MUST dec nonzerolen */ 3750 if (cn->jlist) { /* since we are clearing the bh, we MUST dec nonzerolen */
@@ -3824,7 +3834,7 @@ static int __commit_trans_jl(struct inode *inode, unsigned long id,
3824 3834
3825int reiserfs_commit_for_inode(struct inode *inode) 3835int reiserfs_commit_for_inode(struct inode *inode)
3826{ 3836{
3827 unsigned long id = REISERFS_I(inode)->i_trans_id; 3837 unsigned int id = REISERFS_I(inode)->i_trans_id;
3828 struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl; 3838 struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;
3829 3839
3830 /* for the whole inode, assume unset id means it was 3840 /* for the whole inode, assume unset id means it was
@@ -3839,18 +3849,18 @@ int reiserfs_commit_for_inode(struct inode *inode)
3839 return __commit_trans_jl(inode, id, jl); 3849 return __commit_trans_jl(inode, id, jl);
3840} 3850}
3841 3851
3842void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb, 3852void reiserfs_restore_prepared_buffer(struct super_block *sb,
3843 struct buffer_head *bh) 3853 struct buffer_head *bh)
3844{ 3854{
3845 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); 3855 struct reiserfs_journal *journal = SB_JOURNAL(sb);
3846 PROC_INFO_INC(p_s_sb, journal.restore_prepared); 3856 PROC_INFO_INC(sb, journal.restore_prepared);
3847 if (!bh) { 3857 if (!bh) {
3848 return; 3858 return;
3849 } 3859 }
3850 if (test_clear_buffer_journal_restore_dirty(bh) && 3860 if (test_clear_buffer_journal_restore_dirty(bh) &&
3851 buffer_journal_dirty(bh)) { 3861 buffer_journal_dirty(bh)) {
3852 struct reiserfs_journal_cnode *cn; 3862 struct reiserfs_journal_cnode *cn;
3853 cn = get_journal_hash_dev(p_s_sb, 3863 cn = get_journal_hash_dev(sb,
3854 journal->j_list_hash_table, 3864 journal->j_list_hash_table,
3855 bh->b_blocknr); 3865 bh->b_blocknr);
3856 if (cn && can_dirty(cn)) { 3866 if (cn && can_dirty(cn)) {
@@ -3867,12 +3877,12 @@ extern struct tree_balance *cur_tb;
3867** be written to disk while we are altering it. So, we must: 3877** be written to disk while we are altering it. So, we must:
3868** clean it 3878** clean it
3869** wait on it. 3879** wait on it.
3870** 3880**
3871*/ 3881*/
3872int reiserfs_prepare_for_journal(struct super_block *p_s_sb, 3882int reiserfs_prepare_for_journal(struct super_block *sb,
3873 struct buffer_head *bh, int wait) 3883 struct buffer_head *bh, int wait)
3874{ 3884{
3875 PROC_INFO_INC(p_s_sb, journal.prepare); 3885 PROC_INFO_INC(sb, journal.prepare);
3876 3886
3877 if (!trylock_buffer(bh)) { 3887 if (!trylock_buffer(bh)) {
3878 if (!wait) 3888 if (!wait)
@@ -3909,7 +3919,7 @@ static void flush_old_journal_lists(struct super_block *s)
3909 } 3919 }
3910} 3920}
3911 3921
3912/* 3922/*
3913** long and ugly. If flush, will not return until all commit 3923** long and ugly. If flush, will not return until all commit
3914** blocks and all real buffers in the trans are on disk. 3924** blocks and all real buffers in the trans are on disk.
3915** If no_async, won't return until all commit blocks are on disk. 3925** If no_async, won't return until all commit blocks are on disk.
@@ -3920,10 +3930,10 @@ static void flush_old_journal_lists(struct super_block *s)
3920** journal lists, etc just won't happen. 3930** journal lists, etc just won't happen.
3921*/ 3931*/
3922static int do_journal_end(struct reiserfs_transaction_handle *th, 3932static int do_journal_end(struct reiserfs_transaction_handle *th,
3923 struct super_block *p_s_sb, unsigned long nblocks, 3933 struct super_block *sb, unsigned long nblocks,
3924 int flags) 3934 int flags)
3925{ 3935{
3926 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); 3936 struct reiserfs_journal *journal = SB_JOURNAL(sb);
3927 struct reiserfs_journal_cnode *cn, *next, *jl_cn; 3937 struct reiserfs_journal_cnode *cn, *next, *jl_cn;
3928 struct reiserfs_journal_cnode *last_cn = NULL; 3938 struct reiserfs_journal_cnode *last_cn = NULL;
3929 struct reiserfs_journal_desc *desc; 3939 struct reiserfs_journal_desc *desc;
@@ -3938,7 +3948,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
3938 struct reiserfs_journal_list *jl, *temp_jl; 3948 struct reiserfs_journal_list *jl, *temp_jl;
3939 struct list_head *entry, *safe; 3949 struct list_head *entry, *safe;
3940 unsigned long jindex; 3950 unsigned long jindex;
3941 unsigned long commit_trans_id; 3951 unsigned int commit_trans_id;
3942 int trans_half; 3952 int trans_half;
3943 3953
3944 BUG_ON(th->t_refcount > 1); 3954 BUG_ON(th->t_refcount > 1);
@@ -3946,21 +3956,21 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
3946 3956
3947 /* protect flush_older_commits from doing mistakes if the 3957 /* protect flush_older_commits from doing mistakes if the
3948 transaction ID counter gets overflowed. */ 3958 transaction ID counter gets overflowed. */
3949 if (th->t_trans_id == ~0UL) 3959 if (th->t_trans_id == ~0U)
3950 flags |= FLUSH_ALL | COMMIT_NOW | WAIT; 3960 flags |= FLUSH_ALL | COMMIT_NOW | WAIT;
3951 flush = flags & FLUSH_ALL; 3961 flush = flags & FLUSH_ALL;
3952 wait_on_commit = flags & WAIT; 3962 wait_on_commit = flags & WAIT;
3953 3963
3954 put_fs_excl(); 3964 put_fs_excl();
3955 current->journal_info = th->t_handle_save; 3965 current->journal_info = th->t_handle_save;
3956 reiserfs_check_lock_depth(p_s_sb, "journal end"); 3966 reiserfs_check_lock_depth(sb, "journal end");
3957 if (journal->j_len == 0) { 3967 if (journal->j_len == 0) {
3958 reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 3968 reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
3959 1); 3969 1);
3960 journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)); 3970 journal_mark_dirty(th, sb, SB_BUFFER_WITH_SB(sb));
3961 } 3971 }
3962 3972
3963 lock_journal(p_s_sb); 3973 lock_journal(sb);
3964 if (journal->j_next_full_flush) { 3974 if (journal->j_next_full_flush) {
3965 flags |= FLUSH_ALL; 3975 flags |= FLUSH_ALL;
3966 flush = 1; 3976 flush = 1;
@@ -3970,13 +3980,13 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
3970 wait_on_commit = 1; 3980 wait_on_commit = 1;
3971 } 3981 }
3972 3982
3973 /* check_journal_end locks the journal, and unlocks if it does not return 1 3983 /* check_journal_end locks the journal, and unlocks if it does not return 1
3974 ** it tells us if we should continue with the journal_end, or just return 3984 ** it tells us if we should continue with the journal_end, or just return
3975 */ 3985 */
3976 if (!check_journal_end(th, p_s_sb, nblocks, flags)) { 3986 if (!check_journal_end(th, sb, nblocks, flags)) {
3977 p_s_sb->s_dirt = 1; 3987 sb->s_dirt = 1;
3978 wake_queued_writers(p_s_sb); 3988 wake_queued_writers(sb);
3979 reiserfs_async_progress_wait(p_s_sb); 3989 reiserfs_async_progress_wait(sb);
3980 goto out; 3990 goto out;
3981 } 3991 }
3982 3992
@@ -4005,8 +4015,8 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
4005 4015
4006 /* setup description block */ 4016 /* setup description block */
4007 d_bh = 4017 d_bh =
4008 journal_getblk(p_s_sb, 4018 journal_getblk(sb,
4009 SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + 4019 SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
4010 journal->j_start); 4020 journal->j_start);
4011 set_buffer_uptodate(d_bh); 4021 set_buffer_uptodate(d_bh);
4012 desc = (struct reiserfs_journal_desc *)(d_bh)->b_data; 4022 desc = (struct reiserfs_journal_desc *)(d_bh)->b_data;
@@ -4015,9 +4025,9 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
4015 set_desc_trans_id(desc, journal->j_trans_id); 4025 set_desc_trans_id(desc, journal->j_trans_id);
4016 4026
4017 /* setup commit block. Don't write (keep it clean too) this one until after everyone else is written */ 4027 /* setup commit block. Don't write (keep it clean too) this one until after everyone else is written */
4018 c_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + 4028 c_bh = journal_getblk(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
4019 ((journal->j_start + journal->j_len + 4029 ((journal->j_start + journal->j_len +
4020 1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))); 4030 1) % SB_ONDISK_JOURNAL_SIZE(sb)));
4021 commit = (struct reiserfs_journal_commit *)c_bh->b_data; 4031 commit = (struct reiserfs_journal_commit *)c_bh->b_data;
4022 memset(c_bh->b_data, 0, c_bh->b_size); 4032 memset(c_bh->b_data, 0, c_bh->b_size);
4023 set_commit_trans_id(commit, journal->j_trans_id); 4033 set_commit_trans_id(commit, journal->j_trans_id);
@@ -4050,13 +4060,13 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
4050 ** for each real block, add it to the journal list hash, 4060 ** for each real block, add it to the journal list hash,
4051 ** copy into real block index array in the commit or desc block 4061 ** copy into real block index array in the commit or desc block
4052 */ 4062 */
4053 trans_half = journal_trans_half(p_s_sb->s_blocksize); 4063 trans_half = journal_trans_half(sb->s_blocksize);
4054 for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) { 4064 for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) {
4055 if (buffer_journaled(cn->bh)) { 4065 if (buffer_journaled(cn->bh)) {
4056 jl_cn = get_cnode(p_s_sb); 4066 jl_cn = get_cnode(sb);
4057 if (!jl_cn) { 4067 if (!jl_cn) {
4058 reiserfs_panic(p_s_sb, 4068 reiserfs_panic(sb, "journal-1676",
4059 "journal-1676, get_cnode returned NULL\n"); 4069 "get_cnode returned NULL");
4060 } 4070 }
4061 if (i == 0) { 4071 if (i == 0) {
4062 jl->j_realblock = jl_cn; 4072 jl->j_realblock = jl_cn;
@@ -4067,18 +4077,19 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
4067 last_cn->next = jl_cn; 4077 last_cn->next = jl_cn;
4068 } 4078 }
4069 last_cn = jl_cn; 4079 last_cn = jl_cn;
4070 /* make sure the block we are trying to log is not a block 4080 /* make sure the block we are trying to log is not a block
4071 of journal or reserved area */ 4081 of journal or reserved area */
4072 4082
4073 if (is_block_in_log_or_reserved_area 4083 if (is_block_in_log_or_reserved_area
4074 (p_s_sb, cn->bh->b_blocknr)) { 4084 (sb, cn->bh->b_blocknr)) {
4075 reiserfs_panic(p_s_sb, 4085 reiserfs_panic(sb, "journal-2332",
4076 "journal-2332: Trying to log block %lu, which is a log block\n", 4086 "Trying to log block %lu, "
4087 "which is a log block",
4077 cn->bh->b_blocknr); 4088 cn->bh->b_blocknr);
4078 } 4089 }
4079 jl_cn->blocknr = cn->bh->b_blocknr; 4090 jl_cn->blocknr = cn->bh->b_blocknr;
4080 jl_cn->state = 0; 4091 jl_cn->state = 0;
4081 jl_cn->sb = p_s_sb; 4092 jl_cn->sb = sb;
4082 jl_cn->bh = cn->bh; 4093 jl_cn->bh = cn->bh;
4083 jl_cn->jlist = jl; 4094 jl_cn->jlist = jl;
4084 insert_journal_hash(journal->j_list_hash_table, jl_cn); 4095 insert_journal_hash(journal->j_list_hash_table, jl_cn);
@@ -4119,11 +4130,11 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
4119 char *addr; 4130 char *addr;
4120 struct page *page; 4131 struct page *page;
4121 tmp_bh = 4132 tmp_bh =
4122 journal_getblk(p_s_sb, 4133 journal_getblk(sb,
4123 SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + 4134 SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
4124 ((cur_write_start + 4135 ((cur_write_start +
4125 jindex) % 4136 jindex) %
4126 SB_ONDISK_JOURNAL_SIZE(p_s_sb))); 4137 SB_ONDISK_JOURNAL_SIZE(sb)));
4127 set_buffer_uptodate(tmp_bh); 4138 set_buffer_uptodate(tmp_bh);
4128 page = cn->bh->b_page; 4139 page = cn->bh->b_page;
4129 addr = kmap(page); 4140 addr = kmap(page);
@@ -4137,12 +4148,13 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
4137 clear_buffer_journaled(cn->bh); 4148 clear_buffer_journaled(cn->bh);
4138 } else { 4149 } else {
4139 /* JDirty cleared sometime during transaction. don't log this one */ 4150 /* JDirty cleared sometime during transaction. don't log this one */
4140 reiserfs_warning(p_s_sb, 4151 reiserfs_warning(sb, "journal-2048",
4141 "journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!"); 4152 "BAD, buffer in journal hash, "
4153 "but not JDirty!");
4142 brelse(cn->bh); 4154 brelse(cn->bh);
4143 } 4155 }
4144 next = cn->next; 4156 next = cn->next;
4145 free_cnode(p_s_sb, cn); 4157 free_cnode(sb, cn);
4146 cn = next; 4158 cn = next;
4147 cond_resched(); 4159 cond_resched();
4148 } 4160 }
@@ -4152,7 +4164,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
4152 ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1. 4164 ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
4153 */ 4165 */
4154 4166
4155 journal->j_current_jl = alloc_journal_list(p_s_sb); 4167 journal->j_current_jl = alloc_journal_list(sb);
4156 4168
4157 /* now it is safe to insert this transaction on the main list */ 4169 /* now it is safe to insert this transaction on the main list */
4158 list_add_tail(&jl->j_list, &journal->j_journal_list); 4170 list_add_tail(&jl->j_list, &journal->j_journal_list);
@@ -4163,7 +4175,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
4163 old_start = journal->j_start; 4175 old_start = journal->j_start;
4164 journal->j_start = 4176 journal->j_start =
4165 (journal->j_start + journal->j_len + 4177 (journal->j_start + journal->j_len +
4166 2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb); 4178 2) % SB_ONDISK_JOURNAL_SIZE(sb);
4167 atomic_set(&(journal->j_wcount), 0); 4179 atomic_set(&(journal->j_wcount), 0);
4168 journal->j_bcount = 0; 4180 journal->j_bcount = 0;
4169 journal->j_last = NULL; 4181 journal->j_last = NULL;
@@ -4178,7 +4190,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
4178 journal->j_len_alloc = 0; 4190 journal->j_len_alloc = 0;
4179 journal->j_next_full_flush = 0; 4191 journal->j_next_full_flush = 0;
4180 journal->j_next_async_flush = 0; 4192 journal->j_next_async_flush = 0;
4181 init_journal_hash(p_s_sb); 4193 init_journal_hash(sb);
4182 4194
4183 // make sure reiserfs_add_jh sees the new current_jl before we 4195 // make sure reiserfs_add_jh sees the new current_jl before we
4184 // write out the tails 4196 // write out the tails
@@ -4207,14 +4219,14 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
4207 ** queue don't wait for this proc to flush journal lists and such. 4219 ** queue don't wait for this proc to flush journal lists and such.
4208 */ 4220 */
4209 if (flush) { 4221 if (flush) {
4210 flush_commit_list(p_s_sb, jl, 1); 4222 flush_commit_list(sb, jl, 1);
4211 flush_journal_list(p_s_sb, jl, 1); 4223 flush_journal_list(sb, jl, 1);
4212 } else if (!(jl->j_state & LIST_COMMIT_PENDING)) 4224 } else if (!(jl->j_state & LIST_COMMIT_PENDING))
4213 queue_delayed_work(commit_wq, &journal->j_work, HZ / 10); 4225 queue_delayed_work(commit_wq, &journal->j_work, HZ / 10);
4214 4226
4215 /* if the next transaction has any chance of wrapping, flush 4227 /* if the next transaction has any chance of wrapping, flush
4216 ** transactions that might get overwritten. If any journal lists are very 4228 ** transactions that might get overwritten. If any journal lists are very
4217 ** old flush them as well. 4229 ** old flush them as well.
4218 */ 4230 */
4219 first_jl: 4231 first_jl:
4220 list_for_each_safe(entry, safe, &journal->j_journal_list) { 4232 list_for_each_safe(entry, safe, &journal->j_journal_list) {
@@ -4222,11 +4234,11 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
4222 if (journal->j_start <= temp_jl->j_start) { 4234 if (journal->j_start <= temp_jl->j_start) {
4223 if ((journal->j_start + journal->j_trans_max + 1) >= 4235 if ((journal->j_start + journal->j_trans_max + 1) >=
4224 temp_jl->j_start) { 4236 temp_jl->j_start) {
4225 flush_used_journal_lists(p_s_sb, temp_jl); 4237 flush_used_journal_lists(sb, temp_jl);
4226 goto first_jl; 4238 goto first_jl;
4227 } else if ((journal->j_start + 4239 } else if ((journal->j_start +
4228 journal->j_trans_max + 1) < 4240 journal->j_trans_max + 1) <
4229 SB_ONDISK_JOURNAL_SIZE(p_s_sb)) { 4241 SB_ONDISK_JOURNAL_SIZE(sb)) {
4230 /* if we don't cross into the next transaction and we don't 4242 /* if we don't cross into the next transaction and we don't
4231 * wrap, there is no way we can overlap any later transactions 4243 * wrap, there is no way we can overlap any later transactions
4232 * break now 4244 * break now
@@ -4235,11 +4247,11 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
4235 } 4247 }
4236 } else if ((journal->j_start + 4248 } else if ((journal->j_start +
4237 journal->j_trans_max + 1) > 4249 journal->j_trans_max + 1) >
4238 SB_ONDISK_JOURNAL_SIZE(p_s_sb)) { 4250 SB_ONDISK_JOURNAL_SIZE(sb)) {
4239 if (((journal->j_start + journal->j_trans_max + 1) % 4251 if (((journal->j_start + journal->j_trans_max + 1) %
4240 SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >= 4252 SB_ONDISK_JOURNAL_SIZE(sb)) >=
4241 temp_jl->j_start) { 4253 temp_jl->j_start) {
4242 flush_used_journal_lists(p_s_sb, temp_jl); 4254 flush_used_journal_lists(sb, temp_jl);
4243 goto first_jl; 4255 goto first_jl;
4244 } else { 4256 } else {
4245 /* we don't overlap anything from out start to the end of the 4257 /* we don't overlap anything from out start to the end of the
@@ -4250,46 +4262,47 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
4250 } 4262 }
4251 } 4263 }
4252 } 4264 }
4253 flush_old_journal_lists(p_s_sb); 4265 flush_old_journal_lists(sb);
4254 4266
4255 journal->j_current_jl->j_list_bitmap = 4267 journal->j_current_jl->j_list_bitmap =
4256 get_list_bitmap(p_s_sb, journal->j_current_jl); 4268 get_list_bitmap(sb, journal->j_current_jl);
4257 4269
4258 if (!(journal->j_current_jl->j_list_bitmap)) { 4270 if (!(journal->j_current_jl->j_list_bitmap)) {
4259 reiserfs_panic(p_s_sb, 4271 reiserfs_panic(sb, "journal-1996",
4260 "journal-1996: do_journal_end, could not get a list bitmap\n"); 4272 "could not get a list bitmap");
4261 } 4273 }
4262 4274
4263 atomic_set(&(journal->j_jlock), 0); 4275 atomic_set(&(journal->j_jlock), 0);
4264 unlock_journal(p_s_sb); 4276 unlock_journal(sb);
4265 /* wake up any body waiting to join. */ 4277 /* wake up any body waiting to join. */
4266 clear_bit(J_WRITERS_QUEUED, &journal->j_state); 4278 clear_bit(J_WRITERS_QUEUED, &journal->j_state);
4267 wake_up(&(journal->j_join_wait)); 4279 wake_up(&(journal->j_join_wait));
4268 4280
4269 if (!flush && wait_on_commit && 4281 if (!flush && wait_on_commit &&
4270 journal_list_still_alive(p_s_sb, commit_trans_id)) { 4282 journal_list_still_alive(sb, commit_trans_id)) {
4271 flush_commit_list(p_s_sb, jl, 1); 4283 flush_commit_list(sb, jl, 1);
4272 } 4284 }
4273 out: 4285 out:
4274 reiserfs_check_lock_depth(p_s_sb, "journal end2"); 4286 reiserfs_check_lock_depth(sb, "journal end2");
4275 4287
4276 memset(th, 0, sizeof(*th)); 4288 memset(th, 0, sizeof(*th));
4277 /* Re-set th->t_super, so we can properly keep track of how many 4289 /* Re-set th->t_super, so we can properly keep track of how many
4278 * persistent transactions there are. We need to do this so if this 4290 * persistent transactions there are. We need to do this so if this
4279 * call is part of a failed restart_transaction, we can free it later */ 4291 * call is part of a failed restart_transaction, we can free it later */
4280 th->t_super = p_s_sb; 4292 th->t_super = sb;
4281 4293
4282 return journal->j_errno; 4294 return journal->j_errno;
4283} 4295}
4284 4296
4285static void __reiserfs_journal_abort_hard(struct super_block *sb) 4297/* Send the file system read only and refuse new transactions */
4298void reiserfs_abort_journal(struct super_block *sb, int errno)
4286{ 4299{
4287 struct reiserfs_journal *journal = SB_JOURNAL(sb); 4300 struct reiserfs_journal *journal = SB_JOURNAL(sb);
4288 if (test_bit(J_ABORTED, &journal->j_state)) 4301 if (test_bit(J_ABORTED, &journal->j_state))
4289 return; 4302 return;
4290 4303
4291 printk(KERN_CRIT "REISERFS: Aborting journal for filesystem on %s\n", 4304 if (!journal->j_errno)
4292 reiserfs_bdevname(sb)); 4305 journal->j_errno = errno;
4293 4306
4294 sb->s_flags |= MS_RDONLY; 4307 sb->s_flags |= MS_RDONLY;
4295 set_bit(J_ABORTED, &journal->j_state); 4308 set_bit(J_ABORTED, &journal->j_state);
@@ -4299,19 +4312,3 @@ static void __reiserfs_journal_abort_hard(struct super_block *sb)
4299#endif 4312#endif
4300} 4313}
4301 4314
4302static void __reiserfs_journal_abort_soft(struct super_block *sb, int errno)
4303{
4304 struct reiserfs_journal *journal = SB_JOURNAL(sb);
4305 if (test_bit(J_ABORTED, &journal->j_state))
4306 return;
4307
4308 if (!journal->j_errno)
4309 journal->j_errno = errno;
4310
4311 __reiserfs_journal_abort_hard(sb);
4312}
4313
4314void reiserfs_journal_abort(struct super_block *sb, int errno)
4315{
4316 __reiserfs_journal_abort_soft(sb, errno);
4317}
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index 6de060a6aa7f..381750a155f6 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -111,7 +111,7 @@ static void leaf_copy_dir_entries(struct buffer_info *dest_bi,
111 item_num_in_dest = 111 item_num_in_dest =
112 (last_first == FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - 1) : 0; 112 (last_first == FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - 1) : 0;
113 113
114 leaf_paste_entries(dest_bi->bi_bh, item_num_in_dest, 114 leaf_paste_entries(dest_bi, item_num_in_dest,
115 (last_first == 115 (last_first ==
116 FIRST_TO_LAST) ? I_ENTRY_COUNT(B_N_PITEM_HEAD(dest, 116 FIRST_TO_LAST) ? I_ENTRY_COUNT(B_N_PITEM_HEAD(dest,
117 item_num_in_dest)) 117 item_num_in_dest))
@@ -119,8 +119,8 @@ static void leaf_copy_dir_entries(struct buffer_info *dest_bi,
119 DEH_SIZE * copy_count + copy_records_len); 119 DEH_SIZE * copy_count + copy_records_len);
120} 120}
121 121
122/* Copy the first (if last_first == FIRST_TO_LAST) or last (last_first == LAST_TO_FIRST) item or 122/* Copy the first (if last_first == FIRST_TO_LAST) or last (last_first == LAST_TO_FIRST) item or
123 part of it or nothing (see the return 0 below) from SOURCE to the end 123 part of it or nothing (see the return 0 below) from SOURCE to the end
124 (if last_first) or beginning (!last_first) of the DEST */ 124 (if last_first) or beginning (!last_first) of the DEST */
125/* returns 1 if anything was copied, else 0 */ 125/* returns 1 if anything was copied, else 0 */
126static int leaf_copy_boundary_item(struct buffer_info *dest_bi, 126static int leaf_copy_boundary_item(struct buffer_info *dest_bi,
@@ -168,10 +168,11 @@ static int leaf_copy_boundary_item(struct buffer_info *dest_bi,
168 if (bytes_or_entries == ih_item_len(ih) 168 if (bytes_or_entries == ih_item_len(ih)
169 && is_indirect_le_ih(ih)) 169 && is_indirect_le_ih(ih))
170 if (get_ih_free_space(ih)) 170 if (get_ih_free_space(ih))
171 reiserfs_panic(NULL, 171 reiserfs_panic(sb_from_bi(dest_bi),
172 "vs-10020: leaf_copy_boundary_item: " 172 "vs-10020",
173 "last unformatted node must be filled entirely (%h)", 173 "last unformatted node "
174 ih); 174 "must be filled "
175 "entirely (%h)", ih);
175 } 176 }
176#endif 177#endif
177 178
@@ -395,7 +396,7 @@ static void leaf_item_bottle(struct buffer_info *dest_bi,
395 else { 396 else {
396 struct item_head n_ih; 397 struct item_head n_ih;
397 398
398 /* copy part of the body of the item number 'item_num' of SOURCE to the end of the DEST 399 /* copy part of the body of the item number 'item_num' of SOURCE to the end of the DEST
399 part defined by 'cpy_bytes'; create new item header; change old item_header (????); 400 part defined by 'cpy_bytes'; create new item header; change old item_header (????);
400 n_ih = new item_header; 401 n_ih = new item_header;
401 */ 402 */
@@ -425,7 +426,7 @@ static void leaf_item_bottle(struct buffer_info *dest_bi,
425 else { 426 else {
426 struct item_head n_ih; 427 struct item_head n_ih;
427 428
428 /* copy part of the body of the item number 'item_num' of SOURCE to the begin of the DEST 429 /* copy part of the body of the item number 'item_num' of SOURCE to the begin of the DEST
429 part defined by 'cpy_bytes'; create new item header; 430 part defined by 'cpy_bytes'; create new item header;
430 n_ih = new item_header; 431 n_ih = new item_header;
431 */ 432 */
@@ -622,9 +623,8 @@ static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb,
622 break; 623 break;
623 624
624 default: 625 default:
625 reiserfs_panic(NULL, 626 reiserfs_panic(sb_from_bi(src_bi), "vs-10250",
626 "vs-10250: leaf_define_dest_src_infos: shift type is unknown (%d)", 627 "shift type is unknown (%d)", shift_mode);
627 shift_mode);
628 } 628 }
629 RFALSE(!src_bi->bi_bh || !dest_bi->bi_bh, 629 RFALSE(!src_bi->bi_bh || !dest_bi->bi_bh,
630 "vs-10260: mode==%d, source (%p) or dest (%p) buffer is initialized incorrectly", 630 "vs-10260: mode==%d, source (%p) or dest (%p) buffer is initialized incorrectly",
@@ -674,9 +674,9 @@ int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes)
674#ifdef CONFIG_REISERFS_CHECK 674#ifdef CONFIG_REISERFS_CHECK
675 if (tb->tb_mode == M_PASTE || tb->tb_mode == M_INSERT) { 675 if (tb->tb_mode == M_PASTE || tb->tb_mode == M_INSERT) {
676 print_cur_tb("vs-10275"); 676 print_cur_tb("vs-10275");
677 reiserfs_panic(tb->tb_sb, 677 reiserfs_panic(tb->tb_sb, "vs-10275",
678 "vs-10275: leaf_shift_left: balance condition corrupted (%c)", 678 "balance condition corrupted "
679 tb->tb_mode); 679 "(%c)", tb->tb_mode);
680 } 680 }
681#endif 681#endif
682 682
@@ -724,7 +724,7 @@ int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes)
724static void leaf_delete_items_entirely(struct buffer_info *bi, 724static void leaf_delete_items_entirely(struct buffer_info *bi,
725 int first, int del_num); 725 int first, int del_num);
726/* If del_bytes == -1, starting from position 'first' delete del_num items in whole in buffer CUR. 726/* If del_bytes == -1, starting from position 'first' delete del_num items in whole in buffer CUR.
727 If not. 727 If not.
728 If last_first == 0. Starting from position 'first' delete del_num-1 items in whole. Delete part of body of 728 If last_first == 0. Starting from position 'first' delete del_num-1 items in whole. Delete part of body of
729 the first item. Part defined by del_bytes. Don't delete first item header 729 the first item. Part defined by del_bytes. Don't delete first item header
730 If last_first == 1. Starting from position 'first+1' delete del_num-1 items in whole. Delete part of body of 730 If last_first == 1. Starting from position 'first+1' delete del_num-1 items in whole. Delete part of body of
@@ -783,7 +783,7 @@ void leaf_delete_items(struct buffer_info *cur_bi, int last_first,
783 /* len = body len of item */ 783 /* len = body len of item */
784 len = ih_item_len(ih); 784 len = ih_item_len(ih);
785 785
786 /* delete the part of the last item of the bh 786 /* delete the part of the last item of the bh
787 do not delete item header 787 do not delete item header
788 */ 788 */
789 leaf_cut_from_buffer(cur_bi, B_NR_ITEMS(bh) - 1, 789 leaf_cut_from_buffer(cur_bi, B_NR_ITEMS(bh) - 1,
@@ -865,7 +865,7 @@ void leaf_insert_into_buf(struct buffer_info *bi, int before,
865 } 865 }
866} 866}
867 867
868/* paste paste_size bytes to affected_item_num-th item. 868/* paste paste_size bytes to affected_item_num-th item.
869 When item is a directory, this only prepare space for new entries */ 869 When item is a directory, this only prepare space for new entries */
870void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num, 870void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num,
871 int pos_in_item, int paste_size, 871 int pos_in_item, int paste_size,
@@ -889,9 +889,12 @@ void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num,
889 889
890#ifdef CONFIG_REISERFS_CHECK 890#ifdef CONFIG_REISERFS_CHECK
891 if (zeros_number > paste_size) { 891 if (zeros_number > paste_size) {
892 struct super_block *sb = NULL;
893 if (bi && bi->tb)
894 sb = bi->tb->tb_sb;
892 print_cur_tb("10177"); 895 print_cur_tb("10177");
893 reiserfs_panic(NULL, 896 reiserfs_panic(sb, "vs-10177",
894 "vs-10177: leaf_paste_in_buffer: ero number == %d, paste_size == %d", 897 "zeros_number == %d, paste_size == %d",
895 zeros_number, paste_size); 898 zeros_number, paste_size);
896 } 899 }
897#endif /* CONFIG_REISERFS_CHECK */ 900#endif /* CONFIG_REISERFS_CHECK */
@@ -1019,7 +1022,7 @@ static int leaf_cut_entries(struct buffer_head *bh,
1019/* when cut item is part of regular file 1022/* when cut item is part of regular file
1020 pos_in_item - first byte that must be cut 1023 pos_in_item - first byte that must be cut
1021 cut_size - number of bytes to be cut beginning from pos_in_item 1024 cut_size - number of bytes to be cut beginning from pos_in_item
1022 1025
1023 when cut item is part of directory 1026 when cut item is part of directory
1024 pos_in_item - number of first deleted entry 1027 pos_in_item - number of first deleted entry
1025 cut_size - count of deleted entries 1028 cut_size - count of deleted entries
@@ -1191,7 +1194,7 @@ static void leaf_delete_items_entirely(struct buffer_info *bi,
1191} 1194}
1192 1195
1193/* paste new_entry_count entries (new_dehs, records) into position before to item_num-th item */ 1196/* paste new_entry_count entries (new_dehs, records) into position before to item_num-th item */
1194void leaf_paste_entries(struct buffer_head *bh, 1197void leaf_paste_entries(struct buffer_info *bi,
1195 int item_num, 1198 int item_num,
1196 int before, 1199 int before,
1197 int new_entry_count, 1200 int new_entry_count,
@@ -1203,6 +1206,7 @@ void leaf_paste_entries(struct buffer_head *bh,
1203 struct reiserfs_de_head *deh; 1206 struct reiserfs_de_head *deh;
1204 char *insert_point; 1207 char *insert_point;
1205 int i, old_entry_num; 1208 int i, old_entry_num;
1209 struct buffer_head *bh = bi->bi_bh;
1206 1210
1207 if (new_entry_count == 0) 1211 if (new_entry_count == 0)
1208 return; 1212 return;
@@ -1271,7 +1275,7 @@ void leaf_paste_entries(struct buffer_head *bh,
1271 /* change item key if necessary (when we paste before 0-th entry */ 1275 /* change item key if necessary (when we paste before 0-th entry */
1272 if (!before) { 1276 if (!before) {
1273 set_le_ih_k_offset(ih, deh_offset(new_dehs)); 1277 set_le_ih_k_offset(ih, deh_offset(new_dehs));
1274/* memcpy (&ih->ih_key.k_offset, 1278/* memcpy (&ih->ih_key.k_offset,
1275 &new_dehs->deh_offset, SHORT_KEY_SIZE);*/ 1279 &new_dehs->deh_offset, SHORT_KEY_SIZE);*/
1276 } 1280 }
1277#ifdef CONFIG_REISERFS_CHECK 1281#ifdef CONFIG_REISERFS_CHECK
@@ -1287,13 +1291,17 @@ void leaf_paste_entries(struct buffer_head *bh,
1287 prev = (i != 0) ? deh_location(&(deh[i - 1])) : 0; 1291 prev = (i != 0) ? deh_location(&(deh[i - 1])) : 0;
1288 1292
1289 if (prev && prev <= deh_location(&(deh[i]))) 1293 if (prev && prev <= deh_location(&(deh[i])))
1290 reiserfs_warning(NULL, 1294 reiserfs_error(sb_from_bi(bi), "vs-10240",
1291 "vs-10240: leaf_paste_entries: directory item (%h) corrupted (prev %a, cur(%d) %a)", 1295 "directory item (%h) "
1292 ih, deh + i - 1, i, deh + i); 1296 "corrupted (prev %a, "
1297 "cur(%d) %a)",
1298 ih, deh + i - 1, i, deh + i);
1293 if (next && next >= deh_location(&(deh[i]))) 1299 if (next && next >= deh_location(&(deh[i])))
1294 reiserfs_warning(NULL, 1300 reiserfs_error(sb_from_bi(bi), "vs-10250",
1295 "vs-10250: leaf_paste_entries: directory item (%h) corrupted (cur(%d) %a, next %a)", 1301 "directory item (%h) "
1296 ih, i, deh + i, deh + i + 1); 1302 "corrupted (cur(%d) %a, "
1303 "next %a)",
1304 ih, i, deh + i, deh + i + 1);
1297 } 1305 }
1298 } 1306 }
1299#endif 1307#endif
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 738967f6c8ee..efd4d720718e 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -106,7 +106,7 @@ key of the first directory entry in it.
106This function first calls search_by_key, then, if item whose first 106This function first calls search_by_key, then, if item whose first
107entry matches is not found it looks for the entry inside directory 107entry matches is not found it looks for the entry inside directory
108item found by search_by_key. Fills the path to the entry, and to the 108item found by search_by_key. Fills the path to the entry, and to the
109entry position in the item 109entry position in the item
110 110
111*/ 111*/
112 112
@@ -120,8 +120,8 @@ int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
120 switch (retval) { 120 switch (retval) {
121 case ITEM_NOT_FOUND: 121 case ITEM_NOT_FOUND:
122 if (!PATH_LAST_POSITION(path)) { 122 if (!PATH_LAST_POSITION(path)) {
123 reiserfs_warning(sb, 123 reiserfs_error(sb, "vs-7000", "search_by_key "
124 "vs-7000: search_by_entry_key: search_by_key returned item position == 0"); 124 "returned item position == 0");
125 pathrelse(path); 125 pathrelse(path);
126 return IO_ERROR; 126 return IO_ERROR;
127 } 127 }
@@ -135,8 +135,7 @@ int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
135 135
136 default: 136 default:
137 pathrelse(path); 137 pathrelse(path);
138 reiserfs_warning(sb, 138 reiserfs_error(sb, "vs-7002", "no path to here");
139 "vs-7002: search_by_entry_key: no path to here");
140 return IO_ERROR; 139 return IO_ERROR;
141 } 140 }
142 141
@@ -146,10 +145,9 @@ int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
146 if (!is_direntry_le_ih(de->de_ih) || 145 if (!is_direntry_le_ih(de->de_ih) ||
147 COMP_SHORT_KEYS(&(de->de_ih->ih_key), key)) { 146 COMP_SHORT_KEYS(&(de->de_ih->ih_key), key)) {
148 print_block(de->de_bh, 0, -1, -1); 147 print_block(de->de_bh, 0, -1, -1);
149 reiserfs_panic(sb, 148 reiserfs_panic(sb, "vs-7005", "found item %h is not directory "
150 "vs-7005: search_by_entry_key: found item %h is not directory item or " 149 "item or does not belong to the same directory "
151 "does not belong to the same directory as key %K", 150 "as key %K", de->de_ih, key);
152 de->de_ih, key);
153 } 151 }
154#endif /* CONFIG_REISERFS_CHECK */ 152#endif /* CONFIG_REISERFS_CHECK */
155 153
@@ -300,8 +298,7 @@ static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen,
300 search_by_entry_key(dir->i_sb, &key_to_search, 298 search_by_entry_key(dir->i_sb, &key_to_search,
301 path_to_entry, de); 299 path_to_entry, de);
302 if (retval == IO_ERROR) { 300 if (retval == IO_ERROR) {
303 reiserfs_warning(dir->i_sb, "zam-7001: io error in %s", 301 reiserfs_error(dir->i_sb, "zam-7001", "io error");
304 __func__);
305 return IO_ERROR; 302 return IO_ERROR;
306 } 303 }
307 304
@@ -361,9 +358,10 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
361 return ERR_PTR(-EACCES); 358 return ERR_PTR(-EACCES);
362 } 359 }
363 360
364 /* Propogate the priv_object flag so we know we're in the priv tree */ 361 /* Propagate the private flag so we know we're
365 if (is_reiserfs_priv_object(dir)) 362 * in the priv tree */
366 reiserfs_mark_inode_private(inode); 363 if (IS_PRIVATE(dir))
364 inode->i_flags |= S_PRIVATE;
367 } 365 }
368 reiserfs_write_unlock(dir->i_sb); 366 reiserfs_write_unlock(dir->i_sb);
369 if (retval == IO_ERROR) { 367 if (retval == IO_ERROR) {
@@ -373,7 +371,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
373 return d_splice_alias(inode, dentry); 371 return d_splice_alias(inode, dentry);
374} 372}
375 373
376/* 374/*
377** looks up the dentry of the parent directory for child. 375** looks up the dentry of the parent directory for child.
378** taken from ext2_get_parent 376** taken from ext2_get_parent
379*/ 377*/
@@ -403,7 +401,7 @@ struct dentry *reiserfs_get_parent(struct dentry *child)
403 return d_obtain_alias(inode); 401 return d_obtain_alias(inode);
404} 402}
405 403
406/* add entry to the directory (entry can be hidden). 404/* add entry to the directory (entry can be hidden).
407 405
408insert definition of when hidden directories are used here -Hans 406insert definition of when hidden directories are used here -Hans
409 407
@@ -484,10 +482,9 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
484 } 482 }
485 483
486 if (retval != NAME_FOUND) { 484 if (retval != NAME_FOUND) {
487 reiserfs_warning(dir->i_sb, 485 reiserfs_error(dir->i_sb, "zam-7002",
488 "zam-7002:%s: \"reiserfs_find_entry\" " 486 "reiserfs_find_entry() returned "
489 "has returned unexpected value (%d)", 487 "unexpected value (%d)", retval);
490 __func__, retval);
491 } 488 }
492 489
493 return -EEXIST; 490 return -EEXIST;
@@ -498,8 +495,9 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
498 MAX_GENERATION_NUMBER + 1); 495 MAX_GENERATION_NUMBER + 1);
499 if (gen_number > MAX_GENERATION_NUMBER) { 496 if (gen_number > MAX_GENERATION_NUMBER) {
500 /* there is no free generation number */ 497 /* there is no free generation number */
501 reiserfs_warning(dir->i_sb, 498 reiserfs_warning(dir->i_sb, "reiserfs-7010",
502 "reiserfs_add_entry: Congratulations! we have got hash function screwed up"); 499 "Congratulations! we have got hash function "
500 "screwed up");
503 if (buffer != small_buf) 501 if (buffer != small_buf)
504 kfree(buffer); 502 kfree(buffer);
505 pathrelse(&path); 503 pathrelse(&path);
@@ -515,10 +513,9 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
515 if (gen_number != 0) { /* we need to re-search for the insertion point */ 513 if (gen_number != 0) { /* we need to re-search for the insertion point */
516 if (search_by_entry_key(dir->i_sb, &entry_key, &path, &de) != 514 if (search_by_entry_key(dir->i_sb, &entry_key, &path, &de) !=
517 NAME_NOT_FOUND) { 515 NAME_NOT_FOUND) {
518 reiserfs_warning(dir->i_sb, 516 reiserfs_warning(dir->i_sb, "vs-7032",
519 "vs-7032: reiserfs_add_entry: " 517 "entry with this key (%K) already "
520 "entry with this key (%K) already exists", 518 "exists", &entry_key);
521 &entry_key);
522 519
523 if (buffer != small_buf) 520 if (buffer != small_buf)
524 kfree(buffer); 521 kfree(buffer);
@@ -555,15 +552,15 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
555*/ 552*/
556static int drop_new_inode(struct inode *inode) 553static int drop_new_inode(struct inode *inode)
557{ 554{
558 DQUOT_DROP(inode); 555 vfs_dq_drop(inode);
559 make_bad_inode(inode); 556 make_bad_inode(inode);
560 inode->i_flags |= S_NOQUOTA; 557 inode->i_flags |= S_NOQUOTA;
561 iput(inode); 558 iput(inode);
562 return 0; 559 return 0;
563} 560}
564 561
565/* utility function that does setup for reiserfs_new_inode. 562/* utility function that does setup for reiserfs_new_inode.
566** DQUOT_INIT needs lots of credits so it's better to have it 563** vfs_dq_init needs lots of credits so it's better to have it
567** outside of a transaction, so we had to pull some bits of 564** outside of a transaction, so we had to pull some bits of
568** reiserfs_new_inode out into this func. 565** reiserfs_new_inode out into this func.
569*/ 566*/
@@ -586,7 +583,7 @@ static int new_inode_init(struct inode *inode, struct inode *dir, int mode)
586 } else { 583 } else {
587 inode->i_gid = current_fsgid(); 584 inode->i_gid = current_fsgid();
588 } 585 }
589 DQUOT_INIT(inode); 586 vfs_dq_init(inode);
590 return 0; 587 return 0;
591} 588}
592 589
@@ -601,20 +598,22 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
601 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + 598 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
602 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); 599 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
603 struct reiserfs_transaction_handle th; 600 struct reiserfs_transaction_handle th;
604 int locked; 601 struct reiserfs_security_handle security;
605 602
606 if (!(inode = new_inode(dir->i_sb))) { 603 if (!(inode = new_inode(dir->i_sb))) {
607 return -ENOMEM; 604 return -ENOMEM;
608 } 605 }
609 new_inode_init(inode, dir, mode); 606 new_inode_init(inode, dir, mode);
610 607
611 locked = reiserfs_cache_default_acl(dir); 608 jbegin_count += reiserfs_cache_default_acl(dir);
612 609 retval = reiserfs_security_init(dir, inode, &security);
610 if (retval < 0) {
611 drop_new_inode(inode);
612 return retval;
613 }
614 jbegin_count += retval;
613 reiserfs_write_lock(dir->i_sb); 615 reiserfs_write_lock(dir->i_sb);
614 616
615 if (locked)
616 reiserfs_write_lock_xattrs(dir->i_sb);
617
618 retval = journal_begin(&th, dir->i_sb, jbegin_count); 617 retval = journal_begin(&th, dir->i_sb, jbegin_count);
619 if (retval) { 618 if (retval) {
620 drop_new_inode(inode); 619 drop_new_inode(inode);
@@ -623,15 +622,10 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
623 622
624 retval = 623 retval =
625 reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry, 624 reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry,
626 inode); 625 inode, &security);
627 if (retval) 626 if (retval)
628 goto out_failed; 627 goto out_failed;
629 628
630 if (locked) {
631 reiserfs_write_unlock_xattrs(dir->i_sb);
632 locked = 0;
633 }
634
635 inode->i_op = &reiserfs_file_inode_operations; 629 inode->i_op = &reiserfs_file_inode_operations;
636 inode->i_fop = &reiserfs_file_operations; 630 inode->i_fop = &reiserfs_file_operations;
637 inode->i_mapping->a_ops = &reiserfs_address_space_operations; 631 inode->i_mapping->a_ops = &reiserfs_address_space_operations;
@@ -658,8 +652,6 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
658 retval = journal_end(&th, dir->i_sb, jbegin_count); 652 retval = journal_end(&th, dir->i_sb, jbegin_count);
659 653
660 out_failed: 654 out_failed:
661 if (locked)
662 reiserfs_write_unlock_xattrs(dir->i_sb);
663 reiserfs_write_unlock(dir->i_sb); 655 reiserfs_write_unlock(dir->i_sb);
664 return retval; 656 return retval;
665} 657}
@@ -670,12 +662,12 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
670 int retval; 662 int retval;
671 struct inode *inode; 663 struct inode *inode;
672 struct reiserfs_transaction_handle th; 664 struct reiserfs_transaction_handle th;
665 struct reiserfs_security_handle security;
673 /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ 666 /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
674 int jbegin_count = 667 int jbegin_count =
675 JOURNAL_PER_BALANCE_CNT * 3 + 668 JOURNAL_PER_BALANCE_CNT * 3 +
676 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + 669 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
677 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); 670 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
678 int locked;
679 671
680 if (!new_valid_dev(rdev)) 672 if (!new_valid_dev(rdev))
681 return -EINVAL; 673 return -EINVAL;
@@ -685,13 +677,15 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
685 } 677 }
686 new_inode_init(inode, dir, mode); 678 new_inode_init(inode, dir, mode);
687 679
688 locked = reiserfs_cache_default_acl(dir); 680 jbegin_count += reiserfs_cache_default_acl(dir);
689 681 retval = reiserfs_security_init(dir, inode, &security);
682 if (retval < 0) {
683 drop_new_inode(inode);
684 return retval;
685 }
686 jbegin_count += retval;
690 reiserfs_write_lock(dir->i_sb); 687 reiserfs_write_lock(dir->i_sb);
691 688
692 if (locked)
693 reiserfs_write_lock_xattrs(dir->i_sb);
694
695 retval = journal_begin(&th, dir->i_sb, jbegin_count); 689 retval = journal_begin(&th, dir->i_sb, jbegin_count);
696 if (retval) { 690 if (retval) {
697 drop_new_inode(inode); 691 drop_new_inode(inode);
@@ -700,16 +694,11 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
700 694
701 retval = 695 retval =
702 reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry, 696 reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry,
703 inode); 697 inode, &security);
704 if (retval) { 698 if (retval) {
705 goto out_failed; 699 goto out_failed;
706 } 700 }
707 701
708 if (locked) {
709 reiserfs_write_unlock_xattrs(dir->i_sb);
710 locked = 0;
711 }
712
713 inode->i_op = &reiserfs_special_inode_operations; 702 inode->i_op = &reiserfs_special_inode_operations;
714 init_special_inode(inode, inode->i_mode, rdev); 703 init_special_inode(inode, inode->i_mode, rdev);
715 704
@@ -739,8 +728,6 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
739 retval = journal_end(&th, dir->i_sb, jbegin_count); 728 retval = journal_end(&th, dir->i_sb, jbegin_count);
740 729
741 out_failed: 730 out_failed:
742 if (locked)
743 reiserfs_write_unlock_xattrs(dir->i_sb);
744 reiserfs_write_unlock(dir->i_sb); 731 reiserfs_write_unlock(dir->i_sb);
745 return retval; 732 return retval;
746} 733}
@@ -750,12 +737,12 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
750 int retval; 737 int retval;
751 struct inode *inode; 738 struct inode *inode;
752 struct reiserfs_transaction_handle th; 739 struct reiserfs_transaction_handle th;
740 struct reiserfs_security_handle security;
753 /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ 741 /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
754 int jbegin_count = 742 int jbegin_count =
755 JOURNAL_PER_BALANCE_CNT * 3 + 743 JOURNAL_PER_BALANCE_CNT * 3 +
756 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + 744 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
757 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); 745 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
758 int locked;
759 746
760#ifdef DISPLACE_NEW_PACKING_LOCALITIES 747#ifdef DISPLACE_NEW_PACKING_LOCALITIES
761 /* set flag that new packing locality created and new blocks for the content * of that directory are not displaced yet */ 748 /* set flag that new packing locality created and new blocks for the content * of that directory are not displaced yet */
@@ -767,11 +754,14 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
767 } 754 }
768 new_inode_init(inode, dir, mode); 755 new_inode_init(inode, dir, mode);
769 756
770 locked = reiserfs_cache_default_acl(dir); 757 jbegin_count += reiserfs_cache_default_acl(dir);
771 758 retval = reiserfs_security_init(dir, inode, &security);
759 if (retval < 0) {
760 drop_new_inode(inode);
761 return retval;
762 }
763 jbegin_count += retval;
772 reiserfs_write_lock(dir->i_sb); 764 reiserfs_write_lock(dir->i_sb);
773 if (locked)
774 reiserfs_write_lock_xattrs(dir->i_sb);
775 765
776 retval = journal_begin(&th, dir->i_sb, jbegin_count); 766 retval = journal_begin(&th, dir->i_sb, jbegin_count);
777 if (retval) { 767 if (retval) {
@@ -787,17 +777,12 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
787 retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */ , 777 retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */ ,
788 old_format_only(dir->i_sb) ? 778 old_format_only(dir->i_sb) ?
789 EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE, 779 EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
790 dentry, inode); 780 dentry, inode, &security);
791 if (retval) { 781 if (retval) {
792 dir->i_nlink--; 782 dir->i_nlink--;
793 goto out_failed; 783 goto out_failed;
794 } 784 }
795 785
796 if (locked) {
797 reiserfs_write_unlock_xattrs(dir->i_sb);
798 locked = 0;
799 }
800
801 reiserfs_update_inode_transaction(inode); 786 reiserfs_update_inode_transaction(inode);
802 reiserfs_update_inode_transaction(dir); 787 reiserfs_update_inode_transaction(dir);
803 788
@@ -827,8 +812,6 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
827 unlock_new_inode(inode); 812 unlock_new_inode(inode);
828 retval = journal_end(&th, dir->i_sb, jbegin_count); 813 retval = journal_end(&th, dir->i_sb, jbegin_count);
829 out_failed: 814 out_failed:
830 if (locked)
831 reiserfs_write_unlock_xattrs(dir->i_sb);
832 reiserfs_write_unlock(dir->i_sb); 815 reiserfs_write_unlock(dir->i_sb);
833 return retval; 816 return retval;
834} 817}
@@ -837,7 +820,7 @@ static inline int reiserfs_empty_dir(struct inode *inode)
837{ 820{
838 /* we can cheat because an old format dir cannot have 821 /* we can cheat because an old format dir cannot have
839 ** EMPTY_DIR_SIZE, and a new format dir cannot have 822 ** EMPTY_DIR_SIZE, and a new format dir cannot have
840 ** EMPTY_DIR_SIZE_V1. So, if the inode is either size, 823 ** EMPTY_DIR_SIZE_V1. So, if the inode is either size,
841 ** regardless of disk format version, the directory is empty. 824 ** regardless of disk format version, the directory is empty.
842 */ 825 */
843 if (inode->i_size != EMPTY_DIR_SIZE && 826 if (inode->i_size != EMPTY_DIR_SIZE &&
@@ -903,8 +886,9 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
903 goto end_rmdir; 886 goto end_rmdir;
904 887
905 if (inode->i_nlink != 2 && inode->i_nlink != 1) 888 if (inode->i_nlink != 2 && inode->i_nlink != 1)
906 reiserfs_warning(inode->i_sb, "%s: empty directory has nlink " 889 reiserfs_error(inode->i_sb, "reiserfs-7040",
907 "!= 2 (%d)", __func__, inode->i_nlink); 890 "empty directory has nlink != 2 (%d)",
891 inode->i_nlink);
908 892
909 clear_nlink(inode); 893 clear_nlink(inode);
910 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; 894 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
@@ -980,10 +964,9 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
980 } 964 }
981 965
982 if (!inode->i_nlink) { 966 if (!inode->i_nlink) {
983 reiserfs_warning(inode->i_sb, "%s: deleting nonexistent file " 967 reiserfs_warning(inode->i_sb, "reiserfs-7042",
984 "(%s:%lu), %d", __func__, 968 "deleting nonexistent file (%lu), %d",
985 reiserfs_bdevname(inode->i_sb), inode->i_ino, 969 inode->i_ino, inode->i_nlink);
986 inode->i_nlink);
987 inode->i_nlink = 1; 970 inode->i_nlink = 1;
988 } 971 }
989 972
@@ -1037,6 +1020,7 @@ static int reiserfs_symlink(struct inode *parent_dir,
1037 char *name; 1020 char *name;
1038 int item_len; 1021 int item_len;
1039 struct reiserfs_transaction_handle th; 1022 struct reiserfs_transaction_handle th;
1023 struct reiserfs_security_handle security;
1040 int mode = S_IFLNK | S_IRWXUGO; 1024 int mode = S_IFLNK | S_IRWXUGO;
1041 /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ 1025 /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
1042 int jbegin_count = 1026 int jbegin_count =
@@ -1049,6 +1033,13 @@ static int reiserfs_symlink(struct inode *parent_dir,
1049 } 1033 }
1050 new_inode_init(inode, parent_dir, mode); 1034 new_inode_init(inode, parent_dir, mode);
1051 1035
1036 retval = reiserfs_security_init(parent_dir, inode, &security);
1037 if (retval < 0) {
1038 drop_new_inode(inode);
1039 return retval;
1040 }
1041 jbegin_count += retval;
1042
1052 reiserfs_write_lock(parent_dir->i_sb); 1043 reiserfs_write_lock(parent_dir->i_sb);
1053 item_len = ROUND_UP(strlen(symname)); 1044 item_len = ROUND_UP(strlen(symname));
1054 if (item_len > MAX_DIRECT_ITEM_LEN(parent_dir->i_sb->s_blocksize)) { 1045 if (item_len > MAX_DIRECT_ITEM_LEN(parent_dir->i_sb->s_blocksize)) {
@@ -1066,8 +1057,6 @@ static int reiserfs_symlink(struct inode *parent_dir,
1066 memcpy(name, symname, strlen(symname)); 1057 memcpy(name, symname, strlen(symname));
1067 padd_item(name, item_len, strlen(symname)); 1058 padd_item(name, item_len, strlen(symname));
1068 1059
1069 /* We would inherit the default ACL here, but symlinks don't get ACLs */
1070
1071 retval = journal_begin(&th, parent_dir->i_sb, jbegin_count); 1060 retval = journal_begin(&th, parent_dir->i_sb, jbegin_count);
1072 if (retval) { 1061 if (retval) {
1073 drop_new_inode(inode); 1062 drop_new_inode(inode);
@@ -1077,7 +1066,7 @@ static int reiserfs_symlink(struct inode *parent_dir,
1077 1066
1078 retval = 1067 retval =
1079 reiserfs_new_inode(&th, parent_dir, mode, name, strlen(symname), 1068 reiserfs_new_inode(&th, parent_dir, mode, name, strlen(symname),
1080 dentry, inode); 1069 dentry, inode, &security);
1081 kfree(name); 1070 kfree(name);
1082 if (retval) { /* reiserfs_new_inode iputs for us */ 1071 if (retval) { /* reiserfs_new_inode iputs for us */
1083 goto out_failed; 1072 goto out_failed;
@@ -1173,7 +1162,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
1173 return retval; 1162 return retval;
1174} 1163}
1175 1164
1176// de contains information pointing to an entry which 1165/* de contains information pointing to an entry which */
1177static int de_still_valid(const char *name, int len, 1166static int de_still_valid(const char *name, int len,
1178 struct reiserfs_dir_entry *de) 1167 struct reiserfs_dir_entry *de)
1179{ 1168{
@@ -1196,15 +1185,14 @@ static int entry_points_to_object(const char *name, int len,
1196 1185
1197 if (inode) { 1186 if (inode) {
1198 if (!de_visible(de->de_deh + de->de_entry_num)) 1187 if (!de_visible(de->de_deh + de->de_entry_num))
1199 reiserfs_panic(NULL, 1188 reiserfs_panic(inode->i_sb, "vs-7042",
1200 "vs-7042: entry_points_to_object: entry must be visible"); 1189 "entry must be visible");
1201 return (de->de_objectid == inode->i_ino) ? 1 : 0; 1190 return (de->de_objectid == inode->i_ino) ? 1 : 0;
1202 } 1191 }
1203 1192
1204 /* this must be added hidden entry */ 1193 /* this must be added hidden entry */
1205 if (de_visible(de->de_deh + de->de_entry_num)) 1194 if (de_visible(de->de_deh + de->de_entry_num))
1206 reiserfs_panic(NULL, 1195 reiserfs_panic(NULL, "vs-7043", "entry must be visible");
1207 "vs-7043: entry_points_to_object: entry must be visible");
1208 1196
1209 return 1; 1197 return 1;
1210} 1198}
@@ -1218,10 +1206,10 @@ static void set_ino_in_dir_entry(struct reiserfs_dir_entry *de,
1218 de->de_deh[de->de_entry_num].deh_objectid = key->k_objectid; 1206 de->de_deh[de->de_entry_num].deh_objectid = key->k_objectid;
1219} 1207}
1220 1208
1221/* 1209/*
1222 * process, that is going to call fix_nodes/do_balance must hold only 1210 * process, that is going to call fix_nodes/do_balance must hold only
1223 * one path. If it holds 2 or more, it can get into endless waiting in 1211 * one path. If it holds 2 or more, it can get into endless waiting in
1224 * get_empty_nodes or its clones 1212 * get_empty_nodes or its clones
1225 */ 1213 */
1226static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, 1214static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1227 struct inode *new_dir, struct dentry *new_dentry) 1215 struct inode *new_dir, struct dentry *new_dentry)
@@ -1275,7 +1263,7 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1275 1263
1276 old_inode_mode = old_inode->i_mode; 1264 old_inode_mode = old_inode->i_mode;
1277 if (S_ISDIR(old_inode_mode)) { 1265 if (S_ISDIR(old_inode_mode)) {
1278 // make sure, that directory being renamed has correct ".." 1266 // make sure, that directory being renamed has correct ".."
1279 // and that its new parent directory has not too many links 1267 // and that its new parent directory has not too many links
1280 // already 1268 // already
1281 1269
@@ -1286,8 +1274,8 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1286 } 1274 }
1287 } 1275 }
1288 1276
1289 /* directory is renamed, its parent directory will be changed, 1277 /* directory is renamed, its parent directory will be changed,
1290 ** so find ".." entry 1278 ** so find ".." entry
1291 */ 1279 */
1292 dot_dot_de.de_gen_number_bit_string = NULL; 1280 dot_dot_de.de_gen_number_bit_string = NULL;
1293 retval = 1281 retval =
@@ -1318,8 +1306,8 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1318 new_dentry->d_name.len, old_inode, 0); 1306 new_dentry->d_name.len, old_inode, 0);
1319 if (retval == -EEXIST) { 1307 if (retval == -EEXIST) {
1320 if (!new_dentry_inode) { 1308 if (!new_dentry_inode) {
1321 reiserfs_panic(old_dir->i_sb, 1309 reiserfs_panic(old_dir->i_sb, "vs-7050",
1322 "vs-7050: new entry is found, new inode == 0\n"); 1310 "new entry is found, new inode == 0");
1323 } 1311 }
1324 } else if (retval) { 1312 } else if (retval) {
1325 int err = journal_end(&th, old_dir->i_sb, jbegin_count); 1313 int err = journal_end(&th, old_dir->i_sb, jbegin_count);
@@ -1397,9 +1385,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1397 this stuff, yes? Then, having 1385 this stuff, yes? Then, having
1398 gathered everything into RAM we 1386 gathered everything into RAM we
1399 should lock the buffers, yes? -Hans */ 1387 should lock the buffers, yes? -Hans */
1400 /* probably. our rename needs to hold more 1388 /* probably. our rename needs to hold more
1401 ** than one path at once. The seals would 1389 ** than one path at once. The seals would
1402 ** have to be written to deal with multi-path 1390 ** have to be written to deal with multi-path
1403 ** issues -chris 1391 ** issues -chris
1404 */ 1392 */
1405 /* sanity checking before doing the rename - avoid races many 1393 /* sanity checking before doing the rename - avoid races many
@@ -1477,7 +1465,7 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1477 } 1465 }
1478 1466
1479 if (S_ISDIR(old_inode_mode)) { 1467 if (S_ISDIR(old_inode_mode)) {
1480 // adjust ".." of renamed directory 1468 /* adjust ".." of renamed directory */
1481 set_ino_in_dir_entry(&dot_dot_de, INODE_PKEY(new_dir)); 1469 set_ino_in_dir_entry(&dot_dot_de, INODE_PKEY(new_dir));
1482 journal_mark_dirty(&th, new_dir->i_sb, dot_dot_de.de_bh); 1470 journal_mark_dirty(&th, new_dir->i_sb, dot_dot_de.de_bh);
1483 1471
@@ -1499,8 +1487,8 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1499 if (reiserfs_cut_from_item 1487 if (reiserfs_cut_from_item
1500 (&th, &old_entry_path, &(old_de.de_entry_key), old_dir, NULL, 1488 (&th, &old_entry_path, &(old_de.de_entry_key), old_dir, NULL,
1501 0) < 0) 1489 0) < 0)
1502 reiserfs_warning(old_dir->i_sb, 1490 reiserfs_error(old_dir->i_sb, "vs-7060",
1503 "vs-7060: reiserfs_rename: couldn't not cut old name. Fsck later?"); 1491 "couldn't not cut old name. Fsck later?");
1504 1492
1505 old_dir->i_size -= DEH_SIZE + old_de.de_entrylen; 1493 old_dir->i_size -= DEH_SIZE + old_de.de_entrylen;
1506 1494
diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
index ea0cf8c28a99..3a6de810bd61 100644
--- a/fs/reiserfs/objectid.c
+++ b/fs/reiserfs/objectid.c
@@ -18,8 +18,7 @@
18static void check_objectid_map(struct super_block *s, __le32 * map) 18static void check_objectid_map(struct super_block *s, __le32 * map)
19{ 19{
20 if (le32_to_cpu(map[0]) != 1) 20 if (le32_to_cpu(map[0]) != 1)
21 reiserfs_panic(s, 21 reiserfs_panic(s, "vs-15010", "map corrupted: %lx",
22 "vs-15010: check_objectid_map: map corrupted: %lx",
23 (long unsigned int)le32_to_cpu(map[0])); 22 (long unsigned int)le32_to_cpu(map[0]));
24 23
25 // FIXME: add something else here 24 // FIXME: add something else here
@@ -61,7 +60,7 @@ __u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th)
61 /* comment needed -Hans */ 60 /* comment needed -Hans */
62 unused_objectid = le32_to_cpu(map[1]); 61 unused_objectid = le32_to_cpu(map[1]);
63 if (unused_objectid == U32_MAX) { 62 if (unused_objectid == U32_MAX) {
64 reiserfs_warning(s, "%s: no more object ids", __func__); 63 reiserfs_warning(s, "reiserfs-15100", "no more object ids");
65 reiserfs_restore_prepared_buffer(s, SB_BUFFER_WITH_SB(s)); 64 reiserfs_restore_prepared_buffer(s, SB_BUFFER_WITH_SB(s));
66 return 0; 65 return 0;
67 } 66 }
@@ -160,9 +159,8 @@ void reiserfs_release_objectid(struct reiserfs_transaction_handle *th,
160 i += 2; 159 i += 2;
161 } 160 }
162 161
163 reiserfs_warning(s, 162 reiserfs_error(s, "vs-15011", "tried to free free object id (%lu)",
164 "vs-15011: reiserfs_release_objectid: tried to free free object id (%lu)", 163 (long unsigned)objectid_to_release);
165 (long unsigned)objectid_to_release);
166} 164}
167 165
168int reiserfs_convert_objectid_map_v1(struct super_block *s) 166int reiserfs_convert_objectid_map_v1(struct super_block *s)
@@ -182,7 +180,7 @@ int reiserfs_convert_objectid_map_v1(struct super_block *s)
182 180
183 if (cur_size > new_size) { 181 if (cur_size > new_size) {
184 /* mark everyone used that was listed as free at the end of the objectid 182 /* mark everyone used that was listed as free at the end of the objectid
185 ** map 183 ** map
186 */ 184 */
187 objectid_map[new_size - 1] = objectid_map[cur_size - 1]; 185 objectid_map[new_size - 1] = objectid_map[cur_size - 1];
188 set_sb_oid_cursize(disk_sb, new_size); 186 set_sb_oid_cursize(disk_sb, new_size);
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 740bb8c0c1ae..536eacaeb710 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -157,19 +157,16 @@ static void sprintf_disk_child(char *buf, struct disk_child *dc)
157 dc_size(dc)); 157 dc_size(dc));
158} 158}
159 159
160static char *is_there_reiserfs_struct(char *fmt, int *what, int *skip) 160static char *is_there_reiserfs_struct(char *fmt, int *what)
161{ 161{
162 char *k = fmt; 162 char *k = fmt;
163 163
164 *skip = 0;
165
166 while ((k = strchr(k, '%')) != NULL) { 164 while ((k = strchr(k, '%')) != NULL) {
167 if (k[1] == 'k' || k[1] == 'K' || k[1] == 'h' || k[1] == 't' || 165 if (k[1] == 'k' || k[1] == 'K' || k[1] == 'h' || k[1] == 't' ||
168 k[1] == 'z' || k[1] == 'b' || k[1] == 'y' || k[1] == 'a') { 166 k[1] == 'z' || k[1] == 'b' || k[1] == 'y' || k[1] == 'a') {
169 *what = k[1]; 167 *what = k[1];
170 break; 168 break;
171 } 169 }
172 (*skip)++;
173 k++; 170 k++;
174 } 171 }
175 return k; 172 return k;
@@ -181,30 +178,29 @@ static char *is_there_reiserfs_struct(char *fmt, int *what, int *skip)
181 appropriative printk. With this reiserfs_warning you can use format 178 appropriative printk. With this reiserfs_warning you can use format
182 specification for complex structures like you used to do with 179 specification for complex structures like you used to do with
183 printfs for integers, doubles and pointers. For instance, to print 180 printfs for integers, doubles and pointers. For instance, to print
184 out key structure you have to write just: 181 out key structure you have to write just:
185 reiserfs_warning ("bad key %k", key); 182 reiserfs_warning ("bad key %k", key);
186 instead of 183 instead of
187 printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid, 184 printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid,
188 key->k_offset, key->k_uniqueness); 185 key->k_offset, key->k_uniqueness);
189*/ 186*/
190 187static DEFINE_SPINLOCK(error_lock);
191static void prepare_error_buf(const char *fmt, va_list args) 188static void prepare_error_buf(const char *fmt, va_list args)
192{ 189{
193 char *fmt1 = fmt_buf; 190 char *fmt1 = fmt_buf;
194 char *k; 191 char *k;
195 char *p = error_buf; 192 char *p = error_buf;
196 int i, j, what, skip; 193 int what;
194
195 spin_lock(&error_lock);
197 196
198 strcpy(fmt1, fmt); 197 strcpy(fmt1, fmt);
199 198
200 while ((k = is_there_reiserfs_struct(fmt1, &what, &skip)) != NULL) { 199 while ((k = is_there_reiserfs_struct(fmt1, &what)) != NULL) {
201 *k = 0; 200 *k = 0;
202 201
203 p += vsprintf(p, fmt1, args); 202 p += vsprintf(p, fmt1, args);
204 203
205 for (i = 0; i < skip; i++)
206 j = va_arg(args, int);
207
208 switch (what) { 204 switch (what) {
209 case 'k': 205 case 'k':
210 sprintf_le_key(p, va_arg(args, struct reiserfs_key *)); 206 sprintf_le_key(p, va_arg(args, struct reiserfs_key *));
@@ -243,15 +239,16 @@ static void prepare_error_buf(const char *fmt, va_list args)
243 fmt1 = k + 2; 239 fmt1 = k + 2;
244 } 240 }
245 vsprintf(p, fmt1, args); 241 vsprintf(p, fmt1, args);
242 spin_unlock(&error_lock);
246 243
247} 244}
248 245
249/* in addition to usual conversion specifiers this accepts reiserfs 246/* in addition to usual conversion specifiers this accepts reiserfs
250 specific conversion specifiers: 247 specific conversion specifiers:
251 %k to print little endian key, 248 %k to print little endian key,
252 %K to print cpu key, 249 %K to print cpu key,
253 %h to print item_head, 250 %h to print item_head,
254 %t to print directory entry 251 %t to print directory entry
255 %z to print block head (arg must be struct buffer_head * 252 %z to print block head (arg must be struct buffer_head *
256 %b to print buffer_head 253 %b to print buffer_head
257*/ 254*/
@@ -264,14 +261,17 @@ static void prepare_error_buf(const char *fmt, va_list args)
264 va_end( args );\ 261 va_end( args );\
265} 262}
266 263
267void reiserfs_warning(struct super_block *sb, const char *fmt, ...) 264void __reiserfs_warning(struct super_block *sb, const char *id,
265 const char *function, const char *fmt, ...)
268{ 266{
269 do_reiserfs_warning(fmt); 267 do_reiserfs_warning(fmt);
270 if (sb) 268 if (sb)
271 printk(KERN_WARNING "ReiserFS: %s: warning: %s\n", 269 printk(KERN_WARNING "REISERFS warning (device %s): %s%s%s: "
272 reiserfs_bdevname(sb), error_buf); 270 "%s\n", sb->s_id, id ? id : "", id ? " " : "",
271 function, error_buf);
273 else 272 else
274 printk(KERN_WARNING "ReiserFS: warning: %s\n", error_buf); 273 printk(KERN_WARNING "REISERFS warning: %s%s%s: %s\n",
274 id ? id : "", id ? " " : "", function, error_buf);
275} 275}
276 276
277/* No newline.. reiserfs_info calls can be followed by printk's */ 277/* No newline.. reiserfs_info calls can be followed by printk's */
@@ -279,10 +279,10 @@ void reiserfs_info(struct super_block *sb, const char *fmt, ...)
279{ 279{
280 do_reiserfs_warning(fmt); 280 do_reiserfs_warning(fmt);
281 if (sb) 281 if (sb)
282 printk(KERN_NOTICE "ReiserFS: %s: %s", 282 printk(KERN_NOTICE "REISERFS (device %s): %s",
283 reiserfs_bdevname(sb), error_buf); 283 sb->s_id, error_buf);
284 else 284 else
285 printk(KERN_NOTICE "ReiserFS: %s", error_buf); 285 printk(KERN_NOTICE "REISERFS %s:", error_buf);
286} 286}
287 287
288/* No newline.. reiserfs_printk calls can be followed by printk's */ 288/* No newline.. reiserfs_printk calls can be followed by printk's */
@@ -297,10 +297,10 @@ void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)
297#ifdef CONFIG_REISERFS_CHECK 297#ifdef CONFIG_REISERFS_CHECK
298 do_reiserfs_warning(fmt); 298 do_reiserfs_warning(fmt);
299 if (s) 299 if (s)
300 printk(KERN_DEBUG "ReiserFS: %s: %s\n", 300 printk(KERN_DEBUG "REISERFS debug (device %s): %s\n",
301 reiserfs_bdevname(s), error_buf); 301 s->s_id, error_buf);
302 else 302 else
303 printk(KERN_DEBUG "ReiserFS: %s\n", error_buf); 303 printk(KERN_DEBUG "REISERFS debug: %s\n", error_buf);
304#endif 304#endif
305} 305}
306 306
@@ -314,17 +314,17 @@ void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)
314 maintainer-errorid. Don't bother with reusing errorids, there are 314 maintainer-errorid. Don't bother with reusing errorids, there are
315 lots of numbers out there. 315 lots of numbers out there.
316 316
317 Example: 317 Example:
318 318
319 reiserfs_panic( 319 reiserfs_panic(
320 p_sb, "reiser-29: reiserfs_new_blocknrs: " 320 p_sb, "reiser-29: reiserfs_new_blocknrs: "
321 "one of search_start or rn(%d) is equal to MAX_B_NUM," 321 "one of search_start or rn(%d) is equal to MAX_B_NUM,"
322 "which means that we are optimizing location based on the bogus location of a temp buffer (%p).", 322 "which means that we are optimizing location based on the bogus location of a temp buffer (%p).",
323 rn, bh 323 rn, bh
324 ); 324 );
325 325
326 Regular panic()s sometimes clear the screen before the message can 326 Regular panic()s sometimes clear the screen before the message can
327 be read, thus the need for the while loop. 327 be read, thus the need for the while loop.
328 328
329 Numbering scheme for panic used by Vladimir and Anatoly( Hans completely ignores this scheme, and considers it 329 Numbering scheme for panic used by Vladimir and Anatoly( Hans completely ignores this scheme, and considers it
330 pointless complexity): 330 pointless complexity):
@@ -353,14 +353,46 @@ void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)
353extern struct tree_balance *cur_tb; 353extern struct tree_balance *cur_tb;
354#endif 354#endif
355 355
356void reiserfs_panic(struct super_block *sb, const char *fmt, ...) 356void __reiserfs_panic(struct super_block *sb, const char *id,
357 const char *function, const char *fmt, ...)
357{ 358{
358 do_reiserfs_warning(fmt); 359 do_reiserfs_warning(fmt);
359 360
361#ifdef CONFIG_REISERFS_CHECK
360 dump_stack(); 362 dump_stack();
363#endif
364 if (sb)
365 panic(KERN_WARNING "REISERFS panic (device %s): %s%s%s: %s\n",
366 sb->s_id, id ? id : "", id ? " " : "",
367 function, error_buf);
368 else
369 panic(KERN_WARNING "REISERFS panic: %s%s%s: %s\n",
370 id ? id : "", id ? " " : "", function, error_buf);
371}
372
373void __reiserfs_error(struct super_block *sb, const char *id,
374 const char *function, const char *fmt, ...)
375{
376 do_reiserfs_warning(fmt);
361 377
362 panic(KERN_EMERG "REISERFS: panic (device %s): %s\n", 378 BUG_ON(sb == NULL);
363 reiserfs_bdevname(sb), error_buf); 379
380 if (reiserfs_error_panic(sb))
381 __reiserfs_panic(sb, id, function, error_buf);
382
383 if (id && id[0])
384 printk(KERN_CRIT "REISERFS error (device %s): %s %s: %s\n",
385 sb->s_id, id, function, error_buf);
386 else
387 printk(KERN_CRIT "REISERFS error (device %s): %s: %s\n",
388 sb->s_id, function, error_buf);
389
390 if (sb->s_flags & MS_RDONLY)
391 return;
392
393 reiserfs_info(sb, "Remounting filesystem read-only\n");
394 sb->s_flags |= MS_RDONLY;
395 reiserfs_abort_journal(sb, -EIO);
364} 396}
365 397
366void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...) 398void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...)
@@ -368,18 +400,18 @@ void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...)
368 do_reiserfs_warning(fmt); 400 do_reiserfs_warning(fmt);
369 401
370 if (reiserfs_error_panic(sb)) { 402 if (reiserfs_error_panic(sb)) {
371 panic(KERN_CRIT "REISERFS: panic (device %s): %s\n", 403 panic(KERN_CRIT "REISERFS panic (device %s): %s\n", sb->s_id,
372 reiserfs_bdevname(sb), error_buf); 404 error_buf);
373 } 405 }
374 406
375 if (sb->s_flags & MS_RDONLY) 407 if (reiserfs_is_journal_aborted(SB_JOURNAL(sb)))
376 return; 408 return;
377 409
378 printk(KERN_CRIT "REISERFS: abort (device %s): %s\n", 410 printk(KERN_CRIT "REISERFS abort (device %s): %s\n", sb->s_id,
379 reiserfs_bdevname(sb), error_buf); 411 error_buf);
380 412
381 sb->s_flags |= MS_RDONLY; 413 sb->s_flags |= MS_RDONLY;
382 reiserfs_journal_abort(sb, errno); 414 reiserfs_abort_journal(sb, errno);
383} 415}
384 416
385/* this prints internal nodes (4 keys/items in line) (dc_number, 417/* this prints internal nodes (4 keys/items in line) (dc_number,
@@ -681,12 +713,10 @@ static void check_leaf_block_head(struct buffer_head *bh)
681 blkh = B_BLK_HEAD(bh); 713 blkh = B_BLK_HEAD(bh);
682 nr = blkh_nr_item(blkh); 714 nr = blkh_nr_item(blkh);
683 if (nr > (bh->b_size - BLKH_SIZE) / IH_SIZE) 715 if (nr > (bh->b_size - BLKH_SIZE) / IH_SIZE)
684 reiserfs_panic(NULL, 716 reiserfs_panic(NULL, "vs-6010", "invalid item number %z",
685 "vs-6010: check_leaf_block_head: invalid item number %z",
686 bh); 717 bh);
687 if (blkh_free_space(blkh) > bh->b_size - BLKH_SIZE - IH_SIZE * nr) 718 if (blkh_free_space(blkh) > bh->b_size - BLKH_SIZE - IH_SIZE * nr)
688 reiserfs_panic(NULL, 719 reiserfs_panic(NULL, "vs-6020", "invalid free space %z",
689 "vs-6020: check_leaf_block_head: invalid free space %z",
690 bh); 720 bh);
691 721
692} 722}
@@ -697,21 +727,15 @@ static void check_internal_block_head(struct buffer_head *bh)
697 727
698 blkh = B_BLK_HEAD(bh); 728 blkh = B_BLK_HEAD(bh);
699 if (!(B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL(bh) <= MAX_HEIGHT)) 729 if (!(B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL(bh) <= MAX_HEIGHT))
700 reiserfs_panic(NULL, 730 reiserfs_panic(NULL, "vs-6025", "invalid level %z", bh);
701 "vs-6025: check_internal_block_head: invalid level %z",
702 bh);
703 731
704 if (B_NR_ITEMS(bh) > (bh->b_size - BLKH_SIZE) / IH_SIZE) 732 if (B_NR_ITEMS(bh) > (bh->b_size - BLKH_SIZE) / IH_SIZE)
705 reiserfs_panic(NULL, 733 reiserfs_panic(NULL, "vs-6030", "invalid item number %z", bh);
706 "vs-6030: check_internal_block_head: invalid item number %z",
707 bh);
708 734
709 if (B_FREE_SPACE(bh) != 735 if (B_FREE_SPACE(bh) !=
710 bh->b_size - BLKH_SIZE - KEY_SIZE * B_NR_ITEMS(bh) - 736 bh->b_size - BLKH_SIZE - KEY_SIZE * B_NR_ITEMS(bh) -
711 DC_SIZE * (B_NR_ITEMS(bh) + 1)) 737 DC_SIZE * (B_NR_ITEMS(bh) + 1))
712 reiserfs_panic(NULL, 738 reiserfs_panic(NULL, "vs-6040", "invalid free space %z", bh);
713 "vs-6040: check_internal_block_head: invalid free space %z",
714 bh);
715 739
716} 740}
717 741
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 37173fa07d15..9229e5514a4e 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -321,7 +321,7 @@ static int show_journal(struct seq_file *m, struct super_block *sb)
321 /* incore fields */ 321 /* incore fields */
322 "j_1st_reserved_block: \t%i\n" 322 "j_1st_reserved_block: \t%i\n"
323 "j_state: \t%li\n" 323 "j_state: \t%li\n"
324 "j_trans_id: \t%lu\n" 324 "j_trans_id: \t%u\n"
325 "j_mount_id: \t%lu\n" 325 "j_mount_id: \t%lu\n"
326 "j_start: \t%lu\n" 326 "j_start: \t%lu\n"
327 "j_len: \t%lu\n" 327 "j_len: \t%lu\n"
@@ -329,7 +329,7 @@ static int show_journal(struct seq_file *m, struct super_block *sb)
329 "j_wcount: \t%i\n" 329 "j_wcount: \t%i\n"
330 "j_bcount: \t%lu\n" 330 "j_bcount: \t%lu\n"
331 "j_first_unflushed_offset: \t%lu\n" 331 "j_first_unflushed_offset: \t%lu\n"
332 "j_last_flush_trans_id: \t%lu\n" 332 "j_last_flush_trans_id: \t%u\n"
333 "j_trans_start_time: \t%li\n" 333 "j_trans_start_time: \t%li\n"
334 "j_list_bitmap_index: \t%i\n" 334 "j_list_bitmap_index: \t%i\n"
335 "j_must_wait: \t%i\n" 335 "j_must_wait: \t%i\n"
@@ -492,7 +492,6 @@ int reiserfs_proc_info_init(struct super_block *sb)
492 spin_lock_init(&__PINFO(sb).lock); 492 spin_lock_init(&__PINFO(sb).lock);
493 REISERFS_SB(sb)->procdir = proc_mkdir(b, proc_info_root); 493 REISERFS_SB(sb)->procdir = proc_mkdir(b, proc_info_root);
494 if (REISERFS_SB(sb)->procdir) { 494 if (REISERFS_SB(sb)->procdir) {
495 REISERFS_SB(sb)->procdir->owner = THIS_MODULE;
496 REISERFS_SB(sb)->procdir->data = sb; 495 REISERFS_SB(sb)->procdir->data = sb;
497 add_file(sb, "version", show_version); 496 add_file(sb, "version", show_version);
498 add_file(sb, "super", show_super); 497 add_file(sb, "super", show_super);
@@ -503,7 +502,7 @@ int reiserfs_proc_info_init(struct super_block *sb)
503 add_file(sb, "journal", show_journal); 502 add_file(sb, "journal", show_journal);
504 return 0; 503 return 0;
505 } 504 }
506 reiserfs_warning(sb, "reiserfs: cannot create /proc/%s/%s", 505 reiserfs_warning(sb, "cannot create /proc/%s/%s",
507 proc_info_root_name, b); 506 proc_info_root_name, b);
508 return 1; 507 return 1;
509} 508}
@@ -556,11 +555,8 @@ int reiserfs_proc_info_global_init(void)
556{ 555{
557 if (proc_info_root == NULL) { 556 if (proc_info_root == NULL) {
558 proc_info_root = proc_mkdir(proc_info_root_name, NULL); 557 proc_info_root = proc_mkdir(proc_info_root_name, NULL);
559 if (proc_info_root) { 558 if (!proc_info_root) {
560 proc_info_root->owner = THIS_MODULE; 559 reiserfs_warning(NULL, "cannot create /proc/%s",
561 } else {
562 reiserfs_warning(NULL,
563 "reiserfs: cannot create /proc/%s",
564 proc_info_root_name); 560 proc_info_root_name);
565 return 1; 561 return 1;
566 } 562 }
@@ -634,7 +630,7 @@ int reiserfs_global_version_in_proc(char *buffer, char **start,
634 * 630 *
635 */ 631 */
636 632
637/* 633/*
638 * Make Linus happy. 634 * Make Linus happy.
639 * Local variables: 635 * Local variables:
640 * c-indentation-style: "K&R" 636 * c-indentation-style: "K&R"
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index f71c3948edef..238e9d9b31e0 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -1,8 +1,8 @@
1/* 1/*
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README 2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */ 3 */
4 4
5/* 5/*
6 * Written by Alexander Zarochentcev. 6 * Written by Alexander Zarochentcev.
7 * 7 *
8 * The kernel part of the (on-line) reiserfs resizer. 8 * The kernel part of the (on-line) reiserfs resizer.
@@ -101,7 +101,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
101 memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size); 101 memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size);
102 102
103 /* just in case vfree schedules on us, copy the new 103 /* just in case vfree schedules on us, copy the new
104 ** pointer into the journal struct before freeing the 104 ** pointer into the journal struct before freeing the
105 ** old one 105 ** old one
106 */ 106 */
107 node_tmp = jb->bitmaps; 107 node_tmp = jb->bitmaps;
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index abbc64dcc8d4..d036ee5b1c81 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -23,7 +23,6 @@
23 * get_rkey 23 * get_rkey
24 * key_in_buffer 24 * key_in_buffer
25 * decrement_bcount 25 * decrement_bcount
26 * decrement_counters_in_path
27 * reiserfs_check_path 26 * reiserfs_check_path
28 * pathrelse_and_restore 27 * pathrelse_and_restore
29 * pathrelse 28 * pathrelse
@@ -57,28 +56,28 @@
57#include <linux/quotaops.h> 56#include <linux/quotaops.h>
58 57
59/* Does the buffer contain a disk block which is in the tree. */ 58/* Does the buffer contain a disk block which is in the tree. */
60inline int B_IS_IN_TREE(const struct buffer_head *p_s_bh) 59inline int B_IS_IN_TREE(const struct buffer_head *bh)
61{ 60{
62 61
63 RFALSE(B_LEVEL(p_s_bh) > MAX_HEIGHT, 62 RFALSE(B_LEVEL(bh) > MAX_HEIGHT,
64 "PAP-1010: block (%b) has too big level (%z)", p_s_bh, p_s_bh); 63 "PAP-1010: block (%b) has too big level (%z)", bh, bh);
65 64
66 return (B_LEVEL(p_s_bh) != FREE_LEVEL); 65 return (B_LEVEL(bh) != FREE_LEVEL);
67} 66}
68 67
69// 68//
70// to gets item head in le form 69// to gets item head in le form
71// 70//
72inline void copy_item_head(struct item_head *p_v_to, 71inline void copy_item_head(struct item_head *to,
73 const struct item_head *p_v_from) 72 const struct item_head *from)
74{ 73{
75 memcpy(p_v_to, p_v_from, IH_SIZE); 74 memcpy(to, from, IH_SIZE);
76} 75}
77 76
78/* k1 is pointer to on-disk structure which is stored in little-endian 77/* k1 is pointer to on-disk structure which is stored in little-endian
79 form. k2 is pointer to cpu variable. For key of items of the same 78 form. k2 is pointer to cpu variable. For key of items of the same
80 object this returns 0. 79 object this returns 0.
81 Returns: -1 if key1 < key2 80 Returns: -1 if key1 < key2
82 0 if key1 == key2 81 0 if key1 == key2
83 1 if key1 > key2 */ 82 1 if key1 > key2 */
84inline int comp_short_keys(const struct reiserfs_key *le_key, 83inline int comp_short_keys(const struct reiserfs_key *le_key,
@@ -136,15 +135,15 @@ static inline int comp_keys(const struct reiserfs_key *le_key,
136inline int comp_short_le_keys(const struct reiserfs_key *key1, 135inline int comp_short_le_keys(const struct reiserfs_key *key1,
137 const struct reiserfs_key *key2) 136 const struct reiserfs_key *key2)
138{ 137{
139 __u32 *p_s_1_u32, *p_s_2_u32; 138 __u32 *k1_u32, *k2_u32;
140 int n_key_length = REISERFS_SHORT_KEY_LEN; 139 int key_length = REISERFS_SHORT_KEY_LEN;
141 140
142 p_s_1_u32 = (__u32 *) key1; 141 k1_u32 = (__u32 *) key1;
143 p_s_2_u32 = (__u32 *) key2; 142 k2_u32 = (__u32 *) key2;
144 for (; n_key_length--; ++p_s_1_u32, ++p_s_2_u32) { 143 for (; key_length--; ++k1_u32, ++k2_u32) {
145 if (le32_to_cpu(*p_s_1_u32) < le32_to_cpu(*p_s_2_u32)) 144 if (le32_to_cpu(*k1_u32) < le32_to_cpu(*k2_u32))
146 return -1; 145 return -1;
147 if (le32_to_cpu(*p_s_1_u32) > le32_to_cpu(*p_s_2_u32)) 146 if (le32_to_cpu(*k1_u32) > le32_to_cpu(*k2_u32))
148 return 1; 147 return 1;
149 } 148 }
150 return 0; 149 return 0;
@@ -175,52 +174,51 @@ inline int comp_le_keys(const struct reiserfs_key *k1,
175 * Binary search toolkit function * 174 * Binary search toolkit function *
176 * Search for an item in the array by the item key * 175 * Search for an item in the array by the item key *
177 * Returns: 1 if found, 0 if not found; * 176 * Returns: 1 if found, 0 if not found; *
178 * *p_n_pos = number of the searched element if found, else the * 177 * *pos = number of the searched element if found, else the *
179 * number of the first element that is larger than p_v_key. * 178 * number of the first element that is larger than key. *
180 **************************************************************************/ 179 **************************************************************************/
181/* For those not familiar with binary search: n_lbound is the leftmost item that it 180/* For those not familiar with binary search: lbound is the leftmost item that it
182 could be, n_rbound the rightmost item that it could be. We examine the item 181 could be, rbound the rightmost item that it could be. We examine the item
183 halfway between n_lbound and n_rbound, and that tells us either that we can increase 182 halfway between lbound and rbound, and that tells us either that we can increase
184 n_lbound, or decrease n_rbound, or that we have found it, or if n_lbound <= n_rbound that 183 lbound, or decrease rbound, or that we have found it, or if lbound <= rbound that
185 there are no possible items, and we have not found it. With each examination we 184 there are no possible items, and we have not found it. With each examination we
186 cut the number of possible items it could be by one more than half rounded down, 185 cut the number of possible items it could be by one more than half rounded down,
187 or we find it. */ 186 or we find it. */
188static inline int bin_search(const void *p_v_key, /* Key to search for. */ 187static inline int bin_search(const void *key, /* Key to search for. */
189 const void *p_v_base, /* First item in the array. */ 188 const void *base, /* First item in the array. */
190 int p_n_num, /* Number of items in the array. */ 189 int num, /* Number of items in the array. */
191 int p_n_width, /* Item size in the array. 190 int width, /* Item size in the array.
192 searched. Lest the reader be 191 searched. Lest the reader be
193 confused, note that this is crafted 192 confused, note that this is crafted
194 as a general function, and when it 193 as a general function, and when it
195 is applied specifically to the array 194 is applied specifically to the array
196 of item headers in a node, p_n_width 195 of item headers in a node, width
197 is actually the item header size not 196 is actually the item header size not
198 the item size. */ 197 the item size. */
199 int *p_n_pos /* Number of the searched for element. */ 198 int *pos /* Number of the searched for element. */
200 ) 199 )
201{ 200{
202 int n_rbound, n_lbound, n_j; 201 int rbound, lbound, j;
203 202
204 for (n_j = ((n_rbound = p_n_num - 1) + (n_lbound = 0)) / 2; 203 for (j = ((rbound = num - 1) + (lbound = 0)) / 2;
205 n_lbound <= n_rbound; n_j = (n_rbound + n_lbound) / 2) 204 lbound <= rbound; j = (rbound + lbound) / 2)
206 switch (comp_keys 205 switch (comp_keys
207 ((struct reiserfs_key *)((char *)p_v_base + 206 ((struct reiserfs_key *)((char *)base + j * width),
208 n_j * p_n_width), 207 (struct cpu_key *)key)) {
209 (struct cpu_key *)p_v_key)) {
210 case -1: 208 case -1:
211 n_lbound = n_j + 1; 209 lbound = j + 1;
212 continue; 210 continue;
213 case 1: 211 case 1:
214 n_rbound = n_j - 1; 212 rbound = j - 1;
215 continue; 213 continue;
216 case 0: 214 case 0:
217 *p_n_pos = n_j; 215 *pos = j;
218 return ITEM_FOUND; /* Key found in the array. */ 216 return ITEM_FOUND; /* Key found in the array. */
219 } 217 }
220 218
221 /* bin_search did not find given key, it returns position of key, 219 /* bin_search did not find given key, it returns position of key,
222 that is minimal and greater than the given one. */ 220 that is minimal and greater than the given one. */
223 *p_n_pos = n_lbound; 221 *pos = lbound;
224 return ITEM_NOT_FOUND; 222 return ITEM_NOT_FOUND;
225} 223}
226 224
@@ -243,90 +241,88 @@ static const struct reiserfs_key MAX_KEY = {
243 of the path, and going upwards. We must check the path's validity at each step. If the key is not in 241 of the path, and going upwards. We must check the path's validity at each step. If the key is not in
244 the path, there is no delimiting key in the tree (buffer is first or last buffer in tree), and in this 242 the path, there is no delimiting key in the tree (buffer is first or last buffer in tree), and in this
245 case we return a special key, either MIN_KEY or MAX_KEY. */ 243 case we return a special key, either MIN_KEY or MAX_KEY. */
246static inline const struct reiserfs_key *get_lkey(const struct treepath 244static inline const struct reiserfs_key *get_lkey(const struct treepath *chk_path,
247 *p_s_chk_path, 245 const struct super_block *sb)
248 const struct super_block
249 *p_s_sb)
250{ 246{
251 int n_position, n_path_offset = p_s_chk_path->path_length; 247 int position, path_offset = chk_path->path_length;
252 struct buffer_head *p_s_parent; 248 struct buffer_head *parent;
253 249
254 RFALSE(n_path_offset < FIRST_PATH_ELEMENT_OFFSET, 250 RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET,
255 "PAP-5010: invalid offset in the path"); 251 "PAP-5010: invalid offset in the path");
256 252
257 /* While not higher in path than first element. */ 253 /* While not higher in path than first element. */
258 while (n_path_offset-- > FIRST_PATH_ELEMENT_OFFSET) { 254 while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
259 255
260 RFALSE(!buffer_uptodate 256 RFALSE(!buffer_uptodate
261 (PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)), 257 (PATH_OFFSET_PBUFFER(chk_path, path_offset)),
262 "PAP-5020: parent is not uptodate"); 258 "PAP-5020: parent is not uptodate");
263 259
264 /* Parent at the path is not in the tree now. */ 260 /* Parent at the path is not in the tree now. */
265 if (!B_IS_IN_TREE 261 if (!B_IS_IN_TREE
266 (p_s_parent = 262 (parent =
267 PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset))) 263 PATH_OFFSET_PBUFFER(chk_path, path_offset)))
268 return &MAX_KEY; 264 return &MAX_KEY;
269 /* Check whether position in the parent is correct. */ 265 /* Check whether position in the parent is correct. */
270 if ((n_position = 266 if ((position =
271 PATH_OFFSET_POSITION(p_s_chk_path, 267 PATH_OFFSET_POSITION(chk_path,
272 n_path_offset)) > 268 path_offset)) >
273 B_NR_ITEMS(p_s_parent)) 269 B_NR_ITEMS(parent))
274 return &MAX_KEY; 270 return &MAX_KEY;
275 /* Check whether parent at the path really points to the child. */ 271 /* Check whether parent at the path really points to the child. */
276 if (B_N_CHILD_NUM(p_s_parent, n_position) != 272 if (B_N_CHILD_NUM(parent, position) !=
277 PATH_OFFSET_PBUFFER(p_s_chk_path, 273 PATH_OFFSET_PBUFFER(chk_path,
278 n_path_offset + 1)->b_blocknr) 274 path_offset + 1)->b_blocknr)
279 return &MAX_KEY; 275 return &MAX_KEY;
280 /* Return delimiting key if position in the parent is not equal to zero. */ 276 /* Return delimiting key if position in the parent is not equal to zero. */
281 if (n_position) 277 if (position)
282 return B_N_PDELIM_KEY(p_s_parent, n_position - 1); 278 return B_N_PDELIM_KEY(parent, position - 1);
283 } 279 }
284 /* Return MIN_KEY if we are in the root of the buffer tree. */ 280 /* Return MIN_KEY if we are in the root of the buffer tree. */
285 if (PATH_OFFSET_PBUFFER(p_s_chk_path, FIRST_PATH_ELEMENT_OFFSET)-> 281 if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
286 b_blocknr == SB_ROOT_BLOCK(p_s_sb)) 282 b_blocknr == SB_ROOT_BLOCK(sb))
287 return &MIN_KEY; 283 return &MIN_KEY;
288 return &MAX_KEY; 284 return &MAX_KEY;
289} 285}
290 286
291/* Get delimiting key of the buffer at the path and its right neighbor. */ 287/* Get delimiting key of the buffer at the path and its right neighbor. */
292inline const struct reiserfs_key *get_rkey(const struct treepath *p_s_chk_path, 288inline const struct reiserfs_key *get_rkey(const struct treepath *chk_path,
293 const struct super_block *p_s_sb) 289 const struct super_block *sb)
294{ 290{
295 int n_position, n_path_offset = p_s_chk_path->path_length; 291 int position, path_offset = chk_path->path_length;
296 struct buffer_head *p_s_parent; 292 struct buffer_head *parent;
297 293
298 RFALSE(n_path_offset < FIRST_PATH_ELEMENT_OFFSET, 294 RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET,
299 "PAP-5030: invalid offset in the path"); 295 "PAP-5030: invalid offset in the path");
300 296
301 while (n_path_offset-- > FIRST_PATH_ELEMENT_OFFSET) { 297 while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
302 298
303 RFALSE(!buffer_uptodate 299 RFALSE(!buffer_uptodate
304 (PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)), 300 (PATH_OFFSET_PBUFFER(chk_path, path_offset)),
305 "PAP-5040: parent is not uptodate"); 301 "PAP-5040: parent is not uptodate");
306 302
307 /* Parent at the path is not in the tree now. */ 303 /* Parent at the path is not in the tree now. */
308 if (!B_IS_IN_TREE 304 if (!B_IS_IN_TREE
309 (p_s_parent = 305 (parent =
310 PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset))) 306 PATH_OFFSET_PBUFFER(chk_path, path_offset)))
311 return &MIN_KEY; 307 return &MIN_KEY;
312 /* Check whether position in the parent is correct. */ 308 /* Check whether position in the parent is correct. */
313 if ((n_position = 309 if ((position =
314 PATH_OFFSET_POSITION(p_s_chk_path, 310 PATH_OFFSET_POSITION(chk_path,
315 n_path_offset)) > 311 path_offset)) >
316 B_NR_ITEMS(p_s_parent)) 312 B_NR_ITEMS(parent))
317 return &MIN_KEY; 313 return &MIN_KEY;
318 /* Check whether parent at the path really points to the child. */ 314 /* Check whether parent at the path really points to the child. */
319 if (B_N_CHILD_NUM(p_s_parent, n_position) != 315 if (B_N_CHILD_NUM(parent, position) !=
320 PATH_OFFSET_PBUFFER(p_s_chk_path, 316 PATH_OFFSET_PBUFFER(chk_path,
321 n_path_offset + 1)->b_blocknr) 317 path_offset + 1)->b_blocknr)
322 return &MIN_KEY; 318 return &MIN_KEY;
323 /* Return delimiting key if position in the parent is not the last one. */ 319 /* Return delimiting key if position in the parent is not the last one. */
324 if (n_position != B_NR_ITEMS(p_s_parent)) 320 if (position != B_NR_ITEMS(parent))
325 return B_N_PDELIM_KEY(p_s_parent, n_position); 321 return B_N_PDELIM_KEY(parent, position);
326 } 322 }
327 /* Return MAX_KEY if we are in the root of the buffer tree. */ 323 /* Return MAX_KEY if we are in the root of the buffer tree. */
328 if (PATH_OFFSET_PBUFFER(p_s_chk_path, FIRST_PATH_ELEMENT_OFFSET)-> 324 if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
329 b_blocknr == SB_ROOT_BLOCK(p_s_sb)) 325 b_blocknr == SB_ROOT_BLOCK(sb))
330 return &MAX_KEY; 326 return &MAX_KEY;
331 return &MIN_KEY; 327 return &MIN_KEY;
332} 328}
@@ -336,60 +332,29 @@ inline const struct reiserfs_key *get_rkey(const struct treepath *p_s_chk_path,
336 the path. These delimiting keys are stored at least one level above that buffer in the tree. If the 332 the path. These delimiting keys are stored at least one level above that buffer in the tree. If the
337 buffer is the first or last node in the tree order then one of the delimiting keys may be absent, and in 333 buffer is the first or last node in the tree order then one of the delimiting keys may be absent, and in
338 this case get_lkey and get_rkey return a special key which is MIN_KEY or MAX_KEY. */ 334 this case get_lkey and get_rkey return a special key which is MIN_KEY or MAX_KEY. */
339static inline int key_in_buffer(struct treepath *p_s_chk_path, /* Path which should be checked. */ 335static inline int key_in_buffer(struct treepath *chk_path, /* Path which should be checked. */
340 const struct cpu_key *p_s_key, /* Key which should be checked. */ 336 const struct cpu_key *key, /* Key which should be checked. */
341 struct super_block *p_s_sb /* Super block pointer. */ 337 struct super_block *sb
342 ) 338 )
343{ 339{
344 340
345 RFALSE(!p_s_key || p_s_chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET 341 RFALSE(!key || chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET
346 || p_s_chk_path->path_length > MAX_HEIGHT, 342 || chk_path->path_length > MAX_HEIGHT,
347 "PAP-5050: pointer to the key(%p) is NULL or invalid path length(%d)", 343 "PAP-5050: pointer to the key(%p) is NULL or invalid path length(%d)",
348 p_s_key, p_s_chk_path->path_length); 344 key, chk_path->path_length);
349 RFALSE(!PATH_PLAST_BUFFER(p_s_chk_path)->b_bdev, 345 RFALSE(!PATH_PLAST_BUFFER(chk_path)->b_bdev,
350 "PAP-5060: device must not be NODEV"); 346 "PAP-5060: device must not be NODEV");
351 347
352 if (comp_keys(get_lkey(p_s_chk_path, p_s_sb), p_s_key) == 1) 348 if (comp_keys(get_lkey(chk_path, sb), key) == 1)
353 /* left delimiting key is bigger, that the key we look for */ 349 /* left delimiting key is bigger, that the key we look for */
354 return 0; 350 return 0;
355 // if ( comp_keys(p_s_key, get_rkey(p_s_chk_path, p_s_sb)) != -1 ) 351 /* if ( comp_keys(key, get_rkey(chk_path, sb)) != -1 ) */
356 if (comp_keys(get_rkey(p_s_chk_path, p_s_sb), p_s_key) != 1) 352 if (comp_keys(get_rkey(chk_path, sb), key) != 1)
357 /* p_s_key must be less than right delimitiing key */ 353 /* key must be less than right delimitiing key */
358 return 0; 354 return 0;
359 return 1; 355 return 1;
360} 356}
361 357
362inline void decrement_bcount(struct buffer_head *p_s_bh)
363{
364 if (p_s_bh) {
365 if (atomic_read(&(p_s_bh->b_count))) {
366 put_bh(p_s_bh);
367 return;
368 }
369 reiserfs_panic(NULL,
370 "PAP-5070: decrement_bcount: trying to free free buffer %b",
371 p_s_bh);
372 }
373}
374
375/* Decrement b_count field of the all buffers in the path. */
376void decrement_counters_in_path(struct treepath *p_s_search_path)
377{
378 int n_path_offset = p_s_search_path->path_length;
379
380 RFALSE(n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET ||
381 n_path_offset > EXTENDED_MAX_HEIGHT - 1,
382 "PAP-5080: invalid path offset of %d", n_path_offset);
383
384 while (n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) {
385 struct buffer_head *bh;
386
387 bh = PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--);
388 decrement_bcount(bh);
389 }
390 p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
391}
392
393int reiserfs_check_path(struct treepath *p) 358int reiserfs_check_path(struct treepath *p)
394{ 359{
395 RFALSE(p->path_length != ILLEGAL_PATH_ELEMENT_OFFSET, 360 RFALSE(p->path_length != ILLEGAL_PATH_ELEMENT_OFFSET,
@@ -397,40 +362,38 @@ int reiserfs_check_path(struct treepath *p)
397 return 0; 362 return 0;
398} 363}
399 364
400/* Release all buffers in the path. Restore dirty bits clean 365/* Drop the reference to each buffer in a path and restore
401** when preparing the buffer for the log 366 * dirty bits clean when preparing the buffer for the log.
402** 367 * This version should only be called from fix_nodes() */
403** only called from fix_nodes() 368void pathrelse_and_restore(struct super_block *sb,
404*/ 369 struct treepath *search_path)
405void pathrelse_and_restore(struct super_block *s, struct treepath *p_s_search_path)
406{ 370{
407 int n_path_offset = p_s_search_path->path_length; 371 int path_offset = search_path->path_length;
408 372
409 RFALSE(n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET, 373 RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
410 "clm-4000: invalid path offset"); 374 "clm-4000: invalid path offset");
411 375
412 while (n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) { 376 while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) {
413 reiserfs_restore_prepared_buffer(s, 377 struct buffer_head *bh;
414 PATH_OFFSET_PBUFFER 378 bh = PATH_OFFSET_PBUFFER(search_path, path_offset--);
415 (p_s_search_path, 379 reiserfs_restore_prepared_buffer(sb, bh);
416 n_path_offset)); 380 brelse(bh);
417 brelse(PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--));
418 } 381 }
419 p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET; 382 search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
420} 383}
421 384
422/* Release all buffers in the path. */ 385/* Drop the reference to each buffer in a path */
423void pathrelse(struct treepath *p_s_search_path) 386void pathrelse(struct treepath *search_path)
424{ 387{
425 int n_path_offset = p_s_search_path->path_length; 388 int path_offset = search_path->path_length;
426 389
427 RFALSE(n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET, 390 RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
428 "PAP-5090: invalid path offset"); 391 "PAP-5090: invalid path offset");
429 392
430 while (n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) 393 while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET)
431 brelse(PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--)); 394 brelse(PATH_OFFSET_PBUFFER(search_path, path_offset--));
432 395
433 p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET; 396 search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
434} 397}
435 398
436static int is_leaf(char *buf, int blocksize, struct buffer_head *bh) 399static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
@@ -444,23 +407,24 @@ static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
444 407
445 blkh = (struct block_head *)buf; 408 blkh = (struct block_head *)buf;
446 if (blkh_level(blkh) != DISK_LEAF_NODE_LEVEL) { 409 if (blkh_level(blkh) != DISK_LEAF_NODE_LEVEL) {
447 reiserfs_warning(NULL, 410 reiserfs_warning(NULL, "reiserfs-5080",
448 "is_leaf: this should be caught earlier"); 411 "this should be caught earlier");
449 return 0; 412 return 0;
450 } 413 }
451 414
452 nr = blkh_nr_item(blkh); 415 nr = blkh_nr_item(blkh);
453 if (nr < 1 || nr > ((blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN))) { 416 if (nr < 1 || nr > ((blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN))) {
454 /* item number is too big or too small */ 417 /* item number is too big or too small */
455 reiserfs_warning(NULL, "is_leaf: nr_item seems wrong: %z", bh); 418 reiserfs_warning(NULL, "reiserfs-5081",
419 "nr_item seems wrong: %z", bh);
456 return 0; 420 return 0;
457 } 421 }
458 ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1; 422 ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1;
459 used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location(ih)); 423 used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location(ih));
460 if (used_space != blocksize - blkh_free_space(blkh)) { 424 if (used_space != blocksize - blkh_free_space(blkh)) {
461 /* free space does not match to calculated amount of use space */ 425 /* free space does not match to calculated amount of use space */
462 reiserfs_warning(NULL, "is_leaf: free space seems wrong: %z", 426 reiserfs_warning(NULL, "reiserfs-5082",
463 bh); 427 "free space seems wrong: %z", bh);
464 return 0; 428 return 0;
465 } 429 }
466 // FIXME: it is_leaf will hit performance too much - we may have 430 // FIXME: it is_leaf will hit performance too much - we may have
@@ -471,29 +435,29 @@ static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
471 prev_location = blocksize; 435 prev_location = blocksize;
472 for (i = 0; i < nr; i++, ih++) { 436 for (i = 0; i < nr; i++, ih++) {
473 if (le_ih_k_type(ih) == TYPE_ANY) { 437 if (le_ih_k_type(ih) == TYPE_ANY) {
474 reiserfs_warning(NULL, 438 reiserfs_warning(NULL, "reiserfs-5083",
475 "is_leaf: wrong item type for item %h", 439 "wrong item type for item %h",
476 ih); 440 ih);
477 return 0; 441 return 0;
478 } 442 }
479 if (ih_location(ih) >= blocksize 443 if (ih_location(ih) >= blocksize
480 || ih_location(ih) < IH_SIZE * nr) { 444 || ih_location(ih) < IH_SIZE * nr) {
481 reiserfs_warning(NULL, 445 reiserfs_warning(NULL, "reiserfs-5084",
482 "is_leaf: item location seems wrong: %h", 446 "item location seems wrong: %h",
483 ih); 447 ih);
484 return 0; 448 return 0;
485 } 449 }
486 if (ih_item_len(ih) < 1 450 if (ih_item_len(ih) < 1
487 || ih_item_len(ih) > MAX_ITEM_LEN(blocksize)) { 451 || ih_item_len(ih) > MAX_ITEM_LEN(blocksize)) {
488 reiserfs_warning(NULL, 452 reiserfs_warning(NULL, "reiserfs-5085",
489 "is_leaf: item length seems wrong: %h", 453 "item length seems wrong: %h",
490 ih); 454 ih);
491 return 0; 455 return 0;
492 } 456 }
493 if (prev_location - ih_location(ih) != ih_item_len(ih)) { 457 if (prev_location - ih_location(ih) != ih_item_len(ih)) {
494 reiserfs_warning(NULL, 458 reiserfs_warning(NULL, "reiserfs-5086",
495 "is_leaf: item location seems wrong (second one): %h", 459 "item location seems wrong "
496 ih); 460 "(second one): %h", ih);
497 return 0; 461 return 0;
498 } 462 }
499 prev_location = ih_location(ih); 463 prev_location = ih_location(ih);
@@ -514,24 +478,23 @@ static int is_internal(char *buf, int blocksize, struct buffer_head *bh)
514 nr = blkh_level(blkh); 478 nr = blkh_level(blkh);
515 if (nr <= DISK_LEAF_NODE_LEVEL || nr > MAX_HEIGHT) { 479 if (nr <= DISK_LEAF_NODE_LEVEL || nr > MAX_HEIGHT) {
516 /* this level is not possible for internal nodes */ 480 /* this level is not possible for internal nodes */
517 reiserfs_warning(NULL, 481 reiserfs_warning(NULL, "reiserfs-5087",
518 "is_internal: this should be caught earlier"); 482 "this should be caught earlier");
519 return 0; 483 return 0;
520 } 484 }
521 485
522 nr = blkh_nr_item(blkh); 486 nr = blkh_nr_item(blkh);
523 if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) { 487 if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) {
524 /* for internal which is not root we might check min number of keys */ 488 /* for internal which is not root we might check min number of keys */
525 reiserfs_warning(NULL, 489 reiserfs_warning(NULL, "reiserfs-5088",
526 "is_internal: number of key seems wrong: %z", 490 "number of key seems wrong: %z", bh);
527 bh);
528 return 0; 491 return 0;
529 } 492 }
530 493
531 used_space = BLKH_SIZE + KEY_SIZE * nr + DC_SIZE * (nr + 1); 494 used_space = BLKH_SIZE + KEY_SIZE * nr + DC_SIZE * (nr + 1);
532 if (used_space != blocksize - blkh_free_space(blkh)) { 495 if (used_space != blocksize - blkh_free_space(blkh)) {
533 reiserfs_warning(NULL, 496 reiserfs_warning(NULL, "reiserfs-5089",
534 "is_internal: free space seems wrong: %z", bh); 497 "free space seems wrong: %z", bh);
535 return 0; 498 return 0;
536 } 499 }
537 // one may imagine much more checks 500 // one may imagine much more checks
@@ -543,8 +506,8 @@ static int is_internal(char *buf, int blocksize, struct buffer_head *bh)
543static int is_tree_node(struct buffer_head *bh, int level) 506static int is_tree_node(struct buffer_head *bh, int level)
544{ 507{
545 if (B_LEVEL(bh) != level) { 508 if (B_LEVEL(bh) != level) {
546 reiserfs_warning(NULL, 509 reiserfs_warning(NULL, "reiserfs-5090", "node level %d does "
547 "is_tree_node: node level %d does not match to the expected one %d", 510 "not match to the expected one %d",
548 B_LEVEL(bh), level); 511 B_LEVEL(bh), level);
549 return 0; 512 return 0;
550 } 513 }
@@ -580,10 +543,10 @@ static void search_by_key_reada(struct super_block *s,
580/************************************************************************** 543/**************************************************************************
581 * Algorithm SearchByKey * 544 * Algorithm SearchByKey *
582 * look for item in the Disk S+Tree by its key * 545 * look for item in the Disk S+Tree by its key *
583 * Input: p_s_sb - super block * 546 * Input: sb - super block *
584 * p_s_key - pointer to the key to search * 547 * key - pointer to the key to search *
585 * Output: ITEM_FOUND, ITEM_NOT_FOUND or IO_ERROR * 548 * Output: ITEM_FOUND, ITEM_NOT_FOUND or IO_ERROR *
586 * p_s_search_path - path from the root to the needed leaf * 549 * search_path - path from the root to the needed leaf *
587 **************************************************************************/ 550 **************************************************************************/
588 551
589/* This function fills up the path from the root to the leaf as it 552/* This function fills up the path from the root to the leaf as it
@@ -600,22 +563,22 @@ static void search_by_key_reada(struct super_block *s,
600 correctness of the top of the path but need not be checked for the 563 correctness of the top of the path but need not be checked for the
601 correctness of the bottom of the path */ 564 correctness of the bottom of the path */
602/* The function is NOT SCHEDULE-SAFE! */ 565/* The function is NOT SCHEDULE-SAFE! */
603int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key, /* Key to search. */ 566int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to search. */
604 struct treepath *p_s_search_path,/* This structure was 567 struct treepath *search_path,/* This structure was
605 allocated and initialized 568 allocated and initialized
606 by the calling 569 by the calling
607 function. It is filled up 570 function. It is filled up
608 by this function. */ 571 by this function. */
609 int n_stop_level /* How far down the tree to search. To 572 int stop_level /* How far down the tree to search. To
610 stop at leaf level - set to 573 stop at leaf level - set to
611 DISK_LEAF_NODE_LEVEL */ 574 DISK_LEAF_NODE_LEVEL */
612 ) 575 )
613{ 576{
614 b_blocknr_t n_block_number; 577 b_blocknr_t block_number;
615 int expected_level; 578 int expected_level;
616 struct buffer_head *p_s_bh; 579 struct buffer_head *bh;
617 struct path_element *p_s_last_element; 580 struct path_element *last_element;
618 int n_node_level, n_retval; 581 int node_level, retval;
619 int right_neighbor_of_leaf_node; 582 int right_neighbor_of_leaf_node;
620 int fs_gen; 583 int fs_gen;
621 struct buffer_head *reada_bh[SEARCH_BY_KEY_READA]; 584 struct buffer_head *reada_bh[SEARCH_BY_KEY_READA];
@@ -623,80 +586,79 @@ int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key, /*
623 int reada_count = 0; 586 int reada_count = 0;
624 587
625#ifdef CONFIG_REISERFS_CHECK 588#ifdef CONFIG_REISERFS_CHECK
626 int n_repeat_counter = 0; 589 int repeat_counter = 0;
627#endif 590#endif
628 591
629 PROC_INFO_INC(p_s_sb, search_by_key); 592 PROC_INFO_INC(sb, search_by_key);
630 593
631 /* As we add each node to a path we increase its count. This means that 594 /* As we add each node to a path we increase its count. This means that
632 we must be careful to release all nodes in a path before we either 595 we must be careful to release all nodes in a path before we either
633 discard the path struct or re-use the path struct, as we do here. */ 596 discard the path struct or re-use the path struct, as we do here. */
634 597
635 decrement_counters_in_path(p_s_search_path); 598 pathrelse(search_path);
636 599
637 right_neighbor_of_leaf_node = 0; 600 right_neighbor_of_leaf_node = 0;
638 601
639 /* With each iteration of this loop we search through the items in the 602 /* With each iteration of this loop we search through the items in the
640 current node, and calculate the next current node(next path element) 603 current node, and calculate the next current node(next path element)
641 for the next iteration of this loop.. */ 604 for the next iteration of this loop.. */
642 n_block_number = SB_ROOT_BLOCK(p_s_sb); 605 block_number = SB_ROOT_BLOCK(sb);
643 expected_level = -1; 606 expected_level = -1;
644 while (1) { 607 while (1) {
645 608
646#ifdef CONFIG_REISERFS_CHECK 609#ifdef CONFIG_REISERFS_CHECK
647 if (!(++n_repeat_counter % 50000)) 610 if (!(++repeat_counter % 50000))
648 reiserfs_warning(p_s_sb, "PAP-5100: search_by_key: %s:" 611 reiserfs_warning(sb, "PAP-5100",
649 "there were %d iterations of while loop " 612 "%s: there were %d iterations of "
650 "looking for key %K", 613 "while loop looking for key %K",
651 current->comm, n_repeat_counter, 614 current->comm, repeat_counter,
652 p_s_key); 615 key);
653#endif 616#endif
654 617
655 /* prep path to have another element added to it. */ 618 /* prep path to have another element added to it. */
656 p_s_last_element = 619 last_element =
657 PATH_OFFSET_PELEMENT(p_s_search_path, 620 PATH_OFFSET_PELEMENT(search_path,
658 ++p_s_search_path->path_length); 621 ++search_path->path_length);
659 fs_gen = get_generation(p_s_sb); 622 fs_gen = get_generation(sb);
660 623
661 /* Read the next tree node, and set the last element in the path to 624 /* Read the next tree node, and set the last element in the path to
662 have a pointer to it. */ 625 have a pointer to it. */
663 if ((p_s_bh = p_s_last_element->pe_buffer = 626 if ((bh = last_element->pe_buffer =
664 sb_getblk(p_s_sb, n_block_number))) { 627 sb_getblk(sb, block_number))) {
665 if (!buffer_uptodate(p_s_bh) && reada_count > 1) { 628 if (!buffer_uptodate(bh) && reada_count > 1)
666 search_by_key_reada(p_s_sb, reada_bh, 629 search_by_key_reada(sb, reada_bh,
667 reada_blocks, reada_count); 630 reada_blocks, reada_count);
668 } 631 ll_rw_block(READ, 1, &bh);
669 ll_rw_block(READ, 1, &p_s_bh); 632 wait_on_buffer(bh);
670 wait_on_buffer(p_s_bh); 633 if (!buffer_uptodate(bh))
671 if (!buffer_uptodate(p_s_bh))
672 goto io_error; 634 goto io_error;
673 } else { 635 } else {
674 io_error: 636 io_error:
675 p_s_search_path->path_length--; 637 search_path->path_length--;
676 pathrelse(p_s_search_path); 638 pathrelse(search_path);
677 return IO_ERROR; 639 return IO_ERROR;
678 } 640 }
679 reada_count = 0; 641 reada_count = 0;
680 if (expected_level == -1) 642 if (expected_level == -1)
681 expected_level = SB_TREE_HEIGHT(p_s_sb); 643 expected_level = SB_TREE_HEIGHT(sb);
682 expected_level--; 644 expected_level--;
683 645
684 /* It is possible that schedule occurred. We must check whether the key 646 /* It is possible that schedule occurred. We must check whether the key
685 to search is still in the tree rooted from the current buffer. If 647 to search is still in the tree rooted from the current buffer. If
686 not then repeat search from the root. */ 648 not then repeat search from the root. */
687 if (fs_changed(fs_gen, p_s_sb) && 649 if (fs_changed(fs_gen, sb) &&
688 (!B_IS_IN_TREE(p_s_bh) || 650 (!B_IS_IN_TREE(bh) ||
689 B_LEVEL(p_s_bh) != expected_level || 651 B_LEVEL(bh) != expected_level ||
690 !key_in_buffer(p_s_search_path, p_s_key, p_s_sb))) { 652 !key_in_buffer(search_path, key, sb))) {
691 PROC_INFO_INC(p_s_sb, search_by_key_fs_changed); 653 PROC_INFO_INC(sb, search_by_key_fs_changed);
692 PROC_INFO_INC(p_s_sb, search_by_key_restarted); 654 PROC_INFO_INC(sb, search_by_key_restarted);
693 PROC_INFO_INC(p_s_sb, 655 PROC_INFO_INC(sb,
694 sbk_restarted[expected_level - 1]); 656 sbk_restarted[expected_level - 1]);
695 decrement_counters_in_path(p_s_search_path); 657 pathrelse(search_path);
696 658
697 /* Get the root block number so that we can repeat the search 659 /* Get the root block number so that we can repeat the search
698 starting from the root. */ 660 starting from the root. */
699 n_block_number = SB_ROOT_BLOCK(p_s_sb); 661 block_number = SB_ROOT_BLOCK(sb);
700 expected_level = -1; 662 expected_level = -1;
701 right_neighbor_of_leaf_node = 0; 663 right_neighbor_of_leaf_node = 0;
702 664
@@ -704,53 +666,53 @@ int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key, /*
704 continue; 666 continue;
705 } 667 }
706 668
707 /* only check that the key is in the buffer if p_s_key is not 669 /* only check that the key is in the buffer if key is not
708 equal to the MAX_KEY. Latter case is only possible in 670 equal to the MAX_KEY. Latter case is only possible in
709 "finish_unfinished()" processing during mount. */ 671 "finish_unfinished()" processing during mount. */
710 RFALSE(comp_keys(&MAX_KEY, p_s_key) && 672 RFALSE(comp_keys(&MAX_KEY, key) &&
711 !key_in_buffer(p_s_search_path, p_s_key, p_s_sb), 673 !key_in_buffer(search_path, key, sb),
712 "PAP-5130: key is not in the buffer"); 674 "PAP-5130: key is not in the buffer");
713#ifdef CONFIG_REISERFS_CHECK 675#ifdef CONFIG_REISERFS_CHECK
714 if (cur_tb) { 676 if (cur_tb) {
715 print_cur_tb("5140"); 677 print_cur_tb("5140");
716 reiserfs_panic(p_s_sb, 678 reiserfs_panic(sb, "PAP-5140",
717 "PAP-5140: search_by_key: schedule occurred in do_balance!"); 679 "schedule occurred in do_balance!");
718 } 680 }
719#endif 681#endif
720 682
721 // make sure, that the node contents look like a node of 683 // make sure, that the node contents look like a node of
722 // certain level 684 // certain level
723 if (!is_tree_node(p_s_bh, expected_level)) { 685 if (!is_tree_node(bh, expected_level)) {
724 reiserfs_warning(p_s_sb, "vs-5150: search_by_key: " 686 reiserfs_error(sb, "vs-5150",
725 "invalid format found in block %ld. Fsck?", 687 "invalid format found in block %ld. "
726 p_s_bh->b_blocknr); 688 "Fsck?", bh->b_blocknr);
727 pathrelse(p_s_search_path); 689 pathrelse(search_path);
728 return IO_ERROR; 690 return IO_ERROR;
729 } 691 }
730 692
731 /* ok, we have acquired next formatted node in the tree */ 693 /* ok, we have acquired next formatted node in the tree */
732 n_node_level = B_LEVEL(p_s_bh); 694 node_level = B_LEVEL(bh);
733 695
734 PROC_INFO_BH_STAT(p_s_sb, p_s_bh, n_node_level - 1); 696 PROC_INFO_BH_STAT(sb, bh, node_level - 1);
735 697
736 RFALSE(n_node_level < n_stop_level, 698 RFALSE(node_level < stop_level,
737 "vs-5152: tree level (%d) is less than stop level (%d)", 699 "vs-5152: tree level (%d) is less than stop level (%d)",
738 n_node_level, n_stop_level); 700 node_level, stop_level);
739 701
740 n_retval = bin_search(p_s_key, B_N_PITEM_HEAD(p_s_bh, 0), 702 retval = bin_search(key, B_N_PITEM_HEAD(bh, 0),
741 B_NR_ITEMS(p_s_bh), 703 B_NR_ITEMS(bh),
742 (n_node_level == 704 (node_level ==
743 DISK_LEAF_NODE_LEVEL) ? IH_SIZE : 705 DISK_LEAF_NODE_LEVEL) ? IH_SIZE :
744 KEY_SIZE, 706 KEY_SIZE,
745 &(p_s_last_element->pe_position)); 707 &(last_element->pe_position));
746 if (n_node_level == n_stop_level) { 708 if (node_level == stop_level) {
747 return n_retval; 709 return retval;
748 } 710 }
749 711
750 /* we are not in the stop level */ 712 /* we are not in the stop level */
751 if (n_retval == ITEM_FOUND) 713 if (retval == ITEM_FOUND)
752 /* item has been found, so we choose the pointer which is to the right of the found one */ 714 /* item has been found, so we choose the pointer which is to the right of the found one */
753 p_s_last_element->pe_position++; 715 last_element->pe_position++;
754 716
755 /* if item was not found we choose the position which is to 717 /* if item was not found we choose the position which is to
756 the left of the found item. This requires no code, 718 the left of the found item. This requires no code,
@@ -759,24 +721,24 @@ int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key, /*
759 /* So we have chosen a position in the current node which is 721 /* So we have chosen a position in the current node which is
760 an internal node. Now we calculate child block number by 722 an internal node. Now we calculate child block number by
761 position in the node. */ 723 position in the node. */
762 n_block_number = 724 block_number =
763 B_N_CHILD_NUM(p_s_bh, p_s_last_element->pe_position); 725 B_N_CHILD_NUM(bh, last_element->pe_position);
764 726
765 /* if we are going to read leaf nodes, try for read ahead as well */ 727 /* if we are going to read leaf nodes, try for read ahead as well */
766 if ((p_s_search_path->reada & PATH_READA) && 728 if ((search_path->reada & PATH_READA) &&
767 n_node_level == DISK_LEAF_NODE_LEVEL + 1) { 729 node_level == DISK_LEAF_NODE_LEVEL + 1) {
768 int pos = p_s_last_element->pe_position; 730 int pos = last_element->pe_position;
769 int limit = B_NR_ITEMS(p_s_bh); 731 int limit = B_NR_ITEMS(bh);
770 struct reiserfs_key *le_key; 732 struct reiserfs_key *le_key;
771 733
772 if (p_s_search_path->reada & PATH_READA_BACK) 734 if (search_path->reada & PATH_READA_BACK)
773 limit = 0; 735 limit = 0;
774 while (reada_count < SEARCH_BY_KEY_READA) { 736 while (reada_count < SEARCH_BY_KEY_READA) {
775 if (pos == limit) 737 if (pos == limit)
776 break; 738 break;
777 reada_blocks[reada_count++] = 739 reada_blocks[reada_count++] =
778 B_N_CHILD_NUM(p_s_bh, pos); 740 B_N_CHILD_NUM(bh, pos);
779 if (p_s_search_path->reada & PATH_READA_BACK) 741 if (search_path->reada & PATH_READA_BACK)
780 pos--; 742 pos--;
781 else 743 else
782 pos++; 744 pos++;
@@ -784,9 +746,9 @@ int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key, /*
784 /* 746 /*
785 * check to make sure we're in the same object 747 * check to make sure we're in the same object
786 */ 748 */
787 le_key = B_N_PDELIM_KEY(p_s_bh, pos); 749 le_key = B_N_PDELIM_KEY(bh, pos);
788 if (le32_to_cpu(le_key->k_objectid) != 750 if (le32_to_cpu(le_key->k_objectid) !=
789 p_s_key->on_disk_key.k_objectid) { 751 key->on_disk_key.k_objectid) {
790 break; 752 break;
791 } 753 }
792 } 754 }
@@ -795,11 +757,11 @@ int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key, /*
795} 757}
796 758
797/* Form the path to an item and position in this item which contains 759/* Form the path to an item and position in this item which contains
798 file byte defined by p_s_key. If there is no such item 760 file byte defined by key. If there is no such item
799 corresponding to the key, we point the path to the item with 761 corresponding to the key, we point the path to the item with
800 maximal key less than p_s_key, and *p_n_pos_in_item is set to one 762 maximal key less than key, and *pos_in_item is set to one
801 past the last entry/byte in the item. If searching for entry in a 763 past the last entry/byte in the item. If searching for entry in a
802 directory item, and it is not found, *p_n_pos_in_item is set to one 764 directory item, and it is not found, *pos_in_item is set to one
803 entry more than the entry with maximal key which is less than the 765 entry more than the entry with maximal key which is less than the
804 sought key. 766 sought key.
805 767
@@ -810,48 +772,48 @@ int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key, /*
810 units of directory entries. */ 772 units of directory entries. */
811 773
812/* The function is NOT SCHEDULE-SAFE! */ 774/* The function is NOT SCHEDULE-SAFE! */
813int search_for_position_by_key(struct super_block *p_s_sb, /* Pointer to the super block. */ 775int search_for_position_by_key(struct super_block *sb, /* Pointer to the super block. */
814 const struct cpu_key *p_cpu_key, /* Key to search (cpu variable) */ 776 const struct cpu_key *p_cpu_key, /* Key to search (cpu variable) */
815 struct treepath *p_s_search_path /* Filled up by this function. */ 777 struct treepath *search_path /* Filled up by this function. */
816 ) 778 )
817{ 779{
818 struct item_head *p_le_ih; /* pointer to on-disk structure */ 780 struct item_head *p_le_ih; /* pointer to on-disk structure */
819 int n_blk_size; 781 int blk_size;
820 loff_t item_offset, offset; 782 loff_t item_offset, offset;
821 struct reiserfs_dir_entry de; 783 struct reiserfs_dir_entry de;
822 int retval; 784 int retval;
823 785
824 /* If searching for directory entry. */ 786 /* If searching for directory entry. */
825 if (is_direntry_cpu_key(p_cpu_key)) 787 if (is_direntry_cpu_key(p_cpu_key))
826 return search_by_entry_key(p_s_sb, p_cpu_key, p_s_search_path, 788 return search_by_entry_key(sb, p_cpu_key, search_path,
827 &de); 789 &de);
828 790
829 /* If not searching for directory entry. */ 791 /* If not searching for directory entry. */
830 792
831 /* If item is found. */ 793 /* If item is found. */
832 retval = search_item(p_s_sb, p_cpu_key, p_s_search_path); 794 retval = search_item(sb, p_cpu_key, search_path);
833 if (retval == IO_ERROR) 795 if (retval == IO_ERROR)
834 return retval; 796 return retval;
835 if (retval == ITEM_FOUND) { 797 if (retval == ITEM_FOUND) {
836 798
837 RFALSE(!ih_item_len 799 RFALSE(!ih_item_len
838 (B_N_PITEM_HEAD 800 (B_N_PITEM_HEAD
839 (PATH_PLAST_BUFFER(p_s_search_path), 801 (PATH_PLAST_BUFFER(search_path),
840 PATH_LAST_POSITION(p_s_search_path))), 802 PATH_LAST_POSITION(search_path))),
841 "PAP-5165: item length equals zero"); 803 "PAP-5165: item length equals zero");
842 804
843 pos_in_item(p_s_search_path) = 0; 805 pos_in_item(search_path) = 0;
844 return POSITION_FOUND; 806 return POSITION_FOUND;
845 } 807 }
846 808
847 RFALSE(!PATH_LAST_POSITION(p_s_search_path), 809 RFALSE(!PATH_LAST_POSITION(search_path),
848 "PAP-5170: position equals zero"); 810 "PAP-5170: position equals zero");
849 811
850 /* Item is not found. Set path to the previous item. */ 812 /* Item is not found. Set path to the previous item. */
851 p_le_ih = 813 p_le_ih =
852 B_N_PITEM_HEAD(PATH_PLAST_BUFFER(p_s_search_path), 814 B_N_PITEM_HEAD(PATH_PLAST_BUFFER(search_path),
853 --PATH_LAST_POSITION(p_s_search_path)); 815 --PATH_LAST_POSITION(search_path));
854 n_blk_size = p_s_sb->s_blocksize; 816 blk_size = sb->s_blocksize;
855 817
856 if (comp_short_keys(&(p_le_ih->ih_key), p_cpu_key)) { 818 if (comp_short_keys(&(p_le_ih->ih_key), p_cpu_key)) {
857 return FILE_NOT_FOUND; 819 return FILE_NOT_FOUND;
@@ -863,10 +825,10 @@ int search_for_position_by_key(struct super_block *p_s_sb, /* Pointer to the sup
863 825
864 /* Needed byte is contained in the item pointed to by the path. */ 826 /* Needed byte is contained in the item pointed to by the path. */
865 if (item_offset <= offset && 827 if (item_offset <= offset &&
866 item_offset + op_bytes_number(p_le_ih, n_blk_size) > offset) { 828 item_offset + op_bytes_number(p_le_ih, blk_size) > offset) {
867 pos_in_item(p_s_search_path) = offset - item_offset; 829 pos_in_item(search_path) = offset - item_offset;
868 if (is_indirect_le_ih(p_le_ih)) { 830 if (is_indirect_le_ih(p_le_ih)) {
869 pos_in_item(p_s_search_path) /= n_blk_size; 831 pos_in_item(search_path) /= blk_size;
870 } 832 }
871 return POSITION_FOUND; 833 return POSITION_FOUND;
872 } 834 }
@@ -874,30 +836,30 @@ int search_for_position_by_key(struct super_block *p_s_sb, /* Pointer to the sup
874 /* Needed byte is not contained in the item pointed to by the 836 /* Needed byte is not contained in the item pointed to by the
875 path. Set pos_in_item out of the item. */ 837 path. Set pos_in_item out of the item. */
876 if (is_indirect_le_ih(p_le_ih)) 838 if (is_indirect_le_ih(p_le_ih))
877 pos_in_item(p_s_search_path) = 839 pos_in_item(search_path) =
878 ih_item_len(p_le_ih) / UNFM_P_SIZE; 840 ih_item_len(p_le_ih) / UNFM_P_SIZE;
879 else 841 else
880 pos_in_item(p_s_search_path) = ih_item_len(p_le_ih); 842 pos_in_item(search_path) = ih_item_len(p_le_ih);
881 843
882 return POSITION_NOT_FOUND; 844 return POSITION_NOT_FOUND;
883} 845}
884 846
885/* Compare given item and item pointed to by the path. */ 847/* Compare given item and item pointed to by the path. */
886int comp_items(const struct item_head *stored_ih, const struct treepath *p_s_path) 848int comp_items(const struct item_head *stored_ih, const struct treepath *path)
887{ 849{
888 struct buffer_head *p_s_bh; 850 struct buffer_head *bh = PATH_PLAST_BUFFER(path);
889 struct item_head *ih; 851 struct item_head *ih;
890 852
891 /* Last buffer at the path is not in the tree. */ 853 /* Last buffer at the path is not in the tree. */
892 if (!B_IS_IN_TREE(p_s_bh = PATH_PLAST_BUFFER(p_s_path))) 854 if (!B_IS_IN_TREE(bh))
893 return 1; 855 return 1;
894 856
895 /* Last path position is invalid. */ 857 /* Last path position is invalid. */
896 if (PATH_LAST_POSITION(p_s_path) >= B_NR_ITEMS(p_s_bh)) 858 if (PATH_LAST_POSITION(path) >= B_NR_ITEMS(bh))
897 return 1; 859 return 1;
898 860
899 /* we need only to know, whether it is the same item */ 861 /* we need only to know, whether it is the same item */
900 ih = get_ih(p_s_path); 862 ih = get_ih(path);
901 return memcmp(stored_ih, ih, IH_SIZE); 863 return memcmp(stored_ih, ih, IH_SIZE);
902} 864}
903 865
@@ -924,9 +886,9 @@ static inline int prepare_for_direct_item(struct treepath *path,
924 } 886 }
925 // new file gets truncated 887 // new file gets truncated
926 if (get_inode_item_key_version(inode) == KEY_FORMAT_3_6) { 888 if (get_inode_item_key_version(inode) == KEY_FORMAT_3_6) {
927 // 889 //
928 round_len = ROUND_UP(new_file_length); 890 round_len = ROUND_UP(new_file_length);
929 /* this was n_new_file_length < le_ih ... */ 891 /* this was new_file_length < le_ih ... */
930 if (round_len < le_ih_k_offset(le_ih)) { 892 if (round_len < le_ih_k_offset(le_ih)) {
931 *cut_size = -(IH_SIZE + ih_item_len(le_ih)); 893 *cut_size = -(IH_SIZE + ih_item_len(le_ih));
932 return M_DELETE; /* Delete this item. */ 894 return M_DELETE; /* Delete this item. */
@@ -986,96 +948,95 @@ static inline int prepare_for_direntry_item(struct treepath *path,
986 In case of file truncate calculate whether this item must be deleted/truncated or last 948 In case of file truncate calculate whether this item must be deleted/truncated or last
987 unformatted node of this item will be converted to a direct item. 949 unformatted node of this item will be converted to a direct item.
988 This function returns a determination of what balance mode the calling function should employ. */ 950 This function returns a determination of what balance mode the calling function should employ. */
989static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, struct inode *inode, struct treepath *p_s_path, const struct cpu_key *p_s_item_key, int *p_n_removed, /* Number of unformatted nodes which were removed 951static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, struct inode *inode, struct treepath *path, const struct cpu_key *item_key, int *removed, /* Number of unformatted nodes which were removed
990 from end of the file. */ 952 from end of the file. */
991 int *p_n_cut_size, unsigned long long n_new_file_length /* MAX_KEY_OFFSET in case of delete. */ 953 int *cut_size, unsigned long long new_file_length /* MAX_KEY_OFFSET in case of delete. */
992 ) 954 )
993{ 955{
994 struct super_block *p_s_sb = inode->i_sb; 956 struct super_block *sb = inode->i_sb;
995 struct item_head *p_le_ih = PATH_PITEM_HEAD(p_s_path); 957 struct item_head *p_le_ih = PATH_PITEM_HEAD(path);
996 struct buffer_head *p_s_bh = PATH_PLAST_BUFFER(p_s_path); 958 struct buffer_head *bh = PATH_PLAST_BUFFER(path);
997 959
998 BUG_ON(!th->t_trans_id); 960 BUG_ON(!th->t_trans_id);
999 961
1000 /* Stat_data item. */ 962 /* Stat_data item. */
1001 if (is_statdata_le_ih(p_le_ih)) { 963 if (is_statdata_le_ih(p_le_ih)) {
1002 964
1003 RFALSE(n_new_file_length != max_reiserfs_offset(inode), 965 RFALSE(new_file_length != max_reiserfs_offset(inode),
1004 "PAP-5210: mode must be M_DELETE"); 966 "PAP-5210: mode must be M_DELETE");
1005 967
1006 *p_n_cut_size = -(IH_SIZE + ih_item_len(p_le_ih)); 968 *cut_size = -(IH_SIZE + ih_item_len(p_le_ih));
1007 return M_DELETE; 969 return M_DELETE;
1008 } 970 }
1009 971
1010 /* Directory item. */ 972 /* Directory item. */
1011 if (is_direntry_le_ih(p_le_ih)) 973 if (is_direntry_le_ih(p_le_ih))
1012 return prepare_for_direntry_item(p_s_path, p_le_ih, inode, 974 return prepare_for_direntry_item(path, p_le_ih, inode,
1013 n_new_file_length, 975 new_file_length,
1014 p_n_cut_size); 976 cut_size);
1015 977
1016 /* Direct item. */ 978 /* Direct item. */
1017 if (is_direct_le_ih(p_le_ih)) 979 if (is_direct_le_ih(p_le_ih))
1018 return prepare_for_direct_item(p_s_path, p_le_ih, inode, 980 return prepare_for_direct_item(path, p_le_ih, inode,
1019 n_new_file_length, p_n_cut_size); 981 new_file_length, cut_size);
1020 982
1021 /* Case of an indirect item. */ 983 /* Case of an indirect item. */
1022 { 984 {
1023 int blk_size = p_s_sb->s_blocksize; 985 int blk_size = sb->s_blocksize;
1024 struct item_head s_ih; 986 struct item_head s_ih;
1025 int need_re_search; 987 int need_re_search;
1026 int delete = 0; 988 int delete = 0;
1027 int result = M_CUT; 989 int result = M_CUT;
1028 int pos = 0; 990 int pos = 0;
1029 991
1030 if ( n_new_file_length == max_reiserfs_offset (inode) ) { 992 if ( new_file_length == max_reiserfs_offset (inode) ) {
1031 /* prepare_for_delete_or_cut() is called by 993 /* prepare_for_delete_or_cut() is called by
1032 * reiserfs_delete_item() */ 994 * reiserfs_delete_item() */
1033 n_new_file_length = 0; 995 new_file_length = 0;
1034 delete = 1; 996 delete = 1;
1035 } 997 }
1036 998
1037 do { 999 do {
1038 need_re_search = 0; 1000 need_re_search = 0;
1039 *p_n_cut_size = 0; 1001 *cut_size = 0;
1040 p_s_bh = PATH_PLAST_BUFFER(p_s_path); 1002 bh = PATH_PLAST_BUFFER(path);
1041 copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path)); 1003 copy_item_head(&s_ih, PATH_PITEM_HEAD(path));
1042 pos = I_UNFM_NUM(&s_ih); 1004 pos = I_UNFM_NUM(&s_ih);
1043 1005
1044 while (le_ih_k_offset (&s_ih) + (pos - 1) * blk_size > n_new_file_length) { 1006 while (le_ih_k_offset (&s_ih) + (pos - 1) * blk_size > new_file_length) {
1045 __le32 *unfm; 1007 __le32 *unfm;
1046 __u32 block; 1008 __u32 block;
1047 1009
1048 /* Each unformatted block deletion may involve one additional 1010 /* Each unformatted block deletion may involve one additional
1049 * bitmap block into the transaction, thereby the initial 1011 * bitmap block into the transaction, thereby the initial
1050 * journal space reservation might not be enough. */ 1012 * journal space reservation might not be enough. */
1051 if (!delete && (*p_n_cut_size) != 0 && 1013 if (!delete && (*cut_size) != 0 &&
1052 reiserfs_transaction_free_space(th) < JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) { 1014 reiserfs_transaction_free_space(th) < JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD)
1053 break; 1015 break;
1054 }
1055 1016
1056 unfm = (__le32 *)B_I_PITEM(p_s_bh, &s_ih) + pos - 1; 1017 unfm = (__le32 *)B_I_PITEM(bh, &s_ih) + pos - 1;
1057 block = get_block_num(unfm, 0); 1018 block = get_block_num(unfm, 0);
1058 1019
1059 if (block != 0) { 1020 if (block != 0) {
1060 reiserfs_prepare_for_journal(p_s_sb, p_s_bh, 1); 1021 reiserfs_prepare_for_journal(sb, bh, 1);
1061 put_block_num(unfm, 0, 0); 1022 put_block_num(unfm, 0, 0);
1062 journal_mark_dirty (th, p_s_sb, p_s_bh); 1023 journal_mark_dirty(th, sb, bh);
1063 reiserfs_free_block(th, inode, block, 1); 1024 reiserfs_free_block(th, inode, block, 1);
1064 } 1025 }
1065 1026
1066 cond_resched(); 1027 cond_resched();
1067 1028
1068 if (item_moved (&s_ih, p_s_path)) { 1029 if (item_moved (&s_ih, path)) {
1069 need_re_search = 1; 1030 need_re_search = 1;
1070 break; 1031 break;
1071 } 1032 }
1072 1033
1073 pos --; 1034 pos --;
1074 (*p_n_removed) ++; 1035 (*removed)++;
1075 (*p_n_cut_size) -= UNFM_P_SIZE; 1036 (*cut_size) -= UNFM_P_SIZE;
1076 1037
1077 if (pos == 0) { 1038 if (pos == 0) {
1078 (*p_n_cut_size) -= IH_SIZE; 1039 (*cut_size) -= IH_SIZE;
1079 result = M_DELETE; 1040 result = M_DELETE;
1080 break; 1041 break;
1081 } 1042 }
@@ -1083,12 +1044,12 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
1083 /* a trick. If the buffer has been logged, this will do nothing. If 1044 /* a trick. If the buffer has been logged, this will do nothing. If
1084 ** we've broken the loop without logging it, it will restore the 1045 ** we've broken the loop without logging it, it will restore the
1085 ** buffer */ 1046 ** buffer */
1086 reiserfs_restore_prepared_buffer(p_s_sb, p_s_bh); 1047 reiserfs_restore_prepared_buffer(sb, bh);
1087 } while (need_re_search && 1048 } while (need_re_search &&
1088 search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path) == POSITION_FOUND); 1049 search_for_position_by_key(sb, item_key, path) == POSITION_FOUND);
1089 pos_in_item(p_s_path) = pos * UNFM_P_SIZE; 1050 pos_in_item(path) = pos * UNFM_P_SIZE;
1090 1051
1091 if (*p_n_cut_size == 0) { 1052 if (*cut_size == 0) {
1092 /* Nothing were cut. maybe convert last unformatted node to the 1053 /* Nothing were cut. maybe convert last unformatted node to the
1093 * direct item? */ 1054 * direct item? */
1094 result = M_CONVERT; 1055 result = M_CONVERT;
@@ -1098,45 +1059,45 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
1098} 1059}
1099 1060
1100/* Calculate number of bytes which will be deleted or cut during balance */ 1061/* Calculate number of bytes which will be deleted or cut during balance */
1101static int calc_deleted_bytes_number(struct tree_balance *p_s_tb, char c_mode) 1062static int calc_deleted_bytes_number(struct tree_balance *tb, char mode)
1102{ 1063{
1103 int n_del_size; 1064 int del_size;
1104 struct item_head *p_le_ih = PATH_PITEM_HEAD(p_s_tb->tb_path); 1065 struct item_head *p_le_ih = PATH_PITEM_HEAD(tb->tb_path);
1105 1066
1106 if (is_statdata_le_ih(p_le_ih)) 1067 if (is_statdata_le_ih(p_le_ih))
1107 return 0; 1068 return 0;
1108 1069
1109 n_del_size = 1070 del_size =
1110 (c_mode == 1071 (mode ==
1111 M_DELETE) ? ih_item_len(p_le_ih) : -p_s_tb->insert_size[0]; 1072 M_DELETE) ? ih_item_len(p_le_ih) : -tb->insert_size[0];
1112 if (is_direntry_le_ih(p_le_ih)) { 1073 if (is_direntry_le_ih(p_le_ih)) {
1113 // return EMPTY_DIR_SIZE; /* We delete emty directoris only. */ 1074 /* return EMPTY_DIR_SIZE; We delete emty directoris only.
1114 // we can't use EMPTY_DIR_SIZE, as old format dirs have a different 1075 * we can't use EMPTY_DIR_SIZE, as old format dirs have a different
1115 // empty size. ick. FIXME, is this right? 1076 * empty size. ick. FIXME, is this right? */
1116 // 1077 return del_size;
1117 return n_del_size;
1118 } 1078 }
1119 1079
1120 if (is_indirect_le_ih(p_le_ih)) 1080 if (is_indirect_le_ih(p_le_ih))
1121 n_del_size = (n_del_size / UNFM_P_SIZE) * (PATH_PLAST_BUFFER(p_s_tb->tb_path)->b_size); // - get_ih_free_space (p_le_ih); 1081 del_size = (del_size / UNFM_P_SIZE) *
1122 return n_del_size; 1082 (PATH_PLAST_BUFFER(tb->tb_path)->b_size);
1083 return del_size;
1123} 1084}
1124 1085
1125static void init_tb_struct(struct reiserfs_transaction_handle *th, 1086static void init_tb_struct(struct reiserfs_transaction_handle *th,
1126 struct tree_balance *p_s_tb, 1087 struct tree_balance *tb,
1127 struct super_block *p_s_sb, 1088 struct super_block *sb,
1128 struct treepath *p_s_path, int n_size) 1089 struct treepath *path, int size)
1129{ 1090{
1130 1091
1131 BUG_ON(!th->t_trans_id); 1092 BUG_ON(!th->t_trans_id);
1132 1093
1133 memset(p_s_tb, '\0', sizeof(struct tree_balance)); 1094 memset(tb, '\0', sizeof(struct tree_balance));
1134 p_s_tb->transaction_handle = th; 1095 tb->transaction_handle = th;
1135 p_s_tb->tb_sb = p_s_sb; 1096 tb->tb_sb = sb;
1136 p_s_tb->tb_path = p_s_path; 1097 tb->tb_path = path;
1137 PATH_OFFSET_PBUFFER(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL; 1098 PATH_OFFSET_PBUFFER(path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL;
1138 PATH_OFFSET_POSITION(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0; 1099 PATH_OFFSET_POSITION(path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0;
1139 p_s_tb->insert_size[0] = n_size; 1100 tb->insert_size[0] = size;
1140} 1101}
1141 1102
1142void padd_item(char *item, int total_length, int length) 1103void padd_item(char *item, int total_length, int length)
@@ -1175,73 +1136,77 @@ char head2type(struct item_head *ih)
1175} 1136}
1176#endif 1137#endif
1177 1138
1178/* Delete object item. */ 1139/* Delete object item.
1179int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct treepath *p_s_path, /* Path to the deleted item. */ 1140 * th - active transaction handle
1180 const struct cpu_key *p_s_item_key, /* Key to search for the deleted item. */ 1141 * path - path to the deleted item
1181 struct inode *p_s_inode, /* inode is here just to update i_blocks and quotas */ 1142 * item_key - key to search for the deleted item
1182 struct buffer_head *p_s_un_bh) 1143 * indode - used for updating i_blocks and quotas
1183{ /* NULL or unformatted node pointer. */ 1144 * un_bh - NULL or unformatted node pointer
1184 struct super_block *p_s_sb = p_s_inode->i_sb; 1145 */
1146int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
1147 struct treepath *path, const struct cpu_key *item_key,
1148 struct inode *inode, struct buffer_head *un_bh)
1149{
1150 struct super_block *sb = inode->i_sb;
1185 struct tree_balance s_del_balance; 1151 struct tree_balance s_del_balance;
1186 struct item_head s_ih; 1152 struct item_head s_ih;
1187 struct item_head *q_ih; 1153 struct item_head *q_ih;
1188 int quota_cut_bytes; 1154 int quota_cut_bytes;
1189 int n_ret_value, n_del_size, n_removed; 1155 int ret_value, del_size, removed;
1190 1156
1191#ifdef CONFIG_REISERFS_CHECK 1157#ifdef CONFIG_REISERFS_CHECK
1192 char c_mode; 1158 char mode;
1193 int n_iter = 0; 1159 int iter = 0;
1194#endif 1160#endif
1195 1161
1196 BUG_ON(!th->t_trans_id); 1162 BUG_ON(!th->t_trans_id);
1197 1163
1198 init_tb_struct(th, &s_del_balance, p_s_sb, p_s_path, 1164 init_tb_struct(th, &s_del_balance, sb, path,
1199 0 /*size is unknown */ ); 1165 0 /*size is unknown */ );
1200 1166
1201 while (1) { 1167 while (1) {
1202 n_removed = 0; 1168 removed = 0;
1203 1169
1204#ifdef CONFIG_REISERFS_CHECK 1170#ifdef CONFIG_REISERFS_CHECK
1205 n_iter++; 1171 iter++;
1206 c_mode = 1172 mode =
1207#endif 1173#endif
1208 prepare_for_delete_or_cut(th, p_s_inode, p_s_path, 1174 prepare_for_delete_or_cut(th, inode, path,
1209 p_s_item_key, &n_removed, 1175 item_key, &removed,
1210 &n_del_size, 1176 &del_size,
1211 max_reiserfs_offset(p_s_inode)); 1177 max_reiserfs_offset(inode));
1212 1178
1213 RFALSE(c_mode != M_DELETE, "PAP-5320: mode must be M_DELETE"); 1179 RFALSE(mode != M_DELETE, "PAP-5320: mode must be M_DELETE");
1214 1180
1215 copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path)); 1181 copy_item_head(&s_ih, PATH_PITEM_HEAD(path));
1216 s_del_balance.insert_size[0] = n_del_size; 1182 s_del_balance.insert_size[0] = del_size;
1217 1183
1218 n_ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL); 1184 ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL);
1219 if (n_ret_value != REPEAT_SEARCH) 1185 if (ret_value != REPEAT_SEARCH)
1220 break; 1186 break;
1221 1187
1222 PROC_INFO_INC(p_s_sb, delete_item_restarted); 1188 PROC_INFO_INC(sb, delete_item_restarted);
1223 1189
1224 // file system changed, repeat search 1190 // file system changed, repeat search
1225 n_ret_value = 1191 ret_value =
1226 search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path); 1192 search_for_position_by_key(sb, item_key, path);
1227 if (n_ret_value == IO_ERROR) 1193 if (ret_value == IO_ERROR)
1228 break; 1194 break;
1229 if (n_ret_value == FILE_NOT_FOUND) { 1195 if (ret_value == FILE_NOT_FOUND) {
1230 reiserfs_warning(p_s_sb, 1196 reiserfs_warning(sb, "vs-5340",
1231 "vs-5340: reiserfs_delete_item: "
1232 "no items of the file %K found", 1197 "no items of the file %K found",
1233 p_s_item_key); 1198 item_key);
1234 break; 1199 break;
1235 } 1200 }
1236 } /* while (1) */ 1201 } /* while (1) */
1237 1202
1238 if (n_ret_value != CARRY_ON) { 1203 if (ret_value != CARRY_ON) {
1239 unfix_nodes(&s_del_balance); 1204 unfix_nodes(&s_del_balance);
1240 return 0; 1205 return 0;
1241 } 1206 }
1242 // reiserfs_delete_item returns item length when success 1207 // reiserfs_delete_item returns item length when success
1243 n_ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE); 1208 ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE);
1244 q_ih = get_ih(p_s_path); 1209 q_ih = get_ih(path);
1245 quota_cut_bytes = ih_item_len(q_ih); 1210 quota_cut_bytes = ih_item_len(q_ih);
1246 1211
1247 /* hack so the quota code doesn't have to guess if the file 1212 /* hack so the quota code doesn't have to guess if the file
@@ -1250,15 +1215,15 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct treepath
1250 ** split into multiple items, and we only want to decrement for 1215 ** split into multiple items, and we only want to decrement for
1251 ** the unfm node once 1216 ** the unfm node once
1252 */ 1217 */
1253 if (!S_ISLNK(p_s_inode->i_mode) && is_direct_le_ih(q_ih)) { 1218 if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(q_ih)) {
1254 if ((le_ih_k_offset(q_ih) & (p_s_sb->s_blocksize - 1)) == 1) { 1219 if ((le_ih_k_offset(q_ih) & (sb->s_blocksize - 1)) == 1) {
1255 quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE; 1220 quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
1256 } else { 1221 } else {
1257 quota_cut_bytes = 0; 1222 quota_cut_bytes = 0;
1258 } 1223 }
1259 } 1224 }
1260 1225
1261 if (p_s_un_bh) { 1226 if (un_bh) {
1262 int off; 1227 int off;
1263 char *data; 1228 char *data;
1264 1229
@@ -1276,31 +1241,31 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct treepath
1276 ** The unformatted node must be dirtied later on. We can't be 1241 ** The unformatted node must be dirtied later on. We can't be
1277 ** sure here if the entire tail has been deleted yet. 1242 ** sure here if the entire tail has been deleted yet.
1278 ** 1243 **
1279 ** p_s_un_bh is from the page cache (all unformatted nodes are 1244 ** un_bh is from the page cache (all unformatted nodes are
1280 ** from the page cache) and might be a highmem page. So, we 1245 ** from the page cache) and might be a highmem page. So, we
1281 ** can't use p_s_un_bh->b_data. 1246 ** can't use un_bh->b_data.
1282 ** -clm 1247 ** -clm
1283 */ 1248 */
1284 1249
1285 data = kmap_atomic(p_s_un_bh->b_page, KM_USER0); 1250 data = kmap_atomic(un_bh->b_page, KM_USER0);
1286 off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_CACHE_SIZE - 1)); 1251 off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_CACHE_SIZE - 1));
1287 memcpy(data + off, 1252 memcpy(data + off,
1288 B_I_PITEM(PATH_PLAST_BUFFER(p_s_path), &s_ih), 1253 B_I_PITEM(PATH_PLAST_BUFFER(path), &s_ih),
1289 n_ret_value); 1254 ret_value);
1290 kunmap_atomic(data, KM_USER0); 1255 kunmap_atomic(data, KM_USER0);
1291 } 1256 }
1292 /* Perform balancing after all resources have been collected at once. */ 1257 /* Perform balancing after all resources have been collected at once. */
1293 do_balance(&s_del_balance, NULL, NULL, M_DELETE); 1258 do_balance(&s_del_balance, NULL, NULL, M_DELETE);
1294 1259
1295#ifdef REISERQUOTA_DEBUG 1260#ifdef REISERQUOTA_DEBUG
1296 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, 1261 reiserfs_debug(sb, REISERFS_DEBUG_CODE,
1297 "reiserquota delete_item(): freeing %u, id=%u type=%c", 1262 "reiserquota delete_item(): freeing %u, id=%u type=%c",
1298 quota_cut_bytes, p_s_inode->i_uid, head2type(&s_ih)); 1263 quota_cut_bytes, inode->i_uid, head2type(&s_ih));
1299#endif 1264#endif
1300 DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes); 1265 vfs_dq_free_space_nodirty(inode, quota_cut_bytes);
1301 1266
1302 /* Return deleted body length */ 1267 /* Return deleted body length */
1303 return n_ret_value; 1268 return ret_value;
1304} 1269}
1305 1270
1306/* Summary Of Mechanisms For Handling Collisions Between Processes: 1271/* Summary Of Mechanisms For Handling Collisions Between Processes:
@@ -1338,10 +1303,9 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
1338 while (1) { 1303 while (1) {
1339 retval = search_item(th->t_super, &cpu_key, &path); 1304 retval = search_item(th->t_super, &cpu_key, &path);
1340 if (retval == IO_ERROR) { 1305 if (retval == IO_ERROR) {
1341 reiserfs_warning(th->t_super, 1306 reiserfs_error(th->t_super, "vs-5350",
1342 "vs-5350: reiserfs_delete_solid_item: " 1307 "i/o failure occurred trying "
1343 "i/o failure occurred trying to delete %K", 1308 "to delete %K", &cpu_key);
1344 &cpu_key);
1345 break; 1309 break;
1346 } 1310 }
1347 if (retval != ITEM_FOUND) { 1311 if (retval != ITEM_FOUND) {
@@ -1355,9 +1319,8 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
1355 GET_GENERATION_NUMBER(le_key_k_offset 1319 GET_GENERATION_NUMBER(le_key_k_offset
1356 (le_key_version(key), 1320 (le_key_version(key),
1357 key)) == 1)) 1321 key)) == 1))
1358 reiserfs_warning(th->t_super, 1322 reiserfs_warning(th->t_super, "vs-5355",
1359 "vs-5355: reiserfs_delete_solid_item: %k not found", 1323 "%k not found", key);
1360 key);
1361 break; 1324 break;
1362 } 1325 }
1363 if (!tb_init) { 1326 if (!tb_init) {
@@ -1383,14 +1346,13 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
1383 quota_cut_bytes, inode->i_uid, 1346 quota_cut_bytes, inode->i_uid,
1384 key2type(key)); 1347 key2type(key));
1385#endif 1348#endif
1386 DQUOT_FREE_SPACE_NODIRTY(inode, 1349 vfs_dq_free_space_nodirty(inode,
1387 quota_cut_bytes); 1350 quota_cut_bytes);
1388 } 1351 }
1389 break; 1352 break;
1390 } 1353 }
1391 // IO_ERROR, NO_DISK_SPACE, etc 1354 // IO_ERROR, NO_DISK_SPACE, etc
1392 reiserfs_warning(th->t_super, 1355 reiserfs_warning(th->t_super, "vs-5360",
1393 "vs-5360: reiserfs_delete_solid_item: "
1394 "could not delete %K due to fix_nodes failure", 1356 "could not delete %K due to fix_nodes failure",
1395 &cpu_key); 1357 &cpu_key);
1396 unfix_nodes(&tb); 1358 unfix_nodes(&tb);
@@ -1462,36 +1424,37 @@ static void unmap_buffers(struct page *page, loff_t pos)
1462} 1424}
1463 1425
1464static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th, 1426static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th,
1465 struct inode *p_s_inode, 1427 struct inode *inode,
1466 struct page *page, 1428 struct page *page,
1467 struct treepath *p_s_path, 1429 struct treepath *path,
1468 const struct cpu_key *p_s_item_key, 1430 const struct cpu_key *item_key,
1469 loff_t n_new_file_size, char *p_c_mode) 1431 loff_t new_file_size, char *mode)
1470{ 1432{
1471 struct super_block *p_s_sb = p_s_inode->i_sb; 1433 struct super_block *sb = inode->i_sb;
1472 int n_block_size = p_s_sb->s_blocksize; 1434 int block_size = sb->s_blocksize;
1473 int cut_bytes; 1435 int cut_bytes;
1474 BUG_ON(!th->t_trans_id); 1436 BUG_ON(!th->t_trans_id);
1475 BUG_ON(n_new_file_size != p_s_inode->i_size); 1437 BUG_ON(new_file_size != inode->i_size);
1476 1438
1477 /* the page being sent in could be NULL if there was an i/o error 1439 /* the page being sent in could be NULL if there was an i/o error
1478 ** reading in the last block. The user will hit problems trying to 1440 ** reading in the last block. The user will hit problems trying to
1479 ** read the file, but for now we just skip the indirect2direct 1441 ** read the file, but for now we just skip the indirect2direct
1480 */ 1442 */
1481 if (atomic_read(&p_s_inode->i_count) > 1 || 1443 if (atomic_read(&inode->i_count) > 1 ||
1482 !tail_has_to_be_packed(p_s_inode) || 1444 !tail_has_to_be_packed(inode) ||
1483 !page || (REISERFS_I(p_s_inode)->i_flags & i_nopack_mask)) { 1445 !page || (REISERFS_I(inode)->i_flags & i_nopack_mask)) {
1484 // leave tail in an unformatted node 1446 /* leave tail in an unformatted node */
1485 *p_c_mode = M_SKIP_BALANCING; 1447 *mode = M_SKIP_BALANCING;
1486 cut_bytes = 1448 cut_bytes =
1487 n_block_size - (n_new_file_size & (n_block_size - 1)); 1449 block_size - (new_file_size & (block_size - 1));
1488 pathrelse(p_s_path); 1450 pathrelse(path);
1489 return cut_bytes; 1451 return cut_bytes;
1490 } 1452 }
1491 /* Permorm the conversion to a direct_item. */ 1453 /* Perform the conversion to a direct_item. */
1492 /*return indirect_to_direct (p_s_inode, p_s_path, p_s_item_key, n_new_file_size, p_c_mode); */ 1454 /* return indirect_to_direct(inode, path, item_key,
1493 return indirect2direct(th, p_s_inode, page, p_s_path, p_s_item_key, 1455 new_file_size, mode); */
1494 n_new_file_size, p_c_mode); 1456 return indirect2direct(th, inode, page, path, item_key,
1457 new_file_size, mode);
1495} 1458}
1496 1459
1497/* we did indirect_to_direct conversion. And we have inserted direct 1460/* we did indirect_to_direct conversion. And we have inserted direct
@@ -1515,8 +1478,8 @@ static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th,
1515 /* look for the last byte of the tail */ 1478 /* look for the last byte of the tail */
1516 if (search_for_position_by_key(inode->i_sb, &tail_key, path) == 1479 if (search_for_position_by_key(inode->i_sb, &tail_key, path) ==
1517 POSITION_NOT_FOUND) 1480 POSITION_NOT_FOUND)
1518 reiserfs_panic(inode->i_sb, 1481 reiserfs_panic(inode->i_sb, "vs-5615",
1519 "vs-5615: indirect_to_direct_roll_back: found invalid item"); 1482 "found invalid item");
1520 RFALSE(path->pos_in_item != 1483 RFALSE(path->pos_in_item !=
1521 ih_item_len(PATH_PITEM_HEAD(path)) - 1, 1484 ih_item_len(PATH_PITEM_HEAD(path)) - 1,
1522 "vs-5616: appended bytes found"); 1485 "vs-5616: appended bytes found");
@@ -1533,38 +1496,39 @@ static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th,
1533 set_cpu_key_k_offset(&tail_key, 1496 set_cpu_key_k_offset(&tail_key,
1534 cpu_key_k_offset(&tail_key) - removed); 1497 cpu_key_k_offset(&tail_key) - removed);
1535 } 1498 }
1536 reiserfs_warning(inode->i_sb, 1499 reiserfs_warning(inode->i_sb, "reiserfs-5091", "indirect_to_direct "
1537 "indirect_to_direct_roll_back: indirect_to_direct conversion has been rolled back due to lack of disk space"); 1500 "conversion has been rolled back due to "
1501 "lack of disk space");
1538 //mark_file_without_tail (inode); 1502 //mark_file_without_tail (inode);
1539 mark_inode_dirty(inode); 1503 mark_inode_dirty(inode);
1540} 1504}
1541 1505
1542/* (Truncate or cut entry) or delete object item. Returns < 0 on failure */ 1506/* (Truncate or cut entry) or delete object item. Returns < 0 on failure */
1543int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, 1507int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
1544 struct treepath *p_s_path, 1508 struct treepath *path,
1545 struct cpu_key *p_s_item_key, 1509 struct cpu_key *item_key,
1546 struct inode *p_s_inode, 1510 struct inode *inode,
1547 struct page *page, loff_t n_new_file_size) 1511 struct page *page, loff_t new_file_size)
1548{ 1512{
1549 struct super_block *p_s_sb = p_s_inode->i_sb; 1513 struct super_block *sb = inode->i_sb;
1550 /* Every function which is going to call do_balance must first 1514 /* Every function which is going to call do_balance must first
1551 create a tree_balance structure. Then it must fill up this 1515 create a tree_balance structure. Then it must fill up this
1552 structure by using the init_tb_struct and fix_nodes functions. 1516 structure by using the init_tb_struct and fix_nodes functions.
1553 After that we can make tree balancing. */ 1517 After that we can make tree balancing. */
1554 struct tree_balance s_cut_balance; 1518 struct tree_balance s_cut_balance;
1555 struct item_head *p_le_ih; 1519 struct item_head *p_le_ih;
1556 int n_cut_size = 0, /* Amount to be cut. */ 1520 int cut_size = 0, /* Amount to be cut. */
1557 n_ret_value = CARRY_ON, n_removed = 0, /* Number of the removed unformatted nodes. */ 1521 ret_value = CARRY_ON, removed = 0, /* Number of the removed unformatted nodes. */
1558 n_is_inode_locked = 0; 1522 is_inode_locked = 0;
1559 char c_mode; /* Mode of the balance. */ 1523 char mode; /* Mode of the balance. */
1560 int retval2 = -1; 1524 int retval2 = -1;
1561 int quota_cut_bytes; 1525 int quota_cut_bytes;
1562 loff_t tail_pos = 0; 1526 loff_t tail_pos = 0;
1563 1527
1564 BUG_ON(!th->t_trans_id); 1528 BUG_ON(!th->t_trans_id);
1565 1529
1566 init_tb_struct(th, &s_cut_balance, p_s_inode->i_sb, p_s_path, 1530 init_tb_struct(th, &s_cut_balance, inode->i_sb, path,
1567 n_cut_size); 1531 cut_size);
1568 1532
1569 /* Repeat this loop until we either cut the item without needing 1533 /* Repeat this loop until we either cut the item without needing
1570 to balance, or we fix_nodes without schedule occurring */ 1534 to balance, or we fix_nodes without schedule occurring */
@@ -1574,144 +1538,142 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
1574 free unformatted nodes which are pointed to by the cut 1538 free unformatted nodes which are pointed to by the cut
1575 pointers. */ 1539 pointers. */
1576 1540
1577 c_mode = 1541 mode =
1578 prepare_for_delete_or_cut(th, p_s_inode, p_s_path, 1542 prepare_for_delete_or_cut(th, inode, path,
1579 p_s_item_key, &n_removed, 1543 item_key, &removed,
1580 &n_cut_size, n_new_file_size); 1544 &cut_size, new_file_size);
1581 if (c_mode == M_CONVERT) { 1545 if (mode == M_CONVERT) {
1582 /* convert last unformatted node to direct item or leave 1546 /* convert last unformatted node to direct item or leave
1583 tail in the unformatted node */ 1547 tail in the unformatted node */
1584 RFALSE(n_ret_value != CARRY_ON, 1548 RFALSE(ret_value != CARRY_ON,
1585 "PAP-5570: can not convert twice"); 1549 "PAP-5570: can not convert twice");
1586 1550
1587 n_ret_value = 1551 ret_value =
1588 maybe_indirect_to_direct(th, p_s_inode, page, 1552 maybe_indirect_to_direct(th, inode, page,
1589 p_s_path, p_s_item_key, 1553 path, item_key,
1590 n_new_file_size, &c_mode); 1554 new_file_size, &mode);
1591 if (c_mode == M_SKIP_BALANCING) 1555 if (mode == M_SKIP_BALANCING)
1592 /* tail has been left in the unformatted node */ 1556 /* tail has been left in the unformatted node */
1593 return n_ret_value; 1557 return ret_value;
1594 1558
1595 n_is_inode_locked = 1; 1559 is_inode_locked = 1;
1596 1560
1597 /* removing of last unformatted node will change value we 1561 /* removing of last unformatted node will change value we
1598 have to return to truncate. Save it */ 1562 have to return to truncate. Save it */
1599 retval2 = n_ret_value; 1563 retval2 = ret_value;
1600 /*retval2 = p_s_sb->s_blocksize - (n_new_file_size & (p_s_sb->s_blocksize - 1)); */ 1564 /*retval2 = sb->s_blocksize - (new_file_size & (sb->s_blocksize - 1)); */
1601 1565
1602 /* So, we have performed the first part of the conversion: 1566 /* So, we have performed the first part of the conversion:
1603 inserting the new direct item. Now we are removing the 1567 inserting the new direct item. Now we are removing the
1604 last unformatted node pointer. Set key to search for 1568 last unformatted node pointer. Set key to search for
1605 it. */ 1569 it. */
1606 set_cpu_key_k_type(p_s_item_key, TYPE_INDIRECT); 1570 set_cpu_key_k_type(item_key, TYPE_INDIRECT);
1607 p_s_item_key->key_length = 4; 1571 item_key->key_length = 4;
1608 n_new_file_size -= 1572 new_file_size -=
1609 (n_new_file_size & (p_s_sb->s_blocksize - 1)); 1573 (new_file_size & (sb->s_blocksize - 1));
1610 tail_pos = n_new_file_size; 1574 tail_pos = new_file_size;
1611 set_cpu_key_k_offset(p_s_item_key, n_new_file_size + 1); 1575 set_cpu_key_k_offset(item_key, new_file_size + 1);
1612 if (search_for_position_by_key 1576 if (search_for_position_by_key
1613 (p_s_sb, p_s_item_key, 1577 (sb, item_key,
1614 p_s_path) == POSITION_NOT_FOUND) { 1578 path) == POSITION_NOT_FOUND) {
1615 print_block(PATH_PLAST_BUFFER(p_s_path), 3, 1579 print_block(PATH_PLAST_BUFFER(path), 3,
1616 PATH_LAST_POSITION(p_s_path) - 1, 1580 PATH_LAST_POSITION(path) - 1,
1617 PATH_LAST_POSITION(p_s_path) + 1); 1581 PATH_LAST_POSITION(path) + 1);
1618 reiserfs_panic(p_s_sb, 1582 reiserfs_panic(sb, "PAP-5580", "item to "
1619 "PAP-5580: reiserfs_cut_from_item: item to convert does not exist (%K)", 1583 "convert does not exist (%K)",
1620 p_s_item_key); 1584 item_key);
1621 } 1585 }
1622 continue; 1586 continue;
1623 } 1587 }
1624 if (n_cut_size == 0) { 1588 if (cut_size == 0) {
1625 pathrelse(p_s_path); 1589 pathrelse(path);
1626 return 0; 1590 return 0;
1627 } 1591 }
1628 1592
1629 s_cut_balance.insert_size[0] = n_cut_size; 1593 s_cut_balance.insert_size[0] = cut_size;
1630 1594
1631 n_ret_value = fix_nodes(c_mode, &s_cut_balance, NULL, NULL); 1595 ret_value = fix_nodes(mode, &s_cut_balance, NULL, NULL);
1632 if (n_ret_value != REPEAT_SEARCH) 1596 if (ret_value != REPEAT_SEARCH)
1633 break; 1597 break;
1634 1598
1635 PROC_INFO_INC(p_s_sb, cut_from_item_restarted); 1599 PROC_INFO_INC(sb, cut_from_item_restarted);
1636 1600
1637 n_ret_value = 1601 ret_value =
1638 search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path); 1602 search_for_position_by_key(sb, item_key, path);
1639 if (n_ret_value == POSITION_FOUND) 1603 if (ret_value == POSITION_FOUND)
1640 continue; 1604 continue;
1641 1605
1642 reiserfs_warning(p_s_sb, 1606 reiserfs_warning(sb, "PAP-5610", "item %K not found",
1643 "PAP-5610: reiserfs_cut_from_item: item %K not found", 1607 item_key);
1644 p_s_item_key);
1645 unfix_nodes(&s_cut_balance); 1608 unfix_nodes(&s_cut_balance);
1646 return (n_ret_value == IO_ERROR) ? -EIO : -ENOENT; 1609 return (ret_value == IO_ERROR) ? -EIO : -ENOENT;
1647 } /* while */ 1610 } /* while */
1648 1611
1649 // check fix_nodes results (IO_ERROR or NO_DISK_SPACE) 1612 // check fix_nodes results (IO_ERROR or NO_DISK_SPACE)
1650 if (n_ret_value != CARRY_ON) { 1613 if (ret_value != CARRY_ON) {
1651 if (n_is_inode_locked) { 1614 if (is_inode_locked) {
1652 // FIXME: this seems to be not needed: we are always able 1615 // FIXME: this seems to be not needed: we are always able
1653 // to cut item 1616 // to cut item
1654 indirect_to_direct_roll_back(th, p_s_inode, p_s_path); 1617 indirect_to_direct_roll_back(th, inode, path);
1655 } 1618 }
1656 if (n_ret_value == NO_DISK_SPACE) 1619 if (ret_value == NO_DISK_SPACE)
1657 reiserfs_warning(p_s_sb, "NO_DISK_SPACE"); 1620 reiserfs_warning(sb, "reiserfs-5092",
1621 "NO_DISK_SPACE");
1658 unfix_nodes(&s_cut_balance); 1622 unfix_nodes(&s_cut_balance);
1659 return -EIO; 1623 return -EIO;
1660 } 1624 }
1661 1625
1662 /* go ahead and perform balancing */ 1626 /* go ahead and perform balancing */
1663 1627
1664 RFALSE(c_mode == M_PASTE || c_mode == M_INSERT, "invalid mode"); 1628 RFALSE(mode == M_PASTE || mode == M_INSERT, "invalid mode");
1665 1629
1666 /* Calculate number of bytes that need to be cut from the item. */ 1630 /* Calculate number of bytes that need to be cut from the item. */
1667 quota_cut_bytes = 1631 quota_cut_bytes =
1668 (c_mode == 1632 (mode ==
1669 M_DELETE) ? ih_item_len(get_ih(p_s_path)) : -s_cut_balance. 1633 M_DELETE) ? ih_item_len(get_ih(path)) : -s_cut_balance.
1670 insert_size[0]; 1634 insert_size[0];
1671 if (retval2 == -1) 1635 if (retval2 == -1)
1672 n_ret_value = calc_deleted_bytes_number(&s_cut_balance, c_mode); 1636 ret_value = calc_deleted_bytes_number(&s_cut_balance, mode);
1673 else 1637 else
1674 n_ret_value = retval2; 1638 ret_value = retval2;
1675 1639
1676 /* For direct items, we only change the quota when deleting the last 1640 /* For direct items, we only change the quota when deleting the last
1677 ** item. 1641 ** item.
1678 */ 1642 */
1679 p_le_ih = PATH_PITEM_HEAD(s_cut_balance.tb_path); 1643 p_le_ih = PATH_PITEM_HEAD(s_cut_balance.tb_path);
1680 if (!S_ISLNK(p_s_inode->i_mode) && is_direct_le_ih(p_le_ih)) { 1644 if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(p_le_ih)) {
1681 if (c_mode == M_DELETE && 1645 if (mode == M_DELETE &&
1682 (le_ih_k_offset(p_le_ih) & (p_s_sb->s_blocksize - 1)) == 1646 (le_ih_k_offset(p_le_ih) & (sb->s_blocksize - 1)) ==
1683 1) { 1647 1) {
1684 // FIXME: this is to keep 3.5 happy 1648 // FIXME: this is to keep 3.5 happy
1685 REISERFS_I(p_s_inode)->i_first_direct_byte = U32_MAX; 1649 REISERFS_I(inode)->i_first_direct_byte = U32_MAX;
1686 quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE; 1650 quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
1687 } else { 1651 } else {
1688 quota_cut_bytes = 0; 1652 quota_cut_bytes = 0;
1689 } 1653 }
1690 } 1654 }
1691#ifdef CONFIG_REISERFS_CHECK 1655#ifdef CONFIG_REISERFS_CHECK
1692 if (n_is_inode_locked) { 1656 if (is_inode_locked) {
1693 struct item_head *le_ih = 1657 struct item_head *le_ih =
1694 PATH_PITEM_HEAD(s_cut_balance.tb_path); 1658 PATH_PITEM_HEAD(s_cut_balance.tb_path);
1695 /* we are going to complete indirect2direct conversion. Make 1659 /* we are going to complete indirect2direct conversion. Make
1696 sure, that we exactly remove last unformatted node pointer 1660 sure, that we exactly remove last unformatted node pointer
1697 of the item */ 1661 of the item */
1698 if (!is_indirect_le_ih(le_ih)) 1662 if (!is_indirect_le_ih(le_ih))
1699 reiserfs_panic(p_s_sb, 1663 reiserfs_panic(sb, "vs-5652",
1700 "vs-5652: reiserfs_cut_from_item: "
1701 "item must be indirect %h", le_ih); 1664 "item must be indirect %h", le_ih);
1702 1665
1703 if (c_mode == M_DELETE && ih_item_len(le_ih) != UNFM_P_SIZE) 1666 if (mode == M_DELETE && ih_item_len(le_ih) != UNFM_P_SIZE)
1704 reiserfs_panic(p_s_sb, 1667 reiserfs_panic(sb, "vs-5653", "completing "
1705 "vs-5653: reiserfs_cut_from_item: " 1668 "indirect2direct conversion indirect "
1706 "completing indirect2direct conversion indirect item %h " 1669 "item %h being deleted must be of "
1707 "being deleted must be of 4 byte long", 1670 "4 byte long", le_ih);
1708 le_ih);
1709 1671
1710 if (c_mode == M_CUT 1672 if (mode == M_CUT
1711 && s_cut_balance.insert_size[0] != -UNFM_P_SIZE) { 1673 && s_cut_balance.insert_size[0] != -UNFM_P_SIZE) {
1712 reiserfs_panic(p_s_sb, 1674 reiserfs_panic(sb, "vs-5654", "can not complete "
1713 "vs-5654: reiserfs_cut_from_item: " 1675 "indirect2direct conversion of %h "
1714 "can not complete indirect2direct conversion of %h (CUT, insert_size==%d)", 1676 "(CUT, insert_size==%d)",
1715 le_ih, s_cut_balance.insert_size[0]); 1677 le_ih, s_cut_balance.insert_size[0]);
1716 } 1678 }
1717 /* it would be useful to make sure, that right neighboring 1679 /* it would be useful to make sure, that right neighboring
@@ -1719,23 +1681,23 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
1719 } 1681 }
1720#endif 1682#endif
1721 1683
1722 do_balance(&s_cut_balance, NULL, NULL, c_mode); 1684 do_balance(&s_cut_balance, NULL, NULL, mode);
1723 if (n_is_inode_locked) { 1685 if (is_inode_locked) {
1724 /* we've done an indirect->direct conversion. when the data block 1686 /* we've done an indirect->direct conversion. when the data block
1725 ** was freed, it was removed from the list of blocks that must 1687 ** was freed, it was removed from the list of blocks that must
1726 ** be flushed before the transaction commits, make sure to 1688 ** be flushed before the transaction commits, make sure to
1727 ** unmap and invalidate it 1689 ** unmap and invalidate it
1728 */ 1690 */
1729 unmap_buffers(page, tail_pos); 1691 unmap_buffers(page, tail_pos);
1730 REISERFS_I(p_s_inode)->i_flags &= ~i_pack_on_close_mask; 1692 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
1731 } 1693 }
1732#ifdef REISERQUOTA_DEBUG 1694#ifdef REISERQUOTA_DEBUG
1733 reiserfs_debug(p_s_inode->i_sb, REISERFS_DEBUG_CODE, 1695 reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
1734 "reiserquota cut_from_item(): freeing %u id=%u type=%c", 1696 "reiserquota cut_from_item(): freeing %u id=%u type=%c",
1735 quota_cut_bytes, p_s_inode->i_uid, '?'); 1697 quota_cut_bytes, inode->i_uid, '?');
1736#endif 1698#endif
1737 DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes); 1699 vfs_dq_free_space_nodirty(inode, quota_cut_bytes);
1738 return n_ret_value; 1700 return ret_value;
1739} 1701}
1740 1702
1741static void truncate_directory(struct reiserfs_transaction_handle *th, 1703static void truncate_directory(struct reiserfs_transaction_handle *th,
@@ -1743,8 +1705,7 @@ static void truncate_directory(struct reiserfs_transaction_handle *th,
1743{ 1705{
1744 BUG_ON(!th->t_trans_id); 1706 BUG_ON(!th->t_trans_id);
1745 if (inode->i_nlink) 1707 if (inode->i_nlink)
1746 reiserfs_warning(inode->i_sb, 1708 reiserfs_error(inode->i_sb, "vs-5655", "link count != 0");
1747 "vs-5655: truncate_directory: link count != 0");
1748 1709
1749 set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), DOT_OFFSET); 1710 set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), DOT_OFFSET);
1750 set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_DIRENTRY); 1711 set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_DIRENTRY);
@@ -1756,8 +1717,8 @@ static void truncate_directory(struct reiserfs_transaction_handle *th,
1756 1717
1757/* Truncate file to the new size. Note, this must be called with a transaction 1718/* Truncate file to the new size. Note, this must be called with a transaction
1758 already started */ 1719 already started */
1759int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, struct inode *p_s_inode, /* ->i_size contains new 1720int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
1760 size */ 1721 struct inode *inode, /* ->i_size contains new size */
1761 struct page *page, /* up to date for last block */ 1722 struct page *page, /* up to date for last block */
1762 int update_timestamps /* when it is called by 1723 int update_timestamps /* when it is called by
1763 file_release to convert 1724 file_release to convert
@@ -1768,47 +1729,45 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, struct inode *p
1768 INITIALIZE_PATH(s_search_path); /* Path to the current object item. */ 1729 INITIALIZE_PATH(s_search_path); /* Path to the current object item. */
1769 struct item_head *p_le_ih; /* Pointer to an item header. */ 1730 struct item_head *p_le_ih; /* Pointer to an item header. */
1770 struct cpu_key s_item_key; /* Key to search for a previous file item. */ 1731 struct cpu_key s_item_key; /* Key to search for a previous file item. */
1771 loff_t n_file_size, /* Old file size. */ 1732 loff_t file_size, /* Old file size. */
1772 n_new_file_size; /* New file size. */ 1733 new_file_size; /* New file size. */
1773 int n_deleted; /* Number of deleted or truncated bytes. */ 1734 int deleted; /* Number of deleted or truncated bytes. */
1774 int retval; 1735 int retval;
1775 int err = 0; 1736 int err = 0;
1776 1737
1777 BUG_ON(!th->t_trans_id); 1738 BUG_ON(!th->t_trans_id);
1778 if (! 1739 if (!
1779 (S_ISREG(p_s_inode->i_mode) || S_ISDIR(p_s_inode->i_mode) 1740 (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
1780 || S_ISLNK(p_s_inode->i_mode))) 1741 || S_ISLNK(inode->i_mode)))
1781 return 0; 1742 return 0;
1782 1743
1783 if (S_ISDIR(p_s_inode->i_mode)) { 1744 if (S_ISDIR(inode->i_mode)) {
1784 // deletion of directory - no need to update timestamps 1745 // deletion of directory - no need to update timestamps
1785 truncate_directory(th, p_s_inode); 1746 truncate_directory(th, inode);
1786 return 0; 1747 return 0;
1787 } 1748 }
1788 1749
1789 /* Get new file size. */ 1750 /* Get new file size. */
1790 n_new_file_size = p_s_inode->i_size; 1751 new_file_size = inode->i_size;
1791 1752
1792 // FIXME: note, that key type is unimportant here 1753 // FIXME: note, that key type is unimportant here
1793 make_cpu_key(&s_item_key, p_s_inode, max_reiserfs_offset(p_s_inode), 1754 make_cpu_key(&s_item_key, inode, max_reiserfs_offset(inode),
1794 TYPE_DIRECT, 3); 1755 TYPE_DIRECT, 3);
1795 1756
1796 retval = 1757 retval =
1797 search_for_position_by_key(p_s_inode->i_sb, &s_item_key, 1758 search_for_position_by_key(inode->i_sb, &s_item_key,
1798 &s_search_path); 1759 &s_search_path);
1799 if (retval == IO_ERROR) { 1760 if (retval == IO_ERROR) {
1800 reiserfs_warning(p_s_inode->i_sb, 1761 reiserfs_error(inode->i_sb, "vs-5657",
1801 "vs-5657: reiserfs_do_truncate: " 1762 "i/o failure occurred trying to truncate %K",
1802 "i/o failure occurred trying to truncate %K", 1763 &s_item_key);
1803 &s_item_key);
1804 err = -EIO; 1764 err = -EIO;
1805 goto out; 1765 goto out;
1806 } 1766 }
1807 if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) { 1767 if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) {
1808 reiserfs_warning(p_s_inode->i_sb, 1768 reiserfs_error(inode->i_sb, "PAP-5660",
1809 "PAP-5660: reiserfs_do_truncate: " 1769 "wrong result %d of search for %K", retval,
1810 "wrong result %d of search for %K", retval, 1770 &s_item_key);
1811 &s_item_key);
1812 1771
1813 err = -EIO; 1772 err = -EIO;
1814 goto out; 1773 goto out;
@@ -1819,56 +1778,56 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, struct inode *p
1819 /* Get real file size (total length of all file items) */ 1778 /* Get real file size (total length of all file items) */
1820 p_le_ih = PATH_PITEM_HEAD(&s_search_path); 1779 p_le_ih = PATH_PITEM_HEAD(&s_search_path);
1821 if (is_statdata_le_ih(p_le_ih)) 1780 if (is_statdata_le_ih(p_le_ih))
1822 n_file_size = 0; 1781 file_size = 0;
1823 else { 1782 else {
1824 loff_t offset = le_ih_k_offset(p_le_ih); 1783 loff_t offset = le_ih_k_offset(p_le_ih);
1825 int bytes = 1784 int bytes =
1826 op_bytes_number(p_le_ih, p_s_inode->i_sb->s_blocksize); 1785 op_bytes_number(p_le_ih, inode->i_sb->s_blocksize);
1827 1786
1828 /* this may mismatch with real file size: if last direct item 1787 /* this may mismatch with real file size: if last direct item
1829 had no padding zeros and last unformatted node had no free 1788 had no padding zeros and last unformatted node had no free
1830 space, this file would have this file size */ 1789 space, this file would have this file size */
1831 n_file_size = offset + bytes - 1; 1790 file_size = offset + bytes - 1;
1832 } 1791 }
1833 /* 1792 /*
1834 * are we doing a full truncate or delete, if so 1793 * are we doing a full truncate or delete, if so
1835 * kick in the reada code 1794 * kick in the reada code
1836 */ 1795 */
1837 if (n_new_file_size == 0) 1796 if (new_file_size == 0)
1838 s_search_path.reada = PATH_READA | PATH_READA_BACK; 1797 s_search_path.reada = PATH_READA | PATH_READA_BACK;
1839 1798
1840 if (n_file_size == 0 || n_file_size < n_new_file_size) { 1799 if (file_size == 0 || file_size < new_file_size) {
1841 goto update_and_out; 1800 goto update_and_out;
1842 } 1801 }
1843 1802
1844 /* Update key to search for the last file item. */ 1803 /* Update key to search for the last file item. */
1845 set_cpu_key_k_offset(&s_item_key, n_file_size); 1804 set_cpu_key_k_offset(&s_item_key, file_size);
1846 1805
1847 do { 1806 do {
1848 /* Cut or delete file item. */ 1807 /* Cut or delete file item. */
1849 n_deleted = 1808 deleted =
1850 reiserfs_cut_from_item(th, &s_search_path, &s_item_key, 1809 reiserfs_cut_from_item(th, &s_search_path, &s_item_key,
1851 p_s_inode, page, n_new_file_size); 1810 inode, page, new_file_size);
1852 if (n_deleted < 0) { 1811 if (deleted < 0) {
1853 reiserfs_warning(p_s_inode->i_sb, 1812 reiserfs_warning(inode->i_sb, "vs-5665",
1854 "vs-5665: reiserfs_do_truncate: reiserfs_cut_from_item failed"); 1813 "reiserfs_cut_from_item failed");
1855 reiserfs_check_path(&s_search_path); 1814 reiserfs_check_path(&s_search_path);
1856 return 0; 1815 return 0;
1857 } 1816 }
1858 1817
1859 RFALSE(n_deleted > n_file_size, 1818 RFALSE(deleted > file_size,
1860 "PAP-5670: reiserfs_cut_from_item: too many bytes deleted: deleted %d, file_size %lu, item_key %K", 1819 "PAP-5670: reiserfs_cut_from_item: too many bytes deleted: deleted %d, file_size %lu, item_key %K",
1861 n_deleted, n_file_size, &s_item_key); 1820 deleted, file_size, &s_item_key);
1862 1821
1863 /* Change key to search the last file item. */ 1822 /* Change key to search the last file item. */
1864 n_file_size -= n_deleted; 1823 file_size -= deleted;
1865 1824
1866 set_cpu_key_k_offset(&s_item_key, n_file_size); 1825 set_cpu_key_k_offset(&s_item_key, file_size);
1867 1826
1868 /* While there are bytes to truncate and previous file item is presented in the tree. */ 1827 /* While there are bytes to truncate and previous file item is presented in the tree. */
1869 1828
1870 /* 1829 /*
1871 ** This loop could take a really long time, and could log 1830 ** This loop could take a really long time, and could log
1872 ** many more blocks than a transaction can hold. So, we do a polite 1831 ** many more blocks than a transaction can hold. So, we do a polite
1873 ** journal end here, and if the transaction needs ending, we make 1832 ** journal end here, and if the transaction needs ending, we make
1874 ** sure the file is consistent before ending the current trans 1833 ** sure the file is consistent before ending the current trans
@@ -1877,37 +1836,38 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, struct inode *p
1877 if (journal_transaction_should_end(th, 0) || 1836 if (journal_transaction_should_end(th, 0) ||
1878 reiserfs_transaction_free_space(th) <= JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) { 1837 reiserfs_transaction_free_space(th) <= JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) {
1879 int orig_len_alloc = th->t_blocks_allocated; 1838 int orig_len_alloc = th->t_blocks_allocated;
1880 decrement_counters_in_path(&s_search_path); 1839 pathrelse(&s_search_path);
1881 1840
1882 if (update_timestamps) { 1841 if (update_timestamps) {
1883 p_s_inode->i_mtime = p_s_inode->i_ctime = 1842 inode->i_mtime = CURRENT_TIME_SEC;
1884 CURRENT_TIME_SEC; 1843 inode->i_ctime = CURRENT_TIME_SEC;
1885 } 1844 }
1886 reiserfs_update_sd(th, p_s_inode); 1845 reiserfs_update_sd(th, inode);
1887 1846
1888 err = journal_end(th, p_s_inode->i_sb, orig_len_alloc); 1847 err = journal_end(th, inode->i_sb, orig_len_alloc);
1889 if (err) 1848 if (err)
1890 goto out; 1849 goto out;
1891 err = journal_begin(th, p_s_inode->i_sb, 1850 err = journal_begin(th, inode->i_sb,
1892 JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD + JOURNAL_PER_BALANCE_CNT * 4) ; 1851 JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD + JOURNAL_PER_BALANCE_CNT * 4) ;
1893 if (err) 1852 if (err)
1894 goto out; 1853 goto out;
1895 reiserfs_update_inode_transaction(p_s_inode); 1854 reiserfs_update_inode_transaction(inode);
1896 } 1855 }
1897 } while (n_file_size > ROUND_UP(n_new_file_size) && 1856 } while (file_size > ROUND_UP(new_file_size) &&
1898 search_for_position_by_key(p_s_inode->i_sb, &s_item_key, 1857 search_for_position_by_key(inode->i_sb, &s_item_key,
1899 &s_search_path) == POSITION_FOUND); 1858 &s_search_path) == POSITION_FOUND);
1900 1859
1901 RFALSE(n_file_size > ROUND_UP(n_new_file_size), 1860 RFALSE(file_size > ROUND_UP(new_file_size),
1902 "PAP-5680: truncate did not finish: new_file_size %Ld, current %Ld, oid %d", 1861 "PAP-5680: truncate did not finish: new_file_size %Ld, current %Ld, oid %d",
1903 n_new_file_size, n_file_size, s_item_key.on_disk_key.k_objectid); 1862 new_file_size, file_size, s_item_key.on_disk_key.k_objectid);
1904 1863
1905 update_and_out: 1864 update_and_out:
1906 if (update_timestamps) { 1865 if (update_timestamps) {
1907 // this is truncate, not file closing 1866 // this is truncate, not file closing
1908 p_s_inode->i_mtime = p_s_inode->i_ctime = CURRENT_TIME_SEC; 1867 inode->i_mtime = CURRENT_TIME_SEC;
1868 inode->i_ctime = CURRENT_TIME_SEC;
1909 } 1869 }
1910 reiserfs_update_sd(th, p_s_inode); 1870 reiserfs_update_sd(th, inode);
1911 1871
1912 out: 1872 out:
1913 pathrelse(&s_search_path); 1873 pathrelse(&s_search_path);
@@ -1917,7 +1877,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, struct inode *p
1917#ifdef CONFIG_REISERFS_CHECK 1877#ifdef CONFIG_REISERFS_CHECK
1918// this makes sure, that we __append__, not overwrite or add holes 1878// this makes sure, that we __append__, not overwrite or add holes
1919static void check_research_for_paste(struct treepath *path, 1879static void check_research_for_paste(struct treepath *path,
1920 const struct cpu_key *p_s_key) 1880 const struct cpu_key *key)
1921{ 1881{
1922 struct item_head *found_ih = get_ih(path); 1882 struct item_head *found_ih = get_ih(path);
1923 1883
@@ -1925,36 +1885,36 @@ static void check_research_for_paste(struct treepath *path,
1925 if (le_ih_k_offset(found_ih) + 1885 if (le_ih_k_offset(found_ih) +
1926 op_bytes_number(found_ih, 1886 op_bytes_number(found_ih,
1927 get_last_bh(path)->b_size) != 1887 get_last_bh(path)->b_size) !=
1928 cpu_key_k_offset(p_s_key) 1888 cpu_key_k_offset(key)
1929 || op_bytes_number(found_ih, 1889 || op_bytes_number(found_ih,
1930 get_last_bh(path)->b_size) != 1890 get_last_bh(path)->b_size) !=
1931 pos_in_item(path)) 1891 pos_in_item(path))
1932 reiserfs_panic(NULL, 1892 reiserfs_panic(NULL, "PAP-5720", "found direct item "
1933 "PAP-5720: check_research_for_paste: " 1893 "%h or position (%d) does not match "
1934 "found direct item %h or position (%d) does not match to key %K", 1894 "to key %K", found_ih,
1935 found_ih, pos_in_item(path), p_s_key); 1895 pos_in_item(path), key);
1936 } 1896 }
1937 if (is_indirect_le_ih(found_ih)) { 1897 if (is_indirect_le_ih(found_ih)) {
1938 if (le_ih_k_offset(found_ih) + 1898 if (le_ih_k_offset(found_ih) +
1939 op_bytes_number(found_ih, 1899 op_bytes_number(found_ih,
1940 get_last_bh(path)->b_size) != 1900 get_last_bh(path)->b_size) !=
1941 cpu_key_k_offset(p_s_key) 1901 cpu_key_k_offset(key)
1942 || I_UNFM_NUM(found_ih) != pos_in_item(path) 1902 || I_UNFM_NUM(found_ih) != pos_in_item(path)
1943 || get_ih_free_space(found_ih) != 0) 1903 || get_ih_free_space(found_ih) != 0)
1944 reiserfs_panic(NULL, 1904 reiserfs_panic(NULL, "PAP-5730", "found indirect "
1945 "PAP-5730: check_research_for_paste: " 1905 "item (%h) or position (%d) does not "
1946 "found indirect item (%h) or position (%d) does not match to key (%K)", 1906 "match to key (%K)",
1947 found_ih, pos_in_item(path), p_s_key); 1907 found_ih, pos_in_item(path), key);
1948 } 1908 }
1949} 1909}
1950#endif /* config reiserfs check */ 1910#endif /* config reiserfs check */
1951 1911
1952/* Paste bytes to the existing item. Returns bytes number pasted into the item. */ 1912/* Paste bytes to the existing item. Returns bytes number pasted into the item. */
1953int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct treepath *p_s_search_path, /* Path to the pasted item. */ 1913int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct treepath *search_path, /* Path to the pasted item. */
1954 const struct cpu_key *p_s_key, /* Key to search for the needed item. */ 1914 const struct cpu_key *key, /* Key to search for the needed item. */
1955 struct inode *inode, /* Inode item belongs to */ 1915 struct inode *inode, /* Inode item belongs to */
1956 const char *p_c_body, /* Pointer to the bytes to paste. */ 1916 const char *body, /* Pointer to the bytes to paste. */
1957 int n_pasted_size) 1917 int pasted_size)
1958{ /* Size of pasted bytes. */ 1918{ /* Size of pasted bytes. */
1959 struct tree_balance s_paste_balance; 1919 struct tree_balance s_paste_balance;
1960 int retval; 1920 int retval;
@@ -1967,18 +1927,18 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
1967#ifdef REISERQUOTA_DEBUG 1927#ifdef REISERQUOTA_DEBUG
1968 reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE, 1928 reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
1969 "reiserquota paste_into_item(): allocating %u id=%u type=%c", 1929 "reiserquota paste_into_item(): allocating %u id=%u type=%c",
1970 n_pasted_size, inode->i_uid, 1930 pasted_size, inode->i_uid,
1971 key2type(&(p_s_key->on_disk_key))); 1931 key2type(&(key->on_disk_key)));
1972#endif 1932#endif
1973 1933
1974 if (DQUOT_ALLOC_SPACE_NODIRTY(inode, n_pasted_size)) { 1934 if (vfs_dq_alloc_space_nodirty(inode, pasted_size)) {
1975 pathrelse(p_s_search_path); 1935 pathrelse(search_path);
1976 return -EDQUOT; 1936 return -EDQUOT;
1977 } 1937 }
1978 init_tb_struct(th, &s_paste_balance, th->t_super, p_s_search_path, 1938 init_tb_struct(th, &s_paste_balance, th->t_super, search_path,
1979 n_pasted_size); 1939 pasted_size);
1980#ifdef DISPLACE_NEW_PACKING_LOCALITIES 1940#ifdef DISPLACE_NEW_PACKING_LOCALITIES
1981 s_paste_balance.key = p_s_key->on_disk_key; 1941 s_paste_balance.key = key->on_disk_key;
1982#endif 1942#endif
1983 1943
1984 /* DQUOT_* can schedule, must check before the fix_nodes */ 1944 /* DQUOT_* can schedule, must check before the fix_nodes */
@@ -1988,33 +1948,33 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
1988 1948
1989 while ((retval = 1949 while ((retval =
1990 fix_nodes(M_PASTE, &s_paste_balance, NULL, 1950 fix_nodes(M_PASTE, &s_paste_balance, NULL,
1991 p_c_body)) == REPEAT_SEARCH) { 1951 body)) == REPEAT_SEARCH) {
1992 search_again: 1952 search_again:
1993 /* file system changed while we were in the fix_nodes */ 1953 /* file system changed while we were in the fix_nodes */
1994 PROC_INFO_INC(th->t_super, paste_into_item_restarted); 1954 PROC_INFO_INC(th->t_super, paste_into_item_restarted);
1995 retval = 1955 retval =
1996 search_for_position_by_key(th->t_super, p_s_key, 1956 search_for_position_by_key(th->t_super, key,
1997 p_s_search_path); 1957 search_path);
1998 if (retval == IO_ERROR) { 1958 if (retval == IO_ERROR) {
1999 retval = -EIO; 1959 retval = -EIO;
2000 goto error_out; 1960 goto error_out;
2001 } 1961 }
2002 if (retval == POSITION_FOUND) { 1962 if (retval == POSITION_FOUND) {
2003 reiserfs_warning(inode->i_sb, 1963 reiserfs_warning(inode->i_sb, "PAP-5710",
2004 "PAP-5710: reiserfs_paste_into_item: entry or pasted byte (%K) exists", 1964 "entry or pasted byte (%K) exists",
2005 p_s_key); 1965 key);
2006 retval = -EEXIST; 1966 retval = -EEXIST;
2007 goto error_out; 1967 goto error_out;
2008 } 1968 }
2009#ifdef CONFIG_REISERFS_CHECK 1969#ifdef CONFIG_REISERFS_CHECK
2010 check_research_for_paste(p_s_search_path, p_s_key); 1970 check_research_for_paste(search_path, key);
2011#endif 1971#endif
2012 } 1972 }
2013 1973
2014 /* Perform balancing after all resources are collected by fix_nodes, and 1974 /* Perform balancing after all resources are collected by fix_nodes, and
2015 accessing them will not risk triggering schedule. */ 1975 accessing them will not risk triggering schedule. */
2016 if (retval == CARRY_ON) { 1976 if (retval == CARRY_ON) {
2017 do_balance(&s_paste_balance, NULL /*ih */ , p_c_body, M_PASTE); 1977 do_balance(&s_paste_balance, NULL /*ih */ , body, M_PASTE);
2018 return 0; 1978 return 0;
2019 } 1979 }
2020 retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO; 1980 retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
@@ -2024,18 +1984,24 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
2024#ifdef REISERQUOTA_DEBUG 1984#ifdef REISERQUOTA_DEBUG
2025 reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE, 1985 reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
2026 "reiserquota paste_into_item(): freeing %u id=%u type=%c", 1986 "reiserquota paste_into_item(): freeing %u id=%u type=%c",
2027 n_pasted_size, inode->i_uid, 1987 pasted_size, inode->i_uid,
2028 key2type(&(p_s_key->on_disk_key))); 1988 key2type(&(key->on_disk_key)));
2029#endif 1989#endif
2030 DQUOT_FREE_SPACE_NODIRTY(inode, n_pasted_size); 1990 vfs_dq_free_space_nodirty(inode, pasted_size);
2031 return retval; 1991 return retval;
2032} 1992}
2033 1993
2034/* Insert new item into the buffer at the path. */ 1994/* Insert new item into the buffer at the path.
2035int reiserfs_insert_item(struct reiserfs_transaction_handle *th, struct treepath *p_s_path, /* Path to the inserteded item. */ 1995 * th - active transaction handle
2036 const struct cpu_key *key, struct item_head *p_s_ih, /* Pointer to the item header to insert. */ 1996 * path - path to the inserted item
2037 struct inode *inode, const char *p_c_body) 1997 * ih - pointer to the item header to insert
2038{ /* Pointer to the bytes to insert. */ 1998 * body - pointer to the bytes to insert
1999 */
2000int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
2001 struct treepath *path, const struct cpu_key *key,
2002 struct item_head *ih, struct inode *inode,
2003 const char *body)
2004{
2039 struct tree_balance s_ins_balance; 2005 struct tree_balance s_ins_balance;
2040 int retval; 2006 int retval;
2041 int fs_gen = 0; 2007 int fs_gen = 0;
@@ -2045,28 +2011,27 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, struct treepath
2045 2011
2046 if (inode) { /* Do we count quotas for item? */ 2012 if (inode) { /* Do we count quotas for item? */
2047 fs_gen = get_generation(inode->i_sb); 2013 fs_gen = get_generation(inode->i_sb);
2048 quota_bytes = ih_item_len(p_s_ih); 2014 quota_bytes = ih_item_len(ih);
2049 2015
2050 /* hack so the quota code doesn't have to guess if the file has 2016 /* hack so the quota code doesn't have to guess if the file has
2051 ** a tail, links are always tails, so there's no guessing needed 2017 ** a tail, links are always tails, so there's no guessing needed
2052 */ 2018 */
2053 if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(p_s_ih)) { 2019 if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(ih))
2054 quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE; 2020 quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE;
2055 }
2056#ifdef REISERQUOTA_DEBUG 2021#ifdef REISERQUOTA_DEBUG
2057 reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE, 2022 reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
2058 "reiserquota insert_item(): allocating %u id=%u type=%c", 2023 "reiserquota insert_item(): allocating %u id=%u type=%c",
2059 quota_bytes, inode->i_uid, head2type(p_s_ih)); 2024 quota_bytes, inode->i_uid, head2type(ih));
2060#endif 2025#endif
2061 /* We can't dirty inode here. It would be immediately written but 2026 /* We can't dirty inode here. It would be immediately written but
2062 * appropriate stat item isn't inserted yet... */ 2027 * appropriate stat item isn't inserted yet... */
2063 if (DQUOT_ALLOC_SPACE_NODIRTY(inode, quota_bytes)) { 2028 if (vfs_dq_alloc_space_nodirty(inode, quota_bytes)) {
2064 pathrelse(p_s_path); 2029 pathrelse(path);
2065 return -EDQUOT; 2030 return -EDQUOT;
2066 } 2031 }
2067 } 2032 }
2068 init_tb_struct(th, &s_ins_balance, th->t_super, p_s_path, 2033 init_tb_struct(th, &s_ins_balance, th->t_super, path,
2069 IH_SIZE + ih_item_len(p_s_ih)); 2034 IH_SIZE + ih_item_len(ih));
2070#ifdef DISPLACE_NEW_PACKING_LOCALITIES 2035#ifdef DISPLACE_NEW_PACKING_LOCALITIES
2071 s_ins_balance.key = key->on_disk_key; 2036 s_ins_balance.key = key->on_disk_key;
2072#endif 2037#endif
@@ -2076,19 +2041,18 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, struct treepath
2076 } 2041 }
2077 2042
2078 while ((retval = 2043 while ((retval =
2079 fix_nodes(M_INSERT, &s_ins_balance, p_s_ih, 2044 fix_nodes(M_INSERT, &s_ins_balance, ih,
2080 p_c_body)) == REPEAT_SEARCH) { 2045 body)) == REPEAT_SEARCH) {
2081 search_again: 2046 search_again:
2082 /* file system changed while we were in the fix_nodes */ 2047 /* file system changed while we were in the fix_nodes */
2083 PROC_INFO_INC(th->t_super, insert_item_restarted); 2048 PROC_INFO_INC(th->t_super, insert_item_restarted);
2084 retval = search_item(th->t_super, key, p_s_path); 2049 retval = search_item(th->t_super, key, path);
2085 if (retval == IO_ERROR) { 2050 if (retval == IO_ERROR) {
2086 retval = -EIO; 2051 retval = -EIO;
2087 goto error_out; 2052 goto error_out;
2088 } 2053 }
2089 if (retval == ITEM_FOUND) { 2054 if (retval == ITEM_FOUND) {
2090 reiserfs_warning(th->t_super, 2055 reiserfs_warning(th->t_super, "PAP-5760",
2091 "PAP-5760: reiserfs_insert_item: "
2092 "key %K already exists in the tree", 2056 "key %K already exists in the tree",
2093 key); 2057 key);
2094 retval = -EEXIST; 2058 retval = -EEXIST;
@@ -2098,7 +2062,7 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, struct treepath
2098 2062
2099 /* make balancing after all resources will be collected at a time */ 2063 /* make balancing after all resources will be collected at a time */
2100 if (retval == CARRY_ON) { 2064 if (retval == CARRY_ON) {
2101 do_balance(&s_ins_balance, p_s_ih, p_c_body, M_INSERT); 2065 do_balance(&s_ins_balance, ih, body, M_INSERT);
2102 return 0; 2066 return 0;
2103 } 2067 }
2104 2068
@@ -2109,9 +2073,9 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, struct treepath
2109#ifdef REISERQUOTA_DEBUG 2073#ifdef REISERQUOTA_DEBUG
2110 reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE, 2074 reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
2111 "reiserquota insert_item(): freeing %u id=%u type=%c", 2075 "reiserquota insert_item(): freeing %u id=%u type=%c",
2112 quota_bytes, inode->i_uid, head2type(p_s_ih)); 2076 quota_bytes, inode->i_uid, head2type(ih));
2113#endif 2077#endif
2114 if (inode) 2078 if (inode)
2115 DQUOT_FREE_SPACE_NODIRTY(inode, quota_bytes); 2079 vfs_dq_free_space_nodirty(inode, quota_bytes);
2116 return retval; 2080 return retval;
2117} 2081}
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f3c820b75829..0ae6486d9046 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -27,6 +27,7 @@
27#include <linux/mnt_namespace.h> 27#include <linux/mnt_namespace.h>
28#include <linux/mount.h> 28#include <linux/mount.h>
29#include <linux/namei.h> 29#include <linux/namei.h>
30#include <linux/crc32.h>
30 31
31struct file_system_type reiserfs_fs_type; 32struct file_system_type reiserfs_fs_type;
32 33
@@ -183,9 +184,9 @@ static int finish_unfinished(struct super_block *s)
183 if (REISERFS_SB(s)->s_qf_names[i]) { 184 if (REISERFS_SB(s)->s_qf_names[i]) {
184 int ret = reiserfs_quota_on_mount(s, i); 185 int ret = reiserfs_quota_on_mount(s, i);
185 if (ret < 0) 186 if (ret < 0)
186 reiserfs_warning(s, 187 reiserfs_warning(s, "reiserfs-2500",
187 "reiserfs: cannot turn on journaled quota: error %d", 188 "cannot turn on journaled "
188 ret); 189 "quota: error %d", ret);
189 } 190 }
190 } 191 }
191#endif 192#endif
@@ -195,17 +196,16 @@ static int finish_unfinished(struct super_block *s)
195 while (!retval) { 196 while (!retval) {
196 retval = search_item(s, &max_cpu_key, &path); 197 retval = search_item(s, &max_cpu_key, &path);
197 if (retval != ITEM_NOT_FOUND) { 198 if (retval != ITEM_NOT_FOUND) {
198 reiserfs_warning(s, 199 reiserfs_error(s, "vs-2140",
199 "vs-2140: finish_unfinished: search_by_key returned %d", 200 "search_by_key returned %d", retval);
200 retval);
201 break; 201 break;
202 } 202 }
203 203
204 bh = get_last_bh(&path); 204 bh = get_last_bh(&path);
205 item_pos = get_item_pos(&path); 205 item_pos = get_item_pos(&path);
206 if (item_pos != B_NR_ITEMS(bh)) { 206 if (item_pos != B_NR_ITEMS(bh)) {
207 reiserfs_warning(s, 207 reiserfs_warning(s, "vs-2060",
208 "vs-2060: finish_unfinished: wrong position found"); 208 "wrong position found");
209 break; 209 break;
210 } 210 }
211 item_pos--; 211 item_pos--;
@@ -235,8 +235,7 @@ static int finish_unfinished(struct super_block *s)
235 if (!inode) { 235 if (!inode) {
236 /* the unlink almost completed, it just did not manage to remove 236 /* the unlink almost completed, it just did not manage to remove
237 "save" link and release objectid */ 237 "save" link and release objectid */
238 reiserfs_warning(s, 238 reiserfs_warning(s, "vs-2180", "iget failed for %K",
239 "vs-2180: finish_unfinished: iget failed for %K",
240 &obj_key); 239 &obj_key);
241 retval = remove_save_link_only(s, &save_link_key, 1); 240 retval = remove_save_link_only(s, &save_link_key, 1);
242 continue; 241 continue;
@@ -244,21 +243,22 @@ static int finish_unfinished(struct super_block *s)
244 243
245 if (!truncate && inode->i_nlink) { 244 if (!truncate && inode->i_nlink) {
246 /* file is not unlinked */ 245 /* file is not unlinked */
247 reiserfs_warning(s, 246 reiserfs_warning(s, "vs-2185",
248 "vs-2185: finish_unfinished: file %K is not unlinked", 247 "file %K is not unlinked",
249 &obj_key); 248 &obj_key);
250 retval = remove_save_link_only(s, &save_link_key, 0); 249 retval = remove_save_link_only(s, &save_link_key, 0);
251 continue; 250 continue;
252 } 251 }
253 DQUOT_INIT(inode); 252 vfs_dq_init(inode);
254 253
255 if (truncate && S_ISDIR(inode->i_mode)) { 254 if (truncate && S_ISDIR(inode->i_mode)) {
256 /* We got a truncate request for a dir which is impossible. 255 /* We got a truncate request for a dir which is impossible.
257 The only imaginable way is to execute unfinished truncate request 256 The only imaginable way is to execute unfinished truncate request
258 then boot into old kernel, remove the file and create dir with 257 then boot into old kernel, remove the file and create dir with
259 the same key. */ 258 the same key. */
260 reiserfs_warning(s, 259 reiserfs_warning(s, "green-2101",
261 "green-2101: impossible truncate on a directory %k. Please report", 260 "impossible truncate on a "
261 "directory %k. Please report",
262 INODE_PKEY(inode)); 262 INODE_PKEY(inode));
263 retval = remove_save_link_only(s, &save_link_key, 0); 263 retval = remove_save_link_only(s, &save_link_key, 0);
264 truncate = 0; 264 truncate = 0;
@@ -288,9 +288,10 @@ static int finish_unfinished(struct super_block *s)
288 /* removal gets completed in iput */ 288 /* removal gets completed in iput */
289 retval = 0; 289 retval = 0;
290 } else { 290 } else {
291 reiserfs_warning(s, "Dead loop in " 291 reiserfs_warning(s, "super-2189", "Dead loop "
292 "finish_unfinished detected, " 292 "in finish_unfinished "
293 "just remove save link\n"); 293 "detected, just remove "
294 "save link\n");
294 retval = remove_save_link_only(s, 295 retval = remove_save_link_only(s,
295 &save_link_key, 0); 296 &save_link_key, 0);
296 } 297 }
@@ -360,8 +361,9 @@ void add_save_link(struct reiserfs_transaction_handle *th,
360 } else { 361 } else {
361 /* truncate */ 362 /* truncate */
362 if (S_ISDIR(inode->i_mode)) 363 if (S_ISDIR(inode->i_mode))
363 reiserfs_warning(inode->i_sb, 364 reiserfs_warning(inode->i_sb, "green-2102",
364 "green-2102: Adding a truncate savelink for a directory %k! Please report", 365 "Adding a truncate savelink for "
366 "a directory %k! Please report",
365 INODE_PKEY(inode)); 367 INODE_PKEY(inode));
366 set_cpu_key_k_offset(&key, 1); 368 set_cpu_key_k_offset(&key, 1);
367 set_cpu_key_k_type(&key, TYPE_INDIRECT); 369 set_cpu_key_k_type(&key, TYPE_INDIRECT);
@@ -376,9 +378,9 @@ void add_save_link(struct reiserfs_transaction_handle *th,
376 retval = search_item(inode->i_sb, &key, &path); 378 retval = search_item(inode->i_sb, &key, &path);
377 if (retval != ITEM_NOT_FOUND) { 379 if (retval != ITEM_NOT_FOUND) {
378 if (retval != -ENOSPC) 380 if (retval != -ENOSPC)
379 reiserfs_warning(inode->i_sb, "vs-2100: add_save_link:" 381 reiserfs_error(inode->i_sb, "vs-2100",
380 "search_by_key (%K) returned %d", &key, 382 "search_by_key (%K) returned %d", &key,
381 retval); 383 retval);
382 pathrelse(&path); 384 pathrelse(&path);
383 return; 385 return;
384 } 386 }
@@ -391,9 +393,8 @@ void add_save_link(struct reiserfs_transaction_handle *th,
391 reiserfs_insert_item(th, &path, &key, &ih, NULL, (char *)&link); 393 reiserfs_insert_item(th, &path, &key, &ih, NULL, (char *)&link);
392 if (retval) { 394 if (retval) {
393 if (retval != -ENOSPC) 395 if (retval != -ENOSPC)
394 reiserfs_warning(inode->i_sb, 396 reiserfs_error(inode->i_sb, "vs-2120",
395 "vs-2120: add_save_link: insert_item returned %d", 397 "insert_item returned %d", retval);
396 retval);
397 } else { 398 } else {
398 if (truncate) 399 if (truncate)
399 REISERFS_I(inode)->i_flags |= 400 REISERFS_I(inode)->i_flags |=
@@ -492,8 +493,7 @@ static void reiserfs_put_super(struct super_block *s)
492 print_statistics(s); 493 print_statistics(s);
493 494
494 if (REISERFS_SB(s)->reserved_blocks != 0) { 495 if (REISERFS_SB(s)->reserved_blocks != 0) {
495 reiserfs_warning(s, 496 reiserfs_warning(s, "green-2005", "reserved blocks left %d",
496 "green-2005: reiserfs_put_super: reserved blocks left %d",
497 REISERFS_SB(s)->reserved_blocks); 497 REISERFS_SB(s)->reserved_blocks);
498 } 498 }
499 499
@@ -559,8 +559,8 @@ static void reiserfs_dirty_inode(struct inode *inode)
559 559
560 int err = 0; 560 int err = 0;
561 if (inode->i_sb->s_flags & MS_RDONLY) { 561 if (inode->i_sb->s_flags & MS_RDONLY) {
562 reiserfs_warning(inode->i_sb, 562 reiserfs_warning(inode->i_sb, "clm-6006",
563 "clm-6006: writing inode %lu on readonly FS", 563 "writing inode %lu on readonly FS",
564 inode->i_ino); 564 inode->i_ino);
565 return; 565 return;
566 } 566 }
@@ -629,8 +629,6 @@ static const struct super_operations reiserfs_sops = {
629#ifdef CONFIG_QUOTA 629#ifdef CONFIG_QUOTA
630#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") 630#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
631 631
632static int reiserfs_dquot_initialize(struct inode *, int);
633static int reiserfs_dquot_drop(struct inode *);
634static int reiserfs_write_dquot(struct dquot *); 632static int reiserfs_write_dquot(struct dquot *);
635static int reiserfs_acquire_dquot(struct dquot *); 633static int reiserfs_acquire_dquot(struct dquot *);
636static int reiserfs_release_dquot(struct dquot *); 634static int reiserfs_release_dquot(struct dquot *);
@@ -639,8 +637,8 @@ static int reiserfs_write_info(struct super_block *, int);
639static int reiserfs_quota_on(struct super_block *, int, int, char *, int); 637static int reiserfs_quota_on(struct super_block *, int, int, char *, int);
640 638
641static struct dquot_operations reiserfs_quota_operations = { 639static struct dquot_operations reiserfs_quota_operations = {
642 .initialize = reiserfs_dquot_initialize, 640 .initialize = dquot_initialize,
643 .drop = reiserfs_dquot_drop, 641 .drop = dquot_drop,
644 .alloc_space = dquot_alloc_space, 642 .alloc_space = dquot_alloc_space,
645 .alloc_inode = dquot_alloc_inode, 643 .alloc_inode = dquot_alloc_inode,
646 .free_space = dquot_free_space, 644 .free_space = dquot_free_space,
@@ -759,7 +757,7 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
759 char **opt_arg, unsigned long *bit_flags) 757 char **opt_arg, unsigned long *bit_flags)
760{ 758{
761 char *p; 759 char *p;
762 /* foo=bar, 760 /* foo=bar,
763 ^ ^ ^ 761 ^ ^ ^
764 | | +-- option_end 762 | | +-- option_end
765 | +-- arg_start 763 | +-- arg_start
@@ -794,13 +792,15 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
794 if (bit_flags) { 792 if (bit_flags) {
795 if (opt->clrmask == 793 if (opt->clrmask ==
796 (1 << REISERFS_UNSUPPORTED_OPT)) 794 (1 << REISERFS_UNSUPPORTED_OPT))
797 reiserfs_warning(s, "%s not supported.", 795 reiserfs_warning(s, "super-6500",
796 "%s not supported.\n",
798 p); 797 p);
799 else 798 else
800 *bit_flags &= ~opt->clrmask; 799 *bit_flags &= ~opt->clrmask;
801 if (opt->setmask == 800 if (opt->setmask ==
802 (1 << REISERFS_UNSUPPORTED_OPT)) 801 (1 << REISERFS_UNSUPPORTED_OPT))
803 reiserfs_warning(s, "%s not supported.", 802 reiserfs_warning(s, "super-6501",
803 "%s not supported.\n",
804 p); 804 p);
805 else 805 else
806 *bit_flags |= opt->setmask; 806 *bit_flags |= opt->setmask;
@@ -809,7 +809,8 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
809 } 809 }
810 } 810 }
811 if (!opt->option_name) { 811 if (!opt->option_name) {
812 reiserfs_warning(s, "unknown mount option \"%s\"", p); 812 reiserfs_warning(s, "super-6502",
813 "unknown mount option \"%s\"", p);
813 return -1; 814 return -1;
814 } 815 }
815 816
@@ -817,8 +818,9 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
817 switch (*p) { 818 switch (*p) {
818 case '=': 819 case '=':
819 if (!opt->arg_required) { 820 if (!opt->arg_required) {
820 reiserfs_warning(s, 821 reiserfs_warning(s, "super-6503",
821 "the option \"%s\" does not require an argument", 822 "the option \"%s\" does not "
823 "require an argument\n",
822 opt->option_name); 824 opt->option_name);
823 return -1; 825 return -1;
824 } 826 }
@@ -826,14 +828,15 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
826 828
827 case 0: 829 case 0:
828 if (opt->arg_required) { 830 if (opt->arg_required) {
829 reiserfs_warning(s, 831 reiserfs_warning(s, "super-6504",
830 "the option \"%s\" requires an argument", 832 "the option \"%s\" requires an "
831 opt->option_name); 833 "argument\n", opt->option_name);
832 return -1; 834 return -1;
833 } 835 }
834 break; 836 break;
835 default: 837 default:
836 reiserfs_warning(s, "head of option \"%s\" is only correct", 838 reiserfs_warning(s, "super-6505",
839 "head of option \"%s\" is only correct\n",
837 opt->option_name); 840 opt->option_name);
838 return -1; 841 return -1;
839 } 842 }
@@ -845,7 +848,8 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
845 && !(opt->arg_required & (1 << REISERFS_OPT_ALLOWEMPTY)) 848 && !(opt->arg_required & (1 << REISERFS_OPT_ALLOWEMPTY))
846 && !strlen(p)) { 849 && !strlen(p)) {
847 /* this catches "option=," if not allowed */ 850 /* this catches "option=," if not allowed */
848 reiserfs_warning(s, "empty argument for \"%s\"", 851 reiserfs_warning(s, "super-6506",
852 "empty argument for \"%s\"\n",
849 opt->option_name); 853 opt->option_name);
850 return -1; 854 return -1;
851 } 855 }
@@ -867,7 +871,8 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
867 } 871 }
868 } 872 }
869 873
870 reiserfs_warning(s, "bad value \"%s\" for option \"%s\"", p, 874 reiserfs_warning(s, "super-6506",
875 "bad value \"%s\" for option \"%s\"\n", p,
871 opt->option_name); 876 opt->option_name);
872 return -1; 877 return -1;
873} 878}
@@ -957,9 +962,9 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
957 *blocks = simple_strtoul(arg, &p, 0); 962 *blocks = simple_strtoul(arg, &p, 0);
958 if (*p != '\0') { 963 if (*p != '\0') {
959 /* NNN does not look like a number */ 964 /* NNN does not look like a number */
960 reiserfs_warning(s, 965 reiserfs_warning(s, "super-6507",
961 "reiserfs_parse_options: bad value %s", 966 "bad value %s for "
962 arg); 967 "-oresize\n", arg);
963 return 0; 968 return 0;
964 } 969 }
965 } 970 }
@@ -970,8 +975,8 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
970 unsigned long val = simple_strtoul(arg, &p, 0); 975 unsigned long val = simple_strtoul(arg, &p, 0);
971 /* commit=NNN (time in seconds) */ 976 /* commit=NNN (time in seconds) */
972 if (*p != '\0' || val >= (unsigned int)-1) { 977 if (*p != '\0' || val >= (unsigned int)-1) {
973 reiserfs_warning(s, 978 reiserfs_warning(s, "super-6508",
974 "reiserfs_parse_options: bad value %s", 979 "bad value %s for -ocommit\n",
975 arg); 980 arg);
976 return 0; 981 return 0;
977 } 982 }
@@ -979,16 +984,18 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
979 } 984 }
980 985
981 if (c == 'w') { 986 if (c == 'w') {
982 reiserfs_warning(s, "reiserfs: nolargeio option is no longer supported"); 987 reiserfs_warning(s, "super-6509", "nolargeio option "
988 "is no longer supported");
983 return 0; 989 return 0;
984 } 990 }
985 991
986 if (c == 'j') { 992 if (c == 'j') {
987 if (arg && *arg && jdev_name) { 993 if (arg && *arg && jdev_name) {
988 if (*jdev_name) { //Hm, already assigned? 994 if (*jdev_name) { //Hm, already assigned?
989 reiserfs_warning(s, 995 reiserfs_warning(s, "super-6510",
990 "reiserfs_parse_options: journal device was already specified to be %s", 996 "journal device was "
991 *jdev_name); 997 "already specified to "
998 "be %s", *jdev_name);
992 return 0; 999 return 0;
993 } 1000 }
994 *jdev_name = arg; 1001 *jdev_name = arg;
@@ -1000,29 +1007,35 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
1000 1007
1001 if (sb_any_quota_loaded(s) && 1008 if (sb_any_quota_loaded(s) &&
1002 (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) { 1009 (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) {
1003 reiserfs_warning(s, 1010 reiserfs_warning(s, "super-6511",
1004 "reiserfs_parse_options: cannot change journaled quota options when quota turned on."); 1011 "cannot change journaled "
1012 "quota options when quota "
1013 "turned on.");
1005 return 0; 1014 return 0;
1006 } 1015 }
1007 if (*arg) { /* Some filename specified? */ 1016 if (*arg) { /* Some filename specified? */
1008 if (REISERFS_SB(s)->s_qf_names[qtype] 1017 if (REISERFS_SB(s)->s_qf_names[qtype]
1009 && strcmp(REISERFS_SB(s)->s_qf_names[qtype], 1018 && strcmp(REISERFS_SB(s)->s_qf_names[qtype],
1010 arg)) { 1019 arg)) {
1011 reiserfs_warning(s, 1020 reiserfs_warning(s, "super-6512",
1012 "reiserfs_parse_options: %s quota file already specified.", 1021 "%s quota file "
1022 "already specified.",
1013 QTYPE2NAME(qtype)); 1023 QTYPE2NAME(qtype));
1014 return 0; 1024 return 0;
1015 } 1025 }
1016 if (strchr(arg, '/')) { 1026 if (strchr(arg, '/')) {
1017 reiserfs_warning(s, 1027 reiserfs_warning(s, "super-6513",
1018 "reiserfs_parse_options: quotafile must be on filesystem root."); 1028 "quotafile must be "
1029 "on filesystem root.");
1019 return 0; 1030 return 0;
1020 } 1031 }
1021 qf_names[qtype] = 1032 qf_names[qtype] =
1022 kmalloc(strlen(arg) + 1, GFP_KERNEL); 1033 kmalloc(strlen(arg) + 1, GFP_KERNEL);
1023 if (!qf_names[qtype]) { 1034 if (!qf_names[qtype]) {
1024 reiserfs_warning(s, 1035 reiserfs_warning(s, "reiserfs-2502",
1025 "reiserfs_parse_options: not enough memory for storing quotafile name."); 1036 "not enough memory "
1037 "for storing "
1038 "quotafile name.");
1026 return 0; 1039 return 0;
1027 } 1040 }
1028 strcpy(qf_names[qtype], arg); 1041 strcpy(qf_names[qtype], arg);
@@ -1040,21 +1053,24 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
1040 else if (!strcmp(arg, "vfsv0")) 1053 else if (!strcmp(arg, "vfsv0"))
1041 *qfmt = QFMT_VFS_V0; 1054 *qfmt = QFMT_VFS_V0;
1042 else { 1055 else {
1043 reiserfs_warning(s, 1056 reiserfs_warning(s, "super-6514",
1044 "reiserfs_parse_options: unknown quota format specified."); 1057 "unknown quota format "
1058 "specified.");
1045 return 0; 1059 return 0;
1046 } 1060 }
1047 if (sb_any_quota_loaded(s) && 1061 if (sb_any_quota_loaded(s) &&
1048 *qfmt != REISERFS_SB(s)->s_jquota_fmt) { 1062 *qfmt != REISERFS_SB(s)->s_jquota_fmt) {
1049 reiserfs_warning(s, 1063 reiserfs_warning(s, "super-6515",
1050 "reiserfs_parse_options: cannot change journaled quota options when quota turned on."); 1064 "cannot change journaled "
1065 "quota options when quota "
1066 "turned on.");
1051 return 0; 1067 return 0;
1052 } 1068 }
1053 } 1069 }
1054#else 1070#else
1055 if (c == 'u' || c == 'g' || c == 'f') { 1071 if (c == 'u' || c == 'g' || c == 'f') {
1056 reiserfs_warning(s, 1072 reiserfs_warning(s, "reiserfs-2503", "journaled "
1057 "reiserfs_parse_options: journaled quota options not supported."); 1073 "quota options not supported.");
1058 return 0; 1074 return 0;
1059 } 1075 }
1060#endif 1076#endif
@@ -1063,15 +1079,15 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
1063#ifdef CONFIG_QUOTA 1079#ifdef CONFIG_QUOTA
1064 if (!REISERFS_SB(s)->s_jquota_fmt && !*qfmt 1080 if (!REISERFS_SB(s)->s_jquota_fmt && !*qfmt
1065 && (qf_names[USRQUOTA] || qf_names[GRPQUOTA])) { 1081 && (qf_names[USRQUOTA] || qf_names[GRPQUOTA])) {
1066 reiserfs_warning(s, 1082 reiserfs_warning(s, "super-6515",
1067 "reiserfs_parse_options: journaled quota format not specified."); 1083 "journaled quota format not specified.");
1068 return 0; 1084 return 0;
1069 } 1085 }
1070 /* This checking is not precise wrt the quota type but for our purposes it is sufficient */ 1086 /* This checking is not precise wrt the quota type but for our purposes it is sufficient */
1071 if (!(*mount_options & (1 << REISERFS_QUOTA)) 1087 if (!(*mount_options & (1 << REISERFS_QUOTA))
1072 && sb_any_quota_loaded(s)) { 1088 && sb_any_quota_loaded(s)) {
1073 reiserfs_warning(s, 1089 reiserfs_warning(s, "super-6516", "quota options must "
1074 "reiserfs_parse_options: quota options must be present when quota is turned on."); 1090 "be present when quota is turned on.");
1075 return 0; 1091 return 0;
1076 } 1092 }
1077#endif 1093#endif
@@ -1131,14 +1147,15 @@ static void handle_attrs(struct super_block *s)
1131 1147
1132 if (reiserfs_attrs(s)) { 1148 if (reiserfs_attrs(s)) {
1133 if (old_format_only(s)) { 1149 if (old_format_only(s)) {
1134 reiserfs_warning(s, 1150 reiserfs_warning(s, "super-6517", "cannot support "
1135 "reiserfs: cannot support attributes on 3.5.x disk format"); 1151 "attributes on 3.5.x disk format");
1136 REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS); 1152 REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS);
1137 return; 1153 return;
1138 } 1154 }
1139 if (!(le32_to_cpu(rs->s_flags) & reiserfs_attrs_cleared)) { 1155 if (!(le32_to_cpu(rs->s_flags) & reiserfs_attrs_cleared)) {
1140 reiserfs_warning(s, 1156 reiserfs_warning(s, "super-6518", "cannot support "
1141 "reiserfs: cannot support attributes until flag is set in super-block"); 1157 "attributes until flag is set in "
1158 "super-block");
1142 REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS); 1159 REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS);
1143 } 1160 }
1144 } 1161 }
@@ -1280,6 +1297,8 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1280 REISERFS_SB(s)->s_mount_state = sb_umount_state(rs); 1297 REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
1281 s->s_flags &= ~MS_RDONLY; 1298 s->s_flags &= ~MS_RDONLY;
1282 set_sb_umount_state(rs, REISERFS_ERROR_FS); 1299 set_sb_umount_state(rs, REISERFS_ERROR_FS);
1300 if (!old_format_only(s))
1301 set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
1283 /* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */ 1302 /* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */
1284 journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); 1303 journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
1285 REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS; 1304 REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS;
@@ -1314,7 +1333,7 @@ static int read_super_block(struct super_block *s, int offset)
1314 1333
1315 bh = sb_bread(s, offset / s->s_blocksize); 1334 bh = sb_bread(s, offset / s->s_blocksize);
1316 if (!bh) { 1335 if (!bh) {
1317 reiserfs_warning(s, "sh-2006: read_super_block: " 1336 reiserfs_warning(s, "sh-2006",
1318 "bread failed (dev %s, block %lu, size %lu)", 1337 "bread failed (dev %s, block %lu, size %lu)",
1319 reiserfs_bdevname(s), offset / s->s_blocksize, 1338 reiserfs_bdevname(s), offset / s->s_blocksize,
1320 s->s_blocksize); 1339 s->s_blocksize);
@@ -1328,15 +1347,15 @@ static int read_super_block(struct super_block *s, int offset)
1328 } 1347 }
1329 // 1348 //
1330 // ok, reiserfs signature (old or new) found in at the given offset 1349 // ok, reiserfs signature (old or new) found in at the given offset
1331 // 1350 //
1332 fs_blocksize = sb_blocksize(rs); 1351 fs_blocksize = sb_blocksize(rs);
1333 brelse(bh); 1352 brelse(bh);
1334 sb_set_blocksize(s, fs_blocksize); 1353 sb_set_blocksize(s, fs_blocksize);
1335 1354
1336 bh = sb_bread(s, offset / s->s_blocksize); 1355 bh = sb_bread(s, offset / s->s_blocksize);
1337 if (!bh) { 1356 if (!bh) {
1338 reiserfs_warning(s, "sh-2007: read_super_block: " 1357 reiserfs_warning(s, "sh-2007",
1339 "bread failed (dev %s, block %lu, size %lu)\n", 1358 "bread failed (dev %s, block %lu, size %lu)",
1340 reiserfs_bdevname(s), offset / s->s_blocksize, 1359 reiserfs_bdevname(s), offset / s->s_blocksize,
1341 s->s_blocksize); 1360 s->s_blocksize);
1342 return 1; 1361 return 1;
@@ -1344,8 +1363,8 @@ static int read_super_block(struct super_block *s, int offset)
1344 1363
1345 rs = (struct reiserfs_super_block *)bh->b_data; 1364 rs = (struct reiserfs_super_block *)bh->b_data;
1346 if (sb_blocksize(rs) != s->s_blocksize) { 1365 if (sb_blocksize(rs) != s->s_blocksize) {
1347 reiserfs_warning(s, "sh-2011: read_super_block: " 1366 reiserfs_warning(s, "sh-2011", "can't find a reiserfs "
1348 "can't find a reiserfs filesystem on (dev %s, block %Lu, size %lu)\n", 1367 "filesystem on (dev %s, block %Lu, size %lu)",
1349 reiserfs_bdevname(s), 1368 reiserfs_bdevname(s),
1350 (unsigned long long)bh->b_blocknr, 1369 (unsigned long long)bh->b_blocknr,
1351 s->s_blocksize); 1370 s->s_blocksize);
@@ -1355,9 +1374,10 @@ static int read_super_block(struct super_block *s, int offset)
1355 1374
1356 if (rs->s_v1.s_root_block == cpu_to_le32(-1)) { 1375 if (rs->s_v1.s_root_block == cpu_to_le32(-1)) {
1357 brelse(bh); 1376 brelse(bh);
1358 reiserfs_warning(s, 1377 reiserfs_warning(s, "super-6519", "Unfinished reiserfsck "
1359 "Unfinished reiserfsck --rebuild-tree run detected. Please run\n" 1378 "--rebuild-tree run detected. Please run\n"
1360 "reiserfsck --rebuild-tree and wait for a completion. If that fails\n" 1379 "reiserfsck --rebuild-tree and wait for a "
1380 "completion. If that fails\n"
1361 "get newer reiserfsprogs package"); 1381 "get newer reiserfsprogs package");
1362 return 1; 1382 return 1;
1363 } 1383 }
@@ -1369,18 +1389,15 @@ static int read_super_block(struct super_block *s, int offset)
1369 /* magic is of non-standard journal filesystem, look at s_version to 1389 /* magic is of non-standard journal filesystem, look at s_version to
1370 find which format is in use */ 1390 find which format is in use */
1371 if (sb_version(rs) == REISERFS_VERSION_2) 1391 if (sb_version(rs) == REISERFS_VERSION_2)
1372 reiserfs_warning(s, 1392 reiserfs_info(s, "found reiserfs format \"3.6\""
1373 "read_super_block: found reiserfs format \"3.6\"" 1393 " with non-standard journal\n");
1374 " with non-standard journal");
1375 else if (sb_version(rs) == REISERFS_VERSION_1) 1394 else if (sb_version(rs) == REISERFS_VERSION_1)
1376 reiserfs_warning(s, 1395 reiserfs_info(s, "found reiserfs format \"3.5\""
1377 "read_super_block: found reiserfs format \"3.5\"" 1396 " with non-standard journal\n");
1378 " with non-standard journal");
1379 else { 1397 else {
1380 reiserfs_warning(s, 1398 reiserfs_warning(s, "sh-2012", "found unknown "
1381 "sh-2012: read_super_block: found unknown " 1399 "format \"%u\" of reiserfs with "
1382 "format \"%u\" of reiserfs with non-standard magic", 1400 "non-standard magic", sb_version(rs));
1383 sb_version(rs));
1384 return 1; 1401 return 1;
1385 } 1402 }
1386 } else 1403 } else
@@ -1410,8 +1427,7 @@ static int reread_meta_blocks(struct super_block *s)
1410 ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s))); 1427 ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s)));
1411 wait_on_buffer(SB_BUFFER_WITH_SB(s)); 1428 wait_on_buffer(SB_BUFFER_WITH_SB(s));
1412 if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { 1429 if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
1413 reiserfs_warning(s, 1430 reiserfs_warning(s, "reiserfs-2504", "error reading the super");
1414 "reread_meta_blocks, error reading the super");
1415 return 1; 1431 return 1;
1416 } 1432 }
1417 1433
@@ -1454,8 +1470,8 @@ static __u32 find_hash_out(struct super_block *s)
1454 if (reiserfs_rupasov_hash(s)) { 1470 if (reiserfs_rupasov_hash(s)) {
1455 hash = YURA_HASH; 1471 hash = YURA_HASH;
1456 } 1472 }
1457 reiserfs_warning(s, "FS seems to be empty, autodetect " 1473 reiserfs_info(s, "FS seems to be empty, autodetect "
1458 "is using the default hash"); 1474 "is using the default hash\n");
1459 break; 1475 break;
1460 } 1476 }
1461 r5hash = GET_HASH_VALUE(r5_hash(de.de_name, de.de_namelen)); 1477 r5hash = GET_HASH_VALUE(r5_hash(de.de_name, de.de_namelen));
@@ -1475,10 +1491,10 @@ static __u32 find_hash_out(struct super_block *s)
1475 && (yurahash == 1491 && (yurahash ==
1476 GET_HASH_VALUE(deh_offset 1492 GET_HASH_VALUE(deh_offset
1477 (&(de.de_deh[de.de_entry_num])))))) { 1493 (&(de.de_deh[de.de_entry_num])))))) {
1478 reiserfs_warning(s, 1494 reiserfs_warning(s, "reiserfs-2506", "Unable to "
1479 "Unable to automatically detect hash function. " 1495 "automatically detect hash function. "
1480 "Please mount with -o hash={tea,rupasov,r5}", 1496 "Please mount with -o "
1481 reiserfs_bdevname(s)); 1497 "hash={tea,rupasov,r5}");
1482 hash = UNSET_HASH; 1498 hash = UNSET_HASH;
1483 break; 1499 break;
1484 } 1500 }
@@ -1492,7 +1508,8 @@ static __u32 find_hash_out(struct super_block *s)
1492 (deh_offset(&(de.de_deh[de.de_entry_num]))) == r5hash) 1508 (deh_offset(&(de.de_deh[de.de_entry_num]))) == r5hash)
1493 hash = R5_HASH; 1509 hash = R5_HASH;
1494 else { 1510 else {
1495 reiserfs_warning(s, "Unrecognised hash function"); 1511 reiserfs_warning(s, "reiserfs-2506",
1512 "Unrecognised hash function");
1496 hash = UNSET_HASH; 1513 hash = UNSET_HASH;
1497 } 1514 }
1498 } while (0); 1515 } while (0);
@@ -1516,21 +1533,24 @@ static int what_hash(struct super_block *s)
1516 code = find_hash_out(s); 1533 code = find_hash_out(s);
1517 1534
1518 if (code != UNSET_HASH && reiserfs_hash_detect(s)) { 1535 if (code != UNSET_HASH && reiserfs_hash_detect(s)) {
1519 /* detection has found the hash, and we must check against the 1536 /* detection has found the hash, and we must check against the
1520 ** mount options 1537 ** mount options
1521 */ 1538 */
1522 if (reiserfs_rupasov_hash(s) && code != YURA_HASH) { 1539 if (reiserfs_rupasov_hash(s) && code != YURA_HASH) {
1523 reiserfs_warning(s, "Error, %s hash detected, " 1540 reiserfs_warning(s, "reiserfs-2507",
1541 "Error, %s hash detected, "
1524 "unable to force rupasov hash", 1542 "unable to force rupasov hash",
1525 reiserfs_hashname(code)); 1543 reiserfs_hashname(code));
1526 code = UNSET_HASH; 1544 code = UNSET_HASH;
1527 } else if (reiserfs_tea_hash(s) && code != TEA_HASH) { 1545 } else if (reiserfs_tea_hash(s) && code != TEA_HASH) {
1528 reiserfs_warning(s, "Error, %s hash detected, " 1546 reiserfs_warning(s, "reiserfs-2508",
1547 "Error, %s hash detected, "
1529 "unable to force tea hash", 1548 "unable to force tea hash",
1530 reiserfs_hashname(code)); 1549 reiserfs_hashname(code));
1531 code = UNSET_HASH; 1550 code = UNSET_HASH;
1532 } else if (reiserfs_r5_hash(s) && code != R5_HASH) { 1551 } else if (reiserfs_r5_hash(s) && code != R5_HASH) {
1533 reiserfs_warning(s, "Error, %s hash detected, " 1552 reiserfs_warning(s, "reiserfs-2509",
1553 "Error, %s hash detected, "
1534 "unable to force r5 hash", 1554 "unable to force r5 hash",
1535 reiserfs_hashname(code)); 1555 reiserfs_hashname(code));
1536 code = UNSET_HASH; 1556 code = UNSET_HASH;
@@ -1546,7 +1566,7 @@ static int what_hash(struct super_block *s)
1546 } 1566 }
1547 } 1567 }
1548 1568
1549 /* if we are mounted RW, and we have a new valid hash code, update 1569 /* if we are mounted RW, and we have a new valid hash code, update
1550 ** the super 1570 ** the super
1551 */ 1571 */
1552 if (code != UNSET_HASH && 1572 if (code != UNSET_HASH &&
@@ -1589,9 +1609,9 @@ static int function2code(hashf_t func)
1589 return 0; 1609 return 0;
1590} 1610}
1591 1611
1592#define SWARN(silent, s, ...) \ 1612#define SWARN(silent, s, id, ...) \
1593 if (!(silent)) \ 1613 if (!(silent)) \
1594 reiserfs_warning (s, __VA_ARGS__) 1614 reiserfs_warning(s, id, __VA_ARGS__)
1595 1615
1596static int reiserfs_fill_super(struct super_block *s, void *data, int silent) 1616static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1597{ 1617{
@@ -1625,10 +1645,6 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1625 REISERFS_SB(s)->s_alloc_options.preallocmin = 0; 1645 REISERFS_SB(s)->s_alloc_options.preallocmin = 0;
1626 /* Preallocate by 16 blocks (17-1) at once */ 1646 /* Preallocate by 16 blocks (17-1) at once */
1627 REISERFS_SB(s)->s_alloc_options.preallocsize = 17; 1647 REISERFS_SB(s)->s_alloc_options.preallocsize = 17;
1628#ifdef CONFIG_REISERFS_FS_XATTR
1629 /* Initialize the rwsem for xattr dir */
1630 init_rwsem(&REISERFS_SB(s)->xattr_dir_sem);
1631#endif
1632 /* setup default block allocator options */ 1648 /* setup default block allocator options */
1633 reiserfs_init_alloc_options(s); 1649 reiserfs_init_alloc_options(s);
1634 1650
@@ -1643,8 +1659,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1643#endif 1659#endif
1644 1660
1645 if (blocks) { 1661 if (blocks) {
1646 SWARN(silent, s, "jmacd-7: reiserfs_fill_super: resize option " 1662 SWARN(silent, s, "jmacd-7", "resize option for remount only");
1647 "for remount only");
1648 goto error; 1663 goto error;
1649 } 1664 }
1650 1665
@@ -1653,8 +1668,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1653 old_format = 1; 1668 old_format = 1;
1654 /* try new format (64-th 1k block), which can contain reiserfs super block */ 1669 /* try new format (64-th 1k block), which can contain reiserfs super block */
1655 else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) { 1670 else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) {
1656 SWARN(silent, s, 1671 SWARN(silent, s, "sh-2021", "can not find reiserfs on %s",
1657 "sh-2021: reiserfs_fill_super: can not find reiserfs on %s",
1658 reiserfs_bdevname(s)); 1672 reiserfs_bdevname(s));
1659 goto error; 1673 goto error;
1660 } 1674 }
@@ -1666,13 +1680,12 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1666 if (s->s_bdev && s->s_bdev->bd_inode 1680 if (s->s_bdev && s->s_bdev->bd_inode
1667 && i_size_read(s->s_bdev->bd_inode) < 1681 && i_size_read(s->s_bdev->bd_inode) <
1668 sb_block_count(rs) * sb_blocksize(rs)) { 1682 sb_block_count(rs) * sb_blocksize(rs)) {
1669 SWARN(silent, s, 1683 SWARN(silent, s, "", "Filesystem cannot be "
1670 "Filesystem on %s cannot be mounted because it is bigger than the device", 1684 "mounted because it is bigger than the device");
1671 reiserfs_bdevname(s)); 1685 SWARN(silent, s, "", "You may need to run fsck "
1672 SWARN(silent, s, 1686 "or increase size of your LVM partition");
1673 "You may need to run fsck or increase size of your LVM partition"); 1687 SWARN(silent, s, "", "Or may be you forgot to "
1674 SWARN(silent, s, 1688 "reboot after fdisk when it told you to");
1675 "Or may be you forgot to reboot after fdisk when it told you to");
1676 goto error; 1689 goto error;
1677 } 1690 }
1678 1691
@@ -1680,14 +1693,13 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1680 sbi->s_mount_state = REISERFS_VALID_FS; 1693 sbi->s_mount_state = REISERFS_VALID_FS;
1681 1694
1682 if ((errval = reiserfs_init_bitmap_cache(s))) { 1695 if ((errval = reiserfs_init_bitmap_cache(s))) {
1683 SWARN(silent, s, 1696 SWARN(silent, s, "jmacd-8", "unable to read bitmap");
1684 "jmacd-8: reiserfs_fill_super: unable to read bitmap");
1685 goto error; 1697 goto error;
1686 } 1698 }
1687 errval = -EINVAL; 1699 errval = -EINVAL;
1688#ifdef CONFIG_REISERFS_CHECK 1700#ifdef CONFIG_REISERFS_CHECK
1689 SWARN(silent, s, "CONFIG_REISERFS_CHECK is set ON"); 1701 SWARN(silent, s, "", "CONFIG_REISERFS_CHECK is set ON");
1690 SWARN(silent, s, "- it is slow mode for debugging."); 1702 SWARN(silent, s, "", "- it is slow mode for debugging.");
1691#endif 1703#endif
1692 1704
1693 /* make data=ordered the default */ 1705 /* make data=ordered the default */
@@ -1708,8 +1720,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1708 } 1720 }
1709 // set_device_ro(s->s_dev, 1) ; 1721 // set_device_ro(s->s_dev, 1) ;
1710 if (journal_init(s, jdev_name, old_format, commit_max_age)) { 1722 if (journal_init(s, jdev_name, old_format, commit_max_age)) {
1711 SWARN(silent, s, 1723 SWARN(silent, s, "sh-2022",
1712 "sh-2022: reiserfs_fill_super: unable to initialize journal space"); 1724 "unable to initialize journal space");
1713 goto error; 1725 goto error;
1714 } else { 1726 } else {
1715 jinit_done = 1; /* once this is set, journal_release must be called 1727 jinit_done = 1; /* once this is set, journal_release must be called
@@ -1717,8 +1729,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1717 */ 1729 */
1718 } 1730 }
1719 if (reread_meta_blocks(s)) { 1731 if (reread_meta_blocks(s)) {
1720 SWARN(silent, s, 1732 SWARN(silent, s, "jmacd-9",
1721 "jmacd-9: reiserfs_fill_super: unable to reread meta blocks after journal init"); 1733 "unable to reread meta blocks after journal init");
1722 goto error; 1734 goto error;
1723 } 1735 }
1724 1736
@@ -1726,8 +1738,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1726 goto error; 1738 goto error;
1727 1739
1728 if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) { 1740 if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) {
1729 SWARN(silent, s, 1741 SWARN(silent, s, "clm-7000",
1730 "clm-7000: Detected readonly device, marking FS readonly"); 1742 "Detected readonly device, marking FS readonly");
1731 s->s_flags |= MS_RDONLY; 1743 s->s_flags |= MS_RDONLY;
1732 } 1744 }
1733 args.objectid = REISERFS_ROOT_OBJECTID; 1745 args.objectid = REISERFS_ROOT_OBJECTID;
@@ -1736,8 +1748,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1736 iget5_locked(s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor, 1748 iget5_locked(s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor,
1737 reiserfs_init_locked_inode, (void *)(&args)); 1749 reiserfs_init_locked_inode, (void *)(&args));
1738 if (!root_inode) { 1750 if (!root_inode) {
1739 SWARN(silent, s, 1751 SWARN(silent, s, "jmacd-10", "get root inode failed");
1740 "jmacd-10: reiserfs_fill_super: get root inode failed");
1741 goto error; 1752 goto error;
1742 } 1753 }
1743 1754
@@ -1786,7 +1797,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1786 * avoiding corruption. -jeffm */ 1797 * avoiding corruption. -jeffm */
1787 if (bmap_would_wrap(reiserfs_bmap_count(s)) && 1798 if (bmap_would_wrap(reiserfs_bmap_count(s)) &&
1788 sb_bmap_nr(rs) != 0) { 1799 sb_bmap_nr(rs) != 0) {
1789 reiserfs_warning(s, "super-2030: This file system " 1800 reiserfs_warning(s, "super-2030", "This file system "
1790 "claims to use %u bitmap blocks in " 1801 "claims to use %u bitmap blocks in "
1791 "its super block, but requires %u. " 1802 "its super block, but requires %u. "
1792 "Clearing to zero.", sb_bmap_nr(rs), 1803 "Clearing to zero.", sb_bmap_nr(rs),
@@ -1819,7 +1830,9 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1819 } else if (!silent) { 1830 } else if (!silent) {
1820 reiserfs_info(s, "using 3.5.x disk format\n"); 1831 reiserfs_info(s, "using 3.5.x disk format\n");
1821 } 1832 }
1822 } 1833 } else
1834 set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
1835
1823 1836
1824 journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); 1837 journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
1825 errval = journal_end(&th, s, 1); 1838 errval = journal_end(&th, s, 1);
@@ -1892,62 +1905,14 @@ static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1892 buf->f_bsize = dentry->d_sb->s_blocksize; 1905 buf->f_bsize = dentry->d_sb->s_blocksize;
1893 /* changed to accommodate gcc folks. */ 1906 /* changed to accommodate gcc folks. */
1894 buf->f_type = REISERFS_SUPER_MAGIC; 1907 buf->f_type = REISERFS_SUPER_MAGIC;
1908 buf->f_fsid.val[0] = (u32)crc32_le(0, rs->s_uuid, sizeof(rs->s_uuid)/2);
1909 buf->f_fsid.val[1] = (u32)crc32_le(0, rs->s_uuid + sizeof(rs->s_uuid)/2,
1910 sizeof(rs->s_uuid)/2);
1911
1895 return 0; 1912 return 0;
1896} 1913}
1897 1914
1898#ifdef CONFIG_QUOTA 1915#ifdef CONFIG_QUOTA
1899static int reiserfs_dquot_initialize(struct inode *inode, int type)
1900{
1901 struct reiserfs_transaction_handle th;
1902 int ret, err;
1903
1904 /* We may create quota structure so we need to reserve enough blocks */
1905 reiserfs_write_lock(inode->i_sb);
1906 ret =
1907 journal_begin(&th, inode->i_sb,
1908 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb));
1909 if (ret)
1910 goto out;
1911 ret = dquot_initialize(inode, type);
1912 err =
1913 journal_end(&th, inode->i_sb,
1914 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb));
1915 if (!ret && err)
1916 ret = err;
1917 out:
1918 reiserfs_write_unlock(inode->i_sb);
1919 return ret;
1920}
1921
1922static int reiserfs_dquot_drop(struct inode *inode)
1923{
1924 struct reiserfs_transaction_handle th;
1925 int ret, err;
1926
1927 /* We may delete quota structure so we need to reserve enough blocks */
1928 reiserfs_write_lock(inode->i_sb);
1929 ret =
1930 journal_begin(&th, inode->i_sb,
1931 2 * REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb));
1932 if (ret) {
1933 /*
1934 * We call dquot_drop() anyway to at least release references
1935 * to quota structures so that umount does not hang.
1936 */
1937 dquot_drop(inode);
1938 goto out;
1939 }
1940 ret = dquot_drop(inode);
1941 err =
1942 journal_end(&th, inode->i_sb,
1943 2 * REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb));
1944 if (!ret && err)
1945 ret = err;
1946 out:
1947 reiserfs_write_unlock(inode->i_sb);
1948 return ret;
1949}
1950
1951static int reiserfs_write_dquot(struct dquot *dquot) 1916static int reiserfs_write_dquot(struct dquot *dquot)
1952{ 1917{
1953 struct reiserfs_transaction_handle th; 1918 struct reiserfs_transaction_handle th;
@@ -2085,8 +2050,8 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2085 if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) { 2050 if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) {
2086 err = reiserfs_unpack(inode, NULL); 2051 err = reiserfs_unpack(inode, NULL);
2087 if (err) { 2052 if (err) {
2088 reiserfs_warning(sb, 2053 reiserfs_warning(sb, "super-6520",
2089 "reiserfs: Unpacking tail of quota file failed" 2054 "Unpacking tail of quota file failed"
2090 " (%d). Cannot turn on quotas.", err); 2055 " (%d). Cannot turn on quotas.", err);
2091 err = -EINVAL; 2056 err = -EINVAL;
2092 goto out; 2057 goto out;
@@ -2097,8 +2062,8 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2097 if (REISERFS_SB(sb)->s_qf_names[type]) { 2062 if (REISERFS_SB(sb)->s_qf_names[type]) {
2098 /* Quotafile not of fs root? */ 2063 /* Quotafile not of fs root? */
2099 if (path.dentry->d_parent != sb->s_root) 2064 if (path.dentry->d_parent != sb->s_root)
2100 reiserfs_warning(sb, 2065 reiserfs_warning(sb, "super-6521",
2101 "reiserfs: Quota file not on filesystem root. " 2066 "Quota file not on filesystem root. "
2102 "Journalled quota will not work."); 2067 "Journalled quota will not work.");
2103 } 2068 }
2104 2069
@@ -2249,9 +2214,6 @@ static int __init init_reiserfs_fs(void)
2249 return ret; 2214 return ret;
2250 } 2215 }
2251 2216
2252 if ((ret = reiserfs_xattr_register_handlers()))
2253 goto failed_reiserfs_xattr_register_handlers;
2254
2255 reiserfs_proc_info_global_init(); 2217 reiserfs_proc_info_global_init();
2256 reiserfs_proc_register_global("version", 2218 reiserfs_proc_register_global("version",
2257 reiserfs_global_version_in_proc); 2219 reiserfs_global_version_in_proc);
@@ -2262,9 +2224,6 @@ static int __init init_reiserfs_fs(void)
2262 return 0; 2224 return 0;
2263 } 2225 }
2264 2226
2265 reiserfs_xattr_unregister_handlers();
2266
2267 failed_reiserfs_xattr_register_handlers:
2268 reiserfs_proc_unregister_global("version"); 2227 reiserfs_proc_unregister_global("version");
2269 reiserfs_proc_info_global_done(); 2228 reiserfs_proc_info_global_done();
2270 destroy_inodecache(); 2229 destroy_inodecache();
@@ -2274,7 +2233,6 @@ static int __init init_reiserfs_fs(void)
2274 2233
2275static void __exit exit_reiserfs_fs(void) 2234static void __exit exit_reiserfs_fs(void)
2276{ 2235{
2277 reiserfs_xattr_unregister_handlers();
2278 reiserfs_proc_unregister_global("version"); 2236 reiserfs_proc_unregister_global("version");
2279 reiserfs_proc_info_global_done(); 2237 reiserfs_proc_info_global_done();
2280 unregister_filesystem(&reiserfs_fs_type); 2238 unregister_filesystem(&reiserfs_fs_type);
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
index f8121a1147e8..d7f6e51bef2a 100644
--- a/fs/reiserfs/tail_conversion.c
+++ b/fs/reiserfs/tail_conversion.c
@@ -26,7 +26,7 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
26 converted item. */ 26 converted item. */
27 struct item_head ind_ih; /* new indirect item to be inserted or 27 struct item_head ind_ih; /* new indirect item to be inserted or
28 key of unfm pointer to be pasted */ 28 key of unfm pointer to be pasted */
29 int n_blk_size, n_retval; /* returned value for reiserfs_insert_item and clones */ 29 int blk_size, retval; /* returned value for reiserfs_insert_item and clones */
30 unp_t unfm_ptr; /* Handle on an unformatted node 30 unp_t unfm_ptr; /* Handle on an unformatted node
31 that will be inserted in the 31 that will be inserted in the
32 tree. */ 32 tree. */
@@ -35,7 +35,7 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
35 35
36 REISERFS_SB(sb)->s_direct2indirect++; 36 REISERFS_SB(sb)->s_direct2indirect++;
37 37
38 n_blk_size = sb->s_blocksize; 38 blk_size = sb->s_blocksize;
39 39
40 /* and key to search for append or insert pointer to the new 40 /* and key to search for append or insert pointer to the new
41 unformatted node. */ 41 unformatted node. */
@@ -46,11 +46,11 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
46 /* Set the key to search for the place for new unfm pointer */ 46 /* Set the key to search for the place for new unfm pointer */
47 make_cpu_key(&end_key, inode, tail_offset, TYPE_INDIRECT, 4); 47 make_cpu_key(&end_key, inode, tail_offset, TYPE_INDIRECT, 4);
48 48
49 // FIXME: we could avoid this 49 /* FIXME: we could avoid this */
50 if (search_for_position_by_key(sb, &end_key, path) == POSITION_FOUND) { 50 if (search_for_position_by_key(sb, &end_key, path) == POSITION_FOUND) {
51 reiserfs_warning(sb, "PAP-14030: direct2indirect: " 51 reiserfs_error(sb, "PAP-14030",
52 "pasted or inserted byte exists in the tree %K. " 52 "pasted or inserted byte exists in "
53 "Use fsck to repair.", &end_key); 53 "the tree %K. Use fsck to repair.", &end_key);
54 pathrelse(path); 54 pathrelse(path);
55 return -EIO; 55 return -EIO;
56 } 56 }
@@ -64,17 +64,17 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
64 set_ih_free_space(&ind_ih, 0); /* delete at nearest future */ 64 set_ih_free_space(&ind_ih, 0); /* delete at nearest future */
65 put_ih_item_len(&ind_ih, UNFM_P_SIZE); 65 put_ih_item_len(&ind_ih, UNFM_P_SIZE);
66 PATH_LAST_POSITION(path)++; 66 PATH_LAST_POSITION(path)++;
67 n_retval = 67 retval =
68 reiserfs_insert_item(th, path, &end_key, &ind_ih, inode, 68 reiserfs_insert_item(th, path, &end_key, &ind_ih, inode,
69 (char *)&unfm_ptr); 69 (char *)&unfm_ptr);
70 } else { 70 } else {
71 /* Paste into last indirect item of an object. */ 71 /* Paste into last indirect item of an object. */
72 n_retval = reiserfs_paste_into_item(th, path, &end_key, inode, 72 retval = reiserfs_paste_into_item(th, path, &end_key, inode,
73 (char *)&unfm_ptr, 73 (char *)&unfm_ptr,
74 UNFM_P_SIZE); 74 UNFM_P_SIZE);
75 } 75 }
76 if (n_retval) { 76 if (retval) {
77 return n_retval; 77 return retval;
78 } 78 }
79 // note: from here there are two keys which have matching first 79 // note: from here there are two keys which have matching first
80 // three key components. They only differ by the fourth one. 80 // three key components. They only differ by the fourth one.
@@ -92,14 +92,13 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
92 last item of the file */ 92 last item of the file */
93 if (search_for_position_by_key(sb, &end_key, path) == 93 if (search_for_position_by_key(sb, &end_key, path) ==
94 POSITION_FOUND) 94 POSITION_FOUND)
95 reiserfs_panic(sb, 95 reiserfs_panic(sb, "PAP-14050",
96 "PAP-14050: direct2indirect: "
97 "direct item (%K) not found", &end_key); 96 "direct item (%K) not found", &end_key);
98 p_le_ih = PATH_PITEM_HEAD(path); 97 p_le_ih = PATH_PITEM_HEAD(path);
99 RFALSE(!is_direct_le_ih(p_le_ih), 98 RFALSE(!is_direct_le_ih(p_le_ih),
100 "vs-14055: direct item expected(%K), found %h", 99 "vs-14055: direct item expected(%K), found %h",
101 &end_key, p_le_ih); 100 &end_key, p_le_ih);
102 tail_size = (le_ih_k_offset(p_le_ih) & (n_blk_size - 1)) 101 tail_size = (le_ih_k_offset(p_le_ih) & (blk_size - 1))
103 + ih_item_len(p_le_ih) - 1; 102 + ih_item_len(p_le_ih) - 1;
104 103
105 /* we only send the unbh pointer if the buffer is not up to date. 104 /* we only send the unbh pointer if the buffer is not up to date.
@@ -114,11 +113,11 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
114 } else { 113 } else {
115 up_to_date_bh = unbh; 114 up_to_date_bh = unbh;
116 } 115 }
117 n_retval = reiserfs_delete_item(th, path, &end_key, inode, 116 retval = reiserfs_delete_item(th, path, &end_key, inode,
118 up_to_date_bh); 117 up_to_date_bh);
119 118
120 total_tail += n_retval; 119 total_tail += retval;
121 if (tail_size == n_retval) 120 if (tail_size == retval)
122 // done: file does not have direct items anymore 121 // done: file does not have direct items anymore
123 break; 122 break;
124 123
@@ -130,7 +129,7 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
130 unsigned pgoff = 129 unsigned pgoff =
131 (tail_offset + total_tail - 1) & (PAGE_CACHE_SIZE - 1); 130 (tail_offset + total_tail - 1) & (PAGE_CACHE_SIZE - 1);
132 char *kaddr = kmap_atomic(up_to_date_bh->b_page, KM_USER0); 131 char *kaddr = kmap_atomic(up_to_date_bh->b_page, KM_USER0);
133 memset(kaddr + pgoff, 0, n_blk_size - total_tail); 132 memset(kaddr + pgoff, 0, blk_size - total_tail);
134 kunmap_atomic(kaddr, KM_USER0); 133 kunmap_atomic(kaddr, KM_USER0);
135 } 134 }
136 135
@@ -171,14 +170,18 @@ void reiserfs_unmap_buffer(struct buffer_head *bh)
171 what we expect from it (number of cut bytes). But when tail remains 170 what we expect from it (number of cut bytes). But when tail remains
172 in the unformatted node, we set mode to SKIP_BALANCING and unlock 171 in the unformatted node, we set mode to SKIP_BALANCING and unlock
173 inode */ 172 inode */
174int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_inode, struct page *page, struct treepath *p_s_path, /* path to the indirect item. */ 173int indirect2direct(struct reiserfs_transaction_handle *th,
175 const struct cpu_key *p_s_item_key, /* Key to look for unformatted node pointer to be cut. */ 174 struct inode *inode, struct page *page,
175 struct treepath *path, /* path to the indirect item. */
176 const struct cpu_key *item_key, /* Key to look for
177 * unformatted node
178 * pointer to be cut. */
176 loff_t n_new_file_size, /* New file size. */ 179 loff_t n_new_file_size, /* New file size. */
177 char *p_c_mode) 180 char *mode)
178{ 181{
179 struct super_block *p_s_sb = p_s_inode->i_sb; 182 struct super_block *sb = inode->i_sb;
180 struct item_head s_ih; 183 struct item_head s_ih;
181 unsigned long n_block_size = p_s_sb->s_blocksize; 184 unsigned long block_size = sb->s_blocksize;
182 char *tail; 185 char *tail;
183 int tail_len, round_tail_len; 186 int tail_len, round_tail_len;
184 loff_t pos, pos1; /* position of first byte of the tail */ 187 loff_t pos, pos1; /* position of first byte of the tail */
@@ -186,22 +189,22 @@ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_in
186 189
187 BUG_ON(!th->t_trans_id); 190 BUG_ON(!th->t_trans_id);
188 191
189 REISERFS_SB(p_s_sb)->s_indirect2direct++; 192 REISERFS_SB(sb)->s_indirect2direct++;
190 193
191 *p_c_mode = M_SKIP_BALANCING; 194 *mode = M_SKIP_BALANCING;
192 195
193 /* store item head path points to. */ 196 /* store item head path points to. */
194 copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path)); 197 copy_item_head(&s_ih, PATH_PITEM_HEAD(path));
195 198
196 tail_len = (n_new_file_size & (n_block_size - 1)); 199 tail_len = (n_new_file_size & (block_size - 1));
197 if (get_inode_sd_version(p_s_inode) == STAT_DATA_V2) 200 if (get_inode_sd_version(inode) == STAT_DATA_V2)
198 round_tail_len = ROUND_UP(tail_len); 201 round_tail_len = ROUND_UP(tail_len);
199 else 202 else
200 round_tail_len = tail_len; 203 round_tail_len = tail_len;
201 204
202 pos = 205 pos =
203 le_ih_k_offset(&s_ih) - 1 + (ih_item_len(&s_ih) / UNFM_P_SIZE - 206 le_ih_k_offset(&s_ih) - 1 + (ih_item_len(&s_ih) / UNFM_P_SIZE -
204 1) * p_s_sb->s_blocksize; 207 1) * sb->s_blocksize;
205 pos1 = pos; 208 pos1 = pos;
206 209
207 // we are protected by i_mutex. The tail can not disapper, not 210 // we are protected by i_mutex. The tail can not disapper, not
@@ -210,27 +213,26 @@ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_in
210 213
211 tail = (char *)kmap(page); /* this can schedule */ 214 tail = (char *)kmap(page); /* this can schedule */
212 215
213 if (path_changed(&s_ih, p_s_path)) { 216 if (path_changed(&s_ih, path)) {
214 /* re-search indirect item */ 217 /* re-search indirect item */
215 if (search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path) 218 if (search_for_position_by_key(sb, item_key, path)
216 == POSITION_NOT_FOUND) 219 == POSITION_NOT_FOUND)
217 reiserfs_panic(p_s_sb, 220 reiserfs_panic(sb, "PAP-5520",
218 "PAP-5520: indirect2direct: "
219 "item to be converted %K does not exist", 221 "item to be converted %K does not exist",
220 p_s_item_key); 222 item_key);
221 copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path)); 223 copy_item_head(&s_ih, PATH_PITEM_HEAD(path));
222#ifdef CONFIG_REISERFS_CHECK 224#ifdef CONFIG_REISERFS_CHECK
223 pos = le_ih_k_offset(&s_ih) - 1 + 225 pos = le_ih_k_offset(&s_ih) - 1 +
224 (ih_item_len(&s_ih) / UNFM_P_SIZE - 226 (ih_item_len(&s_ih) / UNFM_P_SIZE -
225 1) * p_s_sb->s_blocksize; 227 1) * sb->s_blocksize;
226 if (pos != pos1) 228 if (pos != pos1)
227 reiserfs_panic(p_s_sb, "vs-5530: indirect2direct: " 229 reiserfs_panic(sb, "vs-5530", "tail position "
228 "tail position changed while we were reading it"); 230 "changed while we were reading it");
229#endif 231#endif
230 } 232 }
231 233
232 /* Set direct item header to insert. */ 234 /* Set direct item header to insert. */
233 make_le_item_head(&s_ih, NULL, get_inode_item_key_version(p_s_inode), 235 make_le_item_head(&s_ih, NULL, get_inode_item_key_version(inode),
234 pos1 + 1, TYPE_DIRECT, round_tail_len, 236 pos1 + 1, TYPE_DIRECT, round_tail_len,
235 0xffff /*ih_free_space */ ); 237 0xffff /*ih_free_space */ );
236 238
@@ -240,13 +242,13 @@ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_in
240 */ 242 */
241 tail = tail + (pos & (PAGE_CACHE_SIZE - 1)); 243 tail = tail + (pos & (PAGE_CACHE_SIZE - 1));
242 244
243 PATH_LAST_POSITION(p_s_path)++; 245 PATH_LAST_POSITION(path)++;
244 246
245 key = *p_s_item_key; 247 key = *item_key;
246 set_cpu_key_k_type(&key, TYPE_DIRECT); 248 set_cpu_key_k_type(&key, TYPE_DIRECT);
247 key.key_length = 4; 249 key.key_length = 4;
248 /* Insert tail as new direct item in the tree */ 250 /* Insert tail as new direct item in the tree */
249 if (reiserfs_insert_item(th, p_s_path, &key, &s_ih, p_s_inode, 251 if (reiserfs_insert_item(th, path, &key, &s_ih, inode,
250 tail ? tail : NULL) < 0) { 252 tail ? tail : NULL) < 0) {
251 /* No disk memory. So we can not convert last unformatted node 253 /* No disk memory. So we can not convert last unformatted node
252 to the direct item. In this case we used to adjust 254 to the direct item. In this case we used to adjust
@@ -255,12 +257,12 @@ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_in
255 unformatted node. For now i_size is considered as guard for 257 unformatted node. For now i_size is considered as guard for
256 going out of file size */ 258 going out of file size */
257 kunmap(page); 259 kunmap(page);
258 return n_block_size - round_tail_len; 260 return block_size - round_tail_len;
259 } 261 }
260 kunmap(page); 262 kunmap(page);
261 263
262 /* make sure to get the i_blocks changes from reiserfs_insert_item */ 264 /* make sure to get the i_blocks changes from reiserfs_insert_item */
263 reiserfs_update_sd(th, p_s_inode); 265 reiserfs_update_sd(th, inode);
264 266
265 // note: we have now the same as in above direct2indirect 267 // note: we have now the same as in above direct2indirect
266 // conversion: there are two keys which have matching first three 268 // conversion: there are two keys which have matching first three
@@ -268,11 +270,11 @@ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_in
268 270
269 /* We have inserted new direct item and must remove last 271 /* We have inserted new direct item and must remove last
270 unformatted node. */ 272 unformatted node. */
271 *p_c_mode = M_CUT; 273 *mode = M_CUT;
272 274
273 /* we store position of first direct item in the in-core inode */ 275 /* we store position of first direct item in the in-core inode */
274 //mark_file_with_tail (p_s_inode, pos1 + 1); 276 /* mark_file_with_tail (inode, pos1 + 1); */
275 REISERFS_I(p_s_inode)->i_first_direct_byte = pos1 + 1; 277 REISERFS_I(inode)->i_first_direct_byte = pos1 + 1;
276 278
277 return n_block_size - round_tail_len; 279 return block_size - round_tail_len;
278} 280}
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index ad92461cbfc3..f83f52bae390 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -27,6 +27,10 @@
27 * these are special cases for filesystem ACLs, they are interpreted by the 27 * these are special cases for filesystem ACLs, they are interpreted by the
28 * kernel, in addition, they are negatively and positively cached and attached 28 * kernel, in addition, they are negatively and positively cached and attached
29 * to the inode so that unnecessary lookups are avoided. 29 * to the inode so that unnecessary lookups are avoided.
30 *
31 * Locking works like so:
32 * Directory components (xattr root, xattr dir) are protectd by their i_mutex.
33 * The xattrs themselves are protected by the xattr_sem.
30 */ 34 */
31 35
32#include <linux/reiserfs_fs.h> 36#include <linux/reiserfs_fs.h>
@@ -44,328 +48,334 @@
44#include <net/checksum.h> 48#include <net/checksum.h>
45#include <linux/smp_lock.h> 49#include <linux/smp_lock.h>
46#include <linux/stat.h> 50#include <linux/stat.h>
51#include <linux/quotaops.h>
47 52
48#define FL_READONLY 128
49#define FL_DIR_SEM_HELD 256
50#define PRIVROOT_NAME ".reiserfs_priv" 53#define PRIVROOT_NAME ".reiserfs_priv"
51#define XAROOT_NAME "xattrs" 54#define XAROOT_NAME "xattrs"
52 55
53static struct reiserfs_xattr_handler *find_xattr_handler_prefix(const char
54 *prefix);
55 56
56/* Returns the dentry referring to the root of the extended attribute 57/* Helpers for inode ops. We do this so that we don't have all the VFS
57 * directory tree. If it has already been retrieved, it is used. If it 58 * overhead and also for proper i_mutex annotation.
58 * hasn't been created and the flags indicate creation is allowed, we 59 * dir->i_mutex must be held for all of them. */
59 * attempt to create it. On error, we return a pointer-encoded error. 60#ifdef CONFIG_REISERFS_FS_XATTR
60 */ 61static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
61static struct dentry *get_xa_root(struct super_block *sb, int flags)
62{ 62{
63 struct dentry *privroot = dget(REISERFS_SB(sb)->priv_root); 63 BUG_ON(!mutex_is_locked(&dir->i_mutex));
64 struct dentry *xaroot; 64 vfs_dq_init(dir);
65 return dir->i_op->create(dir, dentry, mode, NULL);
66}
67#endif
65 68
66 /* This needs to be created at mount-time */ 69static int xattr_mkdir(struct inode *dir, struct dentry *dentry, int mode)
67 if (!privroot) 70{
68 return ERR_PTR(-ENODATA); 71 BUG_ON(!mutex_is_locked(&dir->i_mutex));
72 vfs_dq_init(dir);
73 return dir->i_op->mkdir(dir, dentry, mode);
74}
69 75
70 mutex_lock_nested(&privroot->d_inode->i_mutex, I_MUTEX_XATTR); 76/* We use I_MUTEX_CHILD here to silence lockdep. It's safe because xattr
71 if (REISERFS_SB(sb)->xattr_root) { 77 * mutation ops aren't called during rename or splace, which are the
72 xaroot = dget(REISERFS_SB(sb)->xattr_root); 78 * only other users of I_MUTEX_CHILD. It violates the ordering, but that's
73 goto out; 79 * better than allocating another subclass just for this code. */
74 } 80static int xattr_unlink(struct inode *dir, struct dentry *dentry)
81{
82 int error;
83 BUG_ON(!mutex_is_locked(&dir->i_mutex));
84 vfs_dq_init(dir);
75 85
76 xaroot = lookup_one_len(XAROOT_NAME, privroot, strlen(XAROOT_NAME)); 86 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
77 if (IS_ERR(xaroot)) { 87 error = dir->i_op->unlink(dir, dentry);
78 goto out; 88 mutex_unlock(&dentry->d_inode->i_mutex);
79 } else if (!xaroot->d_inode) { 89
90 if (!error)
91 d_delete(dentry);
92 return error;
93}
94
95static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
96{
97 int error;
98 BUG_ON(!mutex_is_locked(&dir->i_mutex));
99 vfs_dq_init(dir);
100
101 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
102 dentry_unhash(dentry);
103 error = dir->i_op->rmdir(dir, dentry);
104 if (!error)
105 dentry->d_inode->i_flags |= S_DEAD;
106 mutex_unlock(&dentry->d_inode->i_mutex);
107 if (!error)
108 d_delete(dentry);
109 dput(dentry);
110
111 return error;
112}
113
114#define xattr_may_create(flags) (!flags || flags & XATTR_CREATE)
115
116/* Returns and possibly creates the xattr dir. */
117static struct dentry *lookup_or_create_dir(struct dentry *parent,
118 const char *name, int flags)
119{
120 struct dentry *dentry;
121 BUG_ON(!parent);
122
123 dentry = lookup_one_len(name, parent, strlen(name));
124 if (IS_ERR(dentry))
125 return dentry;
126 else if (!dentry->d_inode) {
80 int err = -ENODATA; 127 int err = -ENODATA;
81 if (flags == 0 || flags & XATTR_CREATE) 128
82 err = privroot->d_inode->i_op->mkdir(privroot->d_inode, 129 if (xattr_may_create(flags)) {
83 xaroot, 0700); 130 mutex_lock_nested(&parent->d_inode->i_mutex,
131 I_MUTEX_XATTR);
132 err = xattr_mkdir(parent->d_inode, dentry, 0700);
133 mutex_unlock(&parent->d_inode->i_mutex);
134 }
135
84 if (err) { 136 if (err) {
85 dput(xaroot); 137 dput(dentry);
86 xaroot = ERR_PTR(err); 138 dentry = ERR_PTR(err);
87 goto out;
88 } 139 }
89 } 140 }
90 REISERFS_SB(sb)->xattr_root = dget(xaroot);
91 141
92 out: 142 return dentry;
93 mutex_unlock(&privroot->d_inode->i_mutex); 143}
94 dput(privroot); 144
95 return xaroot; 145static struct dentry *open_xa_root(struct super_block *sb, int flags)
146{
147 struct dentry *privroot = REISERFS_SB(sb)->priv_root;
148 if (!privroot)
149 return ERR_PTR(-ENODATA);
150 return lookup_or_create_dir(privroot, XAROOT_NAME, flags);
96} 151}
97 152
98/* Opens the directory corresponding to the inode's extended attribute store.
99 * If flags allow, the tree to the directory may be created. If creation is
100 * prohibited, -ENODATA is returned. */
101static struct dentry *open_xa_dir(const struct inode *inode, int flags) 153static struct dentry *open_xa_dir(const struct inode *inode, int flags)
102{ 154{
103 struct dentry *xaroot, *xadir; 155 struct dentry *xaroot, *xadir;
104 char namebuf[17]; 156 char namebuf[17];
105 157
106 xaroot = get_xa_root(inode->i_sb, flags); 158 xaroot = open_xa_root(inode->i_sb, flags);
107 if (IS_ERR(xaroot)) 159 if (IS_ERR(xaroot))
108 return xaroot; 160 return xaroot;
109 161
110 /* ok, we have xaroot open */
111 snprintf(namebuf, sizeof(namebuf), "%X.%X", 162 snprintf(namebuf, sizeof(namebuf), "%X.%X",
112 le32_to_cpu(INODE_PKEY(inode)->k_objectid), 163 le32_to_cpu(INODE_PKEY(inode)->k_objectid),
113 inode->i_generation); 164 inode->i_generation);
114 xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf));
115 if (IS_ERR(xadir)) {
116 dput(xaroot);
117 return xadir;
118 }
119
120 if (!xadir->d_inode) {
121 int err;
122 if (flags == 0 || flags & XATTR_CREATE) {
123 /* Although there is nothing else trying to create this directory,
124 * another directory with the same hash may be created, so we need
125 * to protect against that */
126 err =
127 xaroot->d_inode->i_op->mkdir(xaroot->d_inode, xadir,
128 0700);
129 if (err) {
130 dput(xaroot);
131 dput(xadir);
132 return ERR_PTR(err);
133 }
134 }
135 if (!xadir->d_inode) {
136 dput(xaroot);
137 dput(xadir);
138 return ERR_PTR(-ENODATA);
139 }
140 }
141 165
166 xadir = lookup_or_create_dir(xaroot, namebuf, flags);
142 dput(xaroot); 167 dput(xaroot);
143 return xadir; 168 return xadir;
169
144} 170}
145 171
146/* Returns a dentry corresponding to a specific extended attribute file 172/* The following are side effects of other operations that aren't explicitly
147 * for the inode. If flags allow, the file is created. Otherwise, a 173 * modifying extended attributes. This includes operations such as permissions
148 * valid or negative dentry, or an error is returned. */ 174 * or ownership changes, object deletions, etc. */
149static struct dentry *get_xa_file_dentry(const struct inode *inode, 175struct reiserfs_dentry_buf {
150 const char *name, int flags) 176 struct dentry *xadir;
151{ 177 int count;
152 struct dentry *xadir, *xafile; 178 struct dentry *dentries[8];
153 int err = 0; 179};
154 180
155 xadir = open_xa_dir(inode, flags); 181static int
156 if (IS_ERR(xadir)) { 182fill_with_dentries(void *buf, const char *name, int namelen, loff_t offset,
157 return ERR_CAST(xadir); 183 u64 ino, unsigned int d_type)
158 } else if (!xadir->d_inode) { 184{
159 dput(xadir); 185 struct reiserfs_dentry_buf *dbuf = buf;
160 return ERR_PTR(-ENODATA); 186 struct dentry *dentry;
161 }
162 187
163 xafile = lookup_one_len(name, xadir, strlen(name)); 188 if (dbuf->count == ARRAY_SIZE(dbuf->dentries))
164 if (IS_ERR(xafile)) { 189 return -ENOSPC;
165 dput(xadir);
166 return ERR_CAST(xafile);
167 }
168 190
169 if (xafile->d_inode) { /* file exists */ 191 if (name[0] == '.' && (name[1] == '\0' ||
170 if (flags & XATTR_CREATE) { 192 (name[1] == '.' && name[2] == '\0')))
171 err = -EEXIST; 193 return 0;
172 dput(xafile);
173 goto out;
174 }
175 } else if (flags & XATTR_REPLACE || flags & FL_READONLY) {
176 goto out;
177 } else {
178 /* inode->i_mutex is down, so nothing else can try to create
179 * the same xattr */
180 err = xadir->d_inode->i_op->create(xadir->d_inode, xafile,
181 0700 | S_IFREG, NULL);
182 194
183 if (err) { 195 dentry = lookup_one_len(name, dbuf->xadir, namelen);
184 dput(xafile); 196 if (IS_ERR(dentry)) {
185 goto out; 197 return PTR_ERR(dentry);
186 } 198 } else if (!dentry->d_inode) {
199 /* A directory entry exists, but no file? */
200 reiserfs_error(dentry->d_sb, "xattr-20003",
201 "Corrupted directory: xattr %s listed but "
202 "not found for file %s.\n",
203 dentry->d_name.name, dbuf->xadir->d_name.name);
204 dput(dentry);
205 return -EIO;
187 } 206 }
188 207
189 out: 208 dbuf->dentries[dbuf->count++] = dentry;
190 dput(xadir); 209 return 0;
191 if (err)
192 xafile = ERR_PTR(err);
193 else if (!xafile->d_inode) {
194 dput(xafile);
195 xafile = ERR_PTR(-ENODATA);
196 }
197 return xafile;
198} 210}
199 211
200/* 212static void
201 * this is very similar to fs/reiserfs/dir.c:reiserfs_readdir, but 213cleanup_dentry_buf(struct reiserfs_dentry_buf *buf)
202 * we need to drop the path before calling the filldir struct. That
203 * would be a big performance hit to the non-xattr case, so I've copied
204 * the whole thing for now. --clm
205 *
206 * the big difference is that I go backwards through the directory,
207 * and don't mess with f->f_pos, but the idea is the same. Do some
208 * action on each and every entry in the directory.
209 *
210 * we're called with i_mutex held, so there are no worries about the directory
211 * changing underneath us.
212 */
213static int __xattr_readdir(struct inode *inode, void *dirent, filldir_t filldir)
214{ 214{
215 struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */ 215 int i;
216 INITIALIZE_PATH(path_to_entry); 216 for (i = 0; i < buf->count; i++)
217 struct buffer_head *bh; 217 if (buf->dentries[i])
218 int entry_num; 218 dput(buf->dentries[i]);
219 struct item_head *ih, tmp_ih; 219}
220 int search_res; 220
221 char *local_buf; 221static int reiserfs_for_each_xattr(struct inode *inode,
222 loff_t next_pos; 222 int (*action)(struct dentry *, void *),
223 char small_buf[32]; /* avoid kmalloc if we can */ 223 void *data)
224 struct reiserfs_de_head *deh; 224{
225 int d_reclen; 225 struct dentry *dir;
226 char *d_name; 226 int i, err = 0;
227 off_t d_off; 227 loff_t pos = 0;
228 ino_t d_ino; 228 struct reiserfs_dentry_buf buf = {
229 struct reiserfs_dir_entry de; 229 .count = 0,
230 230 };
231 /* form key for search the next directory entry using f_pos field of
232 file structure */
233 next_pos = max_reiserfs_offset(inode);
234
235 while (1) {
236 research:
237 if (next_pos <= DOT_DOT_OFFSET)
238 break;
239 make_cpu_key(&pos_key, inode, next_pos, TYPE_DIRENTRY, 3);
240
241 search_res =
242 search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry,
243 &de);
244 if (search_res == IO_ERROR) {
245 // FIXME: we could just skip part of directory which could
246 // not be read
247 pathrelse(&path_to_entry);
248 return -EIO;
249 }
250 231
251 if (search_res == NAME_NOT_FOUND) 232 /* Skip out, an xattr has no xattrs associated with it */
252 de.de_entry_num--; 233 if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1)
234 return 0;
253 235
254 set_de_name_and_namelen(&de); 236 dir = open_xa_dir(inode, XATTR_REPLACE);
255 entry_num = de.de_entry_num; 237 if (IS_ERR(dir)) {
256 deh = &(de.de_deh[entry_num]); 238 err = PTR_ERR(dir);
239 goto out;
240 } else if (!dir->d_inode) {
241 err = 0;
242 goto out_dir;
243 }
257 244
258 bh = de.de_bh; 245 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
259 ih = de.de_ih; 246 buf.xadir = dir;
247 err = reiserfs_readdir_dentry(dir, &buf, fill_with_dentries, &pos);
248 while ((err == 0 || err == -ENOSPC) && buf.count) {
249 err = 0;
260 250
261 if (!is_direntry_le_ih(ih)) { 251 for (i = 0; i < buf.count && buf.dentries[i]; i++) {
262 reiserfs_warning(inode->i_sb, "not direntry %h", ih); 252 int lerr = 0;
263 break; 253 struct dentry *dentry = buf.dentries[i];
264 }
265 copy_item_head(&tmp_ih, ih);
266 254
267 /* we must have found item, that is item of this directory, */ 255 if (err == 0 && !S_ISDIR(dentry->d_inode->i_mode))
268 RFALSE(COMP_SHORT_KEYS(&(ih->ih_key), &pos_key), 256 lerr = action(dentry, data);
269 "vs-9000: found item %h does not match to dir we readdir %K",
270 ih, &pos_key);
271 257
272 if (deh_offset(deh) <= DOT_DOT_OFFSET) { 258 dput(dentry);
273 break; 259 buf.dentries[i] = NULL;
260 err = lerr ?: err;
274 } 261 }
262 buf.count = 0;
263 if (!err)
264 err = reiserfs_readdir_dentry(dir, &buf,
265 fill_with_dentries, &pos);
266 }
267 mutex_unlock(&dir->d_inode->i_mutex);
275 268
276 /* look for the previous entry in the directory */ 269 /* Clean up after a failed readdir */
277 next_pos = deh_offset(deh) - 1; 270 cleanup_dentry_buf(&buf);
278
279 if (!de_visible(deh))
280 /* it is hidden entry */
281 continue;
282 271
283 d_reclen = entry_length(bh, ih, entry_num); 272 if (!err) {
284 d_name = B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh); 273 /* We start a transaction here to avoid a ABBA situation
285 d_off = deh_offset(deh); 274 * between the xattr root's i_mutex and the journal lock.
286 d_ino = deh_objectid(deh); 275 * This doesn't incur much additional overhead since the
276 * new transaction will just nest inside the
277 * outer transaction. */
278 int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 +
279 4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
280 struct reiserfs_transaction_handle th;
281 err = journal_begin(&th, inode->i_sb, blocks);
282 if (!err) {
283 int jerror;
284 mutex_lock_nested(&dir->d_parent->d_inode->i_mutex,
285 I_MUTEX_XATTR);
286 err = action(dir, data);
287 jerror = journal_end(&th, inode->i_sb, blocks);
288 mutex_unlock(&dir->d_parent->d_inode->i_mutex);
289 err = jerror ?: err;
290 }
291 }
292out_dir:
293 dput(dir);
294out:
295 /* -ENODATA isn't an error */
296 if (err == -ENODATA)
297 err = 0;
298 return err;
299}
287 300
288 if (!d_name[d_reclen - 1]) 301static int delete_one_xattr(struct dentry *dentry, void *data)
289 d_reclen = strlen(d_name); 302{
303 struct inode *dir = dentry->d_parent->d_inode;
290 304
291 if (d_reclen > REISERFS_MAX_NAME(inode->i_sb->s_blocksize)) { 305 /* This is the xattr dir, handle specially. */
292 /* too big to send back to VFS */ 306 if (S_ISDIR(dentry->d_inode->i_mode))
293 continue; 307 return xattr_rmdir(dir, dentry);
294 }
295 308
296 /* Ignore the .reiserfs_priv entry */ 309 return xattr_unlink(dir, dentry);
297 if (reiserfs_xattrs(inode->i_sb) && 310}
298 !old_format_only(inode->i_sb) &&
299 deh_objectid(deh) ==
300 le32_to_cpu(INODE_PKEY
301 (REISERFS_SB(inode->i_sb)->priv_root->d_inode)->
302 k_objectid))
303 continue;
304
305 if (d_reclen <= 32) {
306 local_buf = small_buf;
307 } else {
308 local_buf = kmalloc(d_reclen, GFP_NOFS);
309 if (!local_buf) {
310 pathrelse(&path_to_entry);
311 return -ENOMEM;
312 }
313 if (item_moved(&tmp_ih, &path_to_entry)) {
314 kfree(local_buf);
315 311
316 /* sigh, must retry. Do this same offset again */ 312static int chown_one_xattr(struct dentry *dentry, void *data)
317 next_pos = d_off; 313{
318 goto research; 314 struct iattr *attrs = data;
319 } 315 return reiserfs_setattr(dentry, attrs);
320 } 316}
321 317
322 // Note, that we copy name to user space via temporary 318/* No i_mutex, but the inode is unconnected. */
323 // buffer (local_buf) because filldir will block if 319int reiserfs_delete_xattrs(struct inode *inode)
324 // user space buffer is swapped out. At that time 320{
325 // entry can move to somewhere else 321 int err = reiserfs_for_each_xattr(inode, delete_one_xattr, NULL);
326 memcpy(local_buf, d_name, d_reclen); 322 if (err)
327 323 reiserfs_warning(inode->i_sb, "jdm-20004",
328 /* the filldir function might need to start transactions, 324 "Couldn't delete all xattrs (%d)\n", err);
329 * or do who knows what. Release the path now that we've 325 return err;
330 * copied all the important stuff out of the deh 326}
331 */
332 pathrelse(&path_to_entry);
333
334 if (filldir(dirent, local_buf, d_reclen, d_off, d_ino,
335 DT_UNKNOWN) < 0) {
336 if (local_buf != small_buf) {
337 kfree(local_buf);
338 }
339 goto end;
340 }
341 if (local_buf != small_buf) {
342 kfree(local_buf);
343 }
344 } /* while */
345 327
346 end: 328/* inode->i_mutex: down */
347 pathrelse(&path_to_entry); 329int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
348 return 0; 330{
331 int err = reiserfs_for_each_xattr(inode, chown_one_xattr, attrs);
332 if (err)
333 reiserfs_warning(inode->i_sb, "jdm-20007",
334 "Couldn't chown all xattrs (%d)\n", err);
335 return err;
349} 336}
350 337
351/* 338#ifdef CONFIG_REISERFS_FS_XATTR
352 * this could be done with dedicated readdir ops for the xattr files, 339/* Returns a dentry corresponding to a specific extended attribute file
353 * but I want to get something working asap 340 * for the inode. If flags allow, the file is created. Otherwise, a
354 * this is stolen from vfs_readdir 341 * valid or negative dentry, or an error is returned. */
355 * 342static struct dentry *xattr_lookup(struct inode *inode, const char *name,
356 */ 343 int flags)
357static
358int xattr_readdir(struct inode *inode, filldir_t filler, void *buf)
359{ 344{
360 int res = -ENOENT; 345 struct dentry *xadir, *xafile;
361 mutex_lock_nested(&inode->i_mutex, I_MUTEX_XATTR); 346 int err = 0;
362 if (!IS_DEADDIR(inode)) { 347
363 lock_kernel(); 348 xadir = open_xa_dir(inode, flags);
364 res = __xattr_readdir(inode, buf, filler); 349 if (IS_ERR(xadir))
365 unlock_kernel(); 350 return ERR_CAST(xadir);
351
352 xafile = lookup_one_len(name, xadir, strlen(name));
353 if (IS_ERR(xafile)) {
354 err = PTR_ERR(xafile);
355 goto out;
366 } 356 }
367 mutex_unlock(&inode->i_mutex); 357
368 return res; 358 if (xafile->d_inode && (flags & XATTR_CREATE))
359 err = -EEXIST;
360
361 if (!xafile->d_inode) {
362 err = -ENODATA;
363 if (xattr_may_create(flags)) {
364 mutex_lock_nested(&xadir->d_inode->i_mutex,
365 I_MUTEX_XATTR);
366 err = xattr_create(xadir->d_inode, xafile,
367 0700|S_IFREG);
368 mutex_unlock(&xadir->d_inode->i_mutex);
369 }
370 }
371
372 if (err)
373 dput(xafile);
374out:
375 dput(xadir);
376 if (err)
377 return ERR_PTR(err);
378 return xafile;
369} 379}
370 380
371/* Internal operations on file data */ 381/* Internal operations on file data */
@@ -375,14 +385,14 @@ static inline void reiserfs_put_page(struct page *page)
375 page_cache_release(page); 385 page_cache_release(page);
376} 386}
377 387
378static struct page *reiserfs_get_page(struct inode *dir, unsigned long n) 388static struct page *reiserfs_get_page(struct inode *dir, size_t n)
379{ 389{
380 struct address_space *mapping = dir->i_mapping; 390 struct address_space *mapping = dir->i_mapping;
381 struct page *page; 391 struct page *page;
382 /* We can deadlock if we try to free dentries, 392 /* We can deadlock if we try to free dentries,
383 and an unlink/rmdir has just occured - GFP_NOFS avoids this */ 393 and an unlink/rmdir has just occured - GFP_NOFS avoids this */
384 mapping_set_gfp_mask(mapping, GFP_NOFS); 394 mapping_set_gfp_mask(mapping, GFP_NOFS);
385 page = read_mapping_page(mapping, n, NULL); 395 page = read_mapping_page(mapping, n >> PAGE_CACHE_SHIFT, NULL);
386 if (!IS_ERR(page)) { 396 if (!IS_ERR(page)) {
387 kmap(page); 397 kmap(page);
388 if (PageError(page)) 398 if (PageError(page))
@@ -405,6 +415,45 @@ int reiserfs_commit_write(struct file *f, struct page *page,
405int reiserfs_prepare_write(struct file *f, struct page *page, 415int reiserfs_prepare_write(struct file *f, struct page *page,
406 unsigned from, unsigned to); 416 unsigned from, unsigned to);
407 417
418static void update_ctime(struct inode *inode)
419{
420 struct timespec now = current_fs_time(inode->i_sb);
421 if (hlist_unhashed(&inode->i_hash) || !inode->i_nlink ||
422 timespec_equal(&inode->i_ctime, &now))
423 return;
424
425 inode->i_ctime = CURRENT_TIME_SEC;
426 mark_inode_dirty(inode);
427}
428
429static int lookup_and_delete_xattr(struct inode *inode, const char *name)
430{
431 int err = 0;
432 struct dentry *dentry, *xadir;
433
434 xadir = open_xa_dir(inode, XATTR_REPLACE);
435 if (IS_ERR(xadir))
436 return PTR_ERR(xadir);
437
438 dentry = lookup_one_len(name, xadir, strlen(name));
439 if (IS_ERR(dentry)) {
440 err = PTR_ERR(dentry);
441 goto out_dput;
442 }
443
444 if (dentry->d_inode) {
445 mutex_lock_nested(&xadir->d_inode->i_mutex, I_MUTEX_XATTR);
446 err = xattr_unlink(xadir->d_inode, dentry);
447 mutex_unlock(&xadir->d_inode->i_mutex);
448 update_ctime(inode);
449 }
450
451 dput(dentry);
452out_dput:
453 dput(xadir);
454 return err;
455}
456
408 457
409/* Generic extended attribute operations that can be used by xa plugins */ 458/* Generic extended attribute operations that can be used by xa plugins */
410 459
@@ -412,58 +461,32 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
412 * inode->i_mutex: down 461 * inode->i_mutex: down
413 */ 462 */
414int 463int
415reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer, 464reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
416 size_t buffer_size, int flags) 465 struct inode *inode, const char *name,
466 const void *buffer, size_t buffer_size, int flags)
417{ 467{
418 int err = 0; 468 int err = 0;
419 struct dentry *dentry; 469 struct dentry *dentry;
420 struct page *page; 470 struct page *page;
421 char *data; 471 char *data;
422 struct address_space *mapping;
423 size_t file_pos = 0; 472 size_t file_pos = 0;
424 size_t buffer_pos = 0; 473 size_t buffer_pos = 0;
425 struct inode *xinode; 474 size_t new_size;
426 struct iattr newattrs;
427 __u32 xahash = 0; 475 __u32 xahash = 0;
428 476
429 if (get_inode_sd_version(inode) == STAT_DATA_V1) 477 if (get_inode_sd_version(inode) == STAT_DATA_V1)
430 return -EOPNOTSUPP; 478 return -EOPNOTSUPP;
431 479
432 /* Empty xattrs are ok, they're just empty files, no hash */ 480 if (!buffer)
433 if (buffer && buffer_size) 481 return lookup_and_delete_xattr(inode, name);
434 xahash = xattr_hash(buffer, buffer_size);
435 482
436 open_file: 483 dentry = xattr_lookup(inode, name, flags);
437 dentry = get_xa_file_dentry(inode, name, flags); 484 if (IS_ERR(dentry))
438 if (IS_ERR(dentry)) { 485 return PTR_ERR(dentry);
439 err = PTR_ERR(dentry);
440 goto out;
441 }
442
443 xinode = dentry->d_inode;
444 REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
445 486
446 /* we need to copy it off.. */ 487 down_write(&REISERFS_I(inode)->i_xattr_sem);
447 if (xinode->i_nlink > 1) {
448 dput(dentry);
449 err = reiserfs_xattr_del(inode, name);
450 if (err < 0)
451 goto out;
452 /* We just killed the old one, we're not replacing anymore */
453 if (flags & XATTR_REPLACE)
454 flags &= ~XATTR_REPLACE;
455 goto open_file;
456 }
457 488
458 /* Resize it so we're ok to write there */ 489 xahash = xattr_hash(buffer, buffer_size);
459 newattrs.ia_size = buffer_size;
460 newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
461 mutex_lock_nested(&xinode->i_mutex, I_MUTEX_XATTR);
462 err = notify_change(dentry, &newattrs);
463 if (err)
464 goto out_filp;
465
466 mapping = xinode->i_mapping;
467 while (buffer_pos < buffer_size || buffer_pos == 0) { 490 while (buffer_pos < buffer_size || buffer_pos == 0) {
468 size_t chunk; 491 size_t chunk;
469 size_t skip = 0; 492 size_t skip = 0;
@@ -473,10 +496,10 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
473 else 496 else
474 chunk = buffer_size - buffer_pos; 497 chunk = buffer_size - buffer_pos;
475 498
476 page = reiserfs_get_page(xinode, file_pos >> PAGE_CACHE_SHIFT); 499 page = reiserfs_get_page(dentry->d_inode, file_pos);
477 if (IS_ERR(page)) { 500 if (IS_ERR(page)) {
478 err = PTR_ERR(page); 501 err = PTR_ERR(page);
479 goto out_filp; 502 goto out_unlock;
480 } 503 }
481 504
482 lock_page(page); 505 lock_page(page);
@@ -510,28 +533,61 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
510 break; 533 break;
511 } 534 }
512 535
513 /* We can't mark the inode dirty if it's not hashed. This is the case 536 new_size = buffer_size + sizeof(struct reiserfs_xattr_header);
514 * when we're inheriting the default ACL. If we dirty it, the inode 537 if (!err && new_size < i_size_read(dentry->d_inode)) {
515 * gets marked dirty, but won't (ever) make it onto the dirty list until 538 struct iattr newattrs = {
516 * it's synced explicitly to clear I_DIRTY. This is bad. */ 539 .ia_ctime = current_fs_time(inode->i_sb),
517 if (!hlist_unhashed(&inode->i_hash)) { 540 .ia_size = buffer_size,
518 inode->i_ctime = CURRENT_TIME_SEC; 541 .ia_valid = ATTR_SIZE | ATTR_CTIME,
519 mark_inode_dirty(inode); 542 };
543 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR);
544 down_write(&dentry->d_inode->i_alloc_sem);
545 err = reiserfs_setattr(dentry, &newattrs);
546 up_write(&dentry->d_inode->i_alloc_sem);
547 mutex_unlock(&dentry->d_inode->i_mutex);
548 } else
549 update_ctime(inode);
550out_unlock:
551 up_write(&REISERFS_I(inode)->i_xattr_sem);
552 dput(dentry);
553 return err;
554}
555
556/* We need to start a transaction to maintain lock ordering */
557int reiserfs_xattr_set(struct inode *inode, const char *name,
558 const void *buffer, size_t buffer_size, int flags)
559{
560
561 struct reiserfs_transaction_handle th;
562 int error, error2;
563 size_t jbegin_count = reiserfs_xattr_nblocks(inode, buffer_size);
564
565 if (!(flags & XATTR_REPLACE))
566 jbegin_count += reiserfs_xattr_jcreate_nblocks(inode);
567
568 reiserfs_write_lock(inode->i_sb);
569 error = journal_begin(&th, inode->i_sb, jbegin_count);
570 if (error) {
571 reiserfs_write_unlock(inode->i_sb);
572 return error;
520 } 573 }
521 574
522 out_filp: 575 error = reiserfs_xattr_set_handle(&th, inode, name,
523 mutex_unlock(&xinode->i_mutex); 576 buffer, buffer_size, flags);
524 dput(dentry);
525 577
526 out: 578 error2 = journal_end(&th, inode->i_sb, jbegin_count);
527 return err; 579 if (error == 0)
580 error = error2;
581 reiserfs_write_unlock(inode->i_sb);
582
583 return error;
528} 584}
529 585
530/* 586/*
531 * inode->i_mutex: down 587 * inode->i_mutex: down
532 */ 588 */
533int 589int
534reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer, 590reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer,
535 size_t buffer_size) 591 size_t buffer_size)
536{ 592{
537 ssize_t err = 0; 593 ssize_t err = 0;
@@ -540,7 +596,6 @@ reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
540 size_t file_pos = 0; 596 size_t file_pos = 0;
541 size_t buffer_pos = 0; 597 size_t buffer_pos = 0;
542 struct page *page; 598 struct page *page;
543 struct inode *xinode;
544 __u32 hash = 0; 599 __u32 hash = 0;
545 600
546 if (name == NULL) 601 if (name == NULL)
@@ -551,25 +606,25 @@ reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
551 if (get_inode_sd_version(inode) == STAT_DATA_V1) 606 if (get_inode_sd_version(inode) == STAT_DATA_V1)
552 return -EOPNOTSUPP; 607 return -EOPNOTSUPP;
553 608
554 dentry = get_xa_file_dentry(inode, name, FL_READONLY); 609 dentry = xattr_lookup(inode, name, XATTR_REPLACE);
555 if (IS_ERR(dentry)) { 610 if (IS_ERR(dentry)) {
556 err = PTR_ERR(dentry); 611 err = PTR_ERR(dentry);
557 goto out; 612 goto out;
558 } 613 }
559 614
560 xinode = dentry->d_inode; 615 down_read(&REISERFS_I(inode)->i_xattr_sem);
561 isize = xinode->i_size; 616
562 REISERFS_I(inode)->i_flags |= i_has_xattr_dir; 617 isize = i_size_read(dentry->d_inode);
563 618
564 /* Just return the size needed */ 619 /* Just return the size needed */
565 if (buffer == NULL) { 620 if (buffer == NULL) {
566 err = isize - sizeof(struct reiserfs_xattr_header); 621 err = isize - sizeof(struct reiserfs_xattr_header);
567 goto out_dput; 622 goto out_unlock;
568 } 623 }
569 624
570 if (buffer_size < isize - sizeof(struct reiserfs_xattr_header)) { 625 if (buffer_size < isize - sizeof(struct reiserfs_xattr_header)) {
571 err = -ERANGE; 626 err = -ERANGE;
572 goto out_dput; 627 goto out_unlock;
573 } 628 }
574 629
575 while (file_pos < isize) { 630 while (file_pos < isize) {
@@ -581,10 +636,10 @@ reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
581 else 636 else
582 chunk = isize - file_pos; 637 chunk = isize - file_pos;
583 638
584 page = reiserfs_get_page(xinode, file_pos >> PAGE_CACHE_SHIFT); 639 page = reiserfs_get_page(dentry->d_inode, file_pos);
585 if (IS_ERR(page)) { 640 if (IS_ERR(page)) {
586 err = PTR_ERR(page); 641 err = PTR_ERR(page);
587 goto out_dput; 642 goto out_unlock;
588 } 643 }
589 644
590 lock_page(page); 645 lock_page(page);
@@ -598,12 +653,12 @@ reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
598 if (rxh->h_magic != cpu_to_le32(REISERFS_XATTR_MAGIC)) { 653 if (rxh->h_magic != cpu_to_le32(REISERFS_XATTR_MAGIC)) {
599 unlock_page(page); 654 unlock_page(page);
600 reiserfs_put_page(page); 655 reiserfs_put_page(page);
601 reiserfs_warning(inode->i_sb, 656 reiserfs_warning(inode->i_sb, "jdm-20001",
602 "Invalid magic for xattr (%s) " 657 "Invalid magic for xattr (%s) "
603 "associated with %k", name, 658 "associated with %k", name,
604 INODE_PKEY(inode)); 659 INODE_PKEY(inode));
605 err = -EIO; 660 err = -EIO;
606 goto out_dput; 661 goto out_unlock;
607 } 662 }
608 hash = le32_to_cpu(rxh->h_hash); 663 hash = le32_to_cpu(rxh->h_hash);
609 } 664 }
@@ -618,256 +673,83 @@ reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
618 673
619 if (xattr_hash(buffer, isize - sizeof(struct reiserfs_xattr_header)) != 674 if (xattr_hash(buffer, isize - sizeof(struct reiserfs_xattr_header)) !=
620 hash) { 675 hash) {
621 reiserfs_warning(inode->i_sb, 676 reiserfs_warning(inode->i_sb, "jdm-20002",
622 "Invalid hash for xattr (%s) associated " 677 "Invalid hash for xattr (%s) associated "
623 "with %k", name, INODE_PKEY(inode)); 678 "with %k", name, INODE_PKEY(inode));
624 err = -EIO; 679 err = -EIO;
625 } 680 }
626 681
627 out_dput: 682out_unlock:
683 up_read(&REISERFS_I(inode)->i_xattr_sem);
628 dput(dentry); 684 dput(dentry);
629 685
630 out: 686out:
631 return err; 687 return err;
632} 688}
633 689
634static int 690/* Actual operations that are exported to VFS-land */
635__reiserfs_xattr_del(struct dentry *xadir, const char *name, int namelen) 691struct xattr_handler *reiserfs_xattr_handlers[] = {
636{ 692 &reiserfs_xattr_user_handler,
637 struct dentry *dentry; 693 &reiserfs_xattr_trusted_handler,
638 struct inode *dir = xadir->d_inode; 694#ifdef CONFIG_REISERFS_FS_SECURITY
639 int err = 0; 695 &reiserfs_xattr_security_handler,
640 696#endif
641 dentry = lookup_one_len(name, xadir, namelen); 697#ifdef CONFIG_REISERFS_FS_POSIX_ACL
642 if (IS_ERR(dentry)) { 698 &reiserfs_posix_acl_access_handler,
643 err = PTR_ERR(dentry); 699 &reiserfs_posix_acl_default_handler,
644 goto out; 700#endif
645 } else if (!dentry->d_inode) { 701 NULL
646 err = -ENODATA;
647 goto out_file;
648 }
649
650 /* Skip directories.. */
651 if (S_ISDIR(dentry->d_inode->i_mode))
652 goto out_file;
653
654 if (!is_reiserfs_priv_object(dentry->d_inode)) {
655 reiserfs_warning(dir->i_sb, "OID %08x [%.*s/%.*s] doesn't have "
656 "priv flag set [parent is %sset].",
657 le32_to_cpu(INODE_PKEY(dentry->d_inode)->
658 k_objectid), xadir->d_name.len,
659 xadir->d_name.name, namelen, name,
660 is_reiserfs_priv_object(xadir->
661 d_inode) ? "" :
662 "not ");
663 dput(dentry);
664 return -EIO;
665 }
666
667 err = dir->i_op->unlink(dir, dentry);
668 if (!err)
669 d_delete(dentry);
670
671 out_file:
672 dput(dentry);
673
674 out:
675 return err;
676}
677
678int reiserfs_xattr_del(struct inode *inode, const char *name)
679{
680 struct dentry *dir;
681 int err;
682
683 dir = open_xa_dir(inode, FL_READONLY);
684 if (IS_ERR(dir)) {
685 err = PTR_ERR(dir);
686 goto out;
687 }
688
689 err = __reiserfs_xattr_del(dir, name, strlen(name));
690 dput(dir);
691
692 if (!err) {
693 inode->i_ctime = CURRENT_TIME_SEC;
694 mark_inode_dirty(inode);
695 }
696
697 out:
698 return err;
699}
700
701/* The following are side effects of other operations that aren't explicitly
702 * modifying extended attributes. This includes operations such as permissions
703 * or ownership changes, object deletions, etc. */
704
705static int
706reiserfs_delete_xattrs_filler(void *buf, const char *name, int namelen,
707 loff_t offset, u64 ino, unsigned int d_type)
708{
709 struct dentry *xadir = (struct dentry *)buf;
710
711 return __reiserfs_xattr_del(xadir, name, namelen);
712
713}
714
715/* This is called w/ inode->i_mutex downed */
716int reiserfs_delete_xattrs(struct inode *inode)
717{
718 struct dentry *dir, *root;
719 int err = 0;
720
721 /* Skip out, an xattr has no xattrs associated with it */
722 if (is_reiserfs_priv_object(inode) ||
723 get_inode_sd_version(inode) == STAT_DATA_V1 ||
724 !reiserfs_xattrs(inode->i_sb)) {
725 return 0;
726 }
727 reiserfs_read_lock_xattrs(inode->i_sb);
728 dir = open_xa_dir(inode, FL_READONLY);
729 reiserfs_read_unlock_xattrs(inode->i_sb);
730 if (IS_ERR(dir)) {
731 err = PTR_ERR(dir);
732 goto out;
733 } else if (!dir->d_inode) {
734 dput(dir);
735 return 0;
736 }
737
738 lock_kernel();
739 err = xattr_readdir(dir->d_inode, reiserfs_delete_xattrs_filler, dir);
740 if (err) {
741 unlock_kernel();
742 goto out_dir;
743 }
744
745 /* Leftovers besides . and .. -- that's not good. */
746 if (dir->d_inode->i_nlink <= 2) {
747 root = get_xa_root(inode->i_sb, XATTR_REPLACE);
748 reiserfs_write_lock_xattrs(inode->i_sb);
749 err = vfs_rmdir(root->d_inode, dir);
750 reiserfs_write_unlock_xattrs(inode->i_sb);
751 dput(root);
752 } else {
753 reiserfs_warning(inode->i_sb,
754 "Couldn't remove all entries in directory");
755 }
756 unlock_kernel();
757
758 out_dir:
759 dput(dir);
760
761 out:
762 if (!err)
763 REISERFS_I(inode)->i_flags =
764 REISERFS_I(inode)->i_flags & ~i_has_xattr_dir;
765 return err;
766}
767
768struct reiserfs_chown_buf {
769 struct inode *inode;
770 struct dentry *xadir;
771 struct iattr *attrs;
772}; 702};
773 703
774/* XXX: If there is a better way to do this, I'd love to hear about it */ 704/*
775static int 705 * In order to implement different sets of xattr operations for each xattr
776reiserfs_chown_xattrs_filler(void *buf, const char *name, int namelen, 706 * prefix with the generic xattr API, a filesystem should create a
777 loff_t offset, u64 ino, unsigned int d_type) 707 * null-terminated array of struct xattr_handler (one for each prefix) and
778{ 708 * hang a pointer to it off of the s_xattr field of the superblock.
779 struct reiserfs_chown_buf *chown_buf = (struct reiserfs_chown_buf *)buf; 709 *
780 struct dentry *xafile, *xadir = chown_buf->xadir; 710 * The generic_fooxattr() functions will use this list to dispatch xattr
781 struct iattr *attrs = chown_buf->attrs; 711 * operations to the correct xattr_handler.
782 int err = 0; 712 */
783 713#define for_each_xattr_handler(handlers, handler) \
784 xafile = lookup_one_len(name, xadir, namelen); 714 for ((handler) = *(handlers)++; \
785 if (IS_ERR(xafile)) 715 (handler) != NULL; \
786 return PTR_ERR(xafile); 716 (handler) = *(handlers)++)
787 else if (!xafile->d_inode) {
788 dput(xafile);
789 return -ENODATA;
790 }
791
792 if (!S_ISDIR(xafile->d_inode->i_mode))
793 err = notify_change(xafile, attrs);
794 dput(xafile);
795
796 return err;
797}
798 717
799int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs) 718/* This is the implementation for the xattr plugin infrastructure */
719static inline struct xattr_handler *
720find_xattr_handler_prefix(struct xattr_handler **handlers,
721 const char *name)
800{ 722{
801 struct dentry *dir; 723 struct xattr_handler *xah;
802 int err = 0;
803 struct reiserfs_chown_buf buf;
804 unsigned int ia_valid = attrs->ia_valid;
805 724
806 /* Skip out, an xattr has no xattrs associated with it */ 725 if (!handlers)
807 if (is_reiserfs_priv_object(inode) || 726 return NULL;
808 get_inode_sd_version(inode) == STAT_DATA_V1 ||
809 !reiserfs_xattrs(inode->i_sb)) {
810 return 0;
811 }
812 reiserfs_read_lock_xattrs(inode->i_sb);
813 dir = open_xa_dir(inode, FL_READONLY);
814 reiserfs_read_unlock_xattrs(inode->i_sb);
815 if (IS_ERR(dir)) {
816 if (PTR_ERR(dir) != -ENODATA)
817 err = PTR_ERR(dir);
818 goto out;
819 } else if (!dir->d_inode) {
820 dput(dir);
821 goto out;
822 }
823 727
824 lock_kernel(); 728 for_each_xattr_handler(handlers, xah) {
825 729 if (strncmp(xah->prefix, name, strlen(xah->prefix)) == 0)
826 attrs->ia_valid &= (ATTR_UID | ATTR_GID | ATTR_CTIME); 730 break;
827 buf.xadir = dir;
828 buf.attrs = attrs;
829 buf.inode = inode;
830
831 err = xattr_readdir(dir->d_inode, reiserfs_chown_xattrs_filler, &buf);
832 if (err) {
833 unlock_kernel();
834 goto out_dir;
835 } 731 }
836 732
837 err = notify_change(dir, attrs); 733 return xah;
838 unlock_kernel();
839
840 out_dir:
841 dput(dir);
842
843 out:
844 attrs->ia_valid = ia_valid;
845 return err;
846} 734}
847 735
848/* Actual operations that are exported to VFS-land */
849 736
850/* 737/*
851 * Inode operation getxattr() 738 * Inode operation getxattr()
852 * Preliminary locking: we down dentry->d_inode->i_mutex
853 */ 739 */
854ssize_t 740ssize_t
855reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, 741reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
856 size_t size) 742 size_t size)
857{ 743{
858 struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix(name); 744 struct inode *inode = dentry->d_inode;
859 int err; 745 struct xattr_handler *handler;
860 746
861 if (!xah || !reiserfs_xattrs(dentry->d_sb) || 747 handler = find_xattr_handler_prefix(inode->i_sb->s_xattr, name);
862 get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) 748
749 if (!handler || get_inode_sd_version(inode) == STAT_DATA_V1)
863 return -EOPNOTSUPP; 750 return -EOPNOTSUPP;
864 751
865 reiserfs_read_lock_xattr_i(dentry->d_inode); 752 return handler->get(inode, name, buffer, size);
866 reiserfs_read_lock_xattrs(dentry->d_sb);
867 err = xah->get(dentry->d_inode, name, buffer, size);
868 reiserfs_read_unlock_xattrs(dentry->d_sb);
869 reiserfs_read_unlock_xattr_i(dentry->d_inode);
870 return err;
871} 753}
872 754
873/* 755/*
@@ -879,27 +761,15 @@ int
879reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, 761reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
880 size_t size, int flags) 762 size_t size, int flags)
881{ 763{
882 struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix(name); 764 struct inode *inode = dentry->d_inode;
883 int err; 765 struct xattr_handler *handler;
884 int lock;
885 766
886 if (!xah || !reiserfs_xattrs(dentry->d_sb) || 767 handler = find_xattr_handler_prefix(inode->i_sb->s_xattr, name);
887 get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) 768
769 if (!handler || get_inode_sd_version(inode) == STAT_DATA_V1)
888 return -EOPNOTSUPP; 770 return -EOPNOTSUPP;
889 771
890 reiserfs_write_lock_xattr_i(dentry->d_inode); 772 return handler->set(inode, name, value, size, flags);
891 lock = !has_xattr_dir(dentry->d_inode);
892 if (lock)
893 reiserfs_write_lock_xattrs(dentry->d_sb);
894 else
895 reiserfs_read_lock_xattrs(dentry->d_sb);
896 err = xah->set(dentry->d_inode, name, value, size, flags);
897 if (lock)
898 reiserfs_write_unlock_xattrs(dentry->d_sb);
899 else
900 reiserfs_read_unlock_xattrs(dentry->d_sb);
901 reiserfs_write_unlock_xattr_i(dentry->d_inode);
902 return err;
903} 773}
904 774
905/* 775/*
@@ -909,86 +779,66 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
909 */ 779 */
910int reiserfs_removexattr(struct dentry *dentry, const char *name) 780int reiserfs_removexattr(struct dentry *dentry, const char *name)
911{ 781{
912 int err; 782 struct inode *inode = dentry->d_inode;
913 struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix(name); 783 struct xattr_handler *handler;
784 handler = find_xattr_handler_prefix(inode->i_sb->s_xattr, name);
914 785
915 if (!xah || !reiserfs_xattrs(dentry->d_sb) || 786 if (!handler || get_inode_sd_version(inode) == STAT_DATA_V1)
916 get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
917 return -EOPNOTSUPP; 787 return -EOPNOTSUPP;
918 788
919 reiserfs_write_lock_xattr_i(dentry->d_inode); 789 return handler->set(inode, name, NULL, 0, XATTR_REPLACE);
920 reiserfs_read_lock_xattrs(dentry->d_sb);
921
922 /* Deletion pre-operation */
923 if (xah->del) {
924 err = xah->del(dentry->d_inode, name);
925 if (err)
926 goto out;
927 }
928
929 err = reiserfs_xattr_del(dentry->d_inode, name);
930
931 dentry->d_inode->i_ctime = CURRENT_TIME_SEC;
932 mark_inode_dirty(dentry->d_inode);
933
934 out:
935 reiserfs_read_unlock_xattrs(dentry->d_sb);
936 reiserfs_write_unlock_xattr_i(dentry->d_inode);
937 return err;
938} 790}
939 791
940/* This is what filldir will use: 792struct listxattr_buf {
941 * r_pos will always contain the amount of space required for the entire 793 size_t size;
942 * list. If r_pos becomes larger than r_size, we need more space and we 794 size_t pos;
943 * return an error indicating this. If r_pos is less than r_size, then we've 795 char *buf;
944 * filled the buffer successfully and we return success */ 796 struct inode *inode;
945struct reiserfs_listxattr_buf {
946 int r_pos;
947 int r_size;
948 char *r_buf;
949 struct inode *r_inode;
950}; 797};
951 798
952static int 799static int listxattr_filler(void *buf, const char *name, int namelen,
953reiserfs_listxattr_filler(void *buf, const char *name, int namelen, 800 loff_t offset, u64 ino, unsigned int d_type)
954 loff_t offset, u64 ino, unsigned int d_type)
955{ 801{
956 struct reiserfs_listxattr_buf *b = (struct reiserfs_listxattr_buf *)buf; 802 struct listxattr_buf *b = (struct listxattr_buf *)buf;
957 int len = 0; 803 size_t size;
958 if (name[0] != '.' 804 if (name[0] != '.' ||
959 || (namelen != 1 && (name[1] != '.' || namelen != 2))) { 805 (namelen != 1 && (name[1] != '.' || namelen != 2))) {
960 struct reiserfs_xattr_handler *xah = 806 struct xattr_handler *handler;
961 find_xattr_handler_prefix(name); 807 handler = find_xattr_handler_prefix(b->inode->i_sb->s_xattr,
962 if (!xah) 808 name);
963 return 0; /* Unsupported xattr name, skip it */ 809 if (!handler) /* Unsupported xattr name */
964 810 return 0;
965 /* We call ->list() twice because the operation isn't required to just 811 if (b->buf) {
966 * return the name back - we want to make sure we have enough space */ 812 size = handler->list(b->inode, b->buf + b->pos,
967 len += xah->list(b->r_inode, name, namelen, NULL); 813 b->size, name, namelen);
968 814 if (size > b->size)
969 if (len) { 815 return -ERANGE;
970 if (b->r_pos + len + 1 <= b->r_size) { 816 } else {
971 char *p = b->r_buf + b->r_pos; 817 size = handler->list(b->inode, NULL, 0, name, namelen);
972 p += xah->list(b->r_inode, name, namelen, p);
973 *p++ = '\0';
974 }
975 b->r_pos += len + 1;
976 } 818 }
977 }
978 819
820 b->pos += size;
821 }
979 return 0; 822 return 0;
980} 823}
981 824
982/* 825/*
983 * Inode operation listxattr() 826 * Inode operation listxattr()
984 * 827 *
985 * Preliminary locking: we down dentry->d_inode->i_mutex 828 * We totally ignore the generic listxattr here because it would be stupid
829 * not to. Since the xattrs are organized in a directory, we can just
830 * readdir to find them.
986 */ 831 */
987ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) 832ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
988{ 833{
989 struct dentry *dir; 834 struct dentry *dir;
990 int err = 0; 835 int err = 0;
991 struct reiserfs_listxattr_buf buf; 836 loff_t pos = 0;
837 struct listxattr_buf buf = {
838 .inode = dentry->d_inode,
839 .buf = buffer,
840 .size = buffer ? size : 0,
841 };
992 842
993 if (!dentry->d_inode) 843 if (!dentry->d_inode)
994 return -EINVAL; 844 return -EINVAL;
@@ -997,130 +847,104 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
997 get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) 847 get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
998 return -EOPNOTSUPP; 848 return -EOPNOTSUPP;
999 849
1000 reiserfs_read_lock_xattr_i(dentry->d_inode); 850 dir = open_xa_dir(dentry->d_inode, XATTR_REPLACE);
1001 reiserfs_read_lock_xattrs(dentry->d_sb);
1002 dir = open_xa_dir(dentry->d_inode, FL_READONLY);
1003 reiserfs_read_unlock_xattrs(dentry->d_sb);
1004 if (IS_ERR(dir)) { 851 if (IS_ERR(dir)) {
1005 err = PTR_ERR(dir); 852 err = PTR_ERR(dir);
1006 if (err == -ENODATA) 853 if (err == -ENODATA)
1007 err = 0; /* Not an error if there aren't any xattrs */ 854 err = 0; /* Not an error if there aren't any xattrs */
1008 goto out; 855 goto out;
1009 } 856 }
1010 857
1011 buf.r_buf = buffer; 858 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
1012 buf.r_size = buffer ? size : 0; 859 err = reiserfs_readdir_dentry(dir, &buf, listxattr_filler, &pos);
1013 buf.r_pos = 0; 860 mutex_unlock(&dir->d_inode->i_mutex);
1014 buf.r_inode = dentry->d_inode;
1015 861
1016 REISERFS_I(dentry->d_inode)->i_flags |= i_has_xattr_dir; 862 if (!err)
1017 863 err = buf.pos;
1018 err = xattr_readdir(dir->d_inode, reiserfs_listxattr_filler, &buf);
1019 if (err)
1020 goto out_dir;
1021
1022 if (buf.r_pos > buf.r_size && buffer != NULL)
1023 err = -ERANGE;
1024 else
1025 err = buf.r_pos;
1026 864
1027 out_dir:
1028 dput(dir); 865 dput(dir);
1029 866out:
1030 out:
1031 reiserfs_read_unlock_xattr_i(dentry->d_inode);
1032 return err; 867 return err;
1033} 868}
1034 869
1035/* This is the implementation for the xattr plugin infrastructure */ 870static int reiserfs_check_acl(struct inode *inode, int mask)
1036static LIST_HEAD(xattr_handlers);
1037static DEFINE_RWLOCK(handler_lock);
1038
1039static struct reiserfs_xattr_handler *find_xattr_handler_prefix(const char
1040 *prefix)
1041{ 871{
1042 struct reiserfs_xattr_handler *xah = NULL; 872 struct posix_acl *acl;
1043 struct list_head *p; 873 int error = -EAGAIN; /* do regular unix permission checks by default */
1044 874
1045 read_lock(&handler_lock); 875 acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
1046 list_for_each(p, &xattr_handlers) { 876
1047 xah = list_entry(p, struct reiserfs_xattr_handler, handlers); 877 if (acl) {
1048 if (strncmp(xah->prefix, prefix, strlen(xah->prefix)) == 0) 878 if (!IS_ERR(acl)) {
1049 break; 879 error = posix_acl_permission(inode, acl, mask);
1050 xah = NULL; 880 posix_acl_release(acl);
881 } else if (PTR_ERR(acl) != -ENODATA)
882 error = PTR_ERR(acl);
1051 } 883 }
1052 884
1053 read_unlock(&handler_lock); 885 return error;
1054 return xah;
1055} 886}
1056 887
1057static void __unregister_handlers(void) 888int reiserfs_permission(struct inode *inode, int mask)
1058{ 889{
1059 struct reiserfs_xattr_handler *xah; 890 /*
1060 struct list_head *p, *tmp; 891 * We don't do permission checks on the internal objects.
1061 892 * Permissions are determined by the "owning" object.
1062 list_for_each_safe(p, tmp, &xattr_handlers) { 893 */
1063 xah = list_entry(p, struct reiserfs_xattr_handler, handlers); 894 if (IS_PRIVATE(inode))
1064 if (xah->exit) 895 return 0;
1065 xah->exit(); 896 /*
1066 897 * Stat data v1 doesn't support ACLs.
1067 list_del_init(p); 898 */
1068 } 899 if (get_inode_sd_version(inode) == STAT_DATA_V1)
1069 INIT_LIST_HEAD(&xattr_handlers); 900 return generic_permission(inode, mask, NULL);
901 else
902 return generic_permission(inode, mask, reiserfs_check_acl);
1070} 903}
1071 904
1072int __init reiserfs_xattr_register_handlers(void) 905static int create_privroot(struct dentry *dentry)
1073{ 906{
1074 int err = 0; 907 int err;
1075 struct reiserfs_xattr_handler *xah; 908 struct inode *inode = dentry->d_parent->d_inode;
1076 struct list_head *p; 909 mutex_lock_nested(&inode->i_mutex, I_MUTEX_XATTR);
1077 910 err = xattr_mkdir(inode, dentry, 0700);
1078 write_lock(&handler_lock); 911 mutex_unlock(&inode->i_mutex);
1079 912 if (err) {
1080 /* If we're already initialized, nothing to do */ 913 dput(dentry);
1081 if (!list_empty(&xattr_handlers)) { 914 dentry = NULL;
1082 write_unlock(&handler_lock);
1083 return 0;
1084 }
1085
1086 /* Add the handlers */
1087 list_add_tail(&user_handler.handlers, &xattr_handlers);
1088 list_add_tail(&trusted_handler.handlers, &xattr_handlers);
1089#ifdef CONFIG_REISERFS_FS_SECURITY
1090 list_add_tail(&security_handler.handlers, &xattr_handlers);
1091#endif
1092#ifdef CONFIG_REISERFS_FS_POSIX_ACL
1093 list_add_tail(&posix_acl_access_handler.handlers, &xattr_handlers);
1094 list_add_tail(&posix_acl_default_handler.handlers, &xattr_handlers);
1095#endif
1096
1097 /* Run initializers, if available */
1098 list_for_each(p, &xattr_handlers) {
1099 xah = list_entry(p, struct reiserfs_xattr_handler, handlers);
1100 if (xah->init) {
1101 err = xah->init();
1102 if (err) {
1103 list_del_init(p);
1104 break;
1105 }
1106 }
1107 } 915 }
1108 916
1109 /* Clean up other handlers, if any failed */ 917 if (dentry && dentry->d_inode)
1110 if (err) 918 reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr "
1111 __unregister_handlers(); 919 "storage.\n", PRIVROOT_NAME);
1112 920
1113 write_unlock(&handler_lock);
1114 return err; 921 return err;
1115} 922}
1116 923
1117void reiserfs_xattr_unregister_handlers(void) 924static int xattr_mount_check(struct super_block *s)
1118{ 925{
1119 write_lock(&handler_lock); 926 /* We need generation numbers to ensure that the oid mapping is correct
1120 __unregister_handlers(); 927 * v3.5 filesystems don't have them. */
1121 write_unlock(&handler_lock); 928 if (old_format_only(s)) {
929 if (reiserfs_xattrs_optional(s)) {
930 /* Old format filesystem, but optional xattrs have
931 * been enabled. Error out. */
932 reiserfs_warning(s, "jdm-2005",
933 "xattrs/ACLs not supported "
934 "on pre-v3.6 format filesystems. "
935 "Failing mount.");
936 return -EOPNOTSUPP;
937 }
938 }
939
940 return 0;
1122} 941}
1123 942
943#else
944int __init reiserfs_xattr_register_handlers(void) { return 0; }
945void reiserfs_xattr_unregister_handlers(void) {}
946#endif
947
1124/* This will catch lookups from the fs root to .reiserfs_priv */ 948/* This will catch lookups from the fs root to .reiserfs_priv */
1125static int 949static int
1126xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name) 950xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name)
@@ -1136,7 +960,7 @@ xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name)
1136 return 1; 960 return 1;
1137} 961}
1138 962
1139static struct dentry_operations xattr_lookup_poison_ops = { 963static const struct dentry_operations xattr_lookup_poison_ops = {
1140 .d_compare = xattr_lookup_poison, 964 .d_compare = xattr_lookup_poison,
1141}; 965};
1142 966
@@ -1147,48 +971,23 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
1147{ 971{
1148 int err = 0; 972 int err = 0;
1149 973
1150 /* We need generation numbers to ensure that the oid mapping is correct 974#ifdef CONFIG_REISERFS_FS_XATTR
1151 * v3.5 filesystems don't have them. */ 975 err = xattr_mount_check(s);
1152 if (!old_format_only(s)) { 976 if (err)
1153 set_bit(REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt));
1154 } else if (reiserfs_xattrs_optional(s)) {
1155 /* Old format filesystem, but optional xattrs have been enabled
1156 * at mount time. Error out. */
1157 reiserfs_warning(s, "xattrs/ACLs not supported on pre v3.6 "
1158 "format filesystem. Failing mount.");
1159 err = -EOPNOTSUPP;
1160 goto error; 977 goto error;
1161 } else { 978#endif
1162 /* Old format filesystem, but no optional xattrs have been enabled. This
1163 * means we silently disable xattrs on the filesystem. */
1164 clear_bit(REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt));
1165 }
1166 979
1167 /* If we don't have the privroot located yet - go find it */ 980 /* If we don't have the privroot located yet - go find it */
1168 if (reiserfs_xattrs(s) && !REISERFS_SB(s)->priv_root) { 981 if (!REISERFS_SB(s)->priv_root) {
1169 struct dentry *dentry; 982 struct dentry *dentry;
1170 dentry = lookup_one_len(PRIVROOT_NAME, s->s_root, 983 dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
1171 strlen(PRIVROOT_NAME)); 984 strlen(PRIVROOT_NAME));
1172 if (!IS_ERR(dentry)) { 985 if (!IS_ERR(dentry)) {
1173 if (!(mount_flags & MS_RDONLY) && !dentry->d_inode) { 986#ifdef CONFIG_REISERFS_FS_XATTR
1174 struct inode *inode = dentry->d_parent->d_inode; 987 if (!(mount_flags & MS_RDONLY) && !dentry->d_inode)
1175 mutex_lock_nested(&inode->i_mutex, 988 err = create_privroot(dentry);
1176 I_MUTEX_XATTR); 989#endif
1177 err = inode->i_op->mkdir(inode, dentry, 0700); 990 if (!dentry->d_inode) {
1178 mutex_unlock(&inode->i_mutex);
1179 if (err) {
1180 dput(dentry);
1181 dentry = NULL;
1182 }
1183
1184 if (dentry && dentry->d_inode)
1185 reiserfs_warning(s,
1186 "Created %s on %s - reserved for "
1187 "xattr storage.",
1188 PRIVROOT_NAME,
1189 reiserfs_bdevname
1190 (inode->i_sb));
1191 } else if (!dentry->d_inode) {
1192 dput(dentry); 991 dput(dentry);
1193 dentry = NULL; 992 dentry = NULL;
1194 } 993 }
@@ -1197,73 +996,41 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
1197 996
1198 if (!err && dentry) { 997 if (!err && dentry) {
1199 s->s_root->d_op = &xattr_lookup_poison_ops; 998 s->s_root->d_op = &xattr_lookup_poison_ops;
1200 reiserfs_mark_inode_private(dentry->d_inode); 999 dentry->d_inode->i_flags |= S_PRIVATE;
1201 REISERFS_SB(s)->priv_root = dentry; 1000 REISERFS_SB(s)->priv_root = dentry;
1202 } else if (!(mount_flags & MS_RDONLY)) { /* xattrs are unavailable */ 1001#ifdef CONFIG_REISERFS_FS_XATTR
1203 /* If we're read-only it just means that the dir hasn't been 1002 /* xattrs are unavailable */
1204 * created. Not an error -- just no xattrs on the fs. We'll 1003 } else if (!(mount_flags & MS_RDONLY)) {
1205 * check again if we go read-write */ 1004 /* If we're read-only it just means that the dir
1206 reiserfs_warning(s, "xattrs/ACLs enabled and couldn't " 1005 * hasn't been created. Not an error -- just no
1207 "find/create .reiserfs_priv. Failing mount."); 1006 * xattrs on the fs. We'll check again if we
1007 * go read-write */
1008 reiserfs_warning(s, "jdm-20006",
1009 "xattrs/ACLs enabled and couldn't "
1010 "find/create .reiserfs_priv. "
1011 "Failing mount.");
1208 err = -EOPNOTSUPP; 1012 err = -EOPNOTSUPP;
1013#endif
1209 } 1014 }
1210 } 1015 }
1211 1016
1212 error: 1017#ifdef CONFIG_REISERFS_FS_XATTR
1213 /* This is only nonzero if there was an error initializing the xattr 1018 if (!err)
1214 * directory or if there is a condition where we don't support them. */ 1019 s->s_xattr = reiserfs_xattr_handlers;
1020
1021error:
1215 if (err) { 1022 if (err) {
1216 clear_bit(REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt));
1217 clear_bit(REISERFS_XATTRS_USER, &(REISERFS_SB(s)->s_mount_opt)); 1023 clear_bit(REISERFS_XATTRS_USER, &(REISERFS_SB(s)->s_mount_opt));
1218 clear_bit(REISERFS_POSIXACL, &(REISERFS_SB(s)->s_mount_opt)); 1024 clear_bit(REISERFS_POSIXACL, &(REISERFS_SB(s)->s_mount_opt));
1219 } 1025 }
1026#endif
1220 1027
1221 /* The super_block MS_POSIXACL must mirror the (no)acl mount option. */ 1028 /* The super_block MS_POSIXACL must mirror the (no)acl mount option. */
1222 s->s_flags = s->s_flags & ~MS_POSIXACL; 1029 s->s_flags = s->s_flags & ~MS_POSIXACL;
1030#ifdef CONFIG_REISERFS_FS_POSIX_ACL
1223 if (reiserfs_posixacl(s)) 1031 if (reiserfs_posixacl(s))
1224 s->s_flags |= MS_POSIXACL; 1032 s->s_flags |= MS_POSIXACL;
1033#endif
1225 1034
1226 return err; 1035 return err;
1227} 1036}
1228
1229static int reiserfs_check_acl(struct inode *inode, int mask)
1230{
1231 struct posix_acl *acl;
1232 int error = -EAGAIN; /* do regular unix permission checks by default */
1233
1234 reiserfs_read_lock_xattr_i(inode);
1235 reiserfs_read_lock_xattrs(inode->i_sb);
1236
1237 acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
1238
1239 reiserfs_read_unlock_xattrs(inode->i_sb);
1240 reiserfs_read_unlock_xattr_i(inode);
1241
1242 if (acl) {
1243 if (!IS_ERR(acl)) {
1244 error = posix_acl_permission(inode, acl, mask);
1245 posix_acl_release(acl);
1246 } else if (PTR_ERR(acl) != -ENODATA)
1247 error = PTR_ERR(acl);
1248 }
1249
1250 return error;
1251}
1252
1253int reiserfs_permission(struct inode *inode, int mask)
1254{
1255 /*
1256 * We don't do permission checks on the internal objects.
1257 * Permissions are determined by the "owning" object.
1258 */
1259 if (is_reiserfs_priv_object(inode))
1260 return 0;
1261
1262 /*
1263 * Stat data v1 doesn't support ACLs.
1264 */
1265 if (get_inode_sd_version(inode) == STAT_DATA_V1)
1266 return generic_permission(inode, mask, NULL);
1267 else
1268 return generic_permission(inode, mask, reiserfs_check_acl);
1269}
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index b7e4fa4539de..c303c426fe2b 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -10,15 +10,17 @@
10#include <linux/reiserfs_acl.h> 10#include <linux/reiserfs_acl.h>
11#include <asm/uaccess.h> 11#include <asm/uaccess.h>
12 12
13static int reiserfs_set_acl(struct inode *inode, int type, 13static int reiserfs_set_acl(struct reiserfs_transaction_handle *th,
14 struct inode *inode, int type,
14 struct posix_acl *acl); 15 struct posix_acl *acl);
15 16
16static int 17static int
17xattr_set_acl(struct inode *inode, int type, const void *value, size_t size) 18xattr_set_acl(struct inode *inode, int type, const void *value, size_t size)
18{ 19{
19 struct posix_acl *acl; 20 struct posix_acl *acl;
20 int error; 21 int error, error2;
21 22 struct reiserfs_transaction_handle th;
23 size_t jcreate_blocks;
22 if (!reiserfs_posixacl(inode->i_sb)) 24 if (!reiserfs_posixacl(inode->i_sb))
23 return -EOPNOTSUPP; 25 return -EOPNOTSUPP;
24 if (!is_owner_or_cap(inode)) 26 if (!is_owner_or_cap(inode))
@@ -36,7 +38,21 @@ xattr_set_acl(struct inode *inode, int type, const void *value, size_t size)
36 } else 38 } else
37 acl = NULL; 39 acl = NULL;
38 40
39 error = reiserfs_set_acl(inode, type, acl); 41 /* Pessimism: We can't assume that anything from the xattr root up
42 * has been created. */
43
44 jcreate_blocks = reiserfs_xattr_jcreate_nblocks(inode) +
45 reiserfs_xattr_nblocks(inode, size) * 2;
46
47 reiserfs_write_lock(inode->i_sb);
48 error = journal_begin(&th, inode->i_sb, jcreate_blocks);
49 if (error == 0) {
50 error = reiserfs_set_acl(&th, inode, type, acl);
51 error2 = journal_end(&th, inode->i_sb, jcreate_blocks);
52 if (error2)
53 error = error2;
54 }
55 reiserfs_write_unlock(inode->i_sb);
40 56
41 release_and_out: 57 release_and_out:
42 posix_acl_release(acl); 58 posix_acl_release(acl);
@@ -172,6 +188,29 @@ static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
172 return ERR_PTR(-EINVAL); 188 return ERR_PTR(-EINVAL);
173} 189}
174 190
191static inline void iset_acl(struct inode *inode, struct posix_acl **i_acl,
192 struct posix_acl *acl)
193{
194 spin_lock(&inode->i_lock);
195 if (*i_acl != ERR_PTR(-ENODATA))
196 posix_acl_release(*i_acl);
197 *i_acl = posix_acl_dup(acl);
198 spin_unlock(&inode->i_lock);
199}
200
201static inline struct posix_acl *iget_acl(struct inode *inode,
202 struct posix_acl **i_acl)
203{
204 struct posix_acl *acl = ERR_PTR(-ENODATA);
205
206 spin_lock(&inode->i_lock);
207 if (*i_acl != ERR_PTR(-ENODATA))
208 acl = posix_acl_dup(*i_acl);
209 spin_unlock(&inode->i_lock);
210
211 return acl;
212}
213
175/* 214/*
176 * Inode operation get_posix_acl(). 215 * Inode operation get_posix_acl().
177 * 216 *
@@ -199,11 +238,11 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
199 return ERR_PTR(-EINVAL); 238 return ERR_PTR(-EINVAL);
200 } 239 }
201 240
202 if (IS_ERR(*p_acl)) { 241 acl = iget_acl(inode, p_acl);
203 if (PTR_ERR(*p_acl) == -ENODATA) 242 if (acl && !IS_ERR(acl))
204 return NULL; 243 return acl;
205 } else if (*p_acl != NULL) 244 else if (PTR_ERR(acl) == -ENODATA)
206 return posix_acl_dup(*p_acl); 245 return NULL;
207 246
208 size = reiserfs_xattr_get(inode, name, NULL, 0); 247 size = reiserfs_xattr_get(inode, name, NULL, 0);
209 if (size < 0) { 248 if (size < 0) {
@@ -229,7 +268,7 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
229 } else { 268 } else {
230 acl = posix_acl_from_disk(value, retval); 269 acl = posix_acl_from_disk(value, retval);
231 if (!IS_ERR(acl)) 270 if (!IS_ERR(acl))
232 *p_acl = posix_acl_dup(acl); 271 iset_acl(inode, p_acl, acl);
233 } 272 }
234 273
235 kfree(value); 274 kfree(value);
@@ -243,12 +282,13 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
243 * BKL held [before 2.5.x] 282 * BKL held [before 2.5.x]
244 */ 283 */
245static int 284static int
246reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) 285reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
286 int type, struct posix_acl *acl)
247{ 287{
248 char *name; 288 char *name;
249 void *value = NULL; 289 void *value = NULL;
250 struct posix_acl **p_acl; 290 struct posix_acl **p_acl;
251 size_t size; 291 size_t size = 0;
252 int error; 292 int error;
253 struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode); 293 struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
254 294
@@ -285,31 +325,28 @@ reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
285 value = posix_acl_to_disk(acl, &size); 325 value = posix_acl_to_disk(acl, &size);
286 if (IS_ERR(value)) 326 if (IS_ERR(value))
287 return (int)PTR_ERR(value); 327 return (int)PTR_ERR(value);
288 error = reiserfs_xattr_set(inode, name, value, size, 0); 328 }
289 } else { 329
290 error = reiserfs_xattr_del(inode, name); 330 error = reiserfs_xattr_set_handle(th, inode, name, value, size, 0);
291 if (error == -ENODATA) { 331
292 /* This may seem odd here, but it means that the ACL was set 332 /*
293 * with a value representable with mode bits. If there was 333 * Ensure that the inode gets dirtied if we're only using
294 * an ACL before, reiserfs_xattr_del already dirtied the inode. 334 * the mode bits and an old ACL didn't exist. We don't need
295 */ 335 * to check if the inode is hashed here since we won't get
336 * called by reiserfs_inherit_default_acl().
337 */
338 if (error == -ENODATA) {
339 error = 0;
340 if (type == ACL_TYPE_ACCESS) {
341 inode->i_ctime = CURRENT_TIME_SEC;
296 mark_inode_dirty(inode); 342 mark_inode_dirty(inode);
297 error = 0;
298 } 343 }
299 } 344 }
300 345
301 kfree(value); 346 kfree(value);
302 347
303 if (!error) { 348 if (!error)
304 /* Release the old one */ 349 iset_acl(inode, p_acl, acl);
305 if (!IS_ERR(*p_acl) && *p_acl)
306 posix_acl_release(*p_acl);
307
308 if (acl == NULL)
309 *p_acl = ERR_PTR(-ENODATA);
310 else
311 *p_acl = posix_acl_dup(acl);
312 }
313 350
314 return error; 351 return error;
315} 352}
@@ -317,7 +354,8 @@ reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
317/* dir->i_mutex: locked, 354/* dir->i_mutex: locked,
318 * inode is new and not released into the wild yet */ 355 * inode is new and not released into the wild yet */
319int 356int
320reiserfs_inherit_default_acl(struct inode *dir, struct dentry *dentry, 357reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
358 struct inode *dir, struct dentry *dentry,
321 struct inode *inode) 359 struct inode *inode)
322{ 360{
323 struct posix_acl *acl; 361 struct posix_acl *acl;
@@ -335,8 +373,8 @@ reiserfs_inherit_default_acl(struct inode *dir, struct dentry *dentry,
335 /* Don't apply ACLs to objects in the .reiserfs_priv tree.. This 373 /* Don't apply ACLs to objects in the .reiserfs_priv tree.. This
336 * would be useless since permissions are ignored, and a pain because 374 * would be useless since permissions are ignored, and a pain because
337 * it introduces locking cycles */ 375 * it introduces locking cycles */
338 if (is_reiserfs_priv_object(dir)) { 376 if (IS_PRIVATE(dir)) {
339 reiserfs_mark_inode_private(inode); 377 inode->i_flags |= S_PRIVATE;
340 goto apply_umask; 378 goto apply_umask;
341 } 379 }
342 380
@@ -354,7 +392,8 @@ reiserfs_inherit_default_acl(struct inode *dir, struct dentry *dentry,
354 392
355 /* Copy the default ACL to the default ACL of a new directory */ 393 /* Copy the default ACL to the default ACL of a new directory */
356 if (S_ISDIR(inode->i_mode)) { 394 if (S_ISDIR(inode->i_mode)) {
357 err = reiserfs_set_acl(inode, ACL_TYPE_DEFAULT, acl); 395 err = reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT,
396 acl);
358 if (err) 397 if (err)
359 goto cleanup; 398 goto cleanup;
360 } 399 }
@@ -375,9 +414,9 @@ reiserfs_inherit_default_acl(struct inode *dir, struct dentry *dentry,
375 414
376 /* If we need an ACL.. */ 415 /* If we need an ACL.. */
377 if (need_acl > 0) { 416 if (need_acl > 0) {
378 err = 417 err = reiserfs_set_acl(th, inode,
379 reiserfs_set_acl(inode, ACL_TYPE_ACCESS, 418 ACL_TYPE_ACCESS,
380 acl_copy); 419 acl_copy);
381 if (err) 420 if (err)
382 goto cleanup_copy; 421 goto cleanup_copy;
383 } 422 }
@@ -389,31 +428,51 @@ reiserfs_inherit_default_acl(struct inode *dir, struct dentry *dentry,
389 } else { 428 } else {
390 apply_umask: 429 apply_umask:
391 /* no ACL, apply umask */ 430 /* no ACL, apply umask */
392 inode->i_mode &= ~current->fs->umask; 431 inode->i_mode &= ~current_umask();
393 } 432 }
394 433
395 return err; 434 return err;
396} 435}
397 436
398/* Looks up and caches the result of the default ACL. 437/* This is used to cache the default acl before a new object is created.
399 * We do this so that we don't need to carry the xattr_sem into 438 * The biggest reason for this is to get an idea of how many blocks will
400 * reiserfs_new_inode if we don't need to */ 439 * actually be required for the create operation if we must inherit an ACL.
440 * An ACL write can add up to 3 object creations and an additional file write
441 * so we'd prefer not to reserve that many blocks in the journal if we can.
442 * It also has the advantage of not loading the ACL with a transaction open,
443 * this may seem silly, but if the owner of the directory is doing the
444 * creation, the ACL may not be loaded since the permissions wouldn't require
445 * it.
446 * We return the number of blocks required for the transaction.
447 */
401int reiserfs_cache_default_acl(struct inode *inode) 448int reiserfs_cache_default_acl(struct inode *inode)
402{ 449{
403 int ret = 0; 450 struct posix_acl *acl;
404 if (reiserfs_posixacl(inode->i_sb) && !is_reiserfs_priv_object(inode)) { 451 int nblocks = 0;
405 struct posix_acl *acl; 452
406 reiserfs_read_lock_xattr_i(inode); 453 if (IS_PRIVATE(inode))
407 reiserfs_read_lock_xattrs(inode->i_sb); 454 return 0;
408 acl = reiserfs_get_acl(inode, ACL_TYPE_DEFAULT); 455
409 reiserfs_read_unlock_xattrs(inode->i_sb); 456 acl = reiserfs_get_acl(inode, ACL_TYPE_DEFAULT);
410 reiserfs_read_unlock_xattr_i(inode); 457
411 ret = (acl && !IS_ERR(acl)); 458 if (acl && !IS_ERR(acl)) {
412 if (ret) 459 int size = reiserfs_acl_size(acl->a_count);
413 posix_acl_release(acl); 460
461 /* Other xattrs can be created during inode creation. We don't
462 * want to claim too many blocks, so we check to see if we
463 * we need to create the tree to the xattrs, and then we
464 * just want two files. */
465 nblocks = reiserfs_xattr_jcreate_nblocks(inode);
466 nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
467
468 REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
469
470 /* We need to account for writes + bitmaps for two files */
471 nblocks += reiserfs_xattr_nblocks(inode, size) * 4;
472 posix_acl_release(acl);
414 } 473 }
415 474
416 return ret; 475 return nblocks;
417} 476}
418 477
419int reiserfs_acl_chmod(struct inode *inode) 478int reiserfs_acl_chmod(struct inode *inode)
@@ -429,9 +488,7 @@ int reiserfs_acl_chmod(struct inode *inode)
429 return 0; 488 return 0;
430 } 489 }
431 490
432 reiserfs_read_lock_xattrs(inode->i_sb);
433 acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); 491 acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
434 reiserfs_read_unlock_xattrs(inode->i_sb);
435 if (!acl) 492 if (!acl)
436 return 0; 493 return 0;
437 if (IS_ERR(acl)) 494 if (IS_ERR(acl))
@@ -442,18 +499,20 @@ int reiserfs_acl_chmod(struct inode *inode)
442 return -ENOMEM; 499 return -ENOMEM;
443 error = posix_acl_chmod_masq(clone, inode->i_mode); 500 error = posix_acl_chmod_masq(clone, inode->i_mode);
444 if (!error) { 501 if (!error) {
445 int lock = !has_xattr_dir(inode); 502 struct reiserfs_transaction_handle th;
446 reiserfs_write_lock_xattr_i(inode); 503 size_t size = reiserfs_xattr_nblocks(inode,
447 if (lock) 504 reiserfs_acl_size(clone->a_count));
448 reiserfs_write_lock_xattrs(inode->i_sb); 505 reiserfs_write_lock(inode->i_sb);
449 else 506 error = journal_begin(&th, inode->i_sb, size * 2);
450 reiserfs_read_lock_xattrs(inode->i_sb); 507 if (!error) {
451 error = reiserfs_set_acl(inode, ACL_TYPE_ACCESS, clone); 508 int error2;
452 if (lock) 509 error = reiserfs_set_acl(&th, inode, ACL_TYPE_ACCESS,
453 reiserfs_write_unlock_xattrs(inode->i_sb); 510 clone);
454 else 511 error2 = journal_end(&th, inode->i_sb, size * 2);
455 reiserfs_read_unlock_xattrs(inode->i_sb); 512 if (error2)
456 reiserfs_write_unlock_xattr_i(inode); 513 error = error2;
514 }
515 reiserfs_write_unlock(inode->i_sb);
457 } 516 }
458 posix_acl_release(clone); 517 posix_acl_release(clone);
459 return error; 518 return error;
@@ -477,38 +536,22 @@ posix_acl_access_set(struct inode *inode, const char *name,
477 return xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); 536 return xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
478} 537}
479 538
480static int posix_acl_access_del(struct inode *inode, const char *name) 539static size_t posix_acl_access_list(struct inode *inode, char *list,
540 size_t list_size, const char *name,
541 size_t name_len)
481{ 542{
482 struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode); 543 const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
483 struct posix_acl **acl = &reiserfs_i->i_acl_access;
484 if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS) - 1)
485 return -EINVAL;
486 if (!IS_ERR(*acl) && *acl) {
487 posix_acl_release(*acl);
488 *acl = ERR_PTR(-ENODATA);
489 }
490
491 return 0;
492}
493
494static int
495posix_acl_access_list(struct inode *inode, const char *name, int namelen,
496 char *out)
497{
498 int len = namelen;
499 if (!reiserfs_posixacl(inode->i_sb)) 544 if (!reiserfs_posixacl(inode->i_sb))
500 return 0; 545 return 0;
501 if (out) 546 if (list && size <= list_size)
502 memcpy(out, name, len); 547 memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
503 548 return size;
504 return len;
505} 549}
506 550
507struct reiserfs_xattr_handler posix_acl_access_handler = { 551struct xattr_handler reiserfs_posix_acl_access_handler = {
508 .prefix = POSIX_ACL_XATTR_ACCESS, 552 .prefix = POSIX_ACL_XATTR_ACCESS,
509 .get = posix_acl_access_get, 553 .get = posix_acl_access_get,
510 .set = posix_acl_access_set, 554 .set = posix_acl_access_set,
511 .del = posix_acl_access_del,
512 .list = posix_acl_access_list, 555 .list = posix_acl_access_list,
513}; 556};
514 557
@@ -530,37 +573,21 @@ posix_acl_default_set(struct inode *inode, const char *name,
530 return xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); 573 return xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
531} 574}
532 575
533static int posix_acl_default_del(struct inode *inode, const char *name) 576static size_t posix_acl_default_list(struct inode *inode, char *list,
577 size_t list_size, const char *name,
578 size_t name_len)
534{ 579{
535 struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode); 580 const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
536 struct posix_acl **acl = &reiserfs_i->i_acl_default;
537 if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT) - 1)
538 return -EINVAL;
539 if (!IS_ERR(*acl) && *acl) {
540 posix_acl_release(*acl);
541 *acl = ERR_PTR(-ENODATA);
542 }
543
544 return 0;
545}
546
547static int
548posix_acl_default_list(struct inode *inode, const char *name, int namelen,
549 char *out)
550{
551 int len = namelen;
552 if (!reiserfs_posixacl(inode->i_sb)) 581 if (!reiserfs_posixacl(inode->i_sb))
553 return 0; 582 return 0;
554 if (out) 583 if (list && size <= list_size)
555 memcpy(out, name, len); 584 memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
556 585 return size;
557 return len;
558} 586}
559 587
560struct reiserfs_xattr_handler posix_acl_default_handler = { 588struct xattr_handler reiserfs_posix_acl_default_handler = {
561 .prefix = POSIX_ACL_XATTR_DEFAULT, 589 .prefix = POSIX_ACL_XATTR_DEFAULT,
562 .get = posix_acl_default_get, 590 .get = posix_acl_default_get,
563 .set = posix_acl_default_set, 591 .set = posix_acl_default_set,
564 .del = posix_acl_default_del,
565 .list = posix_acl_default_list, 592 .list = posix_acl_default_list,
566}; 593};
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 056008db1377..4d3c20e787c3 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -4,6 +4,7 @@
4#include <linux/pagemap.h> 4#include <linux/pagemap.h>
5#include <linux/xattr.h> 5#include <linux/xattr.h>
6#include <linux/reiserfs_xattr.h> 6#include <linux/reiserfs_xattr.h>
7#include <linux/security.h>
7#include <asm/uaccess.h> 8#include <asm/uaccess.h>
8 9
9static int 10static int
@@ -12,7 +13,7 @@ security_get(struct inode *inode, const char *name, void *buffer, size_t size)
12 if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) 13 if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
13 return -EINVAL; 14 return -EINVAL;
14 15
15 if (is_reiserfs_priv_object(inode)) 16 if (IS_PRIVATE(inode))
16 return -EPERM; 17 return -EPERM;
17 18
18 return reiserfs_xattr_get(inode, name, buffer, size); 19 return reiserfs_xattr_get(inode, name, buffer, size);
@@ -25,41 +26,84 @@ security_set(struct inode *inode, const char *name, const void *buffer,
25 if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) 26 if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
26 return -EINVAL; 27 return -EINVAL;
27 28
28 if (is_reiserfs_priv_object(inode)) 29 if (IS_PRIVATE(inode))
29 return -EPERM; 30 return -EPERM;
30 31
31 return reiserfs_xattr_set(inode, name, buffer, size, flags); 32 return reiserfs_xattr_set(inode, name, buffer, size, flags);
32} 33}
33 34
34static int security_del(struct inode *inode, const char *name) 35static size_t security_list(struct inode *inode, char *list, size_t list_len,
36 const char *name, size_t namelen)
35{ 37{
36 if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) 38 const size_t len = namelen + 1;
37 return -EINVAL;
38 39
39 if (is_reiserfs_priv_object(inode)) 40 if (IS_PRIVATE(inode))
40 return -EPERM; 41 return 0;
42
43 if (list && len <= list_len) {
44 memcpy(list, name, namelen);
45 list[namelen] = '\0';
46 }
41 47
42 return 0; 48 return len;
43} 49}
44 50
45static int 51/* Initializes the security context for a new inode and returns the number
46security_list(struct inode *inode, const char *name, int namelen, char *out) 52 * of blocks needed for the transaction. If successful, reiserfs_security
53 * must be released using reiserfs_security_free when the caller is done. */
54int reiserfs_security_init(struct inode *dir, struct inode *inode,
55 struct reiserfs_security_handle *sec)
47{ 56{
48 int len = namelen; 57 int blocks = 0;
58 int error = security_inode_init_security(inode, dir, &sec->name,
59 &sec->value, &sec->length);
60 if (error) {
61 if (error == -EOPNOTSUPP)
62 error = 0;
49 63
50 if (is_reiserfs_priv_object(inode)) 64 sec->name = NULL;
51 return 0; 65 sec->value = NULL;
66 sec->length = 0;
67 return error;
68 }
52 69
53 if (out) 70 if (sec->length) {
54 memcpy(out, name, len); 71 blocks = reiserfs_xattr_jcreate_nblocks(inode) +
72 reiserfs_xattr_nblocks(inode, sec->length);
73 /* We don't want to count the directories twice if we have
74 * a default ACL. */
75 REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
76 }
77 return blocks;
78}
55 79
56 return len; 80int reiserfs_security_write(struct reiserfs_transaction_handle *th,
81 struct inode *inode,
82 struct reiserfs_security_handle *sec)
83{
84 int error;
85 if (strlen(sec->name) < sizeof(XATTR_SECURITY_PREFIX))
86 return -EINVAL;
87
88 error = reiserfs_xattr_set_handle(th, inode, sec->name, sec->value,
89 sec->length, XATTR_CREATE);
90 if (error == -ENODATA || error == -EOPNOTSUPP)
91 error = 0;
92
93 return error;
94}
95
96void reiserfs_security_free(struct reiserfs_security_handle *sec)
97{
98 kfree(sec->name);
99 kfree(sec->value);
100 sec->name = NULL;
101 sec->value = NULL;
57} 102}
58 103
59struct reiserfs_xattr_handler security_handler = { 104struct xattr_handler reiserfs_xattr_security_handler = {
60 .prefix = XATTR_SECURITY_PREFIX, 105 .prefix = XATTR_SECURITY_PREFIX,
61 .get = security_get, 106 .get = security_get,
62 .set = security_set, 107 .set = security_set,
63 .del = security_del,
64 .list = security_list, 108 .list = security_list,
65}; 109};
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 60abe2bb1f98..a865042f75e2 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -13,10 +13,7 @@ trusted_get(struct inode *inode, const char *name, void *buffer, size_t size)
13 if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) 13 if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
14 return -EINVAL; 14 return -EINVAL;
15 15
16 if (!reiserfs_xattrs(inode->i_sb)) 16 if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
17 return -EOPNOTSUPP;
18
19 if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode)))
20 return -EPERM; 17 return -EPERM;
21 18
22 return reiserfs_xattr_get(inode, name, buffer, size); 19 return reiserfs_xattr_get(inode, name, buffer, size);
@@ -29,50 +26,30 @@ trusted_set(struct inode *inode, const char *name, const void *buffer,
29 if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) 26 if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
30 return -EINVAL; 27 return -EINVAL;
31 28
32 if (!reiserfs_xattrs(inode->i_sb)) 29 if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
33 return -EOPNOTSUPP;
34
35 if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode)))
36 return -EPERM; 30 return -EPERM;
37 31
38 return reiserfs_xattr_set(inode, name, buffer, size, flags); 32 return reiserfs_xattr_set(inode, name, buffer, size, flags);
39} 33}
40 34
41static int trusted_del(struct inode *inode, const char *name) 35static size_t trusted_list(struct inode *inode, char *list, size_t list_size,
36 const char *name, size_t name_len)
42{ 37{
43 if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) 38 const size_t len = name_len + 1;
44 return -EINVAL;
45 39
46 if (!reiserfs_xattrs(inode->i_sb)) 40 if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
47 return -EOPNOTSUPP;
48
49 if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode)))
50 return -EPERM;
51
52 return 0;
53}
54
55static int
56trusted_list(struct inode *inode, const char *name, int namelen, char *out)
57{
58 int len = namelen;
59
60 if (!reiserfs_xattrs(inode->i_sb))
61 return 0; 41 return 0;
62 42
63 if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode))) 43 if (list && len <= list_size) {
64 return 0; 44 memcpy(list, name, name_len);
65 45 list[name_len] = '\0';
66 if (out) 46 }
67 memcpy(out, name, len);
68
69 return len; 47 return len;
70} 48}
71 49
72struct reiserfs_xattr_handler trusted_handler = { 50struct xattr_handler reiserfs_xattr_trusted_handler = {
73 .prefix = XATTR_TRUSTED_PREFIX, 51 .prefix = XATTR_TRUSTED_PREFIX,
74 .get = trusted_get, 52 .get = trusted_get,
75 .set = trusted_set, 53 .set = trusted_set,
76 .del = trusted_del,
77 .list = trusted_list, 54 .list = trusted_list,
78}; 55};
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index 1384efcb938e..e3238dc4f3db 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -6,10 +6,6 @@
6#include <linux/reiserfs_xattr.h> 6#include <linux/reiserfs_xattr.h>
7#include <asm/uaccess.h> 7#include <asm/uaccess.h>
8 8
9#ifdef CONFIG_REISERFS_FS_POSIX_ACL
10# include <linux/reiserfs_acl.h>
11#endif
12
13static int 9static int
14user_get(struct inode *inode, const char *name, void *buffer, size_t size) 10user_get(struct inode *inode, const char *name, void *buffer, size_t size)
15{ 11{
@@ -25,7 +21,6 @@ static int
25user_set(struct inode *inode, const char *name, const void *buffer, 21user_set(struct inode *inode, const char *name, const void *buffer,
26 size_t size, int flags) 22 size_t size, int flags)
27{ 23{
28
29 if (strlen(name) < sizeof(XATTR_USER_PREFIX)) 24 if (strlen(name) < sizeof(XATTR_USER_PREFIX))
30 return -EINVAL; 25 return -EINVAL;
31 26
@@ -34,33 +29,23 @@ user_set(struct inode *inode, const char *name, const void *buffer,
34 return reiserfs_xattr_set(inode, name, buffer, size, flags); 29 return reiserfs_xattr_set(inode, name, buffer, size, flags);
35} 30}
36 31
37static int user_del(struct inode *inode, const char *name) 32static size_t user_list(struct inode *inode, char *list, size_t list_size,
33 const char *name, size_t name_len)
38{ 34{
39 if (strlen(name) < sizeof(XATTR_USER_PREFIX)) 35 const size_t len = name_len + 1;
40 return -EINVAL;
41
42 if (!reiserfs_xattrs_user(inode->i_sb))
43 return -EOPNOTSUPP;
44 return 0;
45}
46 36
47static int
48user_list(struct inode *inode, const char *name, int namelen, char *out)
49{
50 int len = namelen;
51 if (!reiserfs_xattrs_user(inode->i_sb)) 37 if (!reiserfs_xattrs_user(inode->i_sb))
52 return 0; 38 return 0;
53 39 if (list && len <= list_size) {
54 if (out) 40 memcpy(list, name, name_len);
55 memcpy(out, name, len); 41 list[name_len] = '\0';
56 42 }
57 return len; 43 return len;
58} 44}
59 45
60struct reiserfs_xattr_handler user_handler = { 46struct xattr_handler reiserfs_xattr_user_handler = {
61 .prefix = XATTR_USER_PREFIX, 47 .prefix = XATTR_USER_PREFIX,
62 .get = user_get, 48 .get = user_get,
63 .set = user_set, 49 .set = user_set,
64 .del = user_del,
65 .list = user_list, 50 .list = user_list,
66}; 51};
diff --git a/fs/seq_file.c b/fs/seq_file.c
index a1a4cfe19210..7f40f30c55c5 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -513,7 +513,7 @@ int seq_bitmap(struct seq_file *m, const unsigned long *bits,
513} 513}
514EXPORT_SYMBOL(seq_bitmap); 514EXPORT_SYMBOL(seq_bitmap);
515 515
516int seq_bitmap_list(struct seq_file *m, unsigned long *bits, 516int seq_bitmap_list(struct seq_file *m, const unsigned long *bits,
517 unsigned int nr_bits) 517 unsigned int nr_bits)
518{ 518{
519 if (m->count < m->size) { 519 if (m->count < m->size) {
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index e7ddd0328ddc..3e4803b4427e 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -277,7 +277,7 @@ static int smb_hash_dentry(struct dentry *, struct qstr *);
277static int smb_compare_dentry(struct dentry *, struct qstr *, struct qstr *); 277static int smb_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
278static int smb_delete_dentry(struct dentry *); 278static int smb_delete_dentry(struct dentry *);
279 279
280static struct dentry_operations smbfs_dentry_operations = 280static const struct dentry_operations smbfs_dentry_operations =
281{ 281{
282 .d_revalidate = smb_lookup_validate, 282 .d_revalidate = smb_lookup_validate,
283 .d_hash = smb_hash_dentry, 283 .d_hash = smb_hash_dentry,
@@ -285,7 +285,7 @@ static struct dentry_operations smbfs_dentry_operations =
285 .d_delete = smb_delete_dentry, 285 .d_delete = smb_delete_dentry,
286}; 286};
287 287
288static struct dentry_operations smbfs_dentry_operations_case = 288static const struct dentry_operations smbfs_dentry_operations_case =
289{ 289{
290 .d_revalidate = smb_lookup_validate, 290 .d_revalidate = smb_lookup_validate,
291 .d_delete = smb_delete_dentry, 291 .d_delete = smb_delete_dentry,
diff --git a/fs/splice.c b/fs/splice.c
index 4ed0ba44a966..dd727d43e5b7 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -59,7 +59,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
59 */ 59 */
60 wait_on_page_writeback(page); 60 wait_on_page_writeback(page);
61 61
62 if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) 62 if (page_has_private(page) &&
63 !try_to_release_page(page, GFP_KERNEL))
63 goto out_unlock; 64 goto out_unlock;
64 65
65 /* 66 /*
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index c837dfc2b3c6..2a7960310349 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -80,7 +80,7 @@ static struct buffer_head *get_block_length(struct super_block *sb,
80 * generated a larger block - this does occasionally happen with zlib). 80 * generated a larger block - this does occasionally happen with zlib).
81 */ 81 */
82int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, 82int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
83 int length, u64 *next_index, int srclength) 83 int length, u64 *next_index, int srclength, int pages)
84{ 84{
85 struct squashfs_sb_info *msblk = sb->s_fs_info; 85 struct squashfs_sb_info *msblk = sb->s_fs_info;
86 struct buffer_head **bh; 86 struct buffer_head **bh;
@@ -184,7 +184,7 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
184 offset = 0; 184 offset = 0;
185 } 185 }
186 186
187 if (msblk->stream.avail_out == 0) { 187 if (msblk->stream.avail_out == 0 && page < pages) {
188 msblk->stream.next_out = buffer[page++]; 188 msblk->stream.next_out = buffer[page++];
189 msblk->stream.avail_out = PAGE_CACHE_SIZE; 189 msblk->stream.avail_out = PAGE_CACHE_SIZE;
190 } 190 }
@@ -201,25 +201,20 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
201 zlib_init = 1; 201 zlib_init = 1;
202 } 202 }
203 203
204 zlib_err = zlib_inflate(&msblk->stream, Z_NO_FLUSH); 204 zlib_err = zlib_inflate(&msblk->stream, Z_SYNC_FLUSH);
205 205
206 if (msblk->stream.avail_in == 0 && k < b) 206 if (msblk->stream.avail_in == 0 && k < b)
207 put_bh(bh[k++]); 207 put_bh(bh[k++]);
208 } while (zlib_err == Z_OK); 208 } while (zlib_err == Z_OK);
209 209
210 if (zlib_err != Z_STREAM_END) { 210 if (zlib_err != Z_STREAM_END) {
211 ERROR("zlib_inflate returned unexpected result" 211 ERROR("zlib_inflate error, data probably corrupt\n");
212 " 0x%x, srclength %d, avail_in %d,"
213 " avail_out %d\n", zlib_err, srclength,
214 msblk->stream.avail_in,
215 msblk->stream.avail_out);
216 goto release_mutex; 212 goto release_mutex;
217 } 213 }
218 214
219 zlib_err = zlib_inflateEnd(&msblk->stream); 215 zlib_err = zlib_inflateEnd(&msblk->stream);
220 if (zlib_err != Z_OK) { 216 if (zlib_err != Z_OK) {
221 ERROR("zlib_inflateEnd returned unexpected result 0x%x," 217 ERROR("zlib_inflate error, data probably corrupt\n");
222 " srclength %d\n", zlib_err, srclength);
223 goto release_mutex; 218 goto release_mutex;
224 } 219 }
225 length = msblk->stream.total_out; 220 length = msblk->stream.total_out;
@@ -268,7 +263,8 @@ block_release:
268 put_bh(bh[k]); 263 put_bh(bh[k]);
269 264
270read_failure: 265read_failure:
271 ERROR("sb_bread failed reading block 0x%llx\n", cur_index); 266 ERROR("squashfs_read_data failed to read block 0x%llx\n",
267 (unsigned long long) index);
272 kfree(bh); 268 kfree(bh);
273 return -EIO; 269 return -EIO;
274} 270}
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index f29eda16d25e..1c4739e33af6 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -119,7 +119,7 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
119 119
120 entry->length = squashfs_read_data(sb, entry->data, 120 entry->length = squashfs_read_data(sb, entry->data,
121 block, length, &entry->next_index, 121 block, length, &entry->next_index,
122 cache->block_size); 122 cache->block_size, cache->pages);
123 123
124 spin_lock(&cache->lock); 124 spin_lock(&cache->lock);
125 125
@@ -406,7 +406,7 @@ int squashfs_read_table(struct super_block *sb, void *buffer, u64 block,
406 for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE) 406 for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
407 data[i] = buffer; 407 data[i] = buffer;
408 res = squashfs_read_data(sb, data, block, length | 408 res = squashfs_read_data(sb, data, block, length |
409 SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length); 409 SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length, pages);
410 kfree(data); 410 kfree(data);
411 return res; 411 return res;
412} 412}
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 7a63398bb855..9101dbde39ec 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -133,7 +133,8 @@ int squashfs_read_inode(struct inode *inode, long long ino)
133 type = le16_to_cpu(sqshb_ino->inode_type); 133 type = le16_to_cpu(sqshb_ino->inode_type);
134 switch (type) { 134 switch (type) {
135 case SQUASHFS_REG_TYPE: { 135 case SQUASHFS_REG_TYPE: {
136 unsigned int frag_offset, frag_size, frag; 136 unsigned int frag_offset, frag;
137 int frag_size;
137 u64 frag_blk; 138 u64 frag_blk;
138 struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg; 139 struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg;
139 140
@@ -175,7 +176,8 @@ int squashfs_read_inode(struct inode *inode, long long ino)
175 break; 176 break;
176 } 177 }
177 case SQUASHFS_LREG_TYPE: { 178 case SQUASHFS_LREG_TYPE: {
178 unsigned int frag_offset, frag_size, frag; 179 unsigned int frag_offset, frag;
180 int frag_size;
179 u64 frag_blk; 181 u64 frag_blk;
180 struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg; 182 struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg;
181 183
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 6b2515d027d5..0e9feb6adf7e 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -34,7 +34,7 @@ static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
34 34
35/* block.c */ 35/* block.c */
36extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *, 36extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
37 int); 37 int, int);
38 38
39/* cache.c */ 39/* cache.c */
40extern struct squashfs_cache *squashfs_cache_init(char *, int, int); 40extern struct squashfs_cache *squashfs_cache_init(char *, int, int);
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 071df5b5b491..ffa6edcd2d0c 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -301,6 +301,7 @@ failure:
301static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf) 301static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
302{ 302{
303 struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info; 303 struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info;
304 u64 id = huge_encode_dev(dentry->d_sb->s_bdev->bd_dev);
304 305
305 TRACE("Entered squashfs_statfs\n"); 306 TRACE("Entered squashfs_statfs\n");
306 307
@@ -311,6 +312,8 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
311 buf->f_files = msblk->inodes; 312 buf->f_files = msblk->inodes;
312 buf->f_ffree = 0; 313 buf->f_ffree = 0;
313 buf->f_namelen = SQUASHFS_NAME_LEN; 314 buf->f_namelen = SQUASHFS_NAME_LEN;
315 buf->f_fsid.val[0] = (u32)id;
316 buf->f_fsid.val[1] = (u32)(id >> 32);
314 317
315 return 0; 318 return 0;
316} 319}
@@ -389,7 +392,7 @@ static int __init init_squashfs_fs(void)
389 return err; 392 return err;
390 } 393 }
391 394
392 printk(KERN_INFO "squashfs: version 4.0 (2009/01/03) " 395 printk(KERN_INFO "squashfs: version 4.0 (2009/01/31) "
393 "Phillip Lougher\n"); 396 "Phillip Lougher\n");
394 397
395 return 0; 398 return 0;
diff --git a/fs/super.c b/fs/super.c
index 8349ed6b1412..77cb4ec919b9 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -197,7 +197,7 @@ void deactivate_super(struct super_block *s)
197 if (atomic_dec_and_lock(&s->s_active, &sb_lock)) { 197 if (atomic_dec_and_lock(&s->s_active, &sb_lock)) {
198 s->s_count -= S_BIAS-1; 198 s->s_count -= S_BIAS-1;
199 spin_unlock(&sb_lock); 199 spin_unlock(&sb_lock);
200 DQUOT_OFF(s, 0); 200 vfs_dq_off(s, 0);
201 down_write(&s->s_umount); 201 down_write(&s->s_umount);
202 fs->kill_sb(s); 202 fs->kill_sb(s);
203 put_filesystem(fs); 203 put_filesystem(fs);
@@ -266,7 +266,7 @@ EXPORT_SYMBOL(unlock_super);
266void __fsync_super(struct super_block *sb) 266void __fsync_super(struct super_block *sb)
267{ 267{
268 sync_inodes_sb(sb, 0); 268 sync_inodes_sb(sb, 0);
269 DQUOT_SYNC(sb); 269 vfs_dq_sync(sb);
270 lock_super(sb); 270 lock_super(sb);
271 if (sb->s_dirt && sb->s_op->write_super) 271 if (sb->s_dirt && sb->s_op->write_super)
272 sb->s_op->write_super(sb); 272 sb->s_op->write_super(sb);
@@ -287,6 +287,7 @@ int fsync_super(struct super_block *sb)
287 __fsync_super(sb); 287 __fsync_super(sb);
288 return sync_blockdev(sb->s_bdev); 288 return sync_blockdev(sb->s_bdev);
289} 289}
290EXPORT_SYMBOL_GPL(fsync_super);
290 291
291/** 292/**
292 * generic_shutdown_super - common helper for ->kill_sb() 293 * generic_shutdown_super - common helper for ->kill_sb()
@@ -371,8 +372,10 @@ retry:
371 continue; 372 continue;
372 if (!grab_super(old)) 373 if (!grab_super(old))
373 goto retry; 374 goto retry;
374 if (s) 375 if (s) {
376 up_write(&s->s_umount);
375 destroy_super(s); 377 destroy_super(s);
378 }
376 return old; 379 return old;
377 } 380 }
378 } 381 }
@@ -387,6 +390,7 @@ retry:
387 err = set(s, data); 390 err = set(s, data);
388 if (err) { 391 if (err) {
389 spin_unlock(&sb_lock); 392 spin_unlock(&sb_lock);
393 up_write(&s->s_umount);
390 destroy_super(s); 394 destroy_super(s);
391 return ERR_PTR(err); 395 return ERR_PTR(err);
392 } 396 }
@@ -652,7 +656,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
652 mark_files_ro(sb); 656 mark_files_ro(sb);
653 else if (!fs_may_remount_ro(sb)) 657 else if (!fs_may_remount_ro(sb))
654 return -EBUSY; 658 return -EBUSY;
655 retval = DQUOT_OFF(sb, 1); 659 retval = vfs_dq_off(sb, 1);
656 if (retval < 0 && retval != -ENOSYS) 660 if (retval < 0 && retval != -ENOSYS)
657 return -EBUSY; 661 return -EBUSY;
658 } 662 }
@@ -667,11 +671,11 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
667 } 671 }
668 sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); 672 sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
669 if (remount_rw) 673 if (remount_rw)
670 DQUOT_ON_REMOUNT(sb); 674 vfs_dq_quota_on_remount(sb);
671 return 0; 675 return 0;
672} 676}
673 677
674static void do_emergency_remount(unsigned long foo) 678static void do_emergency_remount(struct work_struct *work)
675{ 679{
676 struct super_block *sb; 680 struct super_block *sb;
677 681
@@ -694,12 +698,19 @@ static void do_emergency_remount(unsigned long foo)
694 spin_lock(&sb_lock); 698 spin_lock(&sb_lock);
695 } 699 }
696 spin_unlock(&sb_lock); 700 spin_unlock(&sb_lock);
701 kfree(work);
697 printk("Emergency Remount complete\n"); 702 printk("Emergency Remount complete\n");
698} 703}
699 704
700void emergency_remount(void) 705void emergency_remount(void)
701{ 706{
702 pdflush_operation(do_emergency_remount, 0); 707 struct work_struct *work;
708
709 work = kmalloc(sizeof(*work), GFP_ATOMIC);
710 if (work) {
711 INIT_WORK(work, do_emergency_remount);
712 schedule_work(work);
713 }
703} 714}
704 715
705/* 716/*
@@ -828,7 +839,8 @@ int get_sb_bdev(struct file_system_type *fs_type,
828 bdev->bd_super = s; 839 bdev->bd_super = s;
829 } 840 }
830 841
831 return simple_set_mnt(mnt, s); 842 simple_set_mnt(mnt, s);
843 return 0;
832 844
833error_s: 845error_s:
834 error = PTR_ERR(s); 846 error = PTR_ERR(s);
@@ -874,7 +886,8 @@ int get_sb_nodev(struct file_system_type *fs_type,
874 return error; 886 return error;
875 } 887 }
876 s->s_flags |= MS_ACTIVE; 888 s->s_flags |= MS_ACTIVE;
877 return simple_set_mnt(mnt, s); 889 simple_set_mnt(mnt, s);
890 return 0;
878} 891}
879 892
880EXPORT_SYMBOL(get_sb_nodev); 893EXPORT_SYMBOL(get_sb_nodev);
@@ -906,7 +919,8 @@ int get_sb_single(struct file_system_type *fs_type,
906 s->s_flags |= MS_ACTIVE; 919 s->s_flags |= MS_ACTIVE;
907 } 920 }
908 do_remount_sb(s, flags, data, 0); 921 do_remount_sb(s, flags, data, 0);
909 return simple_set_mnt(mnt, s); 922 simple_set_mnt(mnt, s);
923 return 0;
910} 924}
911 925
912EXPORT_SYMBOL(get_sb_single); 926EXPORT_SYMBOL(get_sb_single);
diff --git a/fs/sync.c b/fs/sync.c
index a16d53e5fe9d..7abc65fbf21d 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -25,7 +25,7 @@ static void do_sync(unsigned long wait)
25{ 25{
26 wakeup_pdflush(0); 26 wakeup_pdflush(0);
27 sync_inodes(0); /* All mappings, inodes and their blockdevs */ 27 sync_inodes(0); /* All mappings, inodes and their blockdevs */
28 DQUOT_SYNC(NULL); 28 vfs_dq_sync(NULL);
29 sync_supers(); /* Write the superblocks */ 29 sync_supers(); /* Write the superblocks */
30 sync_filesystems(0); /* Start syncing the filesystems */ 30 sync_filesystems(0); /* Start syncing the filesystems */
31 sync_filesystems(wait); /* Waitingly sync the filesystems */ 31 sync_filesystems(wait); /* Waitingly sync the filesystems */
@@ -42,9 +42,21 @@ SYSCALL_DEFINE0(sync)
42 return 0; 42 return 0;
43} 43}
44 44
45static void do_sync_work(struct work_struct *work)
46{
47 do_sync(0);
48 kfree(work);
49}
50
45void emergency_sync(void) 51void emergency_sync(void)
46{ 52{
47 pdflush_operation(do_sync, 0); 53 struct work_struct *work;
54
55 work = kmalloc(sizeof(*work), GFP_ATOMIC);
56 if (work) {
57 INIT_WORK(work, do_sync_work);
58 schedule_work(work);
59 }
48} 60}
49 61
50/* 62/*
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index f2c478c3424e..93e0c0281d45 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -21,15 +21,28 @@
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/slab.h> 22#include <linux/slab.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/mm.h>
24 25
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
26 27
27#include "sysfs.h" 28#include "sysfs.h"
28 29
30/*
31 * There's one bin_buffer for each open file.
32 *
33 * filp->private_data points to bin_buffer and
34 * sysfs_dirent->s_bin_attr.buffers points to a the bin_buffer s
35 * sysfs_dirent->s_bin_attr.buffers is protected by sysfs_bin_lock
36 */
37static DEFINE_MUTEX(sysfs_bin_lock);
38
29struct bin_buffer { 39struct bin_buffer {
30 struct mutex mutex; 40 struct mutex mutex;
31 void *buffer; 41 void *buffer;
32 int mmapped; 42 int mmapped;
43 struct vm_operations_struct *vm_ops;
44 struct file *file;
45 struct hlist_node list;
33}; 46};
34 47
35static int 48static int
@@ -168,6 +181,175 @@ out_free:
168 return count; 181 return count;
169} 182}
170 183
184static void bin_vma_open(struct vm_area_struct *vma)
185{
186 struct file *file = vma->vm_file;
187 struct bin_buffer *bb = file->private_data;
188 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
189
190 if (!bb->vm_ops || !bb->vm_ops->open)
191 return;
192
193 if (!sysfs_get_active_two(attr_sd))
194 return;
195
196 bb->vm_ops->open(vma);
197
198 sysfs_put_active_two(attr_sd);
199}
200
201static void bin_vma_close(struct vm_area_struct *vma)
202{
203 struct file *file = vma->vm_file;
204 struct bin_buffer *bb = file->private_data;
205 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
206
207 if (!bb->vm_ops || !bb->vm_ops->close)
208 return;
209
210 if (!sysfs_get_active_two(attr_sd))
211 return;
212
213 bb->vm_ops->close(vma);
214
215 sysfs_put_active_two(attr_sd);
216}
217
218static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
219{
220 struct file *file = vma->vm_file;
221 struct bin_buffer *bb = file->private_data;
222 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
223 int ret;
224
225 if (!bb->vm_ops || !bb->vm_ops->fault)
226 return VM_FAULT_SIGBUS;
227
228 if (!sysfs_get_active_two(attr_sd))
229 return VM_FAULT_SIGBUS;
230
231 ret = bb->vm_ops->fault(vma, vmf);
232
233 sysfs_put_active_two(attr_sd);
234 return ret;
235}
236
237static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
238{
239 struct file *file = vma->vm_file;
240 struct bin_buffer *bb = file->private_data;
241 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
242 int ret;
243
244 if (!bb->vm_ops)
245 return VM_FAULT_SIGBUS;
246
247 if (!bb->vm_ops->page_mkwrite)
248 return 0;
249
250 if (!sysfs_get_active_two(attr_sd))
251 return VM_FAULT_SIGBUS;
252
253 ret = bb->vm_ops->page_mkwrite(vma, vmf);
254
255 sysfs_put_active_two(attr_sd);
256 return ret;
257}
258
259static int bin_access(struct vm_area_struct *vma, unsigned long addr,
260 void *buf, int len, int write)
261{
262 struct file *file = vma->vm_file;
263 struct bin_buffer *bb = file->private_data;
264 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
265 int ret;
266
267 if (!bb->vm_ops || !bb->vm_ops->access)
268 return -EINVAL;
269
270 if (!sysfs_get_active_two(attr_sd))
271 return -EINVAL;
272
273 ret = bb->vm_ops->access(vma, addr, buf, len, write);
274
275 sysfs_put_active_two(attr_sd);
276 return ret;
277}
278
279#ifdef CONFIG_NUMA
280static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
281{
282 struct file *file = vma->vm_file;
283 struct bin_buffer *bb = file->private_data;
284 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
285 int ret;
286
287 if (!bb->vm_ops || !bb->vm_ops->set_policy)
288 return 0;
289
290 if (!sysfs_get_active_two(attr_sd))
291 return -EINVAL;
292
293 ret = bb->vm_ops->set_policy(vma, new);
294
295 sysfs_put_active_two(attr_sd);
296 return ret;
297}
298
299static struct mempolicy *bin_get_policy(struct vm_area_struct *vma,
300 unsigned long addr)
301{
302 struct file *file = vma->vm_file;
303 struct bin_buffer *bb = file->private_data;
304 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
305 struct mempolicy *pol;
306
307 if (!bb->vm_ops || !bb->vm_ops->get_policy)
308 return vma->vm_policy;
309
310 if (!sysfs_get_active_two(attr_sd))
311 return vma->vm_policy;
312
313 pol = bb->vm_ops->get_policy(vma, addr);
314
315 sysfs_put_active_two(attr_sd);
316 return pol;
317}
318
319static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
320 const nodemask_t *to, unsigned long flags)
321{
322 struct file *file = vma->vm_file;
323 struct bin_buffer *bb = file->private_data;
324 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
325 int ret;
326
327 if (!bb->vm_ops || !bb->vm_ops->migrate)
328 return 0;
329
330 if (!sysfs_get_active_two(attr_sd))
331 return 0;
332
333 ret = bb->vm_ops->migrate(vma, from, to, flags);
334
335 sysfs_put_active_two(attr_sd);
336 return ret;
337}
338#endif
339
340static struct vm_operations_struct bin_vm_ops = {
341 .open = bin_vma_open,
342 .close = bin_vma_close,
343 .fault = bin_fault,
344 .page_mkwrite = bin_page_mkwrite,
345 .access = bin_access,
346#ifdef CONFIG_NUMA
347 .set_policy = bin_set_policy,
348 .get_policy = bin_get_policy,
349 .migrate = bin_migrate,
350#endif
351};
352
171static int mmap(struct file *file, struct vm_area_struct *vma) 353static int mmap(struct file *file, struct vm_area_struct *vma)
172{ 354{
173 struct bin_buffer *bb = file->private_data; 355 struct bin_buffer *bb = file->private_data;
@@ -179,18 +361,37 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
179 mutex_lock(&bb->mutex); 361 mutex_lock(&bb->mutex);
180 362
181 /* need attr_sd for attr, its parent for kobj */ 363 /* need attr_sd for attr, its parent for kobj */
364 rc = -ENODEV;
182 if (!sysfs_get_active_two(attr_sd)) 365 if (!sysfs_get_active_two(attr_sd))
183 return -ENODEV; 366 goto out_unlock;
184 367
185 rc = -EINVAL; 368 rc = -EINVAL;
186 if (attr->mmap) 369 if (!attr->mmap)
187 rc = attr->mmap(kobj, attr, vma); 370 goto out_put;
371
372 rc = attr->mmap(kobj, attr, vma);
373 if (rc)
374 goto out_put;
188 375
189 if (rc == 0 && !bb->mmapped) 376 /*
190 bb->mmapped = 1; 377 * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
191 else 378 * to satisfy versions of X which crash if the mmap fails: that
192 sysfs_put_active_two(attr_sd); 379 * substitutes a new vm_file, and we don't then want bin_vm_ops.
380 */
381 if (vma->vm_file != file)
382 goto out_put;
193 383
384 rc = -EINVAL;
385 if (bb->mmapped && bb->vm_ops != vma->vm_ops)
386 goto out_put;
387
388 rc = 0;
389 bb->mmapped = 1;
390 bb->vm_ops = vma->vm_ops;
391 vma->vm_ops = &bin_vm_ops;
392out_put:
393 sysfs_put_active_two(attr_sd);
394out_unlock:
194 mutex_unlock(&bb->mutex); 395 mutex_unlock(&bb->mutex);
195 396
196 return rc; 397 return rc;
@@ -223,8 +424,13 @@ static int open(struct inode * inode, struct file * file)
223 goto err_out; 424 goto err_out;
224 425
225 mutex_init(&bb->mutex); 426 mutex_init(&bb->mutex);
427 bb->file = file;
226 file->private_data = bb; 428 file->private_data = bb;
227 429
430 mutex_lock(&sysfs_bin_lock);
431 hlist_add_head(&bb->list, &attr_sd->s_bin_attr.buffers);
432 mutex_unlock(&sysfs_bin_lock);
433
228 /* open succeeded, put active references */ 434 /* open succeeded, put active references */
229 sysfs_put_active_two(attr_sd); 435 sysfs_put_active_two(attr_sd);
230 return 0; 436 return 0;
@@ -237,11 +443,12 @@ static int open(struct inode * inode, struct file * file)
237 443
238static int release(struct inode * inode, struct file * file) 444static int release(struct inode * inode, struct file * file)
239{ 445{
240 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
241 struct bin_buffer *bb = file->private_data; 446 struct bin_buffer *bb = file->private_data;
242 447
243 if (bb->mmapped) 448 mutex_lock(&sysfs_bin_lock);
244 sysfs_put_active_two(attr_sd); 449 hlist_del(&bb->list);
450 mutex_unlock(&sysfs_bin_lock);
451
245 kfree(bb->buffer); 452 kfree(bb->buffer);
246 kfree(bb); 453 kfree(bb);
247 return 0; 454 return 0;
@@ -256,6 +463,26 @@ const struct file_operations bin_fops = {
256 .release = release, 463 .release = release,
257}; 464};
258 465
466
467void unmap_bin_file(struct sysfs_dirent *attr_sd)
468{
469 struct bin_buffer *bb;
470 struct hlist_node *tmp;
471
472 if (sysfs_type(attr_sd) != SYSFS_KOBJ_BIN_ATTR)
473 return;
474
475 mutex_lock(&sysfs_bin_lock);
476
477 hlist_for_each_entry(bb, tmp, &attr_sd->s_bin_attr.buffers, list) {
478 struct inode *inode = bb->file->f_path.dentry->d_inode;
479
480 unmap_mapping_range(inode->i_mapping, 0, 0, 1);
481 }
482
483 mutex_unlock(&sysfs_bin_lock);
484}
485
259/** 486/**
260 * sysfs_create_bin_file - create binary file for object. 487 * sysfs_create_bin_file - create binary file for object.
261 * @kobj: object. 488 * @kobj: object.
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 82d3b79d0e08..d88d0fac9fa5 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -302,7 +302,7 @@ static void sysfs_d_iput(struct dentry * dentry, struct inode * inode)
302 iput(inode); 302 iput(inode);
303} 303}
304 304
305static struct dentry_operations sysfs_dentry_ops = { 305static const struct dentry_operations sysfs_dentry_ops = {
306 .d_iput = sysfs_d_iput, 306 .d_iput = sysfs_d_iput,
307}; 307};
308 308
@@ -434,6 +434,26 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
434} 434}
435 435
436/** 436/**
437 * sysfs_pathname - return full path to sysfs dirent
438 * @sd: sysfs_dirent whose path we want
439 * @path: caller allocated buffer
440 *
441 * Gives the name "/" to the sysfs_root entry; any path returned
442 * is relative to wherever sysfs is mounted.
443 *
444 * XXX: does no error checking on @path size
445 */
446static char *sysfs_pathname(struct sysfs_dirent *sd, char *path)
447{
448 if (sd->s_parent) {
449 sysfs_pathname(sd->s_parent, path);
450 strcat(path, "/");
451 }
452 strcat(path, sd->s_name);
453 return path;
454}
455
456/**
437 * sysfs_add_one - add sysfs_dirent to parent 457 * sysfs_add_one - add sysfs_dirent to parent
438 * @acxt: addrm context to use 458 * @acxt: addrm context to use
439 * @sd: sysfs_dirent to be added 459 * @sd: sysfs_dirent to be added
@@ -458,8 +478,16 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
458 int ret; 478 int ret;
459 479
460 ret = __sysfs_add_one(acxt, sd); 480 ret = __sysfs_add_one(acxt, sd);
461 WARN(ret == -EEXIST, KERN_WARNING "sysfs: duplicate filename '%s' " 481 if (ret == -EEXIST) {
462 "can not be created\n", sd->s_name); 482 char *path = kzalloc(PATH_MAX, GFP_KERNEL);
483 WARN(1, KERN_WARNING
484 "sysfs: cannot create duplicate filename '%s'\n",
485 (path == NULL) ? sd->s_name :
486 strcat(strcat(sysfs_pathname(acxt->parent_sd, path), "/"),
487 sd->s_name));
488 kfree(path);
489 }
490
463 return ret; 491 return ret;
464} 492}
465 493
@@ -581,6 +609,7 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
581 609
582 sysfs_drop_dentry(sd); 610 sysfs_drop_dentry(sd);
583 sysfs_deactivate(sd); 611 sysfs_deactivate(sd);
612 unmap_bin_file(sd);
584 sysfs_put(sd); 613 sysfs_put(sd);
585 } 614 }
586} 615}
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1f4a3f877262..289c43a47263 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -659,13 +659,16 @@ void sysfs_remove_file_from_group(struct kobject *kobj,
659EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group); 659EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group);
660 660
661struct sysfs_schedule_callback_struct { 661struct sysfs_schedule_callback_struct {
662 struct kobject *kobj; 662 struct list_head workq_list;
663 struct kobject *kobj;
663 void (*func)(void *); 664 void (*func)(void *);
664 void *data; 665 void *data;
665 struct module *owner; 666 struct module *owner;
666 struct work_struct work; 667 struct work_struct work;
667}; 668};
668 669
670static DEFINE_MUTEX(sysfs_workq_mutex);
671static LIST_HEAD(sysfs_workq);
669static void sysfs_schedule_callback_work(struct work_struct *work) 672static void sysfs_schedule_callback_work(struct work_struct *work)
670{ 673{
671 struct sysfs_schedule_callback_struct *ss = container_of(work, 674 struct sysfs_schedule_callback_struct *ss = container_of(work,
@@ -674,6 +677,9 @@ static void sysfs_schedule_callback_work(struct work_struct *work)
674 (ss->func)(ss->data); 677 (ss->func)(ss->data);
675 kobject_put(ss->kobj); 678 kobject_put(ss->kobj);
676 module_put(ss->owner); 679 module_put(ss->owner);
680 mutex_lock(&sysfs_workq_mutex);
681 list_del(&ss->workq_list);
682 mutex_unlock(&sysfs_workq_mutex);
677 kfree(ss); 683 kfree(ss);
678} 684}
679 685
@@ -695,15 +701,25 @@ static void sysfs_schedule_callback_work(struct work_struct *work)
695 * until @func returns. 701 * until @func returns.
696 * 702 *
697 * Returns 0 if the request was submitted, -ENOMEM if storage could not 703 * Returns 0 if the request was submitted, -ENOMEM if storage could not
698 * be allocated, -ENODEV if a reference to @owner isn't available. 704 * be allocated, -ENODEV if a reference to @owner isn't available,
705 * -EAGAIN if a callback has already been scheduled for @kobj.
699 */ 706 */
700int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), 707int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
701 void *data, struct module *owner) 708 void *data, struct module *owner)
702{ 709{
703 struct sysfs_schedule_callback_struct *ss; 710 struct sysfs_schedule_callback_struct *ss, *tmp;
704 711
705 if (!try_module_get(owner)) 712 if (!try_module_get(owner))
706 return -ENODEV; 713 return -ENODEV;
714
715 mutex_lock(&sysfs_workq_mutex);
716 list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list)
717 if (ss->kobj == kobj) {
718 mutex_unlock(&sysfs_workq_mutex);
719 return -EAGAIN;
720 }
721 mutex_unlock(&sysfs_workq_mutex);
722
707 ss = kmalloc(sizeof(*ss), GFP_KERNEL); 723 ss = kmalloc(sizeof(*ss), GFP_KERNEL);
708 if (!ss) { 724 if (!ss) {
709 module_put(owner); 725 module_put(owner);
@@ -715,6 +731,10 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
715 ss->data = data; 731 ss->data = data;
716 ss->owner = owner; 732 ss->owner = owner;
717 INIT_WORK(&ss->work, sysfs_schedule_callback_work); 733 INIT_WORK(&ss->work, sysfs_schedule_callback_work);
734 INIT_LIST_HEAD(&ss->workq_list);
735 mutex_lock(&sysfs_workq_mutex);
736 list_add_tail(&ss->workq_list, &sysfs_workq);
737 mutex_unlock(&sysfs_workq_mutex);
718 schedule_work(&ss->work); 738 schedule_work(&ss->work);
719 return 0; 739 return 0;
720} 740}
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index dfa3d94cfc74..555f0ff988df 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -147,6 +147,7 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
147{ 147{
148 struct bin_attribute *bin_attr; 148 struct bin_attribute *bin_attr;
149 149
150 inode->i_private = sysfs_get(sd);
150 inode->i_mapping->a_ops = &sysfs_aops; 151 inode->i_mapping->a_ops = &sysfs_aops;
151 inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info; 152 inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
152 inode->i_op = &sysfs_inode_operations; 153 inode->i_op = &sysfs_inode_operations;
@@ -214,6 +215,22 @@ struct inode * sysfs_get_inode(struct sysfs_dirent *sd)
214 return inode; 215 return inode;
215} 216}
216 217
218/*
219 * The sysfs_dirent serves as both an inode and a directory entry for sysfs.
220 * To prevent the sysfs inode numbers from being freed prematurely we take a
221 * reference to sysfs_dirent from the sysfs inode. A
222 * super_operations.delete_inode() implementation is needed to drop that
223 * reference upon inode destruction.
224 */
225void sysfs_delete_inode(struct inode *inode)
226{
227 struct sysfs_dirent *sd = inode->i_private;
228
229 truncate_inode_pages(&inode->i_data, 0);
230 clear_inode(inode);
231 sysfs_put(sd);
232}
233
217int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name) 234int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
218{ 235{
219 struct sysfs_addrm_cxt acxt; 236 struct sysfs_addrm_cxt acxt;
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index ab343e371d64..49749955ccaf 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -17,11 +17,10 @@
17#include <linux/pagemap.h> 17#include <linux/pagemap.h>
18#include <linux/init.h> 18#include <linux/init.h>
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/magic.h>
20 21
21#include "sysfs.h" 22#include "sysfs.h"
22 23
23/* Random magic number */
24#define SYSFS_MAGIC 0x62656572
25 24
26static struct vfsmount *sysfs_mount; 25static struct vfsmount *sysfs_mount;
27struct super_block * sysfs_sb = NULL; 26struct super_block * sysfs_sb = NULL;
@@ -30,6 +29,7 @@ struct kmem_cache *sysfs_dir_cachep;
30static const struct super_operations sysfs_ops = { 29static const struct super_operations sysfs_ops = {
31 .statfs = simple_statfs, 30 .statfs = simple_statfs,
32 .drop_inode = generic_delete_inode, 31 .drop_inode = generic_delete_inode,
32 .delete_inode = sysfs_delete_inode,
33}; 33};
34 34
35struct sysfs_dirent sysfs_root = { 35struct sysfs_dirent sysfs_root = {
@@ -53,7 +53,9 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
53 sysfs_sb = sb; 53 sysfs_sb = sb;
54 54
55 /* get root inode, initialize and unlock it */ 55 /* get root inode, initialize and unlock it */
56 mutex_lock(&sysfs_mutex);
56 inode = sysfs_get_inode(&sysfs_root); 57 inode = sysfs_get_inode(&sysfs_root);
58 mutex_unlock(&sysfs_mutex);
57 if (!inode) { 59 if (!inode) {
58 pr_debug("sysfs: could not get root inode\n"); 60 pr_debug("sysfs: could not get root inode\n");
59 return -ENOMEM; 61 return -ENOMEM;
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 93c6d6b27c4d..3fa0d98481e2 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -28,6 +28,7 @@ struct sysfs_elem_attr {
28 28
29struct sysfs_elem_bin_attr { 29struct sysfs_elem_bin_attr {
30 struct bin_attribute *bin_attr; 30 struct bin_attribute *bin_attr;
31 struct hlist_head buffers;
31}; 32};
32 33
33/* 34/*
@@ -145,6 +146,7 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
145 * inode.c 146 * inode.c
146 */ 147 */
147struct inode *sysfs_get_inode(struct sysfs_dirent *sd); 148struct inode *sysfs_get_inode(struct sysfs_dirent *sd);
149void sysfs_delete_inode(struct inode *inode);
148int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); 150int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
149int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name); 151int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
150int sysfs_inode_init(void); 152int sysfs_inode_init(void);
@@ -163,6 +165,7 @@ int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
163 * bin.c 165 * bin.c
164 */ 166 */
165extern const struct file_operations bin_fops; 167extern const struct file_operations bin_fops;
168void unmap_bin_file(struct sysfs_dirent *attr_sd);
166 169
167/* 170/*
168 * symlink.c 171 * symlink.c
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 3d81bf58dae2..da20b48d350f 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -90,6 +90,7 @@ static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
90{ 90{
91 struct super_block *sb = dentry->d_sb; 91 struct super_block *sb = dentry->d_sb;
92 struct sysv_sb_info *sbi = SYSV_SB(sb); 92 struct sysv_sb_info *sbi = SYSV_SB(sb);
93 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
93 94
94 buf->f_type = sb->s_magic; 95 buf->f_type = sb->s_magic;
95 buf->f_bsize = sb->s_blocksize; 96 buf->f_bsize = sb->s_blocksize;
@@ -98,6 +99,8 @@ static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
98 buf->f_files = sbi->s_ninodes; 99 buf->f_files = sbi->s_ninodes;
99 buf->f_ffree = sysv_count_free_inodes(sb); 100 buf->f_ffree = sysv_count_free_inodes(sb);
100 buf->f_namelen = SYSV_NAMELEN; 101 buf->f_namelen = SYSV_NAMELEN;
102 buf->f_fsid.val[0] = (u32)id;
103 buf->f_fsid.val[1] = (u32)(id >> 32);
101 return 0; 104 return 0;
102} 105}
103 106
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index a1f1ef33e81c..33e047b59b8d 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -38,7 +38,7 @@ static int sysv_hash(struct dentry *dentry, struct qstr *qstr)
38 return 0; 38 return 0;
39} 39}
40 40
41struct dentry_operations sysv_dentry_operations = { 41const struct dentry_operations sysv_dentry_operations = {
42 .d_hash = sysv_hash, 42 .d_hash = sysv_hash,
43}; 43};
44 44
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 38ebe3f85b3d..5784a318c883 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -170,7 +170,7 @@ extern const struct file_operations sysv_file_operations;
170extern const struct file_operations sysv_dir_operations; 170extern const struct file_operations sysv_dir_operations;
171extern const struct address_space_operations sysv_aops; 171extern const struct address_space_operations sysv_aops;
172extern const struct super_operations sysv_sops; 172extern const struct super_operations sysv_sops;
173extern struct dentry_operations sysv_dentry_operations; 173extern const struct dentry_operations sysv_dentry_operations;
174 174
175 175
176enum { 176enum {
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index e35b54d5059d..830e3f76f442 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -22,7 +22,7 @@ config UBIFS_FS_ADVANCED_COMPR
22 depends on UBIFS_FS 22 depends on UBIFS_FS
23 help 23 help
24 This option allows to explicitly choose which compressions, if any, 24 This option allows to explicitly choose which compressions, if any,
25 are enabled in UBIFS. Removing compressors means inbility to read 25 are enabled in UBIFS. Removing compressors means inability to read
26 existing file systems. 26 existing file systems.
27 27
28 If unsure, say 'N'. 28 If unsure, say 'N'.
@@ -32,7 +32,7 @@ config UBIFS_FS_LZO
32 depends on UBIFS_FS 32 depends on UBIFS_FS
33 default y 33 default y
34 help 34 help
35 LZO compressor is generally faster then zlib but compresses worse. 35 LZO compressor is generally faster than zlib but compresses worse.
36 Say 'Y' if unsure. 36 Say 'Y' if unsure.
37 37
38config UBIFS_FS_ZLIB 38config UBIFS_FS_ZLIB
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 93b6de51f261..0ff89fe71e51 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1434,8 +1434,9 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
1434 * mmap()d file has taken write protection fault and is being made 1434 * mmap()d file has taken write protection fault and is being made
1435 * writable. UBIFS must ensure page is budgeted for. 1435 * writable. UBIFS must ensure page is budgeted for.
1436 */ 1436 */
1437static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) 1437static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1438{ 1438{
1439 struct page *page = vmf->page;
1439 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 1440 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1440 struct ubifs_info *c = inode->i_sb->s_fs_info; 1441 struct ubifs_info *c = inode->i_sb->s_fs_info;
1441 struct timespec now = ubifs_current_time(inode); 1442 struct timespec now = ubifs_current_time(inode);
@@ -1447,7 +1448,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
1447 ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY)); 1448 ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY));
1448 1449
1449 if (unlikely(c->ro_media)) 1450 if (unlikely(c->ro_media))
1450 return -EROFS; 1451 return VM_FAULT_SIGBUS; /* -EROFS */
1451 1452
1452 /* 1453 /*
1453 * We have not locked @page so far so we may budget for changing the 1454 * We have not locked @page so far so we may budget for changing the
@@ -1480,7 +1481,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
1480 if (err == -ENOSPC) 1481 if (err == -ENOSPC)
1481 ubifs_warn("out of space for mmapped file " 1482 ubifs_warn("out of space for mmapped file "
1482 "(inode number %lu)", inode->i_ino); 1483 "(inode number %lu)", inode->i_ino);
1483 return err; 1484 return VM_FAULT_SIGBUS;
1484 } 1485 }
1485 1486
1486 lock_page(page); 1487 lock_page(page);
@@ -1520,6 +1521,8 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
1520out_unlock: 1521out_unlock:
1521 unlock_page(page); 1522 unlock_page(page);
1522 ubifs_release_budget(c, &req); 1523 ubifs_release_budget(c, &req);
1524 if (err)
1525 err = VM_FAULT_SIGBUS;
1523 return err; 1526 return err;
1524} 1527}
1525 1528
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 1182b66a5491..c5c98355459a 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2034,7 +2034,8 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
2034 /* 'fill_super()' opens ubi again so we must close it here */ 2034 /* 'fill_super()' opens ubi again so we must close it here */
2035 ubi_close_volume(ubi); 2035 ubi_close_volume(ubi);
2036 2036
2037 return simple_set_mnt(mnt, sb); 2037 simple_set_mnt(mnt, sb);
2038 return 0;
2038 2039
2039out_deact: 2040out_deact:
2040 up_write(&sb->s_umount); 2041 up_write(&sb->s_umount);
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 1b809bd494bd..e48e9a3af763 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -87,12 +87,12 @@ static int read_block_bitmap(struct super_block *sb,
87{ 87{
88 struct buffer_head *bh = NULL; 88 struct buffer_head *bh = NULL;
89 int retval = 0; 89 int retval = 0;
90 kernel_lb_addr loc; 90 struct kernel_lb_addr loc;
91 91
92 loc.logicalBlockNum = bitmap->s_extPosition; 92 loc.logicalBlockNum = bitmap->s_extPosition;
93 loc.partitionReferenceNum = UDF_SB(sb)->s_partition; 93 loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
94 94
95 bh = udf_tread(sb, udf_get_lb_pblock(sb, loc, block)); 95 bh = udf_tread(sb, udf_get_lb_pblock(sb, &loc, block));
96 if (!bh) 96 if (!bh)
97 retval = -EIO; 97 retval = -EIO;
98 98
@@ -140,27 +140,29 @@ static inline int load_block_bitmap(struct super_block *sb,
140 return slot; 140 return slot;
141} 141}
142 142
143static bool udf_add_free_space(struct udf_sb_info *sbi, 143static void udf_add_free_space(struct super_block *sb, u16 partition, u32 cnt)
144 u16 partition, u32 cnt)
145{ 144{
145 struct udf_sb_info *sbi = UDF_SB(sb);
146 struct logicalVolIntegrityDesc *lvid; 146 struct logicalVolIntegrityDesc *lvid;
147 147
148 if (sbi->s_lvid_bh == NULL) 148 if (!sbi->s_lvid_bh)
149 return false; 149 return;
150 150
151 lvid = (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data; 151 lvid = (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data;
152 le32_add_cpu(&lvid->freeSpaceTable[partition], cnt); 152 le32_add_cpu(&lvid->freeSpaceTable[partition], cnt);
153 return true; 153 udf_updated_lvid(sb);
154} 154}
155 155
156static void udf_bitmap_free_blocks(struct super_block *sb, 156static void udf_bitmap_free_blocks(struct super_block *sb,
157 struct inode *inode, 157 struct inode *inode,
158 struct udf_bitmap *bitmap, 158 struct udf_bitmap *bitmap,
159 kernel_lb_addr bloc, uint32_t offset, 159 struct kernel_lb_addr *bloc,
160 uint32_t offset,
160 uint32_t count) 161 uint32_t count)
161{ 162{
162 struct udf_sb_info *sbi = UDF_SB(sb); 163 struct udf_sb_info *sbi = UDF_SB(sb);
163 struct buffer_head *bh = NULL; 164 struct buffer_head *bh = NULL;
165 struct udf_part_map *partmap;
164 unsigned long block; 166 unsigned long block;
165 unsigned long block_group; 167 unsigned long block_group;
166 unsigned long bit; 168 unsigned long bit;
@@ -169,17 +171,17 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
169 unsigned long overflow; 171 unsigned long overflow;
170 172
171 mutex_lock(&sbi->s_alloc_mutex); 173 mutex_lock(&sbi->s_alloc_mutex);
172 if (bloc.logicalBlockNum < 0 || 174 partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
173 (bloc.logicalBlockNum + count) > 175 if (bloc->logicalBlockNum < 0 ||
174 sbi->s_partmaps[bloc.partitionReferenceNum].s_partition_len) { 176 (bloc->logicalBlockNum + count) >
177 partmap->s_partition_len) {
175 udf_debug("%d < %d || %d + %d > %d\n", 178 udf_debug("%d < %d || %d + %d > %d\n",
176 bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count, 179 bloc->logicalBlockNum, 0, bloc->logicalBlockNum,
177 sbi->s_partmaps[bloc.partitionReferenceNum]. 180 count, partmap->s_partition_len);
178 s_partition_len);
179 goto error_return; 181 goto error_return;
180 } 182 }
181 183
182 block = bloc.logicalBlockNum + offset + 184 block = bloc->logicalBlockNum + offset +
183 (sizeof(struct spaceBitmapDesc) << 3); 185 (sizeof(struct spaceBitmapDesc) << 3);
184 186
185 do { 187 do {
@@ -206,8 +208,8 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
206 ((char *)bh->b_data)[(bit + i) >> 3]); 208 ((char *)bh->b_data)[(bit + i) >> 3]);
207 } else { 209 } else {
208 if (inode) 210 if (inode)
209 DQUOT_FREE_BLOCK(inode, 1); 211 vfs_dq_free_block(inode, 1);
210 udf_add_free_space(sbi, sbi->s_partition, 1); 212 udf_add_free_space(sb, sbi->s_partition, 1);
211 } 213 }
212 } 214 }
213 mark_buffer_dirty(bh); 215 mark_buffer_dirty(bh);
@@ -218,9 +220,6 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
218 } while (overflow); 220 } while (overflow);
219 221
220error_return: 222error_return:
221 sb->s_dirt = 1;
222 if (sbi->s_lvid_bh)
223 mark_buffer_dirty(sbi->s_lvid_bh);
224 mutex_unlock(&sbi->s_alloc_mutex); 223 mutex_unlock(&sbi->s_alloc_mutex);
225} 224}
226 225
@@ -261,11 +260,11 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
261 while (bit < (sb->s_blocksize << 3) && block_count > 0) { 260 while (bit < (sb->s_blocksize << 3) && block_count > 0) {
262 if (!udf_test_bit(bit, bh->b_data)) 261 if (!udf_test_bit(bit, bh->b_data))
263 goto out; 262 goto out;
264 else if (DQUOT_PREALLOC_BLOCK(inode, 1)) 263 else if (vfs_dq_prealloc_block(inode, 1))
265 goto out; 264 goto out;
266 else if (!udf_clear_bit(bit, bh->b_data)) { 265 else if (!udf_clear_bit(bit, bh->b_data)) {
267 udf_debug("bit already cleared for block %d\n", bit); 266 udf_debug("bit already cleared for block %d\n", bit);
268 DQUOT_FREE_BLOCK(inode, 1); 267 vfs_dq_free_block(inode, 1);
269 goto out; 268 goto out;
270 } 269 }
271 block_count--; 270 block_count--;
@@ -277,9 +276,7 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
277 } while (block_count > 0); 276 } while (block_count > 0);
278 277
279out: 278out:
280 if (udf_add_free_space(sbi, partition, -alloc_count)) 279 udf_add_free_space(sb, partition, -alloc_count);
281 mark_buffer_dirty(sbi->s_lvid_bh);
282 sb->s_dirt = 1;
283 mutex_unlock(&sbi->s_alloc_mutex); 280 mutex_unlock(&sbi->s_alloc_mutex);
284 return alloc_count; 281 return alloc_count;
285} 282}
@@ -393,7 +390,7 @@ got_block:
393 /* 390 /*
394 * Check quota for allocation of this block. 391 * Check quota for allocation of this block.
395 */ 392 */
396 if (inode && DQUOT_ALLOC_BLOCK(inode, 1)) { 393 if (inode && vfs_dq_alloc_block(inode, 1)) {
397 mutex_unlock(&sbi->s_alloc_mutex); 394 mutex_unlock(&sbi->s_alloc_mutex);
398 *err = -EDQUOT; 395 *err = -EDQUOT;
399 return 0; 396 return 0;
@@ -409,9 +406,7 @@ got_block:
409 406
410 mark_buffer_dirty(bh); 407 mark_buffer_dirty(bh);
411 408
412 if (udf_add_free_space(sbi, partition, -1)) 409 udf_add_free_space(sb, partition, -1);
413 mark_buffer_dirty(sbi->s_lvid_bh);
414 sb->s_dirt = 1;
415 mutex_unlock(&sbi->s_alloc_mutex); 410 mutex_unlock(&sbi->s_alloc_mutex);
416 *err = 0; 411 *err = 0;
417 return newblock; 412 return newblock;
@@ -425,26 +420,28 @@ error_return:
425static void udf_table_free_blocks(struct super_block *sb, 420static void udf_table_free_blocks(struct super_block *sb,
426 struct inode *inode, 421 struct inode *inode,
427 struct inode *table, 422 struct inode *table,
428 kernel_lb_addr bloc, uint32_t offset, 423 struct kernel_lb_addr *bloc,
424 uint32_t offset,
429 uint32_t count) 425 uint32_t count)
430{ 426{
431 struct udf_sb_info *sbi = UDF_SB(sb); 427 struct udf_sb_info *sbi = UDF_SB(sb);
428 struct udf_part_map *partmap;
432 uint32_t start, end; 429 uint32_t start, end;
433 uint32_t elen; 430 uint32_t elen;
434 kernel_lb_addr eloc; 431 struct kernel_lb_addr eloc;
435 struct extent_position oepos, epos; 432 struct extent_position oepos, epos;
436 int8_t etype; 433 int8_t etype;
437 int i; 434 int i;
438 struct udf_inode_info *iinfo; 435 struct udf_inode_info *iinfo;
439 436
440 mutex_lock(&sbi->s_alloc_mutex); 437 mutex_lock(&sbi->s_alloc_mutex);
441 if (bloc.logicalBlockNum < 0 || 438 partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
442 (bloc.logicalBlockNum + count) > 439 if (bloc->logicalBlockNum < 0 ||
443 sbi->s_partmaps[bloc.partitionReferenceNum].s_partition_len) { 440 (bloc->logicalBlockNum + count) >
441 partmap->s_partition_len) {
444 udf_debug("%d < %d || %d + %d > %d\n", 442 udf_debug("%d < %d || %d + %d > %d\n",
445 bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count, 443 bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count,
446 sbi->s_partmaps[bloc.partitionReferenceNum]. 444 partmap->s_partition_len);
447 s_partition_len);
448 goto error_return; 445 goto error_return;
449 } 446 }
450 447
@@ -452,12 +449,11 @@ static void udf_table_free_blocks(struct super_block *sb,
452 /* We do this up front - There are some error conditions that 449 /* We do this up front - There are some error conditions that
453 could occure, but.. oh well */ 450 could occure, but.. oh well */
454 if (inode) 451 if (inode)
455 DQUOT_FREE_BLOCK(inode, count); 452 vfs_dq_free_block(inode, count);
456 if (udf_add_free_space(sbi, sbi->s_partition, count)) 453 udf_add_free_space(sb, sbi->s_partition, count);
457 mark_buffer_dirty(sbi->s_lvid_bh);
458 454
459 start = bloc.logicalBlockNum + offset; 455 start = bloc->logicalBlockNum + offset;
460 end = bloc.logicalBlockNum + offset + count - 1; 456 end = bloc->logicalBlockNum + offset + count - 1;
461 457
462 epos.offset = oepos.offset = sizeof(struct unallocSpaceEntry); 458 epos.offset = oepos.offset = sizeof(struct unallocSpaceEntry);
463 elen = 0; 459 elen = 0;
@@ -483,7 +479,7 @@ static void udf_table_free_blocks(struct super_block *sb,
483 start += count; 479 start += count;
484 count = 0; 480 count = 0;
485 } 481 }
486 udf_write_aext(table, &oepos, eloc, elen, 1); 482 udf_write_aext(table, &oepos, &eloc, elen, 1);
487 } else if (eloc.logicalBlockNum == (end + 1)) { 483 } else if (eloc.logicalBlockNum == (end + 1)) {
488 if ((0x3FFFFFFF - elen) < 484 if ((0x3FFFFFFF - elen) <
489 (count << sb->s_blocksize_bits)) { 485 (count << sb->s_blocksize_bits)) {
@@ -502,7 +498,7 @@ static void udf_table_free_blocks(struct super_block *sb,
502 end -= count; 498 end -= count;
503 count = 0; 499 count = 0;
504 } 500 }
505 udf_write_aext(table, &oepos, eloc, elen, 1); 501 udf_write_aext(table, &oepos, &eloc, elen, 1);
506 } 502 }
507 503
508 if (epos.bh != oepos.bh) { 504 if (epos.bh != oepos.bh) {
@@ -532,8 +528,8 @@ static void udf_table_free_blocks(struct super_block *sb,
532 */ 528 */
533 529
534 int adsize; 530 int adsize;
535 short_ad *sad = NULL; 531 struct short_ad *sad = NULL;
536 long_ad *lad = NULL; 532 struct long_ad *lad = NULL;
537 struct allocExtDesc *aed; 533 struct allocExtDesc *aed;
538 534
539 eloc.logicalBlockNum = start; 535 eloc.logicalBlockNum = start;
@@ -541,9 +537,9 @@ static void udf_table_free_blocks(struct super_block *sb,
541 (count << sb->s_blocksize_bits); 537 (count << sb->s_blocksize_bits);
542 538
543 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 539 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
544 adsize = sizeof(short_ad); 540 adsize = sizeof(struct short_ad);
545 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 541 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
546 adsize = sizeof(long_ad); 542 adsize = sizeof(struct long_ad);
547 else { 543 else {
548 brelse(oepos.bh); 544 brelse(oepos.bh);
549 brelse(epos.bh); 545 brelse(epos.bh);
@@ -563,7 +559,7 @@ static void udf_table_free_blocks(struct super_block *sb,
563 elen -= sb->s_blocksize; 559 elen -= sb->s_blocksize;
564 560
565 epos.bh = udf_tread(sb, 561 epos.bh = udf_tread(sb,
566 udf_get_lb_pblock(sb, epos.block, 0)); 562 udf_get_lb_pblock(sb, &epos.block, 0));
567 if (!epos.bh) { 563 if (!epos.bh) {
568 brelse(oepos.bh); 564 brelse(oepos.bh);
569 goto error_return; 565 goto error_return;
@@ -601,15 +597,15 @@ static void udf_table_free_blocks(struct super_block *sb,
601 if (sbi->s_udfrev >= 0x0200) 597 if (sbi->s_udfrev >= 0x0200)
602 udf_new_tag(epos.bh->b_data, TAG_IDENT_AED, 598 udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
603 3, 1, epos.block.logicalBlockNum, 599 3, 1, epos.block.logicalBlockNum,
604 sizeof(tag)); 600 sizeof(struct tag));
605 else 601 else
606 udf_new_tag(epos.bh->b_data, TAG_IDENT_AED, 602 udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
607 2, 1, epos.block.logicalBlockNum, 603 2, 1, epos.block.logicalBlockNum,
608 sizeof(tag)); 604 sizeof(struct tag));
609 605
610 switch (iinfo->i_alloc_type) { 606 switch (iinfo->i_alloc_type) {
611 case ICBTAG_FLAG_AD_SHORT: 607 case ICBTAG_FLAG_AD_SHORT:
612 sad = (short_ad *)sptr; 608 sad = (struct short_ad *)sptr;
613 sad->extLength = cpu_to_le32( 609 sad->extLength = cpu_to_le32(
614 EXT_NEXT_EXTENT_ALLOCDECS | 610 EXT_NEXT_EXTENT_ALLOCDECS |
615 sb->s_blocksize); 611 sb->s_blocksize);
@@ -617,7 +613,7 @@ static void udf_table_free_blocks(struct super_block *sb,
617 cpu_to_le32(epos.block.logicalBlockNum); 613 cpu_to_le32(epos.block.logicalBlockNum);
618 break; 614 break;
619 case ICBTAG_FLAG_AD_LONG: 615 case ICBTAG_FLAG_AD_LONG:
620 lad = (long_ad *)sptr; 616 lad = (struct long_ad *)sptr;
621 lad->extLength = cpu_to_le32( 617 lad->extLength = cpu_to_le32(
622 EXT_NEXT_EXTENT_ALLOCDECS | 618 EXT_NEXT_EXTENT_ALLOCDECS |
623 sb->s_blocksize); 619 sb->s_blocksize);
@@ -635,7 +631,7 @@ static void udf_table_free_blocks(struct super_block *sb,
635 631
636 /* It's possible that stealing the block emptied the extent */ 632 /* It's possible that stealing the block emptied the extent */
637 if (elen) { 633 if (elen) {
638 udf_write_aext(table, &epos, eloc, elen, 1); 634 udf_write_aext(table, &epos, &eloc, elen, 1);
639 635
640 if (!epos.bh) { 636 if (!epos.bh) {
641 iinfo->i_lenAlloc += adsize; 637 iinfo->i_lenAlloc += adsize;
@@ -653,7 +649,6 @@ static void udf_table_free_blocks(struct super_block *sb,
653 brelse(oepos.bh); 649 brelse(oepos.bh);
654 650
655error_return: 651error_return:
656 sb->s_dirt = 1;
657 mutex_unlock(&sbi->s_alloc_mutex); 652 mutex_unlock(&sbi->s_alloc_mutex);
658 return; 653 return;
659} 654}
@@ -666,7 +661,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
666 struct udf_sb_info *sbi = UDF_SB(sb); 661 struct udf_sb_info *sbi = UDF_SB(sb);
667 int alloc_count = 0; 662 int alloc_count = 0;
668 uint32_t elen, adsize; 663 uint32_t elen, adsize;
669 kernel_lb_addr eloc; 664 struct kernel_lb_addr eloc;
670 struct extent_position epos; 665 struct extent_position epos;
671 int8_t etype = -1; 666 int8_t etype = -1;
672 struct udf_inode_info *iinfo; 667 struct udf_inode_info *iinfo;
@@ -677,9 +672,9 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
677 672
678 iinfo = UDF_I(table); 673 iinfo = UDF_I(table);
679 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 674 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
680 adsize = sizeof(short_ad); 675 adsize = sizeof(struct short_ad);
681 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 676 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
682 adsize = sizeof(long_ad); 677 adsize = sizeof(struct long_ad);
683 else 678 else
684 return 0; 679 return 0;
685 680
@@ -700,14 +695,14 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
700 epos.offset -= adsize; 695 epos.offset -= adsize;
701 696
702 alloc_count = (elen >> sb->s_blocksize_bits); 697 alloc_count = (elen >> sb->s_blocksize_bits);
703 if (inode && DQUOT_PREALLOC_BLOCK(inode, 698 if (inode && vfs_dq_prealloc_block(inode,
704 alloc_count > block_count ? block_count : alloc_count)) 699 alloc_count > block_count ? block_count : alloc_count))
705 alloc_count = 0; 700 alloc_count = 0;
706 else if (alloc_count > block_count) { 701 else if (alloc_count > block_count) {
707 alloc_count = block_count; 702 alloc_count = block_count;
708 eloc.logicalBlockNum += alloc_count; 703 eloc.logicalBlockNum += alloc_count;
709 elen -= (alloc_count << sb->s_blocksize_bits); 704 elen -= (alloc_count << sb->s_blocksize_bits);
710 udf_write_aext(table, &epos, eloc, 705 udf_write_aext(table, &epos, &eloc,
711 (etype << 30) | elen, 1); 706 (etype << 30) | elen, 1);
712 } else 707 } else
713 udf_delete_aext(table, epos, eloc, 708 udf_delete_aext(table, epos, eloc,
@@ -718,10 +713,8 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
718 713
719 brelse(epos.bh); 714 brelse(epos.bh);
720 715
721 if (alloc_count && udf_add_free_space(sbi, partition, -alloc_count)) { 716 if (alloc_count)
722 mark_buffer_dirty(sbi->s_lvid_bh); 717 udf_add_free_space(sb, partition, -alloc_count);
723 sb->s_dirt = 1;
724 }
725 mutex_unlock(&sbi->s_alloc_mutex); 718 mutex_unlock(&sbi->s_alloc_mutex);
726 return alloc_count; 719 return alloc_count;
727} 720}
@@ -735,7 +728,7 @@ static int udf_table_new_block(struct super_block *sb,
735 uint32_t spread = 0xFFFFFFFF, nspread = 0xFFFFFFFF; 728 uint32_t spread = 0xFFFFFFFF, nspread = 0xFFFFFFFF;
736 uint32_t newblock = 0, adsize; 729 uint32_t newblock = 0, adsize;
737 uint32_t elen, goal_elen = 0; 730 uint32_t elen, goal_elen = 0;
738 kernel_lb_addr eloc, uninitialized_var(goal_eloc); 731 struct kernel_lb_addr eloc, uninitialized_var(goal_eloc);
739 struct extent_position epos, goal_epos; 732 struct extent_position epos, goal_epos;
740 int8_t etype; 733 int8_t etype;
741 struct udf_inode_info *iinfo = UDF_I(table); 734 struct udf_inode_info *iinfo = UDF_I(table);
@@ -743,9 +736,9 @@ static int udf_table_new_block(struct super_block *sb,
743 *err = -ENOSPC; 736 *err = -ENOSPC;
744 737
745 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 738 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
746 adsize = sizeof(short_ad); 739 adsize = sizeof(struct short_ad);
747 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 740 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
748 adsize = sizeof(long_ad); 741 adsize = sizeof(struct long_ad);
749 else 742 else
750 return newblock; 743 return newblock;
751 744
@@ -806,7 +799,7 @@ static int udf_table_new_block(struct super_block *sb,
806 goal_eloc.logicalBlockNum++; 799 goal_eloc.logicalBlockNum++;
807 goal_elen -= sb->s_blocksize; 800 goal_elen -= sb->s_blocksize;
808 801
809 if (inode && DQUOT_ALLOC_BLOCK(inode, 1)) { 802 if (inode && vfs_dq_alloc_block(inode, 1)) {
810 brelse(goal_epos.bh); 803 brelse(goal_epos.bh);
811 mutex_unlock(&sbi->s_alloc_mutex); 804 mutex_unlock(&sbi->s_alloc_mutex);
812 *err = -EDQUOT; 805 *err = -EDQUOT;
@@ -814,46 +807,37 @@ static int udf_table_new_block(struct super_block *sb,
814 } 807 }
815 808
816 if (goal_elen) 809 if (goal_elen)
817 udf_write_aext(table, &goal_epos, goal_eloc, goal_elen, 1); 810 udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1);
818 else 811 else
819 udf_delete_aext(table, goal_epos, goal_eloc, goal_elen); 812 udf_delete_aext(table, goal_epos, goal_eloc, goal_elen);
820 brelse(goal_epos.bh); 813 brelse(goal_epos.bh);
821 814
822 if (udf_add_free_space(sbi, partition, -1)) 815 udf_add_free_space(sb, partition, -1);
823 mark_buffer_dirty(sbi->s_lvid_bh);
824 816
825 sb->s_dirt = 1;
826 mutex_unlock(&sbi->s_alloc_mutex); 817 mutex_unlock(&sbi->s_alloc_mutex);
827 *err = 0; 818 *err = 0;
828 return newblock; 819 return newblock;
829} 820}
830 821
831inline void udf_free_blocks(struct super_block *sb, 822void udf_free_blocks(struct super_block *sb, struct inode *inode,
832 struct inode *inode, 823 struct kernel_lb_addr *bloc, uint32_t offset,
833 kernel_lb_addr bloc, uint32_t offset, 824 uint32_t count)
834 uint32_t count)
835{ 825{
836 uint16_t partition = bloc.partitionReferenceNum; 826 uint16_t partition = bloc->partitionReferenceNum;
837 struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; 827 struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
838 828
839 if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) { 829 if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) {
840 return udf_bitmap_free_blocks(sb, inode, 830 udf_bitmap_free_blocks(sb, inode, map->s_uspace.s_bitmap,
841 map->s_uspace.s_bitmap, 831 bloc, offset, count);
842 bloc, offset, count);
843 } else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) { 832 } else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) {
844 return udf_table_free_blocks(sb, inode, 833 udf_table_free_blocks(sb, inode, map->s_uspace.s_table,
845 map->s_uspace.s_table, 834 bloc, offset, count);
846 bloc, offset, count);
847 } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) { 835 } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) {
848 return udf_bitmap_free_blocks(sb, inode, 836 udf_bitmap_free_blocks(sb, inode, map->s_fspace.s_bitmap,
849 map->s_fspace.s_bitmap, 837 bloc, offset, count);
850 bloc, offset, count);
851 } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) { 838 } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) {
852 return udf_table_free_blocks(sb, inode, 839 udf_table_free_blocks(sb, inode, map->s_fspace.s_table,
853 map->s_fspace.s_table, 840 bloc, offset, count);
854 bloc, offset, count);
855 } else {
856 return;
857 } 841 }
858} 842}
859 843
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 62dc270c69d1..2efd4d5291b6 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -51,7 +51,7 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
51 uint8_t lfi; 51 uint8_t lfi;
52 loff_t size = udf_ext0_offset(dir) + dir->i_size; 52 loff_t size = udf_ext0_offset(dir) + dir->i_size;
53 struct buffer_head *tmp, *bha[16]; 53 struct buffer_head *tmp, *bha[16];
54 kernel_lb_addr eloc; 54 struct kernel_lb_addr eloc;
55 uint32_t elen; 55 uint32_t elen;
56 sector_t offset; 56 sector_t offset;
57 int i, num, ret = 0; 57 int i, num, ret = 0;
@@ -80,13 +80,13 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
80 ret = -ENOENT; 80 ret = -ENOENT;
81 goto out; 81 goto out;
82 } 82 }
83 block = udf_get_lb_pblock(dir->i_sb, eloc, offset); 83 block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
84 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { 84 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
85 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 85 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
86 epos.offset -= sizeof(short_ad); 86 epos.offset -= sizeof(struct short_ad);
87 else if (iinfo->i_alloc_type == 87 else if (iinfo->i_alloc_type ==
88 ICBTAG_FLAG_AD_LONG) 88 ICBTAG_FLAG_AD_LONG)
89 epos.offset -= sizeof(long_ad); 89 epos.offset -= sizeof(struct long_ad);
90 } else { 90 } else {
91 offset = 0; 91 offset = 0;
92 } 92 }
@@ -101,7 +101,7 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
101 if (i + offset > (elen >> dir->i_sb->s_blocksize_bits)) 101 if (i + offset > (elen >> dir->i_sb->s_blocksize_bits))
102 i = (elen >> dir->i_sb->s_blocksize_bits) - offset; 102 i = (elen >> dir->i_sb->s_blocksize_bits) - offset;
103 for (num = 0; i > 0; i--) { 103 for (num = 0; i > 0; i--) {
104 block = udf_get_lb_pblock(dir->i_sb, eloc, offset + i); 104 block = udf_get_lb_pblock(dir->i_sb, &eloc, offset + i);
105 tmp = udf_tgetblk(dir->i_sb, block); 105 tmp = udf_tgetblk(dir->i_sb, block);
106 if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp)) 106 if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp))
107 bha[num++] = tmp; 107 bha[num++] = tmp;
@@ -161,9 +161,9 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
161 memcpy(fname, "..", flen); 161 memcpy(fname, "..", flen);
162 dt_type = DT_DIR; 162 dt_type = DT_DIR;
163 } else { 163 } else {
164 kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation); 164 struct kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation);
165 165
166 iblock = udf_get_lb_pblock(dir->i_sb, tloc, 0); 166 iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
167 flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi); 167 flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
168 dt_type = DT_UNKNOWN; 168 dt_type = DT_UNKNOWN;
169 } 169 }
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 2820f8fcf4cc..1d2c570704c8 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -20,7 +20,7 @@
20 20
21#if 0 21#if 0
22static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad, 22static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad,
23 uint8_t ad_size, kernel_lb_addr fe_loc, 23 uint8_t ad_size, struct kernel_lb_addr fe_loc,
24 int *pos, int *offset, struct buffer_head **bh, 24 int *pos, int *offset, struct buffer_head **bh,
25 int *error) 25 int *error)
26{ 26{
@@ -75,7 +75,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
75 struct udf_fileident_bh *fibh, 75 struct udf_fileident_bh *fibh,
76 struct fileIdentDesc *cfi, 76 struct fileIdentDesc *cfi,
77 struct extent_position *epos, 77 struct extent_position *epos,
78 kernel_lb_addr *eloc, uint32_t *elen, 78 struct kernel_lb_addr *eloc, uint32_t *elen,
79 sector_t *offset) 79 sector_t *offset)
80{ 80{
81 struct fileIdentDesc *fi; 81 struct fileIdentDesc *fi;
@@ -111,7 +111,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
111 (EXT_RECORDED_ALLOCATED >> 30)) 111 (EXT_RECORDED_ALLOCATED >> 30))
112 return NULL; 112 return NULL;
113 113
114 block = udf_get_lb_pblock(dir->i_sb, *eloc, *offset); 114 block = udf_get_lb_pblock(dir->i_sb, eloc, *offset);
115 115
116 (*offset)++; 116 (*offset)++;
117 117
@@ -131,7 +131,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
131 if (i + *offset > (*elen >> blocksize_bits)) 131 if (i + *offset > (*elen >> blocksize_bits))
132 i = (*elen >> blocksize_bits)-*offset; 132 i = (*elen >> blocksize_bits)-*offset;
133 for (num = 0; i > 0; i--) { 133 for (num = 0; i > 0; i--) {
134 block = udf_get_lb_pblock(dir->i_sb, *eloc, 134 block = udf_get_lb_pblock(dir->i_sb, eloc,
135 *offset + i); 135 *offset + i);
136 tmp = udf_tgetblk(dir->i_sb, block); 136 tmp = udf_tgetblk(dir->i_sb, block);
137 if (tmp && !buffer_uptodate(tmp) && 137 if (tmp && !buffer_uptodate(tmp) &&
@@ -169,7 +169,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
169 (EXT_RECORDED_ALLOCATED >> 30)) 169 (EXT_RECORDED_ALLOCATED >> 30))
170 return NULL; 170 return NULL;
171 171
172 block = udf_get_lb_pblock(dir->i_sb, *eloc, *offset); 172 block = udf_get_lb_pblock(dir->i_sb, eloc, *offset);
173 173
174 (*offset)++; 174 (*offset)++;
175 175
@@ -249,9 +249,9 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset)
249} 249}
250 250
251#if 0 251#if 0
252static extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset) 252static struct extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
253{ 253{
254 extent_ad *ext; 254 struct extent_ad *ext;
255 struct fileEntry *fe; 255 struct fileEntry *fe;
256 uint8_t *ptr; 256 uint8_t *ptr;
257 257
@@ -274,54 +274,54 @@ static extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
274 if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs))) 274 if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs)))
275 ptr += *offset; 275 ptr += *offset;
276 276
277 ext = (extent_ad *)ptr; 277 ext = (struct extent_ad *)ptr;
278 278
279 *offset = *offset + sizeof(extent_ad); 279 *offset = *offset + sizeof(struct extent_ad);
280 return ext; 280 return ext;
281} 281}
282#endif 282#endif
283 283
284short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset, 284struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset,
285 int inc) 285 int inc)
286{ 286{
287 short_ad *sa; 287 struct short_ad *sa;
288 288
289 if ((!ptr) || (!offset)) { 289 if ((!ptr) || (!offset)) {
290 printk(KERN_ERR "udf: udf_get_fileshortad() invalidparms\n"); 290 printk(KERN_ERR "udf: udf_get_fileshortad() invalidparms\n");
291 return NULL; 291 return NULL;
292 } 292 }
293 293
294 if ((*offset + sizeof(short_ad)) > maxoffset) 294 if ((*offset + sizeof(struct short_ad)) > maxoffset)
295 return NULL; 295 return NULL;
296 else { 296 else {
297 sa = (short_ad *)ptr; 297 sa = (struct short_ad *)ptr;
298 if (sa->extLength == 0) 298 if (sa->extLength == 0)
299 return NULL; 299 return NULL;
300 } 300 }
301 301
302 if (inc) 302 if (inc)
303 *offset += sizeof(short_ad); 303 *offset += sizeof(struct short_ad);
304 return sa; 304 return sa;
305} 305}
306 306
307long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc) 307struct long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc)
308{ 308{
309 long_ad *la; 309 struct long_ad *la;
310 310
311 if ((!ptr) || (!offset)) { 311 if ((!ptr) || (!offset)) {
312 printk(KERN_ERR "udf: udf_get_filelongad() invalidparms\n"); 312 printk(KERN_ERR "udf: udf_get_filelongad() invalidparms\n");
313 return NULL; 313 return NULL;
314 } 314 }
315 315
316 if ((*offset + sizeof(long_ad)) > maxoffset) 316 if ((*offset + sizeof(struct long_ad)) > maxoffset)
317 return NULL; 317 return NULL;
318 else { 318 else {
319 la = (long_ad *)ptr; 319 la = (struct long_ad *)ptr;
320 if (la->extLength == 0) 320 if (la->extLength == 0)
321 return NULL; 321 return NULL;
322 } 322 }
323 323
324 if (inc) 324 if (inc)
325 *offset += sizeof(long_ad); 325 *offset += sizeof(struct long_ad);
326 return la; 326 return la;
327} 327}
diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h
index a0974df82b31..4792b771aa80 100644
--- a/fs/udf/ecma_167.h
+++ b/fs/udf/ecma_167.h
@@ -38,10 +38,10 @@
38#define _ECMA_167_H 1 38#define _ECMA_167_H 1
39 39
40/* Character set specification (ECMA 167r3 1/7.2.1) */ 40/* Character set specification (ECMA 167r3 1/7.2.1) */
41typedef struct { 41struct charspec {
42 uint8_t charSetType; 42 uint8_t charSetType;
43 uint8_t charSetInfo[63]; 43 uint8_t charSetInfo[63];
44} __attribute__ ((packed)) charspec; 44} __attribute__ ((packed));
45 45
46/* Character Set Type (ECMA 167r3 1/7.2.1.1) */ 46/* Character Set Type (ECMA 167r3 1/7.2.1.1) */
47#define CHARSPEC_TYPE_CS0 0x00 /* (1/7.2.2) */ 47#define CHARSPEC_TYPE_CS0 0x00 /* (1/7.2.2) */
@@ -57,7 +57,7 @@ typedef struct {
57typedef uint8_t dstring; 57typedef uint8_t dstring;
58 58
59/* Timestamp (ECMA 167r3 1/7.3) */ 59/* Timestamp (ECMA 167r3 1/7.3) */
60typedef struct { 60struct timestamp {
61 __le16 typeAndTimezone; 61 __le16 typeAndTimezone;
62 __le16 year; 62 __le16 year;
63 uint8_t month; 63 uint8_t month;
@@ -68,7 +68,7 @@ typedef struct {
68 uint8_t centiseconds; 68 uint8_t centiseconds;
69 uint8_t hundredsOfMicroseconds; 69 uint8_t hundredsOfMicroseconds;
70 uint8_t microseconds; 70 uint8_t microseconds;
71} __attribute__ ((packed)) timestamp; 71} __attribute__ ((packed));
72 72
73/* Type and Time Zone (ECMA 167r3 1/7.3.1) */ 73/* Type and Time Zone (ECMA 167r3 1/7.3.1) */
74#define TIMESTAMP_TYPE_MASK 0xF000 74#define TIMESTAMP_TYPE_MASK 0xF000
@@ -78,11 +78,11 @@ typedef struct {
78#define TIMESTAMP_TIMEZONE_MASK 0x0FFF 78#define TIMESTAMP_TIMEZONE_MASK 0x0FFF
79 79
80/* Entity identifier (ECMA 167r3 1/7.4) */ 80/* Entity identifier (ECMA 167r3 1/7.4) */
81typedef struct { 81struct regid {
82 uint8_t flags; 82 uint8_t flags;
83 uint8_t ident[23]; 83 uint8_t ident[23];
84 uint8_t identSuffix[8]; 84 uint8_t identSuffix[8];
85} __attribute__ ((packed)) regid; 85} __attribute__ ((packed));
86 86
87/* Flags (ECMA 167r3 1/7.4.1) */ 87/* Flags (ECMA 167r3 1/7.4.1) */
88#define ENTITYID_FLAGS_DIRTY 0x00 88#define ENTITYID_FLAGS_DIRTY 0x00
@@ -126,38 +126,38 @@ struct terminatingExtendedAreaDesc {
126 126
127/* Boot Descriptor (ECMA 167r3 2/9.4) */ 127/* Boot Descriptor (ECMA 167r3 2/9.4) */
128struct bootDesc { 128struct bootDesc {
129 uint8_t structType; 129 uint8_t structType;
130 uint8_t stdIdent[VSD_STD_ID_LEN]; 130 uint8_t stdIdent[VSD_STD_ID_LEN];
131 uint8_t structVersion; 131 uint8_t structVersion;
132 uint8_t reserved1; 132 uint8_t reserved1;
133 regid archType; 133 struct regid archType;
134 regid bootIdent; 134 struct regid bootIdent;
135 __le32 bootExtLocation; 135 __le32 bootExtLocation;
136 __le32 bootExtLength; 136 __le32 bootExtLength;
137 __le64 loadAddress; 137 __le64 loadAddress;
138 __le64 startAddress; 138 __le64 startAddress;
139 timestamp descCreationDateAndTime; 139 struct timestamp descCreationDateAndTime;
140 __le16 flags; 140 __le16 flags;
141 uint8_t reserved2[32]; 141 uint8_t reserved2[32];
142 uint8_t bootUse[1906]; 142 uint8_t bootUse[1906];
143} __attribute__ ((packed)); 143} __attribute__ ((packed));
144 144
145/* Flags (ECMA 167r3 2/9.4.12) */ 145/* Flags (ECMA 167r3 2/9.4.12) */
146#define BOOT_FLAGS_ERASE 0x01 146#define BOOT_FLAGS_ERASE 0x01
147 147
148/* Extent Descriptor (ECMA 167r3 3/7.1) */ 148/* Extent Descriptor (ECMA 167r3 3/7.1) */
149typedef struct { 149struct extent_ad {
150 __le32 extLength; 150 __le32 extLength;
151 __le32 extLocation; 151 __le32 extLocation;
152} __attribute__ ((packed)) extent_ad; 152} __attribute__ ((packed));
153 153
154typedef struct { 154struct kernel_extent_ad {
155 uint32_t extLength; 155 uint32_t extLength;
156 uint32_t extLocation; 156 uint32_t extLocation;
157} kernel_extent_ad; 157};
158 158
159/* Descriptor Tag (ECMA 167r3 3/7.2) */ 159/* Descriptor Tag (ECMA 167r3 3/7.2) */
160typedef struct { 160struct tag {
161 __le16 tagIdent; 161 __le16 tagIdent;
162 __le16 descVersion; 162 __le16 descVersion;
163 uint8_t tagChecksum; 163 uint8_t tagChecksum;
@@ -166,7 +166,7 @@ typedef struct {
166 __le16 descCRC; 166 __le16 descCRC;
167 __le16 descCRCLength; 167 __le16 descCRCLength;
168 __le32 tagLocation; 168 __le32 tagLocation;
169} __attribute__ ((packed)) tag; 169} __attribute__ ((packed));
170 170
171/* Tag Identifier (ECMA 167r3 3/7.2.1) */ 171/* Tag Identifier (ECMA 167r3 3/7.2.1) */
172#define TAG_IDENT_PVD 0x0001 172#define TAG_IDENT_PVD 0x0001
@@ -190,28 +190,28 @@ struct NSRDesc {
190 190
191/* Primary Volume Descriptor (ECMA 167r3 3/10.1) */ 191/* Primary Volume Descriptor (ECMA 167r3 3/10.1) */
192struct primaryVolDesc { 192struct primaryVolDesc {
193 tag descTag; 193 struct tag descTag;
194 __le32 volDescSeqNum; 194 __le32 volDescSeqNum;
195 __le32 primaryVolDescNum; 195 __le32 primaryVolDescNum;
196 dstring volIdent[32]; 196 dstring volIdent[32];
197 __le16 volSeqNum; 197 __le16 volSeqNum;
198 __le16 maxVolSeqNum; 198 __le16 maxVolSeqNum;
199 __le16 interchangeLvl; 199 __le16 interchangeLvl;
200 __le16 maxInterchangeLvl; 200 __le16 maxInterchangeLvl;
201 __le32 charSetList; 201 __le32 charSetList;
202 __le32 maxCharSetList; 202 __le32 maxCharSetList;
203 dstring volSetIdent[128]; 203 dstring volSetIdent[128];
204 charspec descCharSet; 204 struct charspec descCharSet;
205 charspec explanatoryCharSet; 205 struct charspec explanatoryCharSet;
206 extent_ad volAbstract; 206 struct extent_ad volAbstract;
207 extent_ad volCopyright; 207 struct extent_ad volCopyright;
208 regid appIdent; 208 struct regid appIdent;
209 timestamp recordingDateAndTime; 209 struct timestamp recordingDateAndTime;
210 regid impIdent; 210 struct regid impIdent;
211 uint8_t impUse[64]; 211 uint8_t impUse[64];
212 __le32 predecessorVolDescSeqLocation; 212 __le32 predecessorVolDescSeqLocation;
213 __le16 flags; 213 __le16 flags;
214 uint8_t reserved[22]; 214 uint8_t reserved[22];
215} __attribute__ ((packed)); 215} __attribute__ ((packed));
216 216
217/* Flags (ECMA 167r3 3/10.1.21) */ 217/* Flags (ECMA 167r3 3/10.1.21) */
@@ -219,40 +219,40 @@ struct primaryVolDesc {
219 219
220/* Anchor Volume Descriptor Pointer (ECMA 167r3 3/10.2) */ 220/* Anchor Volume Descriptor Pointer (ECMA 167r3 3/10.2) */
221struct anchorVolDescPtr { 221struct anchorVolDescPtr {
222 tag descTag; 222 struct tag descTag;
223 extent_ad mainVolDescSeqExt; 223 struct extent_ad mainVolDescSeqExt;
224 extent_ad reserveVolDescSeqExt; 224 struct extent_ad reserveVolDescSeqExt;
225 uint8_t reserved[480]; 225 uint8_t reserved[480];
226} __attribute__ ((packed)); 226} __attribute__ ((packed));
227 227
228/* Volume Descriptor Pointer (ECMA 167r3 3/10.3) */ 228/* Volume Descriptor Pointer (ECMA 167r3 3/10.3) */
229struct volDescPtr { 229struct volDescPtr {
230 tag descTag; 230 struct tag descTag;
231 __le32 volDescSeqNum; 231 __le32 volDescSeqNum;
232 extent_ad nextVolDescSeqExt; 232 struct extent_ad nextVolDescSeqExt;
233 uint8_t reserved[484]; 233 uint8_t reserved[484];
234} __attribute__ ((packed)); 234} __attribute__ ((packed));
235 235
236/* Implementation Use Volume Descriptor (ECMA 167r3 3/10.4) */ 236/* Implementation Use Volume Descriptor (ECMA 167r3 3/10.4) */
237struct impUseVolDesc { 237struct impUseVolDesc {
238 tag descTag; 238 struct tag descTag;
239 __le32 volDescSeqNum; 239 __le32 volDescSeqNum;
240 regid impIdent; 240 struct regid impIdent;
241 uint8_t impUse[460]; 241 uint8_t impUse[460];
242} __attribute__ ((packed)); 242} __attribute__ ((packed));
243 243
244/* Partition Descriptor (ECMA 167r3 3/10.5) */ 244/* Partition Descriptor (ECMA 167r3 3/10.5) */
245struct partitionDesc { 245struct partitionDesc {
246 tag descTag; 246 struct tag descTag;
247 __le32 volDescSeqNum; 247 __le32 volDescSeqNum;
248 __le16 partitionFlags; 248 __le16 partitionFlags;
249 __le16 partitionNumber; 249 __le16 partitionNumber;
250 regid partitionContents; 250 struct regid partitionContents;
251 uint8_t partitionContentsUse[128]; 251 uint8_t partitionContentsUse[128];
252 __le32 accessType; 252 __le32 accessType;
253 __le32 partitionStartingLocation; 253 __le32 partitionStartingLocation;
254 __le32 partitionLength; 254 __le32 partitionLength;
255 regid impIdent; 255 struct regid impIdent;
256 uint8_t impUse[128]; 256 uint8_t impUse[128];
257 uint8_t reserved[156]; 257 uint8_t reserved[156];
258} __attribute__ ((packed)); 258} __attribute__ ((packed));
@@ -278,19 +278,19 @@ struct partitionDesc {
278 278
279/* Logical Volume Descriptor (ECMA 167r3 3/10.6) */ 279/* Logical Volume Descriptor (ECMA 167r3 3/10.6) */
280struct logicalVolDesc { 280struct logicalVolDesc {
281 tag descTag; 281 struct tag descTag;
282 __le32 volDescSeqNum; 282 __le32 volDescSeqNum;
283 charspec descCharSet; 283 struct charspec descCharSet;
284 dstring logicalVolIdent[128]; 284 dstring logicalVolIdent[128];
285 __le32 logicalBlockSize; 285 __le32 logicalBlockSize;
286 regid domainIdent; 286 struct regid domainIdent;
287 uint8_t logicalVolContentsUse[16]; 287 uint8_t logicalVolContentsUse[16];
288 __le32 mapTableLength; 288 __le32 mapTableLength;
289 __le32 numPartitionMaps; 289 __le32 numPartitionMaps;
290 regid impIdent; 290 struct regid impIdent;
291 uint8_t impUse[128]; 291 uint8_t impUse[128];
292 extent_ad integritySeqExt; 292 struct extent_ad integritySeqExt;
293 uint8_t partitionMaps[0]; 293 uint8_t partitionMaps[0];
294} __attribute__ ((packed)); 294} __attribute__ ((packed));
295 295
296/* Generic Partition Map (ECMA 167r3 3/10.7.1) */ 296/* Generic Partition Map (ECMA 167r3 3/10.7.1) */
@@ -322,30 +322,30 @@ struct genericPartitionMap2 {
322 322
323/* Unallocated Space Descriptor (ECMA 167r3 3/10.8) */ 323/* Unallocated Space Descriptor (ECMA 167r3 3/10.8) */
324struct unallocSpaceDesc { 324struct unallocSpaceDesc {
325 tag descTag; 325 struct tag descTag;
326 __le32 volDescSeqNum; 326 __le32 volDescSeqNum;
327 __le32 numAllocDescs; 327 __le32 numAllocDescs;
328 extent_ad allocDescs[0]; 328 struct extent_ad allocDescs[0];
329} __attribute__ ((packed)); 329} __attribute__ ((packed));
330 330
331/* Terminating Descriptor (ECMA 167r3 3/10.9) */ 331/* Terminating Descriptor (ECMA 167r3 3/10.9) */
332struct terminatingDesc { 332struct terminatingDesc {
333 tag descTag; 333 struct tag descTag;
334 uint8_t reserved[496]; 334 uint8_t reserved[496];
335} __attribute__ ((packed)); 335} __attribute__ ((packed));
336 336
337/* Logical Volume Integrity Descriptor (ECMA 167r3 3/10.10) */ 337/* Logical Volume Integrity Descriptor (ECMA 167r3 3/10.10) */
338struct logicalVolIntegrityDesc { 338struct logicalVolIntegrityDesc {
339 tag descTag; 339 struct tag descTag;
340 timestamp recordingDateAndTime; 340 struct timestamp recordingDateAndTime;
341 __le32 integrityType; 341 __le32 integrityType;
342 extent_ad nextIntegrityExt; 342 struct extent_ad nextIntegrityExt;
343 uint8_t logicalVolContentsUse[32]; 343 uint8_t logicalVolContentsUse[32];
344 __le32 numOfPartitions; 344 __le32 numOfPartitions;
345 __le32 lengthOfImpUse; 345 __le32 lengthOfImpUse;
346 __le32 freeSpaceTable[0]; 346 __le32 freeSpaceTable[0];
347 __le32 sizeTable[0]; 347 __le32 sizeTable[0];
348 uint8_t impUse[0]; 348 uint8_t impUse[0];
349} __attribute__ ((packed)); 349} __attribute__ ((packed));
350 350
351/* Integrity Type (ECMA 167r3 3/10.10.3) */ 351/* Integrity Type (ECMA 167r3 3/10.10.3) */
@@ -353,50 +353,50 @@ struct logicalVolIntegrityDesc {
353#define LVID_INTEGRITY_TYPE_CLOSE 0x00000001 353#define LVID_INTEGRITY_TYPE_CLOSE 0x00000001
354 354
355/* Recorded Address (ECMA 167r3 4/7.1) */ 355/* Recorded Address (ECMA 167r3 4/7.1) */
356typedef struct { 356struct lb_addr {
357 __le32 logicalBlockNum; 357 __le32 logicalBlockNum;
358 __le16 partitionReferenceNum; 358 __le16 partitionReferenceNum;
359} __attribute__ ((packed)) lb_addr; 359} __attribute__ ((packed));
360 360
361/* ... and its in-core analog */ 361/* ... and its in-core analog */
362typedef struct { 362struct kernel_lb_addr {
363 uint32_t logicalBlockNum; 363 uint32_t logicalBlockNum;
364 uint16_t partitionReferenceNum; 364 uint16_t partitionReferenceNum;
365} kernel_lb_addr; 365};
366 366
367/* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */ 367/* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */
368typedef struct { 368struct short_ad {
369 __le32 extLength; 369 __le32 extLength;
370 __le32 extPosition; 370 __le32 extPosition;
371} __attribute__ ((packed)) short_ad; 371} __attribute__ ((packed));
372 372
373/* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */ 373/* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */
374typedef struct { 374struct long_ad {
375 __le32 extLength; 375 __le32 extLength;
376 lb_addr extLocation; 376 struct lb_addr extLocation;
377 uint8_t impUse[6]; 377 uint8_t impUse[6];
378} __attribute__ ((packed)) long_ad; 378} __attribute__ ((packed));
379 379
380typedef struct { 380struct kernel_long_ad {
381 uint32_t extLength; 381 uint32_t extLength;
382 kernel_lb_addr extLocation; 382 struct kernel_lb_addr extLocation;
383 uint8_t impUse[6]; 383 uint8_t impUse[6];
384} kernel_long_ad; 384};
385 385
386/* Extended Allocation Descriptor (ECMA 167r3 4/14.14.3) */ 386/* Extended Allocation Descriptor (ECMA 167r3 4/14.14.3) */
387typedef struct { 387struct ext_ad {
388 __le32 extLength; 388 __le32 extLength;
389 __le32 recordedLength; 389 __le32 recordedLength;
390 __le32 informationLength; 390 __le32 informationLength;
391 lb_addr extLocation; 391 struct lb_addr extLocation;
392} __attribute__ ((packed)) ext_ad; 392} __attribute__ ((packed));
393 393
394typedef struct { 394struct kernel_ext_ad {
395 uint32_t extLength; 395 uint32_t extLength;
396 uint32_t recordedLength; 396 uint32_t recordedLength;
397 uint32_t informationLength; 397 uint32_t informationLength;
398 kernel_lb_addr extLocation; 398 struct kernel_lb_addr extLocation;
399} kernel_ext_ad; 399};
400 400
401/* Descriptor Tag (ECMA 167r3 4/7.2 - See 3/7.2) */ 401/* Descriptor Tag (ECMA 167r3 4/7.2 - See 3/7.2) */
402 402
@@ -415,44 +415,44 @@ typedef struct {
415 415
416/* File Set Descriptor (ECMA 167r3 4/14.1) */ 416/* File Set Descriptor (ECMA 167r3 4/14.1) */
417struct fileSetDesc { 417struct fileSetDesc {
418 tag descTag; 418 struct tag descTag;
419 timestamp recordingDateAndTime; 419 struct timestamp recordingDateAndTime;
420 __le16 interchangeLvl; 420 __le16 interchangeLvl;
421 __le16 maxInterchangeLvl; 421 __le16 maxInterchangeLvl;
422 __le32 charSetList; 422 __le32 charSetList;
423 __le32 maxCharSetList; 423 __le32 maxCharSetList;
424 __le32 fileSetNum; 424 __le32 fileSetNum;
425 __le32 fileSetDescNum; 425 __le32 fileSetDescNum;
426 charspec logicalVolIdentCharSet; 426 struct charspec logicalVolIdentCharSet;
427 dstring logicalVolIdent[128]; 427 dstring logicalVolIdent[128];
428 charspec fileSetCharSet; 428 struct charspec fileSetCharSet;
429 dstring fileSetIdent[32]; 429 dstring fileSetIdent[32];
430 dstring copyrightFileIdent[32]; 430 dstring copyrightFileIdent[32];
431 dstring abstractFileIdent[32]; 431 dstring abstractFileIdent[32];
432 long_ad rootDirectoryICB; 432 struct long_ad rootDirectoryICB;
433 regid domainIdent; 433 struct regid domainIdent;
434 long_ad nextExt; 434 struct long_ad nextExt;
435 long_ad streamDirectoryICB; 435 struct long_ad streamDirectoryICB;
436 uint8_t reserved[32]; 436 uint8_t reserved[32];
437} __attribute__ ((packed)); 437} __attribute__ ((packed));
438 438
439/* Partition Header Descriptor (ECMA 167r3 4/14.3) */ 439/* Partition Header Descriptor (ECMA 167r3 4/14.3) */
440struct partitionHeaderDesc { 440struct partitionHeaderDesc {
441 short_ad unallocSpaceTable; 441 struct short_ad unallocSpaceTable;
442 short_ad unallocSpaceBitmap; 442 struct short_ad unallocSpaceBitmap;
443 short_ad partitionIntegrityTable; 443 struct short_ad partitionIntegrityTable;
444 short_ad freedSpaceTable; 444 struct short_ad freedSpaceTable;
445 short_ad freedSpaceBitmap; 445 struct short_ad freedSpaceBitmap;
446 uint8_t reserved[88]; 446 uint8_t reserved[88];
447} __attribute__ ((packed)); 447} __attribute__ ((packed));
448 448
449/* File Identifier Descriptor (ECMA 167r3 4/14.4) */ 449/* File Identifier Descriptor (ECMA 167r3 4/14.4) */
450struct fileIdentDesc { 450struct fileIdentDesc {
451 tag descTag; 451 struct tag descTag;
452 __le16 fileVersionNum; 452 __le16 fileVersionNum;
453 uint8_t fileCharacteristics; 453 uint8_t fileCharacteristics;
454 uint8_t lengthFileIdent; 454 uint8_t lengthFileIdent;
455 long_ad icb; 455 struct long_ad icb;
456 __le16 lengthOfImpUse; 456 __le16 lengthOfImpUse;
457 uint8_t impUse[0]; 457 uint8_t impUse[0];
458 uint8_t fileIdent[0]; 458 uint8_t fileIdent[0];
@@ -468,22 +468,22 @@ struct fileIdentDesc {
468 468
469/* Allocation Ext Descriptor (ECMA 167r3 4/14.5) */ 469/* Allocation Ext Descriptor (ECMA 167r3 4/14.5) */
470struct allocExtDesc { 470struct allocExtDesc {
471 tag descTag; 471 struct tag descTag;
472 __le32 previousAllocExtLocation; 472 __le32 previousAllocExtLocation;
473 __le32 lengthAllocDescs; 473 __le32 lengthAllocDescs;
474} __attribute__ ((packed)); 474} __attribute__ ((packed));
475 475
476/* ICB Tag (ECMA 167r3 4/14.6) */ 476/* ICB Tag (ECMA 167r3 4/14.6) */
477typedef struct { 477struct icbtag {
478 __le32 priorRecordedNumDirectEntries; 478 __le32 priorRecordedNumDirectEntries;
479 __le16 strategyType; 479 __le16 strategyType;
480 __le16 strategyParameter; 480 __le16 strategyParameter;
481 __le16 numEntries; 481 __le16 numEntries;
482 uint8_t reserved; 482 uint8_t reserved;
483 uint8_t fileType; 483 uint8_t fileType;
484 lb_addr parentICBLocation; 484 struct lb_addr parentICBLocation;
485 __le16 flags; 485 __le16 flags;
486} __attribute__ ((packed)) icbtag; 486} __attribute__ ((packed));
487 487
488/* Strategy Type (ECMA 167r3 4/14.6.2) */ 488/* Strategy Type (ECMA 167r3 4/14.6.2) */
489#define ICBTAG_STRATEGY_TYPE_UNDEF 0x0000 489#define ICBTAG_STRATEGY_TYPE_UNDEF 0x0000
@@ -528,41 +528,41 @@ typedef struct {
528 528
529/* Indirect Entry (ECMA 167r3 4/14.7) */ 529/* Indirect Entry (ECMA 167r3 4/14.7) */
530struct indirectEntry { 530struct indirectEntry {
531 tag descTag; 531 struct tag descTag;
532 icbtag icbTag; 532 struct icbtag icbTag;
533 long_ad indirectICB; 533 struct long_ad indirectICB;
534} __attribute__ ((packed)); 534} __attribute__ ((packed));
535 535
536/* Terminal Entry (ECMA 167r3 4/14.8) */ 536/* Terminal Entry (ECMA 167r3 4/14.8) */
537struct terminalEntry { 537struct terminalEntry {
538 tag descTag; 538 struct tag descTag;
539 icbtag icbTag; 539 struct icbtag icbTag;
540} __attribute__ ((packed)); 540} __attribute__ ((packed));
541 541
542/* File Entry (ECMA 167r3 4/14.9) */ 542/* File Entry (ECMA 167r3 4/14.9) */
543struct fileEntry { 543struct fileEntry {
544 tag descTag; 544 struct tag descTag;
545 icbtag icbTag; 545 struct icbtag icbTag;
546 __le32 uid; 546 __le32 uid;
547 __le32 gid; 547 __le32 gid;
548 __le32 permissions; 548 __le32 permissions;
549 __le16 fileLinkCount; 549 __le16 fileLinkCount;
550 uint8_t recordFormat; 550 uint8_t recordFormat;
551 uint8_t recordDisplayAttr; 551 uint8_t recordDisplayAttr;
552 __le32 recordLength; 552 __le32 recordLength;
553 __le64 informationLength; 553 __le64 informationLength;
554 __le64 logicalBlocksRecorded; 554 __le64 logicalBlocksRecorded;
555 timestamp accessTime; 555 struct timestamp accessTime;
556 timestamp modificationTime; 556 struct timestamp modificationTime;
557 timestamp attrTime; 557 struct timestamp attrTime;
558 __le32 checkpoint; 558 __le32 checkpoint;
559 long_ad extendedAttrICB; 559 struct long_ad extendedAttrICB;
560 regid impIdent; 560 struct regid impIdent;
561 __le64 uniqueID; 561 __le64 uniqueID;
562 __le32 lengthExtendedAttr; 562 __le32 lengthExtendedAttr;
563 __le32 lengthAllocDescs; 563 __le32 lengthAllocDescs;
564 uint8_t extendedAttr[0]; 564 uint8_t extendedAttr[0];
565 uint8_t allocDescs[0]; 565 uint8_t allocDescs[0];
566} __attribute__ ((packed)); 566} __attribute__ ((packed));
567 567
568/* Permissions (ECMA 167r3 4/14.9.5) */ 568/* Permissions (ECMA 167r3 4/14.9.5) */
@@ -604,7 +604,7 @@ struct fileEntry {
604 604
605/* Extended Attribute Header Descriptor (ECMA 167r3 4/14.10.1) */ 605/* Extended Attribute Header Descriptor (ECMA 167r3 4/14.10.1) */
606struct extendedAttrHeaderDesc { 606struct extendedAttrHeaderDesc {
607 tag descTag; 607 struct tag descTag;
608 __le32 impAttrLocation; 608 __le32 impAttrLocation;
609 __le32 appAttrLocation; 609 __le32 appAttrLocation;
610} __attribute__ ((packed)); 610} __attribute__ ((packed));
@@ -687,7 +687,7 @@ struct impUseExtAttr {
687 uint8_t reserved[3]; 687 uint8_t reserved[3];
688 __le32 attrLength; 688 __le32 attrLength;
689 __le32 impUseLength; 689 __le32 impUseLength;
690 regid impIdent; 690 struct regid impIdent;
691 uint8_t impUse[0]; 691 uint8_t impUse[0];
692} __attribute__ ((packed)); 692} __attribute__ ((packed));
693 693
@@ -698,7 +698,7 @@ struct appUseExtAttr {
698 uint8_t reserved[3]; 698 uint8_t reserved[3];
699 __le32 attrLength; 699 __le32 attrLength;
700 __le32 appUseLength; 700 __le32 appUseLength;
701 regid appIdent; 701 struct regid appIdent;
702 uint8_t appUse[0]; 702 uint8_t appUse[0];
703} __attribute__ ((packed)); 703} __attribute__ ((packed));
704 704
@@ -712,15 +712,15 @@ struct appUseExtAttr {
712 712
713/* Unallocated Space Entry (ECMA 167r3 4/14.11) */ 713/* Unallocated Space Entry (ECMA 167r3 4/14.11) */
714struct unallocSpaceEntry { 714struct unallocSpaceEntry {
715 tag descTag; 715 struct tag descTag;
716 icbtag icbTag; 716 struct icbtag icbTag;
717 __le32 lengthAllocDescs; 717 __le32 lengthAllocDescs;
718 uint8_t allocDescs[0]; 718 uint8_t allocDescs[0];
719} __attribute__ ((packed)); 719} __attribute__ ((packed));
720 720
721/* Space Bitmap Descriptor (ECMA 167r3 4/14.12) */ 721/* Space Bitmap Descriptor (ECMA 167r3 4/14.12) */
722struct spaceBitmapDesc { 722struct spaceBitmapDesc {
723 tag descTag; 723 struct tag descTag;
724 __le32 numOfBits; 724 __le32 numOfBits;
725 __le32 numOfBytes; 725 __le32 numOfBytes;
726 uint8_t bitmap[0]; 726 uint8_t bitmap[0];
@@ -728,13 +728,13 @@ struct spaceBitmapDesc {
728 728
729/* Partition Integrity Entry (ECMA 167r3 4/14.13) */ 729/* Partition Integrity Entry (ECMA 167r3 4/14.13) */
730struct partitionIntegrityEntry { 730struct partitionIntegrityEntry {
731 tag descTag; 731 struct tag descTag;
732 icbtag icbTag; 732 struct icbtag icbTag;
733 timestamp recordingDateAndTime; 733 struct timestamp recordingDateAndTime;
734 uint8_t integrityType; 734 uint8_t integrityType;
735 uint8_t reserved[175]; 735 uint8_t reserved[175];
736 regid impIdent; 736 struct regid impIdent;
737 uint8_t impUse[256]; 737 uint8_t impUse[256];
738} __attribute__ ((packed)); 738} __attribute__ ((packed));
739 739
740/* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */ 740/* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */
@@ -765,32 +765,32 @@ struct pathComponent {
765 765
766/* File Entry (ECMA 167r3 4/14.17) */ 766/* File Entry (ECMA 167r3 4/14.17) */
767struct extendedFileEntry { 767struct extendedFileEntry {
768 tag descTag; 768 struct tag descTag;
769 icbtag icbTag; 769 struct icbtag icbTag;
770 __le32 uid; 770 __le32 uid;
771 __le32 gid; 771 __le32 gid;
772 __le32 permissions; 772 __le32 permissions;
773 __le16 fileLinkCount; 773 __le16 fileLinkCount;
774 uint8_t recordFormat; 774 uint8_t recordFormat;
775 uint8_t recordDisplayAttr; 775 uint8_t recordDisplayAttr;
776 __le32 recordLength; 776 __le32 recordLength;
777 __le64 informationLength; 777 __le64 informationLength;
778 __le64 objectSize; 778 __le64 objectSize;
779 __le64 logicalBlocksRecorded; 779 __le64 logicalBlocksRecorded;
780 timestamp accessTime; 780 struct timestamp accessTime;
781 timestamp modificationTime; 781 struct timestamp modificationTime;
782 timestamp createTime; 782 struct timestamp createTime;
783 timestamp attrTime; 783 struct timestamp attrTime;
784 __le32 checkpoint; 784 __le32 checkpoint;
785 __le32 reserved; 785 __le32 reserved;
786 long_ad extendedAttrICB; 786 struct long_ad extendedAttrICB;
787 long_ad streamDirectoryICB; 787 struct long_ad streamDirectoryICB;
788 regid impIdent; 788 struct regid impIdent;
789 __le64 uniqueID; 789 __le64 uniqueID;
790 __le32 lengthExtendedAttr; 790 __le32 lengthExtendedAttr;
791 __le32 lengthAllocDescs; 791 __le32 lengthAllocDescs;
792 uint8_t extendedAttr[0]; 792 uint8_t extendedAttr[0];
793 uint8_t allocDescs[0]; 793 uint8_t allocDescs[0];
794} __attribute__ ((packed)); 794} __attribute__ ((packed));
795 795
796#endif /* _ECMA_167_H */ 796#endif /* _ECMA_167_H */
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 31fc84297ddb..c10fa39f97e2 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -36,8 +36,8 @@ void udf_free_inode(struct inode *inode)
36 * Note: we must free any quota before locking the superblock, 36 * Note: we must free any quota before locking the superblock,
37 * as writing the quota to disk may need the lock as well. 37 * as writing the quota to disk may need the lock as well.
38 */ 38 */
39 DQUOT_FREE_INODE(inode); 39 vfs_dq_free_inode(inode);
40 DQUOT_DROP(inode); 40 vfs_dq_drop(inode);
41 41
42 clear_inode(inode); 42 clear_inode(inode);
43 43
@@ -49,12 +49,11 @@ void udf_free_inode(struct inode *inode)
49 le32_add_cpu(&lvidiu->numDirs, -1); 49 le32_add_cpu(&lvidiu->numDirs, -1);
50 else 50 else
51 le32_add_cpu(&lvidiu->numFiles, -1); 51 le32_add_cpu(&lvidiu->numFiles, -1);
52 52 udf_updated_lvid(sb);
53 mark_buffer_dirty(sbi->s_lvid_bh);
54 } 53 }
55 mutex_unlock(&sbi->s_alloc_mutex); 54 mutex_unlock(&sbi->s_alloc_mutex);
56 55
57 udf_free_blocks(sb, NULL, UDF_I(inode)->i_location, 0, 1); 56 udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1);
58} 57}
59 58
60struct inode *udf_new_inode(struct inode *dir, int mode, int *err) 59struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
@@ -122,7 +121,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
122 if (!(++uniqueID & 0x00000000FFFFFFFFUL)) 121 if (!(++uniqueID & 0x00000000FFFFFFFFUL))
123 uniqueID += 16; 122 uniqueID += 16;
124 lvhd->uniqueID = cpu_to_le64(uniqueID); 123 lvhd->uniqueID = cpu_to_le64(uniqueID);
125 mark_buffer_dirty(sbi->s_lvid_bh); 124 udf_updated_lvid(sb);
126 } 125 }
127 mutex_unlock(&sbi->s_alloc_mutex); 126 mutex_unlock(&sbi->s_alloc_mutex);
128 inode->i_mode = mode; 127 inode->i_mode = mode;
@@ -138,7 +137,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
138 iinfo->i_location.logicalBlockNum = block; 137 iinfo->i_location.logicalBlockNum = block;
139 iinfo->i_location.partitionReferenceNum = 138 iinfo->i_location.partitionReferenceNum =
140 dinfo->i_location.partitionReferenceNum; 139 dinfo->i_location.partitionReferenceNum;
141 inode->i_ino = udf_get_lb_pblock(sb, iinfo->i_location, 0); 140 inode->i_ino = udf_get_lb_pblock(sb, &iinfo->i_location, 0);
142 inode->i_blocks = 0; 141 inode->i_blocks = 0;
143 iinfo->i_lenEAttr = 0; 142 iinfo->i_lenEAttr = 0;
144 iinfo->i_lenAlloc = 0; 143 iinfo->i_lenAlloc = 0;
@@ -154,8 +153,8 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
154 insert_inode_hash(inode); 153 insert_inode_hash(inode);
155 mark_inode_dirty(inode); 154 mark_inode_dirty(inode);
156 155
157 if (DQUOT_ALLOC_INODE(inode)) { 156 if (vfs_dq_alloc_inode(inode)) {
158 DQUOT_DROP(inode); 157 vfs_dq_drop(inode);
159 inode->i_flags |= S_NOQUOTA; 158 inode->i_flags |= S_NOQUOTA;
160 inode->i_nlink = 0; 159 inode->i_nlink = 0;
161 iput(inode); 160 iput(inode);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 30ebde490f7f..e7533f785636 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -55,15 +55,15 @@ static int udf_alloc_i_data(struct inode *inode, size_t size);
55static struct buffer_head *inode_getblk(struct inode *, sector_t, int *, 55static struct buffer_head *inode_getblk(struct inode *, sector_t, int *,
56 sector_t *, int *); 56 sector_t *, int *);
57static int8_t udf_insert_aext(struct inode *, struct extent_position, 57static int8_t udf_insert_aext(struct inode *, struct extent_position,
58 kernel_lb_addr, uint32_t); 58 struct kernel_lb_addr, uint32_t);
59static void udf_split_extents(struct inode *, int *, int, int, 59static void udf_split_extents(struct inode *, int *, int, int,
60 kernel_long_ad[EXTENT_MERGE_SIZE], int *); 60 struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
61static void udf_prealloc_extents(struct inode *, int, int, 61static void udf_prealloc_extents(struct inode *, int, int,
62 kernel_long_ad[EXTENT_MERGE_SIZE], int *); 62 struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
63static void udf_merge_extents(struct inode *, 63static void udf_merge_extents(struct inode *,
64 kernel_long_ad[EXTENT_MERGE_SIZE], int *); 64 struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
65static void udf_update_extents(struct inode *, 65static void udf_update_extents(struct inode *,
66 kernel_long_ad[EXTENT_MERGE_SIZE], int, int, 66 struct kernel_long_ad[EXTENT_MERGE_SIZE], int, int,
67 struct extent_position *); 67 struct extent_position *);
68static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); 68static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
69 69
@@ -200,7 +200,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
200{ 200{
201 int newblock; 201 int newblock;
202 struct buffer_head *dbh = NULL; 202 struct buffer_head *dbh = NULL;
203 kernel_lb_addr eloc; 203 struct kernel_lb_addr eloc;
204 uint32_t elen; 204 uint32_t elen;
205 uint8_t alloctype; 205 uint8_t alloctype;
206 struct extent_position epos; 206 struct extent_position epos;
@@ -281,7 +281,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
281 epos.bh = NULL; 281 epos.bh = NULL;
282 epos.block = iinfo->i_location; 282 epos.block = iinfo->i_location;
283 epos.offset = udf_file_entry_alloc_offset(inode); 283 epos.offset = udf_file_entry_alloc_offset(inode);
284 udf_add_aext(inode, &epos, eloc, elen, 0); 284 udf_add_aext(inode, &epos, &eloc, elen, 0);
285 /* UniqueID stuff */ 285 /* UniqueID stuff */
286 286
287 brelse(epos.bh); 287 brelse(epos.bh);
@@ -359,12 +359,12 @@ static struct buffer_head *udf_getblk(struct inode *inode, long block,
359 359
360/* Extend the file by 'blocks' blocks, return the number of extents added */ 360/* Extend the file by 'blocks' blocks, return the number of extents added */
361int udf_extend_file(struct inode *inode, struct extent_position *last_pos, 361int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
362 kernel_long_ad *last_ext, sector_t blocks) 362 struct kernel_long_ad *last_ext, sector_t blocks)
363{ 363{
364 sector_t add; 364 sector_t add;
365 int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK); 365 int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
366 struct super_block *sb = inode->i_sb; 366 struct super_block *sb = inode->i_sb;
367 kernel_lb_addr prealloc_loc = {}; 367 struct kernel_lb_addr prealloc_loc = {};
368 int prealloc_len = 0; 368 int prealloc_len = 0;
369 struct udf_inode_info *iinfo; 369 struct udf_inode_info *iinfo;
370 370
@@ -411,11 +411,11 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
411 } 411 }
412 412
413 if (fake) { 413 if (fake) {
414 udf_add_aext(inode, last_pos, last_ext->extLocation, 414 udf_add_aext(inode, last_pos, &last_ext->extLocation,
415 last_ext->extLength, 1); 415 last_ext->extLength, 1);
416 count++; 416 count++;
417 } else 417 } else
418 udf_write_aext(inode, last_pos, last_ext->extLocation, 418 udf_write_aext(inode, last_pos, &last_ext->extLocation,
419 last_ext->extLength, 1); 419 last_ext->extLength, 1);
420 420
421 /* Managed to do everything necessary? */ 421 /* Managed to do everything necessary? */
@@ -432,7 +432,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
432 /* Create enough extents to cover the whole hole */ 432 /* Create enough extents to cover the whole hole */
433 while (blocks > add) { 433 while (blocks > add) {
434 blocks -= add; 434 blocks -= add;
435 if (udf_add_aext(inode, last_pos, last_ext->extLocation, 435 if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
436 last_ext->extLength, 1) == -1) 436 last_ext->extLength, 1) == -1)
437 return -1; 437 return -1;
438 count++; 438 count++;
@@ -440,7 +440,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
440 if (blocks) { 440 if (blocks) {
441 last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | 441 last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
442 (blocks << sb->s_blocksize_bits); 442 (blocks << sb->s_blocksize_bits);
443 if (udf_add_aext(inode, last_pos, last_ext->extLocation, 443 if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
444 last_ext->extLength, 1) == -1) 444 last_ext->extLength, 1) == -1)
445 return -1; 445 return -1;
446 count++; 446 count++;
@@ -449,7 +449,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
449out: 449out:
450 /* Do we have some preallocated blocks saved? */ 450 /* Do we have some preallocated blocks saved? */
451 if (prealloc_len) { 451 if (prealloc_len) {
452 if (udf_add_aext(inode, last_pos, prealloc_loc, 452 if (udf_add_aext(inode, last_pos, &prealloc_loc,
453 prealloc_len, 1) == -1) 453 prealloc_len, 1) == -1)
454 return -1; 454 return -1;
455 last_ext->extLocation = prealloc_loc; 455 last_ext->extLocation = prealloc_loc;
@@ -459,9 +459,9 @@ out:
459 459
460 /* last_pos should point to the last written extent... */ 460 /* last_pos should point to the last written extent... */
461 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 461 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
462 last_pos->offset -= sizeof(short_ad); 462 last_pos->offset -= sizeof(struct short_ad);
463 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 463 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
464 last_pos->offset -= sizeof(long_ad); 464 last_pos->offset -= sizeof(struct long_ad);
465 else 465 else
466 return -1; 466 return -1;
467 467
@@ -473,11 +473,11 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
473{ 473{
474 static sector_t last_block; 474 static sector_t last_block;
475 struct buffer_head *result = NULL; 475 struct buffer_head *result = NULL;
476 kernel_long_ad laarr[EXTENT_MERGE_SIZE]; 476 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE];
477 struct extent_position prev_epos, cur_epos, next_epos; 477 struct extent_position prev_epos, cur_epos, next_epos;
478 int count = 0, startnum = 0, endnum = 0; 478 int count = 0, startnum = 0, endnum = 0;
479 uint32_t elen = 0, tmpelen; 479 uint32_t elen = 0, tmpelen;
480 kernel_lb_addr eloc, tmpeloc; 480 struct kernel_lb_addr eloc, tmpeloc;
481 int c = 1; 481 int c = 1;
482 loff_t lbcount = 0, b_off = 0; 482 loff_t lbcount = 0, b_off = 0;
483 uint32_t newblocknum, newblock; 483 uint32_t newblocknum, newblock;
@@ -550,12 +550,12 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
550 elen = EXT_RECORDED_ALLOCATED | 550 elen = EXT_RECORDED_ALLOCATED |
551 ((elen + inode->i_sb->s_blocksize - 1) & 551 ((elen + inode->i_sb->s_blocksize - 1) &
552 ~(inode->i_sb->s_blocksize - 1)); 552 ~(inode->i_sb->s_blocksize - 1));
553 etype = udf_write_aext(inode, &cur_epos, eloc, elen, 1); 553 etype = udf_write_aext(inode, &cur_epos, &eloc, elen, 1);
554 } 554 }
555 brelse(prev_epos.bh); 555 brelse(prev_epos.bh);
556 brelse(cur_epos.bh); 556 brelse(cur_epos.bh);
557 brelse(next_epos.bh); 557 brelse(next_epos.bh);
558 newblock = udf_get_lb_pblock(inode->i_sb, eloc, offset); 558 newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
559 *phys = newblock; 559 *phys = newblock;
560 return NULL; 560 return NULL;
561 } 561 }
@@ -572,7 +572,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
572 } else { 572 } else {
573 /* Create a fake extent when there's not one */ 573 /* Create a fake extent when there's not one */
574 memset(&laarr[0].extLocation, 0x00, 574 memset(&laarr[0].extLocation, 0x00,
575 sizeof(kernel_lb_addr)); 575 sizeof(struct kernel_lb_addr));
576 laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED; 576 laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
577 /* Will udf_extend_file() create real extent from 577 /* Will udf_extend_file() create real extent from
578 a fake one? */ 578 a fake one? */
@@ -602,7 +602,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
602 laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | 602 laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
603 inode->i_sb->s_blocksize; 603 inode->i_sb->s_blocksize;
604 memset(&laarr[c].extLocation, 0x00, 604 memset(&laarr[c].extLocation, 0x00,
605 sizeof(kernel_lb_addr)); 605 sizeof(struct kernel_lb_addr));
606 count++; 606 count++;
607 endnum++; 607 endnum++;
608 } 608 }
@@ -699,7 +699,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
699 699
700static void udf_split_extents(struct inode *inode, int *c, int offset, 700static void udf_split_extents(struct inode *inode, int *c, int offset,
701 int newblocknum, 701 int newblocknum,
702 kernel_long_ad laarr[EXTENT_MERGE_SIZE], 702 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
703 int *endnum) 703 int *endnum)
704{ 704{
705 unsigned long blocksize = inode->i_sb->s_blocksize; 705 unsigned long blocksize = inode->i_sb->s_blocksize;
@@ -726,7 +726,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset,
726 if (offset) { 726 if (offset) {
727 if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { 727 if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
728 udf_free_blocks(inode->i_sb, inode, 728 udf_free_blocks(inode->i_sb, inode,
729 laarr[curr].extLocation, 729 &laarr[curr].extLocation,
730 0, offset); 730 0, offset);
731 laarr[curr].extLength = 731 laarr[curr].extLength =
732 EXT_NOT_RECORDED_NOT_ALLOCATED | 732 EXT_NOT_RECORDED_NOT_ALLOCATED |
@@ -763,7 +763,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset,
763} 763}
764 764
765static void udf_prealloc_extents(struct inode *inode, int c, int lastblock, 765static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
766 kernel_long_ad laarr[EXTENT_MERGE_SIZE], 766 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
767 int *endnum) 767 int *endnum)
768{ 768{
769 int start, length = 0, currlength = 0, i; 769 int start, length = 0, currlength = 0, i;
@@ -817,7 +817,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
817 inode->i_sb->s_blocksize_bits); 817 inode->i_sb->s_blocksize_bits);
818 else { 818 else {
819 memmove(&laarr[c + 2], &laarr[c + 1], 819 memmove(&laarr[c + 2], &laarr[c + 1],
820 sizeof(long_ad) * (*endnum - (c + 1))); 820 sizeof(struct long_ad) * (*endnum - (c + 1)));
821 (*endnum)++; 821 (*endnum)++;
822 laarr[c + 1].extLocation.logicalBlockNum = next; 822 laarr[c + 1].extLocation.logicalBlockNum = next;
823 laarr[c + 1].extLocation.partitionReferenceNum = 823 laarr[c + 1].extLocation.partitionReferenceNum =
@@ -846,7 +846,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
846 if (*endnum > (i + 1)) 846 if (*endnum > (i + 1))
847 memmove(&laarr[i], 847 memmove(&laarr[i],
848 &laarr[i + 1], 848 &laarr[i + 1],
849 sizeof(long_ad) * 849 sizeof(struct long_ad) *
850 (*endnum - (i + 1))); 850 (*endnum - (i + 1)));
851 i--; 851 i--;
852 (*endnum)--; 852 (*endnum)--;
@@ -859,7 +859,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
859} 859}
860 860
861static void udf_merge_extents(struct inode *inode, 861static void udf_merge_extents(struct inode *inode,
862 kernel_long_ad laarr[EXTENT_MERGE_SIZE], 862 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
863 int *endnum) 863 int *endnum)
864{ 864{
865 int i; 865 int i;
@@ -867,8 +867,8 @@ static void udf_merge_extents(struct inode *inode,
867 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; 867 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
868 868
869 for (i = 0; i < (*endnum - 1); i++) { 869 for (i = 0; i < (*endnum - 1); i++) {
870 kernel_long_ad *li /*l[i]*/ = &laarr[i]; 870 struct kernel_long_ad *li /*l[i]*/ = &laarr[i];
871 kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1]; 871 struct kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1];
872 872
873 if (((li->extLength >> 30) == (lip1->extLength >> 30)) && 873 if (((li->extLength >> 30) == (lip1->extLength >> 30)) &&
874 (((li->extLength >> 30) == 874 (((li->extLength >> 30) ==
@@ -902,7 +902,7 @@ static void udf_merge_extents(struct inode *inode,
902 blocksize - 1) & ~(blocksize - 1)); 902 blocksize - 1) & ~(blocksize - 1));
903 if (*endnum > (i + 2)) 903 if (*endnum > (i + 2))
904 memmove(&laarr[i + 1], &laarr[i + 2], 904 memmove(&laarr[i + 1], &laarr[i + 2],
905 sizeof(long_ad) * 905 sizeof(struct long_ad) *
906 (*endnum - (i + 2))); 906 (*endnum - (i + 2)));
907 i--; 907 i--;
908 (*endnum)--; 908 (*endnum)--;
@@ -911,7 +911,7 @@ static void udf_merge_extents(struct inode *inode,
911 (EXT_NOT_RECORDED_ALLOCATED >> 30)) && 911 (EXT_NOT_RECORDED_ALLOCATED >> 30)) &&
912 ((lip1->extLength >> 30) == 912 ((lip1->extLength >> 30) ==
913 (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))) { 913 (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))) {
914 udf_free_blocks(inode->i_sb, inode, li->extLocation, 0, 914 udf_free_blocks(inode->i_sb, inode, &li->extLocation, 0,
915 ((li->extLength & 915 ((li->extLength &
916 UDF_EXTENT_LENGTH_MASK) + 916 UDF_EXTENT_LENGTH_MASK) +
917 blocksize - 1) >> blocksize_bits); 917 blocksize - 1) >> blocksize_bits);
@@ -937,7 +937,7 @@ static void udf_merge_extents(struct inode *inode,
937 blocksize - 1) & ~(blocksize - 1)); 937 blocksize - 1) & ~(blocksize - 1));
938 if (*endnum > (i + 2)) 938 if (*endnum > (i + 2))
939 memmove(&laarr[i + 1], &laarr[i + 2], 939 memmove(&laarr[i + 1], &laarr[i + 2],
940 sizeof(long_ad) * 940 sizeof(struct long_ad) *
941 (*endnum - (i + 2))); 941 (*endnum - (i + 2)));
942 i--; 942 i--;
943 (*endnum)--; 943 (*endnum)--;
@@ -945,7 +945,7 @@ static void udf_merge_extents(struct inode *inode,
945 } else if ((li->extLength >> 30) == 945 } else if ((li->extLength >> 30) ==
946 (EXT_NOT_RECORDED_ALLOCATED >> 30)) { 946 (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
947 udf_free_blocks(inode->i_sb, inode, 947 udf_free_blocks(inode->i_sb, inode,
948 li->extLocation, 0, 948 &li->extLocation, 0,
949 ((li->extLength & 949 ((li->extLength &
950 UDF_EXTENT_LENGTH_MASK) + 950 UDF_EXTENT_LENGTH_MASK) +
951 blocksize - 1) >> blocksize_bits); 951 blocksize - 1) >> blocksize_bits);
@@ -959,12 +959,12 @@ static void udf_merge_extents(struct inode *inode,
959} 959}
960 960
961static void udf_update_extents(struct inode *inode, 961static void udf_update_extents(struct inode *inode,
962 kernel_long_ad laarr[EXTENT_MERGE_SIZE], 962 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
963 int startnum, int endnum, 963 int startnum, int endnum,
964 struct extent_position *epos) 964 struct extent_position *epos)
965{ 965{
966 int start = 0, i; 966 int start = 0, i;
967 kernel_lb_addr tmploc; 967 struct kernel_lb_addr tmploc;
968 uint32_t tmplen; 968 uint32_t tmplen;
969 969
970 if (startnum > endnum) { 970 if (startnum > endnum) {
@@ -983,7 +983,7 @@ static void udf_update_extents(struct inode *inode,
983 983
984 for (i = start; i < endnum; i++) { 984 for (i = start; i < endnum; i++) {
985 udf_next_aext(inode, epos, &tmploc, &tmplen, 0); 985 udf_next_aext(inode, epos, &tmploc, &tmplen, 0);
986 udf_write_aext(inode, epos, laarr[i].extLocation, 986 udf_write_aext(inode, epos, &laarr[i].extLocation,
987 laarr[i].extLength, 1); 987 laarr[i].extLength, 1);
988 } 988 }
989} 989}
@@ -1076,7 +1076,7 @@ static void __udf_read_inode(struct inode *inode)
1076 * i_nlink = 1 1076 * i_nlink = 1
1077 * i_op = NULL; 1077 * i_op = NULL;
1078 */ 1078 */
1079 bh = udf_read_ptagged(inode->i_sb, iinfo->i_location, 0, &ident); 1079 bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident);
1080 if (!bh) { 1080 if (!bh) {
1081 printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed !bh\n", 1081 printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed !bh\n",
1082 inode->i_ino); 1082 inode->i_ino);
@@ -1098,24 +1098,24 @@ static void __udf_read_inode(struct inode *inode)
1098 if (fe->icbTag.strategyType == cpu_to_le16(4096)) { 1098 if (fe->icbTag.strategyType == cpu_to_le16(4096)) {
1099 struct buffer_head *ibh; 1099 struct buffer_head *ibh;
1100 1100
1101 ibh = udf_read_ptagged(inode->i_sb, iinfo->i_location, 1, 1101 ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1,
1102 &ident); 1102 &ident);
1103 if (ident == TAG_IDENT_IE && ibh) { 1103 if (ident == TAG_IDENT_IE && ibh) {
1104 struct buffer_head *nbh = NULL; 1104 struct buffer_head *nbh = NULL;
1105 kernel_lb_addr loc; 1105 struct kernel_lb_addr loc;
1106 struct indirectEntry *ie; 1106 struct indirectEntry *ie;
1107 1107
1108 ie = (struct indirectEntry *)ibh->b_data; 1108 ie = (struct indirectEntry *)ibh->b_data;
1109 loc = lelb_to_cpu(ie->indirectICB.extLocation); 1109 loc = lelb_to_cpu(ie->indirectICB.extLocation);
1110 1110
1111 if (ie->indirectICB.extLength && 1111 if (ie->indirectICB.extLength &&
1112 (nbh = udf_read_ptagged(inode->i_sb, loc, 0, 1112 (nbh = udf_read_ptagged(inode->i_sb, &loc, 0,
1113 &ident))) { 1113 &ident))) {
1114 if (ident == TAG_IDENT_FE || 1114 if (ident == TAG_IDENT_FE ||
1115 ident == TAG_IDENT_EFE) { 1115 ident == TAG_IDENT_EFE) {
1116 memcpy(&iinfo->i_location, 1116 memcpy(&iinfo->i_location,
1117 &loc, 1117 &loc,
1118 sizeof(kernel_lb_addr)); 1118 sizeof(struct kernel_lb_addr));
1119 brelse(bh); 1119 brelse(bh);
1120 brelse(ibh); 1120 brelse(ibh);
1121 brelse(nbh); 1121 brelse(nbh);
@@ -1222,8 +1222,15 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1222 inode->i_size = le64_to_cpu(fe->informationLength); 1222 inode->i_size = le64_to_cpu(fe->informationLength);
1223 iinfo->i_lenExtents = inode->i_size; 1223 iinfo->i_lenExtents = inode->i_size;
1224 1224
1225 inode->i_mode = udf_convert_permissions(fe); 1225 if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY &&
1226 inode->i_mode &= ~UDF_SB(inode->i_sb)->s_umask; 1226 sbi->s_fmode != UDF_INVALID_MODE)
1227 inode->i_mode = sbi->s_fmode;
1228 else if (fe->icbTag.fileType == ICBTAG_FILE_TYPE_DIRECTORY &&
1229 sbi->s_dmode != UDF_INVALID_MODE)
1230 inode->i_mode = sbi->s_dmode;
1231 else
1232 inode->i_mode = udf_convert_permissions(fe);
1233 inode->i_mode &= ~sbi->s_umask;
1227 1234
1228 if (iinfo->i_efe == 0) { 1235 if (iinfo->i_efe == 0) {
1229 inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) << 1236 inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
@@ -1396,7 +1403,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1396 1403
1397 bh = udf_tread(inode->i_sb, 1404 bh = udf_tread(inode->i_sb,
1398 udf_get_lb_pblock(inode->i_sb, 1405 udf_get_lb_pblock(inode->i_sb,
1399 iinfo->i_location, 0)); 1406 &iinfo->i_location, 0));
1400 if (!bh) { 1407 if (!bh) {
1401 udf_debug("bread failure\n"); 1408 udf_debug("bread failure\n");
1402 return -EIO; 1409 return -EIO;
@@ -1416,13 +1423,13 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1416 iinfo->i_ext.i_data, inode->i_sb->s_blocksize - 1423 iinfo->i_ext.i_data, inode->i_sb->s_blocksize -
1417 sizeof(struct unallocSpaceEntry)); 1424 sizeof(struct unallocSpaceEntry));
1418 crclen = sizeof(struct unallocSpaceEntry) + 1425 crclen = sizeof(struct unallocSpaceEntry) +
1419 iinfo->i_lenAlloc - sizeof(tag); 1426 iinfo->i_lenAlloc - sizeof(struct tag);
1420 use->descTag.tagLocation = cpu_to_le32( 1427 use->descTag.tagLocation = cpu_to_le32(
1421 iinfo->i_location. 1428 iinfo->i_location.
1422 logicalBlockNum); 1429 logicalBlockNum);
1423 use->descTag.descCRCLength = cpu_to_le16(crclen); 1430 use->descTag.descCRCLength = cpu_to_le16(crclen);
1424 use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use + 1431 use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use +
1425 sizeof(tag), 1432 sizeof(struct tag),
1426 crclen)); 1433 crclen));
1427 use->descTag.tagChecksum = udf_tag_checksum(&use->descTag); 1434 use->descTag.tagChecksum = udf_tag_checksum(&use->descTag);
1428 1435
@@ -1459,23 +1466,23 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1459 fe->informationLength = cpu_to_le64(inode->i_size); 1466 fe->informationLength = cpu_to_le64(inode->i_size);
1460 1467
1461 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 1468 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
1462 regid *eid; 1469 struct regid *eid;
1463 struct deviceSpec *dsea = 1470 struct deviceSpec *dsea =
1464 (struct deviceSpec *)udf_get_extendedattr(inode, 12, 1); 1471 (struct deviceSpec *)udf_get_extendedattr(inode, 12, 1);
1465 if (!dsea) { 1472 if (!dsea) {
1466 dsea = (struct deviceSpec *) 1473 dsea = (struct deviceSpec *)
1467 udf_add_extendedattr(inode, 1474 udf_add_extendedattr(inode,
1468 sizeof(struct deviceSpec) + 1475 sizeof(struct deviceSpec) +
1469 sizeof(regid), 12, 0x3); 1476 sizeof(struct regid), 12, 0x3);
1470 dsea->attrType = cpu_to_le32(12); 1477 dsea->attrType = cpu_to_le32(12);
1471 dsea->attrSubtype = 1; 1478 dsea->attrSubtype = 1;
1472 dsea->attrLength = cpu_to_le32( 1479 dsea->attrLength = cpu_to_le32(
1473 sizeof(struct deviceSpec) + 1480 sizeof(struct deviceSpec) +
1474 sizeof(regid)); 1481 sizeof(struct regid));
1475 dsea->impUseLength = cpu_to_le32(sizeof(regid)); 1482 dsea->impUseLength = cpu_to_le32(sizeof(struct regid));
1476 } 1483 }
1477 eid = (regid *)dsea->impUse; 1484 eid = (struct regid *)dsea->impUse;
1478 memset(eid, 0, sizeof(regid)); 1485 memset(eid, 0, sizeof(struct regid));
1479 strcpy(eid->ident, UDF_ID_DEVELOPER); 1486 strcpy(eid->ident, UDF_ID_DEVELOPER);
1480 eid->identSuffix[0] = UDF_OS_CLASS_UNIX; 1487 eid->identSuffix[0] = UDF_OS_CLASS_UNIX;
1481 eid->identSuffix[1] = UDF_OS_ID_LINUX; 1488 eid->identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1494,7 +1501,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1494 udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime); 1501 udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime);
1495 udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime); 1502 udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime);
1496 udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime); 1503 udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime);
1497 memset(&(fe->impIdent), 0, sizeof(regid)); 1504 memset(&(fe->impIdent), 0, sizeof(struct regid));
1498 strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER); 1505 strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER);
1499 fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; 1506 fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
1500 fe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; 1507 fe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1533,7 +1540,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1533 udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime); 1540 udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime);
1534 udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime); 1541 udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime);
1535 1542
1536 memset(&(efe->impIdent), 0, sizeof(regid)); 1543 memset(&(efe->impIdent), 0, sizeof(struct regid));
1537 strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER); 1544 strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER);
1538 efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; 1545 efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
1539 efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; 1546 efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1584,9 +1591,9 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1584 fe->descTag.tagLocation = cpu_to_le32( 1591 fe->descTag.tagLocation = cpu_to_le32(
1585 iinfo->i_location.logicalBlockNum); 1592 iinfo->i_location.logicalBlockNum);
1586 crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - 1593 crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc -
1587 sizeof(tag); 1594 sizeof(struct tag);
1588 fe->descTag.descCRCLength = cpu_to_le16(crclen); 1595 fe->descTag.descCRCLength = cpu_to_le16(crclen);
1589 fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(tag), 1596 fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag),
1590 crclen)); 1597 crclen));
1591 fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag); 1598 fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag);
1592 1599
@@ -1606,7 +1613,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1606 return err; 1613 return err;
1607} 1614}
1608 1615
1609struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino) 1616struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino)
1610{ 1617{
1611 unsigned long block = udf_get_lb_pblock(sb, ino, 0); 1618 unsigned long block = udf_get_lb_pblock(sb, ino, 0);
1612 struct inode *inode = iget_locked(sb, block); 1619 struct inode *inode = iget_locked(sb, block);
@@ -1615,7 +1622,7 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
1615 return NULL; 1622 return NULL;
1616 1623
1617 if (inode->i_state & I_NEW) { 1624 if (inode->i_state & I_NEW) {
1618 memcpy(&UDF_I(inode)->i_location, &ino, sizeof(kernel_lb_addr)); 1625 memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
1619 __udf_read_inode(inode); 1626 __udf_read_inode(inode);
1620 unlock_new_inode(inode); 1627 unlock_new_inode(inode);
1621 } 1628 }
@@ -1623,10 +1630,10 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
1623 if (is_bad_inode(inode)) 1630 if (is_bad_inode(inode))
1624 goto out_iput; 1631 goto out_iput;
1625 1632
1626 if (ino.logicalBlockNum >= UDF_SB(sb)-> 1633 if (ino->logicalBlockNum >= UDF_SB(sb)->
1627 s_partmaps[ino.partitionReferenceNum].s_partition_len) { 1634 s_partmaps[ino->partitionReferenceNum].s_partition_len) {
1628 udf_debug("block=%d, partition=%d out of range\n", 1635 udf_debug("block=%d, partition=%d out of range\n",
1629 ino.logicalBlockNum, ino.partitionReferenceNum); 1636 ino->logicalBlockNum, ino->partitionReferenceNum);
1630 make_bad_inode(inode); 1637 make_bad_inode(inode);
1631 goto out_iput; 1638 goto out_iput;
1632 } 1639 }
@@ -1639,11 +1646,11 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
1639} 1646}
1640 1647
1641int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, 1648int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1642 kernel_lb_addr eloc, uint32_t elen, int inc) 1649 struct kernel_lb_addr *eloc, uint32_t elen, int inc)
1643{ 1650{
1644 int adsize; 1651 int adsize;
1645 short_ad *sad = NULL; 1652 struct short_ad *sad = NULL;
1646 long_ad *lad = NULL; 1653 struct long_ad *lad = NULL;
1647 struct allocExtDesc *aed; 1654 struct allocExtDesc *aed;
1648 int8_t etype; 1655 int8_t etype;
1649 uint8_t *ptr; 1656 uint8_t *ptr;
@@ -1657,9 +1664,9 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1657 ptr = epos->bh->b_data + epos->offset; 1664 ptr = epos->bh->b_data + epos->offset;
1658 1665
1659 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 1666 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
1660 adsize = sizeof(short_ad); 1667 adsize = sizeof(struct short_ad);
1661 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 1668 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
1662 adsize = sizeof(long_ad); 1669 adsize = sizeof(struct long_ad);
1663 else 1670 else
1664 return -1; 1671 return -1;
1665 1672
@@ -1667,7 +1674,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1667 char *sptr, *dptr; 1674 char *sptr, *dptr;
1668 struct buffer_head *nbh; 1675 struct buffer_head *nbh;
1669 int err, loffset; 1676 int err, loffset;
1670 kernel_lb_addr obloc = epos->block; 1677 struct kernel_lb_addr obloc = epos->block;
1671 1678
1672 epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL, 1679 epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL,
1673 obloc.partitionReferenceNum, 1680 obloc.partitionReferenceNum,
@@ -1675,7 +1682,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1675 if (!epos->block.logicalBlockNum) 1682 if (!epos->block.logicalBlockNum)
1676 return -1; 1683 return -1;
1677 nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb, 1684 nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb,
1678 epos->block, 1685 &epos->block,
1679 0)); 1686 0));
1680 if (!nbh) 1687 if (!nbh)
1681 return -1; 1688 return -1;
@@ -1712,20 +1719,20 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1712 } 1719 }
1713 if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200) 1720 if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200)
1714 udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1, 1721 udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1,
1715 epos->block.logicalBlockNum, sizeof(tag)); 1722 epos->block.logicalBlockNum, sizeof(struct tag));
1716 else 1723 else
1717 udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1, 1724 udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1,
1718 epos->block.logicalBlockNum, sizeof(tag)); 1725 epos->block.logicalBlockNum, sizeof(struct tag));
1719 switch (iinfo->i_alloc_type) { 1726 switch (iinfo->i_alloc_type) {
1720 case ICBTAG_FLAG_AD_SHORT: 1727 case ICBTAG_FLAG_AD_SHORT:
1721 sad = (short_ad *)sptr; 1728 sad = (struct short_ad *)sptr;
1722 sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS | 1729 sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
1723 inode->i_sb->s_blocksize); 1730 inode->i_sb->s_blocksize);
1724 sad->extPosition = 1731 sad->extPosition =
1725 cpu_to_le32(epos->block.logicalBlockNum); 1732 cpu_to_le32(epos->block.logicalBlockNum);
1726 break; 1733 break;
1727 case ICBTAG_FLAG_AD_LONG: 1734 case ICBTAG_FLAG_AD_LONG:
1728 lad = (long_ad *)sptr; 1735 lad = (struct long_ad *)sptr;
1729 lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS | 1736 lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
1730 inode->i_sb->s_blocksize); 1737 inode->i_sb->s_blocksize);
1731 lad->extLocation = cpu_to_lelb(epos->block); 1738 lad->extLocation = cpu_to_lelb(epos->block);
@@ -1769,12 +1776,12 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1769} 1776}
1770 1777
1771int8_t udf_write_aext(struct inode *inode, struct extent_position *epos, 1778int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
1772 kernel_lb_addr eloc, uint32_t elen, int inc) 1779 struct kernel_lb_addr *eloc, uint32_t elen, int inc)
1773{ 1780{
1774 int adsize; 1781 int adsize;
1775 uint8_t *ptr; 1782 uint8_t *ptr;
1776 short_ad *sad; 1783 struct short_ad *sad;
1777 long_ad *lad; 1784 struct long_ad *lad;
1778 struct udf_inode_info *iinfo = UDF_I(inode); 1785 struct udf_inode_info *iinfo = UDF_I(inode);
1779 1786
1780 if (!epos->bh) 1787 if (!epos->bh)
@@ -1786,17 +1793,17 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
1786 1793
1787 switch (iinfo->i_alloc_type) { 1794 switch (iinfo->i_alloc_type) {
1788 case ICBTAG_FLAG_AD_SHORT: 1795 case ICBTAG_FLAG_AD_SHORT:
1789 sad = (short_ad *)ptr; 1796 sad = (struct short_ad *)ptr;
1790 sad->extLength = cpu_to_le32(elen); 1797 sad->extLength = cpu_to_le32(elen);
1791 sad->extPosition = cpu_to_le32(eloc.logicalBlockNum); 1798 sad->extPosition = cpu_to_le32(eloc->logicalBlockNum);
1792 adsize = sizeof(short_ad); 1799 adsize = sizeof(struct short_ad);
1793 break; 1800 break;
1794 case ICBTAG_FLAG_AD_LONG: 1801 case ICBTAG_FLAG_AD_LONG:
1795 lad = (long_ad *)ptr; 1802 lad = (struct long_ad *)ptr;
1796 lad->extLength = cpu_to_le32(elen); 1803 lad->extLength = cpu_to_le32(elen);
1797 lad->extLocation = cpu_to_lelb(eloc); 1804 lad->extLocation = cpu_to_lelb(*eloc);
1798 memset(lad->impUse, 0x00, sizeof(lad->impUse)); 1805 memset(lad->impUse, 0x00, sizeof(lad->impUse));
1799 adsize = sizeof(long_ad); 1806 adsize = sizeof(struct long_ad);
1800 break; 1807 break;
1801 default: 1808 default:
1802 return -1; 1809 return -1;
@@ -1823,7 +1830,7 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
1823} 1830}
1824 1831
1825int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, 1832int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
1826 kernel_lb_addr *eloc, uint32_t *elen, int inc) 1833 struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
1827{ 1834{
1828 int8_t etype; 1835 int8_t etype;
1829 1836
@@ -1833,7 +1840,7 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
1833 epos->block = *eloc; 1840 epos->block = *eloc;
1834 epos->offset = sizeof(struct allocExtDesc); 1841 epos->offset = sizeof(struct allocExtDesc);
1835 brelse(epos->bh); 1842 brelse(epos->bh);
1836 block = udf_get_lb_pblock(inode->i_sb, epos->block, 0); 1843 block = udf_get_lb_pblock(inode->i_sb, &epos->block, 0);
1837 epos->bh = udf_tread(inode->i_sb, block); 1844 epos->bh = udf_tread(inode->i_sb, block);
1838 if (!epos->bh) { 1845 if (!epos->bh) {
1839 udf_debug("reading block %d failed!\n", block); 1846 udf_debug("reading block %d failed!\n", block);
@@ -1845,13 +1852,13 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
1845} 1852}
1846 1853
1847int8_t udf_current_aext(struct inode *inode, struct extent_position *epos, 1854int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
1848 kernel_lb_addr *eloc, uint32_t *elen, int inc) 1855 struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
1849{ 1856{
1850 int alen; 1857 int alen;
1851 int8_t etype; 1858 int8_t etype;
1852 uint8_t *ptr; 1859 uint8_t *ptr;
1853 short_ad *sad; 1860 struct short_ad *sad;
1854 long_ad *lad; 1861 struct long_ad *lad;
1855 struct udf_inode_info *iinfo = UDF_I(inode); 1862 struct udf_inode_info *iinfo = UDF_I(inode);
1856 1863
1857 if (!epos->bh) { 1864 if (!epos->bh) {
@@ -1900,9 +1907,9 @@ int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
1900} 1907}
1901 1908
1902static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos, 1909static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos,
1903 kernel_lb_addr neloc, uint32_t nelen) 1910 struct kernel_lb_addr neloc, uint32_t nelen)
1904{ 1911{
1905 kernel_lb_addr oeloc; 1912 struct kernel_lb_addr oeloc;
1906 uint32_t oelen; 1913 uint32_t oelen;
1907 int8_t etype; 1914 int8_t etype;
1908 1915
@@ -1910,18 +1917,18 @@ static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos,
1910 get_bh(epos.bh); 1917 get_bh(epos.bh);
1911 1918
1912 while ((etype = udf_next_aext(inode, &epos, &oeloc, &oelen, 0)) != -1) { 1919 while ((etype = udf_next_aext(inode, &epos, &oeloc, &oelen, 0)) != -1) {
1913 udf_write_aext(inode, &epos, neloc, nelen, 1); 1920 udf_write_aext(inode, &epos, &neloc, nelen, 1);
1914 neloc = oeloc; 1921 neloc = oeloc;
1915 nelen = (etype << 30) | oelen; 1922 nelen = (etype << 30) | oelen;
1916 } 1923 }
1917 udf_add_aext(inode, &epos, neloc, nelen, 1); 1924 udf_add_aext(inode, &epos, &neloc, nelen, 1);
1918 brelse(epos.bh); 1925 brelse(epos.bh);
1919 1926
1920 return (nelen >> 30); 1927 return (nelen >> 30);
1921} 1928}
1922 1929
1923int8_t udf_delete_aext(struct inode *inode, struct extent_position epos, 1930int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
1924 kernel_lb_addr eloc, uint32_t elen) 1931 struct kernel_lb_addr eloc, uint32_t elen)
1925{ 1932{
1926 struct extent_position oepos; 1933 struct extent_position oepos;
1927 int adsize; 1934 int adsize;
@@ -1936,9 +1943,9 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
1936 1943
1937 iinfo = UDF_I(inode); 1944 iinfo = UDF_I(inode);
1938 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 1945 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
1939 adsize = sizeof(short_ad); 1946 adsize = sizeof(struct short_ad);
1940 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 1947 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
1941 adsize = sizeof(long_ad); 1948 adsize = sizeof(struct long_ad);
1942 else 1949 else
1943 adsize = 0; 1950 adsize = 0;
1944 1951
@@ -1947,7 +1954,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
1947 return -1; 1954 return -1;
1948 1955
1949 while ((etype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) { 1956 while ((etype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) {
1950 udf_write_aext(inode, &oepos, eloc, (etype << 30) | elen, 1); 1957 udf_write_aext(inode, &oepos, &eloc, (etype << 30) | elen, 1);
1951 if (oepos.bh != epos.bh) { 1958 if (oepos.bh != epos.bh) {
1952 oepos.block = epos.block; 1959 oepos.block = epos.block;
1953 brelse(oepos.bh); 1960 brelse(oepos.bh);
@@ -1956,13 +1963,13 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
1956 oepos.offset = epos.offset - adsize; 1963 oepos.offset = epos.offset - adsize;
1957 } 1964 }
1958 } 1965 }
1959 memset(&eloc, 0x00, sizeof(kernel_lb_addr)); 1966 memset(&eloc, 0x00, sizeof(struct kernel_lb_addr));
1960 elen = 0; 1967 elen = 0;
1961 1968
1962 if (epos.bh != oepos.bh) { 1969 if (epos.bh != oepos.bh) {
1963 udf_free_blocks(inode->i_sb, inode, epos.block, 0, 1); 1970 udf_free_blocks(inode->i_sb, inode, &epos.block, 0, 1);
1964 udf_write_aext(inode, &oepos, eloc, elen, 1); 1971 udf_write_aext(inode, &oepos, &eloc, elen, 1);
1965 udf_write_aext(inode, &oepos, eloc, elen, 1); 1972 udf_write_aext(inode, &oepos, &eloc, elen, 1);
1966 if (!oepos.bh) { 1973 if (!oepos.bh) {
1967 iinfo->i_lenAlloc -= (adsize * 2); 1974 iinfo->i_lenAlloc -= (adsize * 2);
1968 mark_inode_dirty(inode); 1975 mark_inode_dirty(inode);
@@ -1979,7 +1986,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
1979 mark_buffer_dirty_inode(oepos.bh, inode); 1986 mark_buffer_dirty_inode(oepos.bh, inode);
1980 } 1987 }
1981 } else { 1988 } else {
1982 udf_write_aext(inode, &oepos, eloc, elen, 1); 1989 udf_write_aext(inode, &oepos, &eloc, elen, 1);
1983 if (!oepos.bh) { 1990 if (!oepos.bh) {
1984 iinfo->i_lenAlloc -= adsize; 1991 iinfo->i_lenAlloc -= adsize;
1985 mark_inode_dirty(inode); 1992 mark_inode_dirty(inode);
@@ -2004,7 +2011,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
2004} 2011}
2005 2012
2006int8_t inode_bmap(struct inode *inode, sector_t block, 2013int8_t inode_bmap(struct inode *inode, sector_t block,
2007 struct extent_position *pos, kernel_lb_addr *eloc, 2014 struct extent_position *pos, struct kernel_lb_addr *eloc,
2008 uint32_t *elen, sector_t *offset) 2015 uint32_t *elen, sector_t *offset)
2009{ 2016{
2010 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; 2017 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
@@ -2036,7 +2043,7 @@ int8_t inode_bmap(struct inode *inode, sector_t block,
2036 2043
2037long udf_block_map(struct inode *inode, sector_t block) 2044long udf_block_map(struct inode *inode, sector_t block)
2038{ 2045{
2039 kernel_lb_addr eloc; 2046 struct kernel_lb_addr eloc;
2040 uint32_t elen; 2047 uint32_t elen;
2041 sector_t offset; 2048 sector_t offset;
2042 struct extent_position epos = {}; 2049 struct extent_position epos = {};
@@ -2046,7 +2053,7 @@ long udf_block_map(struct inode *inode, sector_t block)
2046 2053
2047 if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) == 2054 if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) ==
2048 (EXT_RECORDED_ALLOCATED >> 30)) 2055 (EXT_RECORDED_ALLOCATED >> 30))
2049 ret = udf_get_lb_pblock(inode->i_sb, eloc, offset); 2056 ret = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
2050 else 2057 else
2051 ret = 0; 2058 ret = 0;
2052 2059
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index 84bf0fd4a4f1..9215700c00a4 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -134,10 +134,10 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size,
134 } 134 }
135 } 135 }
136 /* rewrite CRC + checksum of eahd */ 136 /* rewrite CRC + checksum of eahd */
137 crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(tag); 137 crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(struct tag);
138 eahd->descTag.descCRCLength = cpu_to_le16(crclen); 138 eahd->descTag.descCRCLength = cpu_to_le16(crclen);
139 eahd->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)eahd + 139 eahd->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)eahd +
140 sizeof(tag), crclen)); 140 sizeof(struct tag), crclen));
141 eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag); 141 eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag);
142 iinfo->i_lenEAttr += size; 142 iinfo->i_lenEAttr += size;
143 return (struct genericFormat *)&ea[offset]; 143 return (struct genericFormat *)&ea[offset];
@@ -202,7 +202,7 @@ struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type,
202struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, 202struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
203 uint32_t location, uint16_t *ident) 203 uint32_t location, uint16_t *ident)
204{ 204{
205 tag *tag_p; 205 struct tag *tag_p;
206 struct buffer_head *bh = NULL; 206 struct buffer_head *bh = NULL;
207 207
208 /* Read the block */ 208 /* Read the block */
@@ -216,7 +216,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
216 return NULL; 216 return NULL;
217 } 217 }
218 218
219 tag_p = (tag *)(bh->b_data); 219 tag_p = (struct tag *)(bh->b_data);
220 220
221 *ident = le16_to_cpu(tag_p->tagIdent); 221 *ident = le16_to_cpu(tag_p->tagIdent);
222 222
@@ -241,9 +241,9 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
241 } 241 }
242 242
243 /* Verify the descriptor CRC */ 243 /* Verify the descriptor CRC */
244 if (le16_to_cpu(tag_p->descCRCLength) + sizeof(tag) > sb->s_blocksize || 244 if (le16_to_cpu(tag_p->descCRCLength) + sizeof(struct tag) > sb->s_blocksize ||
245 le16_to_cpu(tag_p->descCRC) == crc_itu_t(0, 245 le16_to_cpu(tag_p->descCRC) == crc_itu_t(0,
246 bh->b_data + sizeof(tag), 246 bh->b_data + sizeof(struct tag),
247 le16_to_cpu(tag_p->descCRCLength))) 247 le16_to_cpu(tag_p->descCRCLength)))
248 return bh; 248 return bh;
249 249
@@ -255,27 +255,28 @@ error_out:
255 return NULL; 255 return NULL;
256} 256}
257 257
258struct buffer_head *udf_read_ptagged(struct super_block *sb, kernel_lb_addr loc, 258struct buffer_head *udf_read_ptagged(struct super_block *sb,
259 struct kernel_lb_addr *loc,
259 uint32_t offset, uint16_t *ident) 260 uint32_t offset, uint16_t *ident)
260{ 261{
261 return udf_read_tagged(sb, udf_get_lb_pblock(sb, loc, offset), 262 return udf_read_tagged(sb, udf_get_lb_pblock(sb, loc, offset),
262 loc.logicalBlockNum + offset, ident); 263 loc->logicalBlockNum + offset, ident);
263} 264}
264 265
265void udf_update_tag(char *data, int length) 266void udf_update_tag(char *data, int length)
266{ 267{
267 tag *tptr = (tag *)data; 268 struct tag *tptr = (struct tag *)data;
268 length -= sizeof(tag); 269 length -= sizeof(struct tag);
269 270
270 tptr->descCRCLength = cpu_to_le16(length); 271 tptr->descCRCLength = cpu_to_le16(length);
271 tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(tag), length)); 272 tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(struct tag), length));
272 tptr->tagChecksum = udf_tag_checksum(tptr); 273 tptr->tagChecksum = udf_tag_checksum(tptr);
273} 274}
274 275
275void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum, 276void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum,
276 uint32_t loc, int length) 277 uint32_t loc, int length)
277{ 278{
278 tag *tptr = (tag *)data; 279 struct tag *tptr = (struct tag *)data;
279 tptr->tagIdent = cpu_to_le16(ident); 280 tptr->tagIdent = cpu_to_le16(ident);
280 tptr->descVersion = cpu_to_le16(version); 281 tptr->descVersion = cpu_to_le16(version);
281 tptr->tagSerialNum = cpu_to_le16(snum); 282 tptr->tagSerialNum = cpu_to_le16(snum);
@@ -283,12 +284,12 @@ void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum,
283 udf_update_tag(data, length); 284 udf_update_tag(data, length);
284} 285}
285 286
286u8 udf_tag_checksum(const tag *t) 287u8 udf_tag_checksum(const struct tag *t)
287{ 288{
288 u8 *data = (u8 *)t; 289 u8 *data = (u8 *)t;
289 u8 checksum = 0; 290 u8 checksum = 0;
290 int i; 291 int i;
291 for (i = 0; i < sizeof(tag); ++i) 292 for (i = 0; i < sizeof(struct tag); ++i)
292 if (i != 4) /* position of checksum */ 293 if (i != 4) /* position of checksum */
293 checksum += data[i]; 294 checksum += data[i];
294 return checksum; 295 return checksum;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index f84bfaa8d941..6a29fa34c478 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -47,7 +47,7 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
47 struct fileIdentDesc *sfi, struct udf_fileident_bh *fibh, 47 struct fileIdentDesc *sfi, struct udf_fileident_bh *fibh,
48 uint8_t *impuse, uint8_t *fileident) 48 uint8_t *impuse, uint8_t *fileident)
49{ 49{
50 uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(tag); 50 uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(struct tag);
51 uint16_t crc; 51 uint16_t crc;
52 int offset; 52 int offset;
53 uint16_t liu = le16_to_cpu(cfi->lengthOfImpUse); 53 uint16_t liu = le16_to_cpu(cfi->lengthOfImpUse);
@@ -99,18 +99,18 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
99 memset(fibh->ebh->b_data, 0x00, padlen + offset); 99 memset(fibh->ebh->b_data, 0x00, padlen + offset);
100 } 100 }
101 101
102 crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(tag), 102 crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(struct tag),
103 sizeof(struct fileIdentDesc) - sizeof(tag)); 103 sizeof(struct fileIdentDesc) - sizeof(struct tag));
104 104
105 if (fibh->sbh == fibh->ebh) { 105 if (fibh->sbh == fibh->ebh) {
106 crc = crc_itu_t(crc, (uint8_t *)sfi->impUse, 106 crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
107 crclen + sizeof(tag) - 107 crclen + sizeof(struct tag) -
108 sizeof(struct fileIdentDesc)); 108 sizeof(struct fileIdentDesc));
109 } else if (sizeof(struct fileIdentDesc) >= -fibh->soffset) { 109 } else if (sizeof(struct fileIdentDesc) >= -fibh->soffset) {
110 crc = crc_itu_t(crc, fibh->ebh->b_data + 110 crc = crc_itu_t(crc, fibh->ebh->b_data +
111 sizeof(struct fileIdentDesc) + 111 sizeof(struct fileIdentDesc) +
112 fibh->soffset, 112 fibh->soffset,
113 crclen + sizeof(tag) - 113 crclen + sizeof(struct tag) -
114 sizeof(struct fileIdentDesc)); 114 sizeof(struct fileIdentDesc));
115 } else { 115 } else {
116 crc = crc_itu_t(crc, (uint8_t *)sfi->impUse, 116 crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
@@ -154,7 +154,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
154 uint8_t lfi; 154 uint8_t lfi;
155 uint16_t liu; 155 uint16_t liu;
156 loff_t size; 156 loff_t size;
157 kernel_lb_addr eloc; 157 struct kernel_lb_addr eloc;
158 uint32_t elen; 158 uint32_t elen;
159 sector_t offset; 159 sector_t offset;
160 struct extent_position epos = {}; 160 struct extent_position epos = {};
@@ -171,12 +171,12 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
171 if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos, 171 if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
172 &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) 172 &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30))
173 goto out_err; 173 goto out_err;
174 block = udf_get_lb_pblock(dir->i_sb, eloc, offset); 174 block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
175 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { 175 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
176 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 176 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
177 epos.offset -= sizeof(short_ad); 177 epos.offset -= sizeof(struct short_ad);
178 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 178 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
179 epos.offset -= sizeof(long_ad); 179 epos.offset -= sizeof(struct long_ad);
180 } else 180 } else
181 offset = 0; 181 offset = 0;
182 182
@@ -268,7 +268,7 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
268#ifdef UDF_RECOVERY 268#ifdef UDF_RECOVERY
269 /* temporary shorthand for specifying files by inode number */ 269 /* temporary shorthand for specifying files by inode number */
270 if (!strncmp(dentry->d_name.name, ".B=", 3)) { 270 if (!strncmp(dentry->d_name.name, ".B=", 3)) {
271 kernel_lb_addr lb = { 271 struct kernel_lb_addr lb = {
272 .logicalBlockNum = 0, 272 .logicalBlockNum = 0,
273 .partitionReferenceNum = 273 .partitionReferenceNum =
274 simple_strtoul(dentry->d_name.name + 3, 274 simple_strtoul(dentry->d_name.name + 3,
@@ -283,11 +283,14 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
283#endif /* UDF_RECOVERY */ 283#endif /* UDF_RECOVERY */
284 284
285 if (udf_find_entry(dir, &dentry->d_name, &fibh, &cfi)) { 285 if (udf_find_entry(dir, &dentry->d_name, &fibh, &cfi)) {
286 struct kernel_lb_addr loc;
287
286 if (fibh.sbh != fibh.ebh) 288 if (fibh.sbh != fibh.ebh)
287 brelse(fibh.ebh); 289 brelse(fibh.ebh);
288 brelse(fibh.sbh); 290 brelse(fibh.sbh);
289 291
290 inode = udf_iget(dir->i_sb, lelb_to_cpu(cfi.icb.extLocation)); 292 loc = lelb_to_cpu(cfi.icb.extLocation);
293 inode = udf_iget(dir->i_sb, &loc);
291 if (!inode) { 294 if (!inode) {
292 unlock_kernel(); 295 unlock_kernel();
293 return ERR_PTR(-EACCES); 296 return ERR_PTR(-EACCES);
@@ -313,7 +316,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
313 uint8_t lfi; 316 uint8_t lfi;
314 uint16_t liu; 317 uint16_t liu;
315 int block; 318 int block;
316 kernel_lb_addr eloc; 319 struct kernel_lb_addr eloc;
317 uint32_t elen = 0; 320 uint32_t elen = 0;
318 sector_t offset; 321 sector_t offset;
319 struct extent_position epos = {}; 322 struct extent_position epos = {};
@@ -351,16 +354,16 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
351 if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos, 354 if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
352 &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) { 355 &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) {
353 block = udf_get_lb_pblock(dir->i_sb, 356 block = udf_get_lb_pblock(dir->i_sb,
354 dinfo->i_location, 0); 357 &dinfo->i_location, 0);
355 fibh->soffset = fibh->eoffset = sb->s_blocksize; 358 fibh->soffset = fibh->eoffset = sb->s_blocksize;
356 goto add; 359 goto add;
357 } 360 }
358 block = udf_get_lb_pblock(dir->i_sb, eloc, offset); 361 block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
359 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { 362 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
360 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 363 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
361 epos.offset -= sizeof(short_ad); 364 epos.offset -= sizeof(struct short_ad);
362 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 365 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
363 epos.offset -= sizeof(long_ad); 366 epos.offset -= sizeof(struct long_ad);
364 } else 367 } else
365 offset = 0; 368 offset = 0;
366 369
@@ -409,10 +412,10 @@ add:
409 if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && elen) { 412 if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && elen) {
410 elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1); 413 elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1);
411 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 414 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
412 epos.offset -= sizeof(short_ad); 415 epos.offset -= sizeof(struct short_ad);
413 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 416 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
414 epos.offset -= sizeof(long_ad); 417 epos.offset -= sizeof(struct long_ad);
415 udf_write_aext(dir, &epos, eloc, elen, 1); 418 udf_write_aext(dir, &epos, &eloc, elen, 1);
416 } 419 }
417 f_pos += nfidlen; 420 f_pos += nfidlen;
418 421
@@ -494,10 +497,10 @@ add:
494 memset(cfi, 0, sizeof(struct fileIdentDesc)); 497 memset(cfi, 0, sizeof(struct fileIdentDesc));
495 if (UDF_SB(sb)->s_udfrev >= 0x0200) 498 if (UDF_SB(sb)->s_udfrev >= 0x0200)
496 udf_new_tag((char *)cfi, TAG_IDENT_FID, 3, 1, block, 499 udf_new_tag((char *)cfi, TAG_IDENT_FID, 3, 1, block,
497 sizeof(tag)); 500 sizeof(struct tag));
498 else 501 else
499 udf_new_tag((char *)cfi, TAG_IDENT_FID, 2, 1, block, 502 udf_new_tag((char *)cfi, TAG_IDENT_FID, 2, 1, block,
500 sizeof(tag)); 503 sizeof(struct tag));
501 cfi->fileVersionNum = cpu_to_le16(1); 504 cfi->fileVersionNum = cpu_to_le16(1);
502 cfi->lengthFileIdent = namelen; 505 cfi->lengthFileIdent = namelen;
503 cfi->lengthOfImpUse = cpu_to_le16(0); 506 cfi->lengthOfImpUse = cpu_to_le16(0);
@@ -530,7 +533,7 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi,
530 cfi->fileCharacteristics |= FID_FILE_CHAR_DELETED; 533 cfi->fileCharacteristics |= FID_FILE_CHAR_DELETED;
531 534
532 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT)) 535 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT))
533 memset(&(cfi->icb), 0x00, sizeof(long_ad)); 536 memset(&(cfi->icb), 0x00, sizeof(struct long_ad));
534 537
535 return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL); 538 return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL);
536} 539}
@@ -710,7 +713,7 @@ static int empty_dir(struct inode *dir)
710 loff_t f_pos; 713 loff_t f_pos;
711 loff_t size = udf_ext0_offset(dir) + dir->i_size; 714 loff_t size = udf_ext0_offset(dir) + dir->i_size;
712 int block; 715 int block;
713 kernel_lb_addr eloc; 716 struct kernel_lb_addr eloc;
714 uint32_t elen; 717 uint32_t elen;
715 sector_t offset; 718 sector_t offset;
716 struct extent_position epos = {}; 719 struct extent_position epos = {};
@@ -724,12 +727,12 @@ static int empty_dir(struct inode *dir)
724 else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, 727 else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits,
725 &epos, &eloc, &elen, &offset) == 728 &epos, &eloc, &elen, &offset) ==
726 (EXT_RECORDED_ALLOCATED >> 30)) { 729 (EXT_RECORDED_ALLOCATED >> 30)) {
727 block = udf_get_lb_pblock(dir->i_sb, eloc, offset); 730 block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
728 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { 731 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
729 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 732 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
730 epos.offset -= sizeof(short_ad); 733 epos.offset -= sizeof(struct short_ad);
731 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 734 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
732 epos.offset -= sizeof(long_ad); 735 epos.offset -= sizeof(struct long_ad);
733 } else 736 } else
734 offset = 0; 737 offset = 0;
735 738
@@ -778,7 +781,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
778 struct inode *inode = dentry->d_inode; 781 struct inode *inode = dentry->d_inode;
779 struct udf_fileident_bh fibh; 782 struct udf_fileident_bh fibh;
780 struct fileIdentDesc *fi, cfi; 783 struct fileIdentDesc *fi, cfi;
781 kernel_lb_addr tloc; 784 struct kernel_lb_addr tloc;
782 785
783 retval = -ENOENT; 786 retval = -ENOENT;
784 lock_kernel(); 787 lock_kernel();
@@ -788,7 +791,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
788 791
789 retval = -EIO; 792 retval = -EIO;
790 tloc = lelb_to_cpu(cfi.icb.extLocation); 793 tloc = lelb_to_cpu(cfi.icb.extLocation);
791 if (udf_get_lb_pblock(dir->i_sb, tloc, 0) != inode->i_ino) 794 if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino)
792 goto end_rmdir; 795 goto end_rmdir;
793 retval = -ENOTEMPTY; 796 retval = -ENOTEMPTY;
794 if (!empty_dir(inode)) 797 if (!empty_dir(inode))
@@ -824,7 +827,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
824 struct udf_fileident_bh fibh; 827 struct udf_fileident_bh fibh;
825 struct fileIdentDesc *fi; 828 struct fileIdentDesc *fi;
826 struct fileIdentDesc cfi; 829 struct fileIdentDesc cfi;
827 kernel_lb_addr tloc; 830 struct kernel_lb_addr tloc;
828 831
829 retval = -ENOENT; 832 retval = -ENOENT;
830 lock_kernel(); 833 lock_kernel();
@@ -834,7 +837,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
834 837
835 retval = -EIO; 838 retval = -EIO;
836 tloc = lelb_to_cpu(cfi.icb.extLocation); 839 tloc = lelb_to_cpu(cfi.icb.extLocation);
837 if (udf_get_lb_pblock(dir->i_sb, tloc, 0) != inode->i_ino) 840 if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino)
838 goto end_unlink; 841 goto end_unlink;
839 842
840 if (!inode->i_nlink) { 843 if (!inode->i_nlink) {
@@ -897,7 +900,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
897 inode->i_op = &page_symlink_inode_operations; 900 inode->i_op = &page_symlink_inode_operations;
898 901
899 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { 902 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
900 kernel_lb_addr eloc; 903 struct kernel_lb_addr eloc;
901 uint32_t bsize; 904 uint32_t bsize;
902 905
903 block = udf_new_block(inode->i_sb, inode, 906 block = udf_new_block(inode->i_sb, inode,
@@ -913,7 +916,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
913 iinfo->i_location.partitionReferenceNum; 916 iinfo->i_location.partitionReferenceNum;
914 bsize = inode->i_sb->s_blocksize; 917 bsize = inode->i_sb->s_blocksize;
915 iinfo->i_lenExtents = bsize; 918 iinfo->i_lenExtents = bsize;
916 udf_add_aext(inode, &epos, eloc, bsize, 0); 919 udf_add_aext(inode, &epos, &eloc, bsize, 0);
917 brelse(epos.bh); 920 brelse(epos.bh);
918 921
919 block = udf_get_pblock(inode->i_sb, block, 922 block = udf_get_pblock(inode->i_sb, block,
@@ -1108,7 +1111,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1108 struct fileIdentDesc ocfi, ncfi; 1111 struct fileIdentDesc ocfi, ncfi;
1109 struct buffer_head *dir_bh = NULL; 1112 struct buffer_head *dir_bh = NULL;
1110 int retval = -ENOENT; 1113 int retval = -ENOENT;
1111 kernel_lb_addr tloc; 1114 struct kernel_lb_addr tloc;
1112 struct udf_inode_info *old_iinfo = UDF_I(old_inode); 1115 struct udf_inode_info *old_iinfo = UDF_I(old_inode);
1113 1116
1114 lock_kernel(); 1117 lock_kernel();
@@ -1119,7 +1122,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1119 brelse(ofibh.sbh); 1122 brelse(ofibh.sbh);
1120 } 1123 }
1121 tloc = lelb_to_cpu(ocfi.icb.extLocation); 1124 tloc = lelb_to_cpu(ocfi.icb.extLocation);
1122 if (!ofi || udf_get_lb_pblock(old_dir->i_sb, tloc, 0) 1125 if (!ofi || udf_get_lb_pblock(old_dir->i_sb, &tloc, 0)
1123 != old_inode->i_ino) 1126 != old_inode->i_ino)
1124 goto end_rename; 1127 goto end_rename;
1125 1128
@@ -1158,7 +1161,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1158 if (!dir_fi) 1161 if (!dir_fi)
1159 goto end_rename; 1162 goto end_rename;
1160 tloc = lelb_to_cpu(dir_fi->icb.extLocation); 1163 tloc = lelb_to_cpu(dir_fi->icb.extLocation);
1161 if (udf_get_lb_pblock(old_inode->i_sb, tloc, 0) != 1164 if (udf_get_lb_pblock(old_inode->i_sb, &tloc, 0) !=
1162 old_dir->i_ino) 1165 old_dir->i_ino)
1163 goto end_rename; 1166 goto end_rename;
1164 1167
@@ -1187,7 +1190,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1187 */ 1190 */
1188 ncfi.fileVersionNum = ocfi.fileVersionNum; 1191 ncfi.fileVersionNum = ocfi.fileVersionNum;
1189 ncfi.fileCharacteristics = ocfi.fileCharacteristics; 1192 ncfi.fileCharacteristics = ocfi.fileCharacteristics;
1190 memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(long_ad)); 1193 memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(struct long_ad));
1191 udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL); 1194 udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL);
1192 1195
1193 /* The old fid may have moved - find it again */ 1196 /* The old fid may have moved - find it again */
@@ -1242,6 +1245,7 @@ end_rename:
1242 1245
1243static struct dentry *udf_get_parent(struct dentry *child) 1246static struct dentry *udf_get_parent(struct dentry *child)
1244{ 1247{
1248 struct kernel_lb_addr tloc;
1245 struct inode *inode = NULL; 1249 struct inode *inode = NULL;
1246 struct qstr dotdot = {.name = "..", .len = 2}; 1250 struct qstr dotdot = {.name = "..", .len = 2};
1247 struct fileIdentDesc cfi; 1251 struct fileIdentDesc cfi;
@@ -1255,8 +1259,8 @@ static struct dentry *udf_get_parent(struct dentry *child)
1255 brelse(fibh.ebh); 1259 brelse(fibh.ebh);
1256 brelse(fibh.sbh); 1260 brelse(fibh.sbh);
1257 1261
1258 inode = udf_iget(child->d_inode->i_sb, 1262 tloc = lelb_to_cpu(cfi.icb.extLocation);
1259 lelb_to_cpu(cfi.icb.extLocation)); 1263 inode = udf_iget(child->d_inode->i_sb, &tloc);
1260 if (!inode) 1264 if (!inode)
1261 goto out_unlock; 1265 goto out_unlock;
1262 unlock_kernel(); 1266 unlock_kernel();
@@ -1272,14 +1276,14 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block,
1272 u16 partref, __u32 generation) 1276 u16 partref, __u32 generation)
1273{ 1277{
1274 struct inode *inode; 1278 struct inode *inode;
1275 kernel_lb_addr loc; 1279 struct kernel_lb_addr loc;
1276 1280
1277 if (block == 0) 1281 if (block == 0)
1278 return ERR_PTR(-ESTALE); 1282 return ERR_PTR(-ESTALE);
1279 1283
1280 loc.logicalBlockNum = block; 1284 loc.logicalBlockNum = block;
1281 loc.partitionReferenceNum = partref; 1285 loc.partitionReferenceNum = partref;
1282 inode = udf_iget(sb, loc); 1286 inode = udf_iget(sb, &loc);
1283 1287
1284 if (inode == NULL) 1288 if (inode == NULL)
1285 return ERR_PTR(-ENOMEM); 1289 return ERR_PTR(-ENOMEM);
@@ -1318,7 +1322,7 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
1318{ 1322{
1319 int len = *lenp; 1323 int len = *lenp;
1320 struct inode *inode = de->d_inode; 1324 struct inode *inode = de->d_inode;
1321 kernel_lb_addr location = UDF_I(inode)->i_location; 1325 struct kernel_lb_addr location = UDF_I(inode)->i_location;
1322 struct fid *fid = (struct fid *)fh; 1326 struct fid *fid = (struct fid *)fh;
1323 int type = FILEID_UDF_WITHOUT_PARENT; 1327 int type = FILEID_UDF_WITHOUT_PARENT;
1324 1328
diff --git a/fs/udf/osta_udf.h b/fs/udf/osta_udf.h
index 65ff47902bd2..fbff74654df2 100644
--- a/fs/udf/osta_udf.h
+++ b/fs/udf/osta_udf.h
@@ -85,7 +85,7 @@ struct appIdentSuffix {
85/* Logical Volume Integrity Descriptor (UDF 2.50 2.2.6) */ 85/* Logical Volume Integrity Descriptor (UDF 2.50 2.2.6) */
86/* Implementation Use (UDF 2.50 2.2.6.4) */ 86/* Implementation Use (UDF 2.50 2.2.6.4) */
87struct logicalVolIntegrityDescImpUse { 87struct logicalVolIntegrityDescImpUse {
88 regid impIdent; 88 struct regid impIdent;
89 __le32 numFiles; 89 __le32 numFiles;
90 __le32 numDirs; 90 __le32 numDirs;
91 __le16 minUDFReadRev; 91 __le16 minUDFReadRev;
@@ -97,12 +97,12 @@ struct logicalVolIntegrityDescImpUse {
97/* Implementation Use Volume Descriptor (UDF 2.50 2.2.7) */ 97/* Implementation Use Volume Descriptor (UDF 2.50 2.2.7) */
98/* Implementation Use (UDF 2.50 2.2.7.2) */ 98/* Implementation Use (UDF 2.50 2.2.7.2) */
99struct impUseVolDescImpUse { 99struct impUseVolDescImpUse {
100 charspec LVICharset; 100 struct charspec LVICharset;
101 dstring logicalVolIdent[128]; 101 dstring logicalVolIdent[128];
102 dstring LVInfo1[36]; 102 dstring LVInfo1[36];
103 dstring LVInfo2[36]; 103 dstring LVInfo2[36];
104 dstring LVInfo3[36]; 104 dstring LVInfo3[36];
105 regid impIdent; 105 struct regid impIdent;
106 uint8_t impUse[128]; 106 uint8_t impUse[128];
107} __attribute__ ((packed)); 107} __attribute__ ((packed));
108 108
@@ -110,7 +110,7 @@ struct udfPartitionMap2 {
110 uint8_t partitionMapType; 110 uint8_t partitionMapType;
111 uint8_t partitionMapLength; 111 uint8_t partitionMapLength;
112 uint8_t reserved1[2]; 112 uint8_t reserved1[2];
113 regid partIdent; 113 struct regid partIdent;
114 __le16 volSeqNum; 114 __le16 volSeqNum;
115 __le16 partitionNum; 115 __le16 partitionNum;
116} __attribute__ ((packed)); 116} __attribute__ ((packed));
@@ -120,7 +120,7 @@ struct virtualPartitionMap {
120 uint8_t partitionMapType; 120 uint8_t partitionMapType;
121 uint8_t partitionMapLength; 121 uint8_t partitionMapLength;
122 uint8_t reserved1[2]; 122 uint8_t reserved1[2];
123 regid partIdent; 123 struct regid partIdent;
124 __le16 volSeqNum; 124 __le16 volSeqNum;
125 __le16 partitionNum; 125 __le16 partitionNum;
126 uint8_t reserved2[24]; 126 uint8_t reserved2[24];
@@ -131,7 +131,7 @@ struct sparablePartitionMap {
131 uint8_t partitionMapType; 131 uint8_t partitionMapType;
132 uint8_t partitionMapLength; 132 uint8_t partitionMapLength;
133 uint8_t reserved1[2]; 133 uint8_t reserved1[2];
134 regid partIdent; 134 struct regid partIdent;
135 __le16 volSeqNum; 135 __le16 volSeqNum;
136 __le16 partitionNum; 136 __le16 partitionNum;
137 __le16 packetLength; 137 __le16 packetLength;
@@ -146,7 +146,7 @@ struct metadataPartitionMap {
146 uint8_t partitionMapType; 146 uint8_t partitionMapType;
147 uint8_t partitionMapLength; 147 uint8_t partitionMapLength;
148 uint8_t reserved1[2]; 148 uint8_t reserved1[2];
149 regid partIdent; 149 struct regid partIdent;
150 __le16 volSeqNum; 150 __le16 volSeqNum;
151 __le16 partitionNum; 151 __le16 partitionNum;
152 __le32 metadataFileLoc; 152 __le32 metadataFileLoc;
@@ -161,7 +161,7 @@ struct metadataPartitionMap {
161/* Virtual Allocation Table (UDF 1.5 2.2.10) */ 161/* Virtual Allocation Table (UDF 1.5 2.2.10) */
162struct virtualAllocationTable15 { 162struct virtualAllocationTable15 {
163 __le32 VirtualSector[0]; 163 __le32 VirtualSector[0];
164 regid vatIdent; 164 struct regid vatIdent;
165 __le32 previousVATICBLoc; 165 __le32 previousVATICBLoc;
166} __attribute__ ((packed)); 166} __attribute__ ((packed));
167 167
@@ -192,8 +192,8 @@ struct sparingEntry {
192} __attribute__ ((packed)); 192} __attribute__ ((packed));
193 193
194struct sparingTable { 194struct sparingTable {
195 tag descTag; 195 struct tag descTag;
196 regid sparingIdent; 196 struct regid sparingIdent;
197 __le16 reallocationTableLen; 197 __le16 reallocationTableLen;
198 __le16 reserved; 198 __le16 reserved;
199 __le32 sequenceNum; 199 __le32 sequenceNum;
@@ -206,7 +206,7 @@ struct sparingTable {
206#define ICBTAG_FILE_TYPE_MIRROR 0xFB 206#define ICBTAG_FILE_TYPE_MIRROR 0xFB
207#define ICBTAG_FILE_TYPE_BITMAP 0xFC 207#define ICBTAG_FILE_TYPE_BITMAP 0xFC
208 208
209/* struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */ 209/* struct struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */
210struct allocDescImpUse { 210struct allocDescImpUse {
211 __le16 flags; 211 __le16 flags;
212 uint8_t impUse[4]; 212 uint8_t impUse[4];
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 96dfd207c3d6..4b540ee632d5 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -273,7 +273,7 @@ static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block,
273{ 273{
274 struct super_block *sb = inode->i_sb; 274 struct super_block *sb = inode->i_sb;
275 struct udf_part_map *map; 275 struct udf_part_map *map;
276 kernel_lb_addr eloc; 276 struct kernel_lb_addr eloc;
277 uint32_t elen; 277 uint32_t elen;
278 sector_t ext_offset; 278 sector_t ext_offset;
279 struct extent_position epos = {}; 279 struct extent_position epos = {};
diff --git a/fs/udf/super.c b/fs/udf/super.c
index e25e7010627b..72348cc855a4 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -81,16 +81,13 @@ static char error_buf[1024];
81/* These are the "meat" - everything else is stuffing */ 81/* These are the "meat" - everything else is stuffing */
82static int udf_fill_super(struct super_block *, void *, int); 82static int udf_fill_super(struct super_block *, void *, int);
83static void udf_put_super(struct super_block *); 83static void udf_put_super(struct super_block *);
84static void udf_write_super(struct super_block *); 84static int udf_sync_fs(struct super_block *, int);
85static int udf_remount_fs(struct super_block *, int *, char *); 85static int udf_remount_fs(struct super_block *, int *, char *);
86static int udf_check_valid(struct super_block *, int, int); 86static void udf_load_logicalvolint(struct super_block *, struct kernel_extent_ad);
87static int udf_vrs(struct super_block *sb, int silent); 87static int udf_find_fileset(struct super_block *, struct kernel_lb_addr *,
88static void udf_load_logicalvolint(struct super_block *, kernel_extent_ad); 88 struct kernel_lb_addr *);
89static void udf_find_anchor(struct super_block *);
90static int udf_find_fileset(struct super_block *, kernel_lb_addr *,
91 kernel_lb_addr *);
92static void udf_load_fileset(struct super_block *, struct buffer_head *, 89static void udf_load_fileset(struct super_block *, struct buffer_head *,
93 kernel_lb_addr *); 90 struct kernel_lb_addr *);
94static void udf_open_lvid(struct super_block *); 91static void udf_open_lvid(struct super_block *);
95static void udf_close_lvid(struct super_block *); 92static void udf_close_lvid(struct super_block *);
96static unsigned int udf_count_free(struct super_block *); 93static unsigned int udf_count_free(struct super_block *);
@@ -181,7 +178,7 @@ static const struct super_operations udf_sb_ops = {
181 .delete_inode = udf_delete_inode, 178 .delete_inode = udf_delete_inode,
182 .clear_inode = udf_clear_inode, 179 .clear_inode = udf_clear_inode,
183 .put_super = udf_put_super, 180 .put_super = udf_put_super,
184 .write_super = udf_write_super, 181 .sync_fs = udf_sync_fs,
185 .statfs = udf_statfs, 182 .statfs = udf_statfs,
186 .remount_fs = udf_remount_fs, 183 .remount_fs = udf_remount_fs,
187 .show_options = udf_show_options, 184 .show_options = udf_show_options,
@@ -201,6 +198,8 @@ struct udf_options {
201 mode_t umask; 198 mode_t umask;
202 gid_t gid; 199 gid_t gid;
203 uid_t uid; 200 uid_t uid;
201 mode_t fmode;
202 mode_t dmode;
204 struct nls_table *nls_map; 203 struct nls_table *nls_map;
205}; 204};
206 205
@@ -258,7 +257,7 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
258 257
259 if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT)) 258 if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT))
260 seq_puts(seq, ",nostrict"); 259 seq_puts(seq, ",nostrict");
261 if (sb->s_blocksize != UDF_DEFAULT_BLOCKSIZE) 260 if (UDF_QUERY_FLAG(sb, UDF_FLAG_BLOCKSIZE_SET))
262 seq_printf(seq, ",bs=%lu", sb->s_blocksize); 261 seq_printf(seq, ",bs=%lu", sb->s_blocksize);
263 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE)) 262 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE))
264 seq_puts(seq, ",unhide"); 263 seq_puts(seq, ",unhide");
@@ -282,18 +281,16 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
282 seq_printf(seq, ",gid=%u", sbi->s_gid); 281 seq_printf(seq, ",gid=%u", sbi->s_gid);
283 if (sbi->s_umask != 0) 282 if (sbi->s_umask != 0)
284 seq_printf(seq, ",umask=%o", sbi->s_umask); 283 seq_printf(seq, ",umask=%o", sbi->s_umask);
284 if (sbi->s_fmode != UDF_INVALID_MODE)
285 seq_printf(seq, ",mode=%o", sbi->s_fmode);
286 if (sbi->s_dmode != UDF_INVALID_MODE)
287 seq_printf(seq, ",dmode=%o", sbi->s_dmode);
285 if (UDF_QUERY_FLAG(sb, UDF_FLAG_SESSION_SET)) 288 if (UDF_QUERY_FLAG(sb, UDF_FLAG_SESSION_SET))
286 seq_printf(seq, ",session=%u", sbi->s_session); 289 seq_printf(seq, ",session=%u", sbi->s_session);
287 if (UDF_QUERY_FLAG(sb, UDF_FLAG_LASTBLOCK_SET)) 290 if (UDF_QUERY_FLAG(sb, UDF_FLAG_LASTBLOCK_SET))
288 seq_printf(seq, ",lastblock=%u", sbi->s_last_block); 291 seq_printf(seq, ",lastblock=%u", sbi->s_last_block);
289 /* 292 if (sbi->s_anchor != 0)
290 * s_anchor[2] could be zeroed out in case there is no anchor 293 seq_printf(seq, ",anchor=%u", sbi->s_anchor);
291 * in the specified block, but then the "anchor=N" option
292 * originally given by the user wasn't effective, so it's OK
293 * if we don't show it.
294 */
295 if (sbi->s_anchor[2] != 0)
296 seq_printf(seq, ",anchor=%u", sbi->s_anchor[2]);
297 /* 294 /*
298 * volume, partition, fileset and rootdir seem to be ignored 295 * volume, partition, fileset and rootdir seem to be ignored
299 * currently 296 * currently
@@ -317,6 +314,8 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
317 * 314 *
318 * gid= Set the default group. 315 * gid= Set the default group.
319 * umask= Set the default umask. 316 * umask= Set the default umask.
317 * mode= Set the default file permissions.
318 * dmode= Set the default directory permissions.
320 * uid= Set the default user. 319 * uid= Set the default user.
321 * bs= Set the block size. 320 * bs= Set the block size.
322 * unhide Show otherwise hidden files. 321 * unhide Show otherwise hidden files.
@@ -366,7 +365,8 @@ enum {
366 Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock, 365 Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock,
367 Opt_anchor, Opt_volume, Opt_partition, Opt_fileset, 366 Opt_anchor, Opt_volume, Opt_partition, Opt_fileset,
368 Opt_rootdir, Opt_utf8, Opt_iocharset, 367 Opt_rootdir, Opt_utf8, Opt_iocharset,
369 Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore 368 Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore,
369 Opt_fmode, Opt_dmode
370}; 370};
371 371
372static const match_table_t tokens = { 372static const match_table_t tokens = {
@@ -395,6 +395,8 @@ static const match_table_t tokens = {
395 {Opt_rootdir, "rootdir=%u"}, 395 {Opt_rootdir, "rootdir=%u"},
396 {Opt_utf8, "utf8"}, 396 {Opt_utf8, "utf8"},
397 {Opt_iocharset, "iocharset=%s"}, 397 {Opt_iocharset, "iocharset=%s"},
398 {Opt_fmode, "mode=%o"},
399 {Opt_dmode, "dmode=%o"},
398 {Opt_err, NULL} 400 {Opt_err, NULL}
399}; 401};
400 402
@@ -405,7 +407,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
405 int option; 407 int option;
406 408
407 uopt->novrs = 0; 409 uopt->novrs = 0;
408 uopt->blocksize = UDF_DEFAULT_BLOCKSIZE;
409 uopt->partition = 0xFFFF; 410 uopt->partition = 0xFFFF;
410 uopt->session = 0xFFFFFFFF; 411 uopt->session = 0xFFFFFFFF;
411 uopt->lastblock = 0; 412 uopt->lastblock = 0;
@@ -428,10 +429,12 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
428 switch (token) { 429 switch (token) {
429 case Opt_novrs: 430 case Opt_novrs:
430 uopt->novrs = 1; 431 uopt->novrs = 1;
432 break;
431 case Opt_bs: 433 case Opt_bs:
432 if (match_int(&args[0], &option)) 434 if (match_int(&args[0], &option))
433 return 0; 435 return 0;
434 uopt->blocksize = option; 436 uopt->blocksize = option;
437 uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET);
435 break; 438 break;
436 case Opt_unhide: 439 case Opt_unhide:
437 uopt->flags |= (1 << UDF_FLAG_UNHIDE); 440 uopt->flags |= (1 << UDF_FLAG_UNHIDE);
@@ -531,6 +534,16 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
531 case Opt_gforget: 534 case Opt_gforget:
532 uopt->flags |= (1 << UDF_FLAG_GID_FORGET); 535 uopt->flags |= (1 << UDF_FLAG_GID_FORGET);
533 break; 536 break;
537 case Opt_fmode:
538 if (match_octal(args, &option))
539 return 0;
540 uopt->fmode = option & 0777;
541 break;
542 case Opt_dmode:
543 if (match_octal(args, &option))
544 return 0;
545 uopt->dmode = option & 0777;
546 break;
534 default: 547 default:
535 printk(KERN_ERR "udf: bad mount option \"%s\" " 548 printk(KERN_ERR "udf: bad mount option \"%s\" "
536 "or missing value\n", p); 549 "or missing value\n", p);
@@ -540,17 +553,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
540 return 1; 553 return 1;
541} 554}
542 555
543static void udf_write_super(struct super_block *sb)
544{
545 lock_kernel();
546
547 if (!(sb->s_flags & MS_RDONLY))
548 udf_open_lvid(sb);
549 sb->s_dirt = 0;
550
551 unlock_kernel();
552}
553
554static int udf_remount_fs(struct super_block *sb, int *flags, char *options) 556static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
555{ 557{
556 struct udf_options uopt; 558 struct udf_options uopt;
@@ -560,6 +562,8 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
560 uopt.uid = sbi->s_uid; 562 uopt.uid = sbi->s_uid;
561 uopt.gid = sbi->s_gid; 563 uopt.gid = sbi->s_gid;
562 uopt.umask = sbi->s_umask; 564 uopt.umask = sbi->s_umask;
565 uopt.fmode = sbi->s_fmode;
566 uopt.dmode = sbi->s_dmode;
563 567
564 if (!udf_parse_options(options, &uopt, true)) 568 if (!udf_parse_options(options, &uopt, true))
565 return -EINVAL; 569 return -EINVAL;
@@ -568,6 +572,8 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
568 sbi->s_uid = uopt.uid; 572 sbi->s_uid = uopt.uid;
569 sbi->s_gid = uopt.gid; 573 sbi->s_gid = uopt.gid;
570 sbi->s_umask = uopt.umask; 574 sbi->s_umask = uopt.umask;
575 sbi->s_fmode = uopt.fmode;
576 sbi->s_dmode = uopt.dmode;
571 577
572 if (sbi->s_lvid_bh) { 578 if (sbi->s_lvid_bh) {
573 int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev); 579 int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev);
@@ -585,22 +591,19 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
585 return 0; 591 return 0;
586} 592}
587 593
588static int udf_vrs(struct super_block *sb, int silent) 594/* Check Volume Structure Descriptors (ECMA 167 2/9.1) */
595/* We also check any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */
596static loff_t udf_check_vsd(struct super_block *sb)
589{ 597{
590 struct volStructDesc *vsd = NULL; 598 struct volStructDesc *vsd = NULL;
591 loff_t sector = 32768; 599 loff_t sector = 32768;
592 int sectorsize; 600 int sectorsize;
593 struct buffer_head *bh = NULL; 601 struct buffer_head *bh = NULL;
594 int iso9660 = 0;
595 int nsr02 = 0; 602 int nsr02 = 0;
596 int nsr03 = 0; 603 int nsr03 = 0;
597 struct udf_sb_info *sbi; 604 struct udf_sb_info *sbi;
598 605
599 /* Block size must be a multiple of 512 */
600 if (sb->s_blocksize & 511)
601 return 0;
602 sbi = UDF_SB(sb); 606 sbi = UDF_SB(sb);
603
604 if (sb->s_blocksize < sizeof(struct volStructDesc)) 607 if (sb->s_blocksize < sizeof(struct volStructDesc))
605 sectorsize = sizeof(struct volStructDesc); 608 sectorsize = sizeof(struct volStructDesc);
606 else 609 else
@@ -627,7 +630,6 @@ static int udf_vrs(struct super_block *sb, int silent)
627 break; 630 break;
628 } else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001, 631 } else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001,
629 VSD_STD_ID_LEN)) { 632 VSD_STD_ID_LEN)) {
630 iso9660 = sector;
631 switch (vsd->structType) { 633 switch (vsd->structType) {
632 case 0: 634 case 0:
633 udf_debug("ISO9660 Boot Record found\n"); 635 udf_debug("ISO9660 Boot Record found\n");
@@ -679,139 +681,9 @@ static int udf_vrs(struct super_block *sb, int silent)
679 return 0; 681 return 0;
680} 682}
681 683
682/*
683 * Check whether there is an anchor block in the given block
684 */
685static int udf_check_anchor_block(struct super_block *sb, sector_t block)
686{
687 struct buffer_head *bh;
688 uint16_t ident;
689
690 if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
691 udf_fixed_to_variable(block) >=
692 sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
693 return 0;
694
695 bh = udf_read_tagged(sb, block, block, &ident);
696 if (!bh)
697 return 0;
698 brelse(bh);
699
700 return ident == TAG_IDENT_AVDP;
701}
702
703/* Search for an anchor volume descriptor pointer */
704static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock)
705{
706 sector_t last[6];
707 int i;
708 struct udf_sb_info *sbi = UDF_SB(sb);
709
710 last[0] = lastblock;
711 last[1] = last[0] - 1;
712 last[2] = last[0] + 1;
713 last[3] = last[0] - 2;
714 last[4] = last[0] - 150;
715 last[5] = last[0] - 152;
716
717 /* according to spec, anchor is in either:
718 * block 256
719 * lastblock-256
720 * lastblock
721 * however, if the disc isn't closed, it could be 512 */
722
723 for (i = 0; i < ARRAY_SIZE(last); i++) {
724 if (last[i] < 0)
725 continue;
726 if (last[i] >= sb->s_bdev->bd_inode->i_size >>
727 sb->s_blocksize_bits)
728 continue;
729
730 if (udf_check_anchor_block(sb, last[i])) {
731 sbi->s_anchor[0] = last[i];
732 sbi->s_anchor[1] = last[i] - 256;
733 return last[i];
734 }
735
736 if (last[i] < 256)
737 continue;
738
739 if (udf_check_anchor_block(sb, last[i] - 256)) {
740 sbi->s_anchor[1] = last[i] - 256;
741 return last[i];
742 }
743 }
744
745 if (udf_check_anchor_block(sb, sbi->s_session + 256)) {
746 sbi->s_anchor[0] = sbi->s_session + 256;
747 return last[0];
748 }
749 if (udf_check_anchor_block(sb, sbi->s_session + 512)) {
750 sbi->s_anchor[0] = sbi->s_session + 512;
751 return last[0];
752 }
753 return 0;
754}
755
756/*
757 * Find an anchor volume descriptor. The function expects sbi->s_lastblock to
758 * be the last block on the media.
759 *
760 * Return 1 if not found, 0 if ok
761 *
762 */
763static void udf_find_anchor(struct super_block *sb)
764{
765 sector_t lastblock;
766 struct buffer_head *bh = NULL;
767 uint16_t ident;
768 int i;
769 struct udf_sb_info *sbi = UDF_SB(sb);
770
771 lastblock = udf_scan_anchors(sb, sbi->s_last_block);
772 if (lastblock)
773 goto check_anchor;
774
775 /* No anchor found? Try VARCONV conversion of block numbers */
776 UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
777 /* Firstly, we try to not convert number of the last block */
778 lastblock = udf_scan_anchors(sb,
779 udf_variable_to_fixed(sbi->s_last_block));
780 if (lastblock)
781 goto check_anchor;
782
783 /* Secondly, we try with converted number of the last block */
784 lastblock = udf_scan_anchors(sb, sbi->s_last_block);
785 if (!lastblock) {
786 /* VARCONV didn't help. Clear it. */
787 UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
788 }
789
790check_anchor:
791 /*
792 * Check located anchors and the anchor block supplied via
793 * mount options
794 */
795 for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) {
796 if (!sbi->s_anchor[i])
797 continue;
798 bh = udf_read_tagged(sb, sbi->s_anchor[i],
799 sbi->s_anchor[i], &ident);
800 if (!bh)
801 sbi->s_anchor[i] = 0;
802 else {
803 brelse(bh);
804 if (ident != TAG_IDENT_AVDP)
805 sbi->s_anchor[i] = 0;
806 }
807 }
808
809 sbi->s_last_block = lastblock;
810}
811
812static int udf_find_fileset(struct super_block *sb, 684static int udf_find_fileset(struct super_block *sb,
813 kernel_lb_addr *fileset, 685 struct kernel_lb_addr *fileset,
814 kernel_lb_addr *root) 686 struct kernel_lb_addr *root)
815{ 687{
816 struct buffer_head *bh = NULL; 688 struct buffer_head *bh = NULL;
817 long lastblock; 689 long lastblock;
@@ -820,7 +692,7 @@ static int udf_find_fileset(struct super_block *sb,
820 692
821 if (fileset->logicalBlockNum != 0xFFFFFFFF || 693 if (fileset->logicalBlockNum != 0xFFFFFFFF ||
822 fileset->partitionReferenceNum != 0xFFFF) { 694 fileset->partitionReferenceNum != 0xFFFF) {
823 bh = udf_read_ptagged(sb, *fileset, 0, &ident); 695 bh = udf_read_ptagged(sb, fileset, 0, &ident);
824 696
825 if (!bh) { 697 if (!bh) {
826 return 1; 698 return 1;
@@ -834,7 +706,7 @@ static int udf_find_fileset(struct super_block *sb,
834 sbi = UDF_SB(sb); 706 sbi = UDF_SB(sb);
835 if (!bh) { 707 if (!bh) {
836 /* Search backwards through the partitions */ 708 /* Search backwards through the partitions */
837 kernel_lb_addr newfileset; 709 struct kernel_lb_addr newfileset;
838 710
839/* --> cvg: FIXME - is it reasonable? */ 711/* --> cvg: FIXME - is it reasonable? */
840 return 1; 712 return 1;
@@ -850,7 +722,7 @@ static int udf_find_fileset(struct super_block *sb,
850 newfileset.logicalBlockNum = 0; 722 newfileset.logicalBlockNum = 0;
851 723
852 do { 724 do {
853 bh = udf_read_ptagged(sb, newfileset, 0, 725 bh = udf_read_ptagged(sb, &newfileset, 0,
854 &ident); 726 &ident);
855 if (!bh) { 727 if (!bh) {
856 newfileset.logicalBlockNum++; 728 newfileset.logicalBlockNum++;
@@ -902,14 +774,23 @@ static int udf_find_fileset(struct super_block *sb,
902static int udf_load_pvoldesc(struct super_block *sb, sector_t block) 774static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
903{ 775{
904 struct primaryVolDesc *pvoldesc; 776 struct primaryVolDesc *pvoldesc;
905 struct ustr instr; 777 struct ustr *instr, *outstr;
906 struct ustr outstr;
907 struct buffer_head *bh; 778 struct buffer_head *bh;
908 uint16_t ident; 779 uint16_t ident;
780 int ret = 1;
781
782 instr = kmalloc(sizeof(struct ustr), GFP_NOFS);
783 if (!instr)
784 return 1;
785
786 outstr = kmalloc(sizeof(struct ustr), GFP_NOFS);
787 if (!outstr)
788 goto out1;
909 789
910 bh = udf_read_tagged(sb, block, block, &ident); 790 bh = udf_read_tagged(sb, block, block, &ident);
911 if (!bh) 791 if (!bh)
912 return 1; 792 goto out2;
793
913 BUG_ON(ident != TAG_IDENT_PVD); 794 BUG_ON(ident != TAG_IDENT_PVD);
914 795
915 pvoldesc = (struct primaryVolDesc *)bh->b_data; 796 pvoldesc = (struct primaryVolDesc *)bh->b_data;
@@ -917,7 +798,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
917 if (udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time, 798 if (udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time,
918 pvoldesc->recordingDateAndTime)) { 799 pvoldesc->recordingDateAndTime)) {
919#ifdef UDFFS_DEBUG 800#ifdef UDFFS_DEBUG
920 timestamp *ts = &pvoldesc->recordingDateAndTime; 801 struct timestamp *ts = &pvoldesc->recordingDateAndTime;
921 udf_debug("recording time %04u/%02u/%02u" 802 udf_debug("recording time %04u/%02u/%02u"
922 " %02u:%02u (%x)\n", 803 " %02u:%02u (%x)\n",
923 le16_to_cpu(ts->year), ts->month, ts->day, ts->hour, 804 le16_to_cpu(ts->year), ts->month, ts->day, ts->hour,
@@ -925,20 +806,25 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
925#endif 806#endif
926 } 807 }
927 808
928 if (!udf_build_ustr(&instr, pvoldesc->volIdent, 32)) 809 if (!udf_build_ustr(instr, pvoldesc->volIdent, 32))
929 if (udf_CS0toUTF8(&outstr, &instr)) { 810 if (udf_CS0toUTF8(outstr, instr)) {
930 strncpy(UDF_SB(sb)->s_volume_ident, outstr.u_name, 811 strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name,
931 outstr.u_len > 31 ? 31 : outstr.u_len); 812 outstr->u_len > 31 ? 31 : outstr->u_len);
932 udf_debug("volIdent[] = '%s'\n", 813 udf_debug("volIdent[] = '%s'\n",
933 UDF_SB(sb)->s_volume_ident); 814 UDF_SB(sb)->s_volume_ident);
934 } 815 }
935 816
936 if (!udf_build_ustr(&instr, pvoldesc->volSetIdent, 128)) 817 if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128))
937 if (udf_CS0toUTF8(&outstr, &instr)) 818 if (udf_CS0toUTF8(outstr, instr))
938 udf_debug("volSetIdent[] = '%s'\n", outstr.u_name); 819 udf_debug("volSetIdent[] = '%s'\n", outstr->u_name);
939 820
940 brelse(bh); 821 brelse(bh);
941 return 0; 822 ret = 0;
823out2:
824 kfree(outstr);
825out1:
826 kfree(instr);
827 return ret;
942} 828}
943 829
944static int udf_load_metadata_files(struct super_block *sb, int partition) 830static int udf_load_metadata_files(struct super_block *sb, int partition)
@@ -946,7 +832,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
946 struct udf_sb_info *sbi = UDF_SB(sb); 832 struct udf_sb_info *sbi = UDF_SB(sb);
947 struct udf_part_map *map; 833 struct udf_part_map *map;
948 struct udf_meta_data *mdata; 834 struct udf_meta_data *mdata;
949 kernel_lb_addr addr; 835 struct kernel_lb_addr addr;
950 int fe_error = 0; 836 int fe_error = 0;
951 837
952 map = &sbi->s_partmaps[partition]; 838 map = &sbi->s_partmaps[partition];
@@ -959,7 +845,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
959 udf_debug("Metadata file location: block = %d part = %d\n", 845 udf_debug("Metadata file location: block = %d part = %d\n",
960 addr.logicalBlockNum, addr.partitionReferenceNum); 846 addr.logicalBlockNum, addr.partitionReferenceNum);
961 847
962 mdata->s_metadata_fe = udf_iget(sb, addr); 848 mdata->s_metadata_fe = udf_iget(sb, &addr);
963 849
964 if (mdata->s_metadata_fe == NULL) { 850 if (mdata->s_metadata_fe == NULL) {
965 udf_warning(sb, __func__, "metadata inode efe not found, " 851 udf_warning(sb, __func__, "metadata inode efe not found, "
@@ -981,7 +867,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
981 udf_debug("Mirror metadata file location: block = %d part = %d\n", 867 udf_debug("Mirror metadata file location: block = %d part = %d\n",
982 addr.logicalBlockNum, addr.partitionReferenceNum); 868 addr.logicalBlockNum, addr.partitionReferenceNum);
983 869
984 mdata->s_mirror_fe = udf_iget(sb, addr); 870 mdata->s_mirror_fe = udf_iget(sb, &addr);
985 871
986 if (mdata->s_mirror_fe == NULL) { 872 if (mdata->s_mirror_fe == NULL) {
987 if (fe_error) { 873 if (fe_error) {
@@ -1013,7 +899,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
1013 udf_debug("Bitmap file location: block = %d part = %d\n", 899 udf_debug("Bitmap file location: block = %d part = %d\n",
1014 addr.logicalBlockNum, addr.partitionReferenceNum); 900 addr.logicalBlockNum, addr.partitionReferenceNum);
1015 901
1016 mdata->s_bitmap_fe = udf_iget(sb, addr); 902 mdata->s_bitmap_fe = udf_iget(sb, &addr);
1017 903
1018 if (mdata->s_bitmap_fe == NULL) { 904 if (mdata->s_bitmap_fe == NULL) {
1019 if (sb->s_flags & MS_RDONLY) 905 if (sb->s_flags & MS_RDONLY)
@@ -1037,7 +923,7 @@ error_exit:
1037} 923}
1038 924
1039static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh, 925static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh,
1040 kernel_lb_addr *root) 926 struct kernel_lb_addr *root)
1041{ 927{
1042 struct fileSetDesc *fset; 928 struct fileSetDesc *fset;
1043 929
@@ -1119,13 +1005,13 @@ static int udf_fill_partdesc_info(struct super_block *sb,
1119 1005
1120 phd = (struct partitionHeaderDesc *)p->partitionContentsUse; 1006 phd = (struct partitionHeaderDesc *)p->partitionContentsUse;
1121 if (phd->unallocSpaceTable.extLength) { 1007 if (phd->unallocSpaceTable.extLength) {
1122 kernel_lb_addr loc = { 1008 struct kernel_lb_addr loc = {
1123 .logicalBlockNum = le32_to_cpu( 1009 .logicalBlockNum = le32_to_cpu(
1124 phd->unallocSpaceTable.extPosition), 1010 phd->unallocSpaceTable.extPosition),
1125 .partitionReferenceNum = p_index, 1011 .partitionReferenceNum = p_index,
1126 }; 1012 };
1127 1013
1128 map->s_uspace.s_table = udf_iget(sb, loc); 1014 map->s_uspace.s_table = udf_iget(sb, &loc);
1129 if (!map->s_uspace.s_table) { 1015 if (!map->s_uspace.s_table) {
1130 udf_debug("cannot load unallocSpaceTable (part %d)\n", 1016 udf_debug("cannot load unallocSpaceTable (part %d)\n",
1131 p_index); 1017 p_index);
@@ -1154,13 +1040,13 @@ static int udf_fill_partdesc_info(struct super_block *sb,
1154 udf_debug("partitionIntegrityTable (part %d)\n", p_index); 1040 udf_debug("partitionIntegrityTable (part %d)\n", p_index);
1155 1041
1156 if (phd->freedSpaceTable.extLength) { 1042 if (phd->freedSpaceTable.extLength) {
1157 kernel_lb_addr loc = { 1043 struct kernel_lb_addr loc = {
1158 .logicalBlockNum = le32_to_cpu( 1044 .logicalBlockNum = le32_to_cpu(
1159 phd->freedSpaceTable.extPosition), 1045 phd->freedSpaceTable.extPosition),
1160 .partitionReferenceNum = p_index, 1046 .partitionReferenceNum = p_index,
1161 }; 1047 };
1162 1048
1163 map->s_fspace.s_table = udf_iget(sb, loc); 1049 map->s_fspace.s_table = udf_iget(sb, &loc);
1164 if (!map->s_fspace.s_table) { 1050 if (!map->s_fspace.s_table) {
1165 udf_debug("cannot load freedSpaceTable (part %d)\n", 1051 udf_debug("cannot load freedSpaceTable (part %d)\n",
1166 p_index); 1052 p_index);
@@ -1192,7 +1078,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
1192{ 1078{
1193 struct udf_sb_info *sbi = UDF_SB(sb); 1079 struct udf_sb_info *sbi = UDF_SB(sb);
1194 struct udf_part_map *map = &sbi->s_partmaps[p_index]; 1080 struct udf_part_map *map = &sbi->s_partmaps[p_index];
1195 kernel_lb_addr ino; 1081 struct kernel_lb_addr ino;
1196 struct buffer_head *bh = NULL; 1082 struct buffer_head *bh = NULL;
1197 struct udf_inode_info *vati; 1083 struct udf_inode_info *vati;
1198 uint32_t pos; 1084 uint32_t pos;
@@ -1201,7 +1087,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
1201 /* VAT file entry is in the last recorded block */ 1087 /* VAT file entry is in the last recorded block */
1202 ino.partitionReferenceNum = type1_index; 1088 ino.partitionReferenceNum = type1_index;
1203 ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root; 1089 ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root;
1204 sbi->s_vat_inode = udf_iget(sb, ino); 1090 sbi->s_vat_inode = udf_iget(sb, &ino);
1205 if (!sbi->s_vat_inode) 1091 if (!sbi->s_vat_inode)
1206 return 1; 1092 return 1;
1207 1093
@@ -1322,7 +1208,7 @@ out_bh:
1322} 1208}
1323 1209
1324static int udf_load_logicalvol(struct super_block *sb, sector_t block, 1210static int udf_load_logicalvol(struct super_block *sb, sector_t block,
1325 kernel_lb_addr *fileset) 1211 struct kernel_lb_addr *fileset)
1326{ 1212{
1327 struct logicalVolDesc *lvd; 1213 struct logicalVolDesc *lvd;
1328 int i, j, offset; 1214 int i, j, offset;
@@ -1471,7 +1357,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
1471 } 1357 }
1472 1358
1473 if (fileset) { 1359 if (fileset) {
1474 long_ad *la = (long_ad *)&(lvd->logicalVolContentsUse[0]); 1360 struct long_ad *la = (struct long_ad *)&(lvd->logicalVolContentsUse[0]);
1475 1361
1476 *fileset = lelb_to_cpu(la->extLocation); 1362 *fileset = lelb_to_cpu(la->extLocation);
1477 udf_debug("FileSet found in LogicalVolDesc at block=%d, " 1363 udf_debug("FileSet found in LogicalVolDesc at block=%d, "
@@ -1490,7 +1376,7 @@ out_bh:
1490 * udf_load_logicalvolint 1376 * udf_load_logicalvolint
1491 * 1377 *
1492 */ 1378 */
1493static void udf_load_logicalvolint(struct super_block *sb, kernel_extent_ad loc) 1379static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ad loc)
1494{ 1380{
1495 struct buffer_head *bh = NULL; 1381 struct buffer_head *bh = NULL;
1496 uint16_t ident; 1382 uint16_t ident;
@@ -1533,7 +1419,7 @@ static void udf_load_logicalvolint(struct super_block *sb, kernel_extent_ad loc)
1533 * Written, tested, and released. 1419 * Written, tested, and released.
1534 */ 1420 */
1535static noinline int udf_process_sequence(struct super_block *sb, long block, 1421static noinline int udf_process_sequence(struct super_block *sb, long block,
1536 long lastblock, kernel_lb_addr *fileset) 1422 long lastblock, struct kernel_lb_addr *fileset)
1537{ 1423{
1538 struct buffer_head *bh = NULL; 1424 struct buffer_head *bh = NULL;
1539 struct udf_vds_record vds[VDS_POS_LENGTH]; 1425 struct udf_vds_record vds[VDS_POS_LENGTH];
@@ -1655,85 +1541,199 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
1655 return 0; 1541 return 0;
1656} 1542}
1657 1543
1544static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh,
1545 struct kernel_lb_addr *fileset)
1546{
1547 struct anchorVolDescPtr *anchor;
1548 long main_s, main_e, reserve_s, reserve_e;
1549 struct udf_sb_info *sbi;
1550
1551 sbi = UDF_SB(sb);
1552 anchor = (struct anchorVolDescPtr *)bh->b_data;
1553
1554 /* Locate the main sequence */
1555 main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation);
1556 main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength);
1557 main_e = main_e >> sb->s_blocksize_bits;
1558 main_e += main_s;
1559
1560 /* Locate the reserve sequence */
1561 reserve_s = le32_to_cpu(anchor->reserveVolDescSeqExt.extLocation);
1562 reserve_e = le32_to_cpu(anchor->reserveVolDescSeqExt.extLength);
1563 reserve_e = reserve_e >> sb->s_blocksize_bits;
1564 reserve_e += reserve_s;
1565
1566 /* Process the main & reserve sequences */
1567 /* responsible for finding the PartitionDesc(s) */
1568 if (!udf_process_sequence(sb, main_s, main_e, fileset))
1569 return 1;
1570 return !udf_process_sequence(sb, reserve_s, reserve_e, fileset);
1571}
1572
1658/* 1573/*
1659 * udf_check_valid() 1574 * Check whether there is an anchor block in the given block and
1575 * load Volume Descriptor Sequence if so.
1660 */ 1576 */
1661static int udf_check_valid(struct super_block *sb, int novrs, int silent) 1577static int udf_check_anchor_block(struct super_block *sb, sector_t block,
1578 struct kernel_lb_addr *fileset)
1662{ 1579{
1663 long block; 1580 struct buffer_head *bh;
1664 struct udf_sb_info *sbi = UDF_SB(sb); 1581 uint16_t ident;
1582 int ret;
1665 1583
1666 if (novrs) { 1584 if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
1667 udf_debug("Validity check skipped because of novrs option\n"); 1585 udf_fixed_to_variable(block) >=
1586 sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
1587 return 0;
1588
1589 bh = udf_read_tagged(sb, block, block, &ident);
1590 if (!bh)
1591 return 0;
1592 if (ident != TAG_IDENT_AVDP) {
1593 brelse(bh);
1668 return 0; 1594 return 0;
1669 } 1595 }
1670 /* Check that it is NSR02 compliant */ 1596 ret = udf_load_sequence(sb, bh, fileset);
1671 /* Process any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */ 1597 brelse(bh);
1672 block = udf_vrs(sb, silent); 1598 return ret;
1673 if (block == -1)
1674 udf_debug("Failed to read byte 32768. Assuming open "
1675 "disc. Skipping validity check\n");
1676 if (block && !sbi->s_last_block)
1677 sbi->s_last_block = udf_get_last_block(sb);
1678 return !block;
1679} 1599}
1680 1600
1681static int udf_load_sequence(struct super_block *sb, kernel_lb_addr *fileset) 1601/* Search for an anchor volume descriptor pointer */
1602static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock,
1603 struct kernel_lb_addr *fileset)
1682{ 1604{
1683 struct anchorVolDescPtr *anchor; 1605 sector_t last[6];
1684 uint16_t ident;
1685 struct buffer_head *bh;
1686 long main_s, main_e, reserve_s, reserve_e;
1687 int i; 1606 int i;
1688 struct udf_sb_info *sbi; 1607 struct udf_sb_info *sbi = UDF_SB(sb);
1689 1608 int last_count = 0;
1690 if (!sb)
1691 return 1;
1692 sbi = UDF_SB(sb);
1693 1609
1694 for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) { 1610 /* First try user provided anchor */
1695 if (!sbi->s_anchor[i]) 1611 if (sbi->s_anchor) {
1612 if (udf_check_anchor_block(sb, sbi->s_anchor, fileset))
1613 return lastblock;
1614 }
1615 /*
1616 * according to spec, anchor is in either:
1617 * block 256
1618 * lastblock-256
1619 * lastblock
1620 * however, if the disc isn't closed, it could be 512.
1621 */
1622 if (udf_check_anchor_block(sb, sbi->s_session + 256, fileset))
1623 return lastblock;
1624 /*
1625 * The trouble is which block is the last one. Drives often misreport
1626 * this so we try various possibilities.
1627 */
1628 last[last_count++] = lastblock;
1629 if (lastblock >= 1)
1630 last[last_count++] = lastblock - 1;
1631 last[last_count++] = lastblock + 1;
1632 if (lastblock >= 2)
1633 last[last_count++] = lastblock - 2;
1634 if (lastblock >= 150)
1635 last[last_count++] = lastblock - 150;
1636 if (lastblock >= 152)
1637 last[last_count++] = lastblock - 152;
1638
1639 for (i = 0; i < last_count; i++) {
1640 if (last[i] >= sb->s_bdev->bd_inode->i_size >>
1641 sb->s_blocksize_bits)
1696 continue; 1642 continue;
1697 1643 if (udf_check_anchor_block(sb, last[i], fileset))
1698 bh = udf_read_tagged(sb, sbi->s_anchor[i], sbi->s_anchor[i], 1644 return last[i];
1699 &ident); 1645 if (last[i] < 256)
1700 if (!bh)
1701 continue; 1646 continue;
1647 if (udf_check_anchor_block(sb, last[i] - 256, fileset))
1648 return last[i];
1649 }
1702 1650
1703 anchor = (struct anchorVolDescPtr *)bh->b_data; 1651 /* Finally try block 512 in case media is open */
1652 if (udf_check_anchor_block(sb, sbi->s_session + 512, fileset))
1653 return last[0];
1654 return 0;
1655}
1704 1656
1705 /* Locate the main sequence */ 1657/*
1706 main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation); 1658 * Find an anchor volume descriptor and load Volume Descriptor Sequence from
1707 main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength); 1659 * area specified by it. The function expects sbi->s_lastblock to be the last
1708 main_e = main_e >> sb->s_blocksize_bits; 1660 * block on the media.
1709 main_e += main_s; 1661 *
1662 * Return 1 if ok, 0 if not found.
1663 *
1664 */
1665static int udf_find_anchor(struct super_block *sb,
1666 struct kernel_lb_addr *fileset)
1667{
1668 sector_t lastblock;
1669 struct udf_sb_info *sbi = UDF_SB(sb);
1710 1670
1711 /* Locate the reserve sequence */ 1671 lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset);
1712 reserve_s = le32_to_cpu( 1672 if (lastblock)
1713 anchor->reserveVolDescSeqExt.extLocation); 1673 goto out;
1714 reserve_e = le32_to_cpu(
1715 anchor->reserveVolDescSeqExt.extLength);
1716 reserve_e = reserve_e >> sb->s_blocksize_bits;
1717 reserve_e += reserve_s;
1718 1674
1719 brelse(bh); 1675 /* No anchor found? Try VARCONV conversion of block numbers */
1676 UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
1677 /* Firstly, we try to not convert number of the last block */
1678 lastblock = udf_scan_anchors(sb,
1679 udf_variable_to_fixed(sbi->s_last_block),
1680 fileset);
1681 if (lastblock)
1682 goto out;
1720 1683
1721 /* Process the main & reserve sequences */ 1684 /* Secondly, we try with converted number of the last block */
1722 /* responsible for finding the PartitionDesc(s) */ 1685 lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset);
1723 if (!(udf_process_sequence(sb, main_s, main_e, 1686 if (!lastblock) {
1724 fileset) && 1687 /* VARCONV didn't help. Clear it. */
1725 udf_process_sequence(sb, reserve_s, reserve_e, 1688 UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
1726 fileset))) 1689 return 0;
1727 break;
1728 } 1690 }
1691out:
1692 sbi->s_last_block = lastblock;
1693 return 1;
1694}
1729 1695
1730 if (i == ARRAY_SIZE(sbi->s_anchor)) { 1696/*
1731 udf_debug("No Anchor block found\n"); 1697 * Check Volume Structure Descriptor, find Anchor block and load Volume
1732 return 1; 1698 * Descriptor Sequence
1699 */
1700static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
1701 int silent, struct kernel_lb_addr *fileset)
1702{
1703 struct udf_sb_info *sbi = UDF_SB(sb);
1704 loff_t nsr_off;
1705
1706 if (!sb_set_blocksize(sb, uopt->blocksize)) {
1707 if (!silent)
1708 printk(KERN_WARNING "UDF-fs: Bad block size\n");
1709 return 0;
1710 }
1711 sbi->s_last_block = uopt->lastblock;
1712 if (!uopt->novrs) {
1713 /* Check that it is NSR02 compliant */
1714 nsr_off = udf_check_vsd(sb);
1715 if (!nsr_off) {
1716 if (!silent)
1717 printk(KERN_WARNING "UDF-fs: No VRS found\n");
1718 return 0;
1719 }
1720 if (nsr_off == -1)
1721 udf_debug("Failed to read byte 32768. Assuming open "
1722 "disc. Skipping validity check\n");
1723 if (!sbi->s_last_block)
1724 sbi->s_last_block = udf_get_last_block(sb);
1725 } else {
1726 udf_debug("Validity check skipped because of novrs option\n");
1733 } 1727 }
1734 udf_debug("Using anchor in block %d\n", sbi->s_anchor[i]);
1735 1728
1736 return 0; 1729 /* Look for anchor block and load Volume Descriptor Sequence */
1730 sbi->s_anchor = uopt->anchor;
1731 if (!udf_find_anchor(sb, fileset)) {
1732 if (!silent)
1733 printk(KERN_WARNING "UDF-fs: No anchor found\n");
1734 return 0;
1735 }
1736 return 1;
1737} 1737}
1738 1738
1739static void udf_open_lvid(struct super_block *sb) 1739static void udf_open_lvid(struct super_block *sb)
@@ -1742,9 +1742,9 @@ static void udf_open_lvid(struct super_block *sb)
1742 struct buffer_head *bh = sbi->s_lvid_bh; 1742 struct buffer_head *bh = sbi->s_lvid_bh;
1743 struct logicalVolIntegrityDesc *lvid; 1743 struct logicalVolIntegrityDesc *lvid;
1744 struct logicalVolIntegrityDescImpUse *lvidiu; 1744 struct logicalVolIntegrityDescImpUse *lvidiu;
1745
1745 if (!bh) 1746 if (!bh)
1746 return; 1747 return;
1747
1748 lvid = (struct logicalVolIntegrityDesc *)bh->b_data; 1748 lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
1749 lvidiu = udf_sb_lvidiu(sbi); 1749 lvidiu = udf_sb_lvidiu(sbi);
1750 1750
@@ -1752,14 +1752,15 @@ static void udf_open_lvid(struct super_block *sb)
1752 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; 1752 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
1753 udf_time_to_disk_stamp(&lvid->recordingDateAndTime, 1753 udf_time_to_disk_stamp(&lvid->recordingDateAndTime,
1754 CURRENT_TIME); 1754 CURRENT_TIME);
1755 lvid->integrityType = LVID_INTEGRITY_TYPE_OPEN; 1755 lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN);
1756 1756
1757 lvid->descTag.descCRC = cpu_to_le16( 1757 lvid->descTag.descCRC = cpu_to_le16(
1758 crc_itu_t(0, (char *)lvid + sizeof(tag), 1758 crc_itu_t(0, (char *)lvid + sizeof(struct tag),
1759 le16_to_cpu(lvid->descTag.descCRCLength))); 1759 le16_to_cpu(lvid->descTag.descCRCLength)));
1760 1760
1761 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); 1761 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
1762 mark_buffer_dirty(bh); 1762 mark_buffer_dirty(bh);
1763 sbi->s_lvid_dirty = 0;
1763} 1764}
1764 1765
1765static void udf_close_lvid(struct super_block *sb) 1766static void udf_close_lvid(struct super_block *sb)
@@ -1773,10 +1774,6 @@ static void udf_close_lvid(struct super_block *sb)
1773 return; 1774 return;
1774 1775
1775 lvid = (struct logicalVolIntegrityDesc *)bh->b_data; 1776 lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
1776
1777 if (lvid->integrityType != LVID_INTEGRITY_TYPE_OPEN)
1778 return;
1779
1780 lvidiu = udf_sb_lvidiu(sbi); 1777 lvidiu = udf_sb_lvidiu(sbi);
1781 lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; 1778 lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
1782 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; 1779 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1790,11 +1787,12 @@ static void udf_close_lvid(struct super_block *sb)
1790 lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE); 1787 lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE);
1791 1788
1792 lvid->descTag.descCRC = cpu_to_le16( 1789 lvid->descTag.descCRC = cpu_to_le16(
1793 crc_itu_t(0, (char *)lvid + sizeof(tag), 1790 crc_itu_t(0, (char *)lvid + sizeof(struct tag),
1794 le16_to_cpu(lvid->descTag.descCRCLength))); 1791 le16_to_cpu(lvid->descTag.descCRCLength)));
1795 1792
1796 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); 1793 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
1797 mark_buffer_dirty(bh); 1794 mark_buffer_dirty(bh);
1795 sbi->s_lvid_dirty = 0;
1798} 1796}
1799 1797
1800static void udf_sb_free_bitmap(struct udf_bitmap *bitmap) 1798static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
@@ -1846,15 +1844,18 @@ static void udf_free_partition(struct udf_part_map *map)
1846static int udf_fill_super(struct super_block *sb, void *options, int silent) 1844static int udf_fill_super(struct super_block *sb, void *options, int silent)
1847{ 1845{
1848 int i; 1846 int i;
1847 int ret;
1849 struct inode *inode = NULL; 1848 struct inode *inode = NULL;
1850 struct udf_options uopt; 1849 struct udf_options uopt;
1851 kernel_lb_addr rootdir, fileset; 1850 struct kernel_lb_addr rootdir, fileset;
1852 struct udf_sb_info *sbi; 1851 struct udf_sb_info *sbi;
1853 1852
1854 uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT); 1853 uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
1855 uopt.uid = -1; 1854 uopt.uid = -1;
1856 uopt.gid = -1; 1855 uopt.gid = -1;
1857 uopt.umask = 0; 1856 uopt.umask = 0;
1857 uopt.fmode = UDF_INVALID_MODE;
1858 uopt.dmode = UDF_INVALID_MODE;
1858 1859
1859 sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL); 1860 sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL);
1860 if (!sbi) 1861 if (!sbi)
@@ -1892,15 +1893,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1892 sbi->s_uid = uopt.uid; 1893 sbi->s_uid = uopt.uid;
1893 sbi->s_gid = uopt.gid; 1894 sbi->s_gid = uopt.gid;
1894 sbi->s_umask = uopt.umask; 1895 sbi->s_umask = uopt.umask;
1896 sbi->s_fmode = uopt.fmode;
1897 sbi->s_dmode = uopt.dmode;
1895 sbi->s_nls_map = uopt.nls_map; 1898 sbi->s_nls_map = uopt.nls_map;
1896 1899
1897 /* Set the block size for all transfers */
1898 if (!sb_min_blocksize(sb, uopt.blocksize)) {
1899 udf_debug("Bad block size (%d)\n", uopt.blocksize);
1900 printk(KERN_ERR "udf: bad block size (%d)\n", uopt.blocksize);
1901 goto error_out;
1902 }
1903
1904 if (uopt.session == 0xFFFFFFFF) 1900 if (uopt.session == 0xFFFFFFFF)
1905 sbi->s_session = udf_get_last_session(sb); 1901 sbi->s_session = udf_get_last_session(sb);
1906 else 1902 else
@@ -1908,18 +1904,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1908 1904
1909 udf_debug("Multi-session=%d\n", sbi->s_session); 1905 udf_debug("Multi-session=%d\n", sbi->s_session);
1910 1906
1911 sbi->s_last_block = uopt.lastblock;
1912 sbi->s_anchor[0] = sbi->s_anchor[1] = 0;
1913 sbi->s_anchor[2] = uopt.anchor;
1914
1915 if (udf_check_valid(sb, uopt.novrs, silent)) {
1916 /* read volume recognition sequences */
1917 printk(KERN_WARNING "UDF-fs: No VRS found\n");
1918 goto error_out;
1919 }
1920
1921 udf_find_anchor(sb);
1922
1923 /* Fill in the rest of the superblock */ 1907 /* Fill in the rest of the superblock */
1924 sb->s_op = &udf_sb_ops; 1908 sb->s_op = &udf_sb_ops;
1925 sb->s_export_op = &udf_export_ops; 1909 sb->s_export_op = &udf_export_ops;
@@ -1928,7 +1912,21 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1928 sb->s_magic = UDF_SUPER_MAGIC; 1912 sb->s_magic = UDF_SUPER_MAGIC;
1929 sb->s_time_gran = 1000; 1913 sb->s_time_gran = 1000;
1930 1914
1931 if (udf_load_sequence(sb, &fileset)) { 1915 if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
1916 ret = udf_load_vrs(sb, &uopt, silent, &fileset);
1917 } else {
1918 uopt.blocksize = bdev_hardsect_size(sb->s_bdev);
1919 ret = udf_load_vrs(sb, &uopt, silent, &fileset);
1920 if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
1921 if (!silent)
1922 printk(KERN_NOTICE
1923 "UDF-fs: Rescanning with blocksize "
1924 "%d\n", UDF_DEFAULT_BLOCKSIZE);
1925 uopt.blocksize = UDF_DEFAULT_BLOCKSIZE;
1926 ret = udf_load_vrs(sb, &uopt, silent, &fileset);
1927 }
1928 }
1929 if (!ret) {
1932 printk(KERN_WARNING "UDF-fs: No partition found (1)\n"); 1930 printk(KERN_WARNING "UDF-fs: No partition found (1)\n");
1933 goto error_out; 1931 goto error_out;
1934 } 1932 }
@@ -1978,7 +1976,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1978 } 1976 }
1979 1977
1980 if (!silent) { 1978 if (!silent) {
1981 timestamp ts; 1979 struct timestamp ts;
1982 udf_time_to_disk_stamp(&ts, sbi->s_record_time); 1980 udf_time_to_disk_stamp(&ts, sbi->s_record_time);
1983 udf_info("UDF: Mounting volume '%s', " 1981 udf_info("UDF: Mounting volume '%s', "
1984 "timestamp %04u/%02u/%02u %02u:%02u (%x)\n", 1982 "timestamp %04u/%02u/%02u %02u:%02u (%x)\n",
@@ -1991,7 +1989,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1991 /* Assign the root inode */ 1989 /* Assign the root inode */
1992 /* assign inodes by physical block number */ 1990 /* assign inodes by physical block number */
1993 /* perhaps it's not extensible enough, but for now ... */ 1991 /* perhaps it's not extensible enough, but for now ... */
1994 inode = udf_iget(sb, rootdir); 1992 inode = udf_iget(sb, &rootdir);
1995 if (!inode) { 1993 if (!inode) {
1996 printk(KERN_ERR "UDF-fs: Error in udf_iget, block=%d, " 1994 printk(KERN_ERR "UDF-fs: Error in udf_iget, block=%d, "
1997 "partition=%d\n", 1995 "partition=%d\n",
@@ -2081,11 +2079,31 @@ static void udf_put_super(struct super_block *sb)
2081 sb->s_fs_info = NULL; 2079 sb->s_fs_info = NULL;
2082} 2080}
2083 2081
2082static int udf_sync_fs(struct super_block *sb, int wait)
2083{
2084 struct udf_sb_info *sbi = UDF_SB(sb);
2085
2086 mutex_lock(&sbi->s_alloc_mutex);
2087 if (sbi->s_lvid_dirty) {
2088 /*
2089 * Blockdevice will be synced later so we don't have to submit
2090 * the buffer for IO
2091 */
2092 mark_buffer_dirty(sbi->s_lvid_bh);
2093 sb->s_dirt = 0;
2094 sbi->s_lvid_dirty = 0;
2095 }
2096 mutex_unlock(&sbi->s_alloc_mutex);
2097
2098 return 0;
2099}
2100
2084static int udf_statfs(struct dentry *dentry, struct kstatfs *buf) 2101static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
2085{ 2102{
2086 struct super_block *sb = dentry->d_sb; 2103 struct super_block *sb = dentry->d_sb;
2087 struct udf_sb_info *sbi = UDF_SB(sb); 2104 struct udf_sb_info *sbi = UDF_SB(sb);
2088 struct logicalVolIntegrityDescImpUse *lvidiu; 2105 struct logicalVolIntegrityDescImpUse *lvidiu;
2106 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
2089 2107
2090 if (sbi->s_lvid_bh != NULL) 2108 if (sbi->s_lvid_bh != NULL)
2091 lvidiu = udf_sb_lvidiu(sbi); 2109 lvidiu = udf_sb_lvidiu(sbi);
@@ -2101,8 +2119,9 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
2101 le32_to_cpu(lvidiu->numDirs)) : 0) 2119 le32_to_cpu(lvidiu->numDirs)) : 0)
2102 + buf->f_bfree; 2120 + buf->f_bfree;
2103 buf->f_ffree = buf->f_bfree; 2121 buf->f_ffree = buf->f_bfree;
2104 /* __kernel_fsid_t f_fsid */
2105 buf->f_namelen = UDF_NAME_LEN - 2; 2122 buf->f_namelen = UDF_NAME_LEN - 2;
2123 buf->f_fsid.val[0] = (u32)id;
2124 buf->f_fsid.val[1] = (u32)(id >> 32);
2106 2125
2107 return 0; 2126 return 0;
2108} 2127}
@@ -2114,7 +2133,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
2114 unsigned int accum = 0; 2133 unsigned int accum = 0;
2115 int index; 2134 int index;
2116 int block = 0, newblock; 2135 int block = 0, newblock;
2117 kernel_lb_addr loc; 2136 struct kernel_lb_addr loc;
2118 uint32_t bytes; 2137 uint32_t bytes;
2119 uint8_t *ptr; 2138 uint8_t *ptr;
2120 uint16_t ident; 2139 uint16_t ident;
@@ -2124,7 +2143,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
2124 2143
2125 loc.logicalBlockNum = bitmap->s_extPosition; 2144 loc.logicalBlockNum = bitmap->s_extPosition;
2126 loc.partitionReferenceNum = UDF_SB(sb)->s_partition; 2145 loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
2127 bh = udf_read_ptagged(sb, loc, 0, &ident); 2146 bh = udf_read_ptagged(sb, &loc, 0, &ident);
2128 2147
2129 if (!bh) { 2148 if (!bh) {
2130 printk(KERN_ERR "udf: udf_count_free failed\n"); 2149 printk(KERN_ERR "udf: udf_count_free failed\n");
@@ -2147,7 +2166,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
2147 bytes -= cur_bytes; 2166 bytes -= cur_bytes;
2148 if (bytes) { 2167 if (bytes) {
2149 brelse(bh); 2168 brelse(bh);
2150 newblock = udf_get_lb_pblock(sb, loc, ++block); 2169 newblock = udf_get_lb_pblock(sb, &loc, ++block);
2151 bh = udf_tread(sb, newblock); 2170 bh = udf_tread(sb, newblock);
2152 if (!bh) { 2171 if (!bh) {
2153 udf_debug("read failed\n"); 2172 udf_debug("read failed\n");
@@ -2170,7 +2189,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
2170{ 2189{
2171 unsigned int accum = 0; 2190 unsigned int accum = 0;
2172 uint32_t elen; 2191 uint32_t elen;
2173 kernel_lb_addr eloc; 2192 struct kernel_lb_addr eloc;
2174 int8_t etype; 2193 int8_t etype;
2175 struct extent_position epos; 2194 struct extent_position epos;
2176 2195
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 65e19b4f9424..225527cdc885 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -28,10 +28,10 @@
28#include "udf_sb.h" 28#include "udf_sb.h"
29 29
30static void extent_trunc(struct inode *inode, struct extent_position *epos, 30static void extent_trunc(struct inode *inode, struct extent_position *epos,
31 kernel_lb_addr eloc, int8_t etype, uint32_t elen, 31 struct kernel_lb_addr *eloc, int8_t etype, uint32_t elen,
32 uint32_t nelen) 32 uint32_t nelen)
33{ 33{
34 kernel_lb_addr neloc = {}; 34 struct kernel_lb_addr neloc = {};
35 int last_block = (elen + inode->i_sb->s_blocksize - 1) >> 35 int last_block = (elen + inode->i_sb->s_blocksize - 1) >>
36 inode->i_sb->s_blocksize_bits; 36 inode->i_sb->s_blocksize_bits;
37 int first_block = (nelen + inode->i_sb->s_blocksize - 1) >> 37 int first_block = (nelen + inode->i_sb->s_blocksize - 1) >>
@@ -43,12 +43,12 @@ static void extent_trunc(struct inode *inode, struct extent_position *epos,
43 last_block); 43 last_block);
44 etype = (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30); 44 etype = (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30);
45 } else 45 } else
46 neloc = eloc; 46 neloc = *eloc;
47 nelen = (etype << 30) | nelen; 47 nelen = (etype << 30) | nelen;
48 } 48 }
49 49
50 if (elen != nelen) { 50 if (elen != nelen) {
51 udf_write_aext(inode, epos, neloc, nelen, 0); 51 udf_write_aext(inode, epos, &neloc, nelen, 0);
52 if (last_block - first_block > 0) { 52 if (last_block - first_block > 0) {
53 if (etype == (EXT_RECORDED_ALLOCATED >> 30)) 53 if (etype == (EXT_RECORDED_ALLOCATED >> 30))
54 mark_inode_dirty(inode); 54 mark_inode_dirty(inode);
@@ -68,7 +68,7 @@ static void extent_trunc(struct inode *inode, struct extent_position *epos,
68void udf_truncate_tail_extent(struct inode *inode) 68void udf_truncate_tail_extent(struct inode *inode)
69{ 69{
70 struct extent_position epos = {}; 70 struct extent_position epos = {};
71 kernel_lb_addr eloc; 71 struct kernel_lb_addr eloc;
72 uint32_t elen, nelen; 72 uint32_t elen, nelen;
73 uint64_t lbcount = 0; 73 uint64_t lbcount = 0;
74 int8_t etype = -1, netype; 74 int8_t etype = -1, netype;
@@ -83,9 +83,9 @@ void udf_truncate_tail_extent(struct inode *inode)
83 return; 83 return;
84 84
85 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 85 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
86 adsize = sizeof(short_ad); 86 adsize = sizeof(struct short_ad);
87 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 87 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
88 adsize = sizeof(long_ad); 88 adsize = sizeof(struct long_ad);
89 else 89 else
90 BUG(); 90 BUG();
91 91
@@ -106,7 +106,7 @@ void udf_truncate_tail_extent(struct inode *inode)
106 (unsigned)elen); 106 (unsigned)elen);
107 nelen = elen - (lbcount - inode->i_size); 107 nelen = elen - (lbcount - inode->i_size);
108 epos.offset -= adsize; 108 epos.offset -= adsize;
109 extent_trunc(inode, &epos, eloc, etype, elen, nelen); 109 extent_trunc(inode, &epos, &eloc, etype, elen, nelen);
110 epos.offset += adsize; 110 epos.offset += adsize;
111 if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1) 111 if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1)
112 printk(KERN_ERR "udf_truncate_tail_extent(): " 112 printk(KERN_ERR "udf_truncate_tail_extent(): "
@@ -124,7 +124,7 @@ void udf_truncate_tail_extent(struct inode *inode)
124void udf_discard_prealloc(struct inode *inode) 124void udf_discard_prealloc(struct inode *inode)
125{ 125{
126 struct extent_position epos = { NULL, 0, {0, 0} }; 126 struct extent_position epos = { NULL, 0, {0, 0} };
127 kernel_lb_addr eloc; 127 struct kernel_lb_addr eloc;
128 uint32_t elen; 128 uint32_t elen;
129 uint64_t lbcount = 0; 129 uint64_t lbcount = 0;
130 int8_t etype = -1, netype; 130 int8_t etype = -1, netype;
@@ -136,9 +136,9 @@ void udf_discard_prealloc(struct inode *inode)
136 return; 136 return;
137 137
138 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 138 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
139 adsize = sizeof(short_ad); 139 adsize = sizeof(struct short_ad);
140 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 140 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
141 adsize = sizeof(long_ad); 141 adsize = sizeof(struct long_ad);
142 else 142 else
143 adsize = 0; 143 adsize = 0;
144 144
@@ -152,7 +152,7 @@ void udf_discard_prealloc(struct inode *inode)
152 if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { 152 if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
153 epos.offset -= adsize; 153 epos.offset -= adsize;
154 lbcount -= elen; 154 lbcount -= elen;
155 extent_trunc(inode, &epos, eloc, etype, elen, 0); 155 extent_trunc(inode, &epos, &eloc, etype, elen, 0);
156 if (!epos.bh) { 156 if (!epos.bh) {
157 iinfo->i_lenAlloc = 157 iinfo->i_lenAlloc =
158 epos.offset - 158 epos.offset -
@@ -200,7 +200,7 @@ static void udf_update_alloc_ext_desc(struct inode *inode,
200void udf_truncate_extents(struct inode *inode) 200void udf_truncate_extents(struct inode *inode)
201{ 201{
202 struct extent_position epos; 202 struct extent_position epos;
203 kernel_lb_addr eloc, neloc = {}; 203 struct kernel_lb_addr eloc, neloc = {};
204 uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc; 204 uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc;
205 int8_t etype; 205 int8_t etype;
206 struct super_block *sb = inode->i_sb; 206 struct super_block *sb = inode->i_sb;
@@ -210,9 +210,9 @@ void udf_truncate_extents(struct inode *inode)
210 struct udf_inode_info *iinfo = UDF_I(inode); 210 struct udf_inode_info *iinfo = UDF_I(inode);
211 211
212 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 212 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
213 adsize = sizeof(short_ad); 213 adsize = sizeof(struct short_ad);
214 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 214 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
215 adsize = sizeof(long_ad); 215 adsize = sizeof(struct long_ad);
216 else 216 else
217 BUG(); 217 BUG();
218 218
@@ -221,7 +221,7 @@ void udf_truncate_extents(struct inode *inode)
221 (inode->i_size & (sb->s_blocksize - 1)); 221 (inode->i_size & (sb->s_blocksize - 1));
222 if (etype != -1) { 222 if (etype != -1) {
223 epos.offset -= adsize; 223 epos.offset -= adsize;
224 extent_trunc(inode, &epos, eloc, etype, elen, byte_offset); 224 extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset);
225 epos.offset += adsize; 225 epos.offset += adsize;
226 if (byte_offset) 226 if (byte_offset)
227 lenalloc = epos.offset; 227 lenalloc = epos.offset;
@@ -236,12 +236,12 @@ void udf_truncate_extents(struct inode *inode)
236 while ((etype = udf_current_aext(inode, &epos, &eloc, 236 while ((etype = udf_current_aext(inode, &epos, &eloc,
237 &elen, 0)) != -1) { 237 &elen, 0)) != -1) {
238 if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) { 238 if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
239 udf_write_aext(inode, &epos, neloc, nelen, 0); 239 udf_write_aext(inode, &epos, &neloc, nelen, 0);
240 if (indirect_ext_len) { 240 if (indirect_ext_len) {
241 /* We managed to free all extents in the 241 /* We managed to free all extents in the
242 * indirect extent - free it too */ 242 * indirect extent - free it too */
243 BUG_ON(!epos.bh); 243 BUG_ON(!epos.bh);
244 udf_free_blocks(sb, inode, epos.block, 244 udf_free_blocks(sb, inode, &epos.block,
245 0, indirect_ext_len); 245 0, indirect_ext_len);
246 } else if (!epos.bh) { 246 } else if (!epos.bh) {
247 iinfo->i_lenAlloc = lenalloc; 247 iinfo->i_lenAlloc = lenalloc;
@@ -253,7 +253,7 @@ void udf_truncate_extents(struct inode *inode)
253 epos.offset = sizeof(struct allocExtDesc); 253 epos.offset = sizeof(struct allocExtDesc);
254 epos.block = eloc; 254 epos.block = eloc;
255 epos.bh = udf_tread(sb, 255 epos.bh = udf_tread(sb,
256 udf_get_lb_pblock(sb, eloc, 0)); 256 udf_get_lb_pblock(sb, &eloc, 0));
257 if (elen) 257 if (elen)
258 indirect_ext_len = 258 indirect_ext_len =
259 (elen + sb->s_blocksize - 1) >> 259 (elen + sb->s_blocksize - 1) >>
@@ -261,7 +261,7 @@ void udf_truncate_extents(struct inode *inode)
261 else 261 else
262 indirect_ext_len = 1; 262 indirect_ext_len = 1;
263 } else { 263 } else {
264 extent_trunc(inode, &epos, eloc, etype, 264 extent_trunc(inode, &epos, &eloc, etype,
265 elen, 0); 265 elen, 0);
266 epos.offset += adsize; 266 epos.offset += adsize;
267 } 267 }
@@ -269,7 +269,7 @@ void udf_truncate_extents(struct inode *inode)
269 269
270 if (indirect_ext_len) { 270 if (indirect_ext_len) {
271 BUG_ON(!epos.bh); 271 BUG_ON(!epos.bh);
272 udf_free_blocks(sb, inode, epos.block, 0, 272 udf_free_blocks(sb, inode, &epos.block, 0,
273 indirect_ext_len); 273 indirect_ext_len);
274 } else if (!epos.bh) { 274 } else if (!epos.bh) {
275 iinfo->i_lenAlloc = lenalloc; 275 iinfo->i_lenAlloc = lenalloc;
@@ -278,7 +278,7 @@ void udf_truncate_extents(struct inode *inode)
278 udf_update_alloc_ext_desc(inode, &epos, lenalloc); 278 udf_update_alloc_ext_desc(inode, &epos, lenalloc);
279 } else if (inode->i_size) { 279 } else if (inode->i_size) {
280 if (byte_offset) { 280 if (byte_offset) {
281 kernel_long_ad extent; 281 struct kernel_long_ad extent;
282 282
283 /* 283 /*
284 * OK, there is not extent covering inode->i_size and 284 * OK, there is not extent covering inode->i_size and
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index 4f86b1d98a5d..e58d1de41073 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -4,7 +4,7 @@
4struct udf_inode_info { 4struct udf_inode_info {
5 struct timespec i_crtime; 5 struct timespec i_crtime;
6 /* Physical address of inode */ 6 /* Physical address of inode */
7 kernel_lb_addr i_location; 7 struct kernel_lb_addr i_location;
8 __u64 i_unique; 8 __u64 i_unique;
9 __u32 i_lenEAttr; 9 __u32 i_lenEAttr;
10 __u32 i_lenAlloc; 10 __u32 i_lenAlloc;
@@ -17,8 +17,8 @@ struct udf_inode_info {
17 unsigned i_strat4096 : 1; 17 unsigned i_strat4096 : 1;
18 unsigned reserved : 26; 18 unsigned reserved : 26;
19 union { 19 union {
20 short_ad *i_sad; 20 struct short_ad *i_sad;
21 long_ad *i_lad; 21 struct long_ad *i_lad;
22 __u8 *i_data; 22 __u8 *i_data;
23 } i_ext; 23 } i_ext;
24 struct inode vfs_inode; 24 struct inode vfs_inode;
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 1c1c514a9725..d113b72c2768 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -30,6 +30,7 @@
30#define UDF_FLAG_GID_SET 16 30#define UDF_FLAG_GID_SET 16
31#define UDF_FLAG_SESSION_SET 17 31#define UDF_FLAG_SESSION_SET 17
32#define UDF_FLAG_LASTBLOCK_SET 18 32#define UDF_FLAG_LASTBLOCK_SET 18
33#define UDF_FLAG_BLOCKSIZE_SET 19
33 34
34#define UDF_PART_FLAG_UNALLOC_BITMAP 0x0001 35#define UDF_PART_FLAG_UNALLOC_BITMAP 0x0001
35#define UDF_PART_FLAG_UNALLOC_TABLE 0x0002 36#define UDF_PART_FLAG_UNALLOC_TABLE 0x0002
@@ -48,6 +49,8 @@
48#define UDF_SPARABLE_MAP15 0x1522U 49#define UDF_SPARABLE_MAP15 0x1522U
49#define UDF_METADATA_MAP25 0x2511U 50#define UDF_METADATA_MAP25 0x2511U
50 51
52#define UDF_INVALID_MODE ((mode_t)-1)
53
51#pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */ 54#pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */
52 55
53struct udf_meta_data { 56struct udf_meta_data {
@@ -114,7 +117,7 @@ struct udf_sb_info {
114 117
115 /* Sector headers */ 118 /* Sector headers */
116 __s32 s_session; 119 __s32 s_session;
117 __u32 s_anchor[3]; 120 __u32 s_anchor;
118 __u32 s_last_block; 121 __u32 s_last_block;
119 122
120 struct buffer_head *s_lvid_bh; 123 struct buffer_head *s_lvid_bh;
@@ -123,6 +126,8 @@ struct udf_sb_info {
123 mode_t s_umask; 126 mode_t s_umask;
124 gid_t s_gid; 127 gid_t s_gid;
125 uid_t s_uid; 128 uid_t s_uid;
129 mode_t s_fmode;
130 mode_t s_dmode;
126 131
127 /* Root Info */ 132 /* Root Info */
128 struct timespec s_record_time; 133 struct timespec s_record_time;
@@ -143,6 +148,8 @@ struct udf_sb_info {
143 struct inode *s_vat_inode; 148 struct inode *s_vat_inode;
144 149
145 struct mutex s_alloc_mutex; 150 struct mutex s_alloc_mutex;
151 /* Protected by s_alloc_mutex */
152 unsigned int s_lvid_dirty;
146}; 153};
147 154
148static inline struct udf_sb_info *UDF_SB(struct super_block *sb) 155static inline struct udf_sb_info *UDF_SB(struct super_block *sb)
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 8ec865de5f13..cac51b77a5d1 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -62,10 +62,8 @@ static inline size_t udf_ext0_offset(struct inode *inode)
62 return 0; 62 return 0;
63} 63}
64 64
65#define udf_get_lb_pblock(sb,loc,offset) udf_get_pblock((sb), (loc).logicalBlockNum, (loc).partitionReferenceNum, (offset))
66
67/* computes tag checksum */ 65/* computes tag checksum */
68u8 udf_tag_checksum(const tag *t); 66u8 udf_tag_checksum(const struct tag *t);
69 67
70struct dentry; 68struct dentry;
71struct inode; 69struct inode;
@@ -95,7 +93,7 @@ struct udf_vds_record {
95}; 93};
96 94
97struct generic_desc { 95struct generic_desc {
98 tag descTag; 96 struct tag descTag;
99 __le32 volDescSeqNum; 97 __le32 volDescSeqNum;
100}; 98};
101 99
@@ -108,11 +106,22 @@ struct ustr {
108struct extent_position { 106struct extent_position {
109 struct buffer_head *bh; 107 struct buffer_head *bh;
110 uint32_t offset; 108 uint32_t offset;
111 kernel_lb_addr block; 109 struct kernel_lb_addr block;
112}; 110};
113 111
114/* super.c */ 112/* super.c */
115extern void udf_warning(struct super_block *, const char *, const char *, ...); 113extern void udf_warning(struct super_block *, const char *, const char *, ...);
114static inline void udf_updated_lvid(struct super_block *sb)
115{
116 struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh;
117
118 BUG_ON(!bh);
119 WARN_ON_ONCE(((struct logicalVolIntegrityDesc *)
120 bh->b_data)->integrityType !=
121 cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN));
122 sb->s_dirt = 1;
123 UDF_SB(sb)->s_lvid_dirty = 1;
124}
116 125
117/* namei.c */ 126/* namei.c */
118extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, 127extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
@@ -124,7 +133,7 @@ extern int udf_ioctl(struct inode *, struct file *, unsigned int,
124 unsigned long); 133 unsigned long);
125 134
126/* inode.c */ 135/* inode.c */
127extern struct inode *udf_iget(struct super_block *, kernel_lb_addr); 136extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
128extern int udf_sync_inode(struct inode *); 137extern int udf_sync_inode(struct inode *);
129extern void udf_expand_file_adinicb(struct inode *, int, int *); 138extern void udf_expand_file_adinicb(struct inode *, int, int *);
130extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *); 139extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
@@ -136,19 +145,19 @@ extern void udf_clear_inode(struct inode *);
136extern int udf_write_inode(struct inode *, int); 145extern int udf_write_inode(struct inode *, int);
137extern long udf_block_map(struct inode *, sector_t); 146extern long udf_block_map(struct inode *, sector_t);
138extern int udf_extend_file(struct inode *, struct extent_position *, 147extern int udf_extend_file(struct inode *, struct extent_position *,
139 kernel_long_ad *, sector_t); 148 struct kernel_long_ad *, sector_t);
140extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *, 149extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *,
141 kernel_lb_addr *, uint32_t *, sector_t *); 150 struct kernel_lb_addr *, uint32_t *, sector_t *);
142extern int8_t udf_add_aext(struct inode *, struct extent_position *, 151extern int8_t udf_add_aext(struct inode *, struct extent_position *,
143 kernel_lb_addr, uint32_t, int); 152 struct kernel_lb_addr *, uint32_t, int);
144extern int8_t udf_write_aext(struct inode *, struct extent_position *, 153extern int8_t udf_write_aext(struct inode *, struct extent_position *,
145 kernel_lb_addr, uint32_t, int); 154 struct kernel_lb_addr *, uint32_t, int);
146extern int8_t udf_delete_aext(struct inode *, struct extent_position, 155extern int8_t udf_delete_aext(struct inode *, struct extent_position,
147 kernel_lb_addr, uint32_t); 156 struct kernel_lb_addr, uint32_t);
148extern int8_t udf_next_aext(struct inode *, struct extent_position *, 157extern int8_t udf_next_aext(struct inode *, struct extent_position *,
149 kernel_lb_addr *, uint32_t *, int); 158 struct kernel_lb_addr *, uint32_t *, int);
150extern int8_t udf_current_aext(struct inode *, struct extent_position *, 159extern int8_t udf_current_aext(struct inode *, struct extent_position *,
151 kernel_lb_addr *, uint32_t *, int); 160 struct kernel_lb_addr *, uint32_t *, int);
152 161
153/* misc.c */ 162/* misc.c */
154extern struct buffer_head *udf_tgetblk(struct super_block *, int); 163extern struct buffer_head *udf_tgetblk(struct super_block *, int);
@@ -160,7 +169,7 @@ extern struct genericFormat *udf_get_extendedattr(struct inode *, uint32_t,
160extern struct buffer_head *udf_read_tagged(struct super_block *, uint32_t, 169extern struct buffer_head *udf_read_tagged(struct super_block *, uint32_t,
161 uint32_t, uint16_t *); 170 uint32_t, uint16_t *);
162extern struct buffer_head *udf_read_ptagged(struct super_block *, 171extern struct buffer_head *udf_read_ptagged(struct super_block *,
163 kernel_lb_addr, uint32_t, 172 struct kernel_lb_addr *, uint32_t,
164 uint16_t *); 173 uint16_t *);
165extern void udf_update_tag(char *, int); 174extern void udf_update_tag(char *, int);
166extern void udf_new_tag(char *, uint16_t, uint16_t, uint16_t, uint32_t, int); 175extern void udf_new_tag(char *, uint16_t, uint16_t, uint16_t, uint32_t, int);
@@ -182,6 +191,14 @@ extern uint32_t udf_get_pblock_meta25(struct super_block *, uint32_t, uint16_t,
182 uint32_t); 191 uint32_t);
183extern int udf_relocate_blocks(struct super_block *, long, long *); 192extern int udf_relocate_blocks(struct super_block *, long, long *);
184 193
194static inline uint32_t
195udf_get_lb_pblock(struct super_block *sb, struct kernel_lb_addr *loc,
196 uint32_t offset)
197{
198 return udf_get_pblock(sb, loc->logicalBlockNum,
199 loc->partitionReferenceNum, offset);
200}
201
185/* unicode.c */ 202/* unicode.c */
186extern int udf_get_filename(struct super_block *, uint8_t *, uint8_t *, int); 203extern int udf_get_filename(struct super_block *, uint8_t *, uint8_t *, int);
187extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *, 204extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *,
@@ -200,7 +217,7 @@ extern void udf_truncate_extents(struct inode *);
200 217
201/* balloc.c */ 218/* balloc.c */
202extern void udf_free_blocks(struct super_block *, struct inode *, 219extern void udf_free_blocks(struct super_block *, struct inode *,
203 kernel_lb_addr, uint32_t, uint32_t); 220 struct kernel_lb_addr *, uint32_t, uint32_t);
204extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t, 221extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t,
205 uint32_t, uint32_t); 222 uint32_t, uint32_t);
206extern int udf_new_block(struct super_block *, struct inode *, uint16_t, 223extern int udf_new_block(struct super_block *, struct inode *, uint16_t,
@@ -214,16 +231,16 @@ extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *,
214 struct udf_fileident_bh *, 231 struct udf_fileident_bh *,
215 struct fileIdentDesc *, 232 struct fileIdentDesc *,
216 struct extent_position *, 233 struct extent_position *,
217 kernel_lb_addr *, uint32_t *, 234 struct kernel_lb_addr *, uint32_t *,
218 sector_t *); 235 sector_t *);
219extern struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, 236extern struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize,
220 int *offset); 237 int *offset);
221extern long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int); 238extern struct long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int);
222extern short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int); 239extern struct short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int);
223 240
224/* udftime.c */ 241/* udftime.c */
225extern struct timespec *udf_disk_stamp_to_time(struct timespec *dest, 242extern struct timespec *udf_disk_stamp_to_time(struct timespec *dest,
226 timestamp src); 243 struct timestamp src);
227extern timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec src); 244extern struct timestamp *udf_time_to_disk_stamp(struct timestamp *dest, struct timespec src);
228 245
229#endif /* __UDF_DECL_H */ 246#endif /* __UDF_DECL_H */
diff --git a/fs/udf/udfend.h b/fs/udf/udfend.h
index 489f52fb428c..6a9f3a9cc428 100644
--- a/fs/udf/udfend.h
+++ b/fs/udf/udfend.h
@@ -4,9 +4,9 @@
4#include <asm/byteorder.h> 4#include <asm/byteorder.h>
5#include <linux/string.h> 5#include <linux/string.h>
6 6
7static inline kernel_lb_addr lelb_to_cpu(lb_addr in) 7static inline struct kernel_lb_addr lelb_to_cpu(struct lb_addr in)
8{ 8{
9 kernel_lb_addr out; 9 struct kernel_lb_addr out;
10 10
11 out.logicalBlockNum = le32_to_cpu(in.logicalBlockNum); 11 out.logicalBlockNum = le32_to_cpu(in.logicalBlockNum);
12 out.partitionReferenceNum = le16_to_cpu(in.partitionReferenceNum); 12 out.partitionReferenceNum = le16_to_cpu(in.partitionReferenceNum);
@@ -14,9 +14,9 @@ static inline kernel_lb_addr lelb_to_cpu(lb_addr in)
14 return out; 14 return out;
15} 15}
16 16
17static inline lb_addr cpu_to_lelb(kernel_lb_addr in) 17static inline struct lb_addr cpu_to_lelb(struct kernel_lb_addr in)
18{ 18{
19 lb_addr out; 19 struct lb_addr out;
20 20
21 out.logicalBlockNum = cpu_to_le32(in.logicalBlockNum); 21 out.logicalBlockNum = cpu_to_le32(in.logicalBlockNum);
22 out.partitionReferenceNum = cpu_to_le16(in.partitionReferenceNum); 22 out.partitionReferenceNum = cpu_to_le16(in.partitionReferenceNum);
@@ -24,9 +24,9 @@ static inline lb_addr cpu_to_lelb(kernel_lb_addr in)
24 return out; 24 return out;
25} 25}
26 26
27static inline short_ad lesa_to_cpu(short_ad in) 27static inline struct short_ad lesa_to_cpu(struct short_ad in)
28{ 28{
29 short_ad out; 29 struct short_ad out;
30 30
31 out.extLength = le32_to_cpu(in.extLength); 31 out.extLength = le32_to_cpu(in.extLength);
32 out.extPosition = le32_to_cpu(in.extPosition); 32 out.extPosition = le32_to_cpu(in.extPosition);
@@ -34,9 +34,9 @@ static inline short_ad lesa_to_cpu(short_ad in)
34 return out; 34 return out;
35} 35}
36 36
37static inline short_ad cpu_to_lesa(short_ad in) 37static inline struct short_ad cpu_to_lesa(struct short_ad in)
38{ 38{
39 short_ad out; 39 struct short_ad out;
40 40
41 out.extLength = cpu_to_le32(in.extLength); 41 out.extLength = cpu_to_le32(in.extLength);
42 out.extPosition = cpu_to_le32(in.extPosition); 42 out.extPosition = cpu_to_le32(in.extPosition);
@@ -44,9 +44,9 @@ static inline short_ad cpu_to_lesa(short_ad in)
44 return out; 44 return out;
45} 45}
46 46
47static inline kernel_long_ad lela_to_cpu(long_ad in) 47static inline struct kernel_long_ad lela_to_cpu(struct long_ad in)
48{ 48{
49 kernel_long_ad out; 49 struct kernel_long_ad out;
50 50
51 out.extLength = le32_to_cpu(in.extLength); 51 out.extLength = le32_to_cpu(in.extLength);
52 out.extLocation = lelb_to_cpu(in.extLocation); 52 out.extLocation = lelb_to_cpu(in.extLocation);
@@ -54,9 +54,9 @@ static inline kernel_long_ad lela_to_cpu(long_ad in)
54 return out; 54 return out;
55} 55}
56 56
57static inline long_ad cpu_to_lela(kernel_long_ad in) 57static inline struct long_ad cpu_to_lela(struct kernel_long_ad in)
58{ 58{
59 long_ad out; 59 struct long_ad out;
60 60
61 out.extLength = cpu_to_le32(in.extLength); 61 out.extLength = cpu_to_le32(in.extLength);
62 out.extLocation = cpu_to_lelb(in.extLocation); 62 out.extLocation = cpu_to_lelb(in.extLocation);
@@ -64,9 +64,9 @@ static inline long_ad cpu_to_lela(kernel_long_ad in)
64 return out; 64 return out;
65} 65}
66 66
67static inline kernel_extent_ad leea_to_cpu(extent_ad in) 67static inline struct kernel_extent_ad leea_to_cpu(struct extent_ad in)
68{ 68{
69 kernel_extent_ad out; 69 struct kernel_extent_ad out;
70 70
71 out.extLength = le32_to_cpu(in.extLength); 71 out.extLength = le32_to_cpu(in.extLength);
72 out.extLocation = le32_to_cpu(in.extLocation); 72 out.extLocation = le32_to_cpu(in.extLocation);
diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c
index 5f811655c9b5..b8c828c4d200 100644
--- a/fs/udf/udftime.c
+++ b/fs/udf/udftime.c
@@ -85,7 +85,8 @@ extern struct timezone sys_tz;
85#define SECS_PER_HOUR (60 * 60) 85#define SECS_PER_HOUR (60 * 60)
86#define SECS_PER_DAY (SECS_PER_HOUR * 24) 86#define SECS_PER_DAY (SECS_PER_HOUR * 24)
87 87
88struct timespec *udf_disk_stamp_to_time(struct timespec *dest, timestamp src) 88struct timespec *
89udf_disk_stamp_to_time(struct timespec *dest, struct timestamp src)
89{ 90{
90 int yday; 91 int yday;
91 u16 typeAndTimezone = le16_to_cpu(src.typeAndTimezone); 92 u16 typeAndTimezone = le16_to_cpu(src.typeAndTimezone);
@@ -116,7 +117,8 @@ struct timespec *udf_disk_stamp_to_time(struct timespec *dest, timestamp src)
116 return dest; 117 return dest;
117} 118}
118 119
119timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec ts) 120struct timestamp *
121udf_time_to_disk_stamp(struct timestamp *dest, struct timespec ts)
120{ 122{
121 long int days, rem, y; 123 long int days, rem, y;
122 const unsigned short int *ip; 124 const unsigned short int *ip;
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 9fdf8c93c58e..cefa8c8913e6 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -254,7 +254,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
254{ 254{
255 const uint8_t *ocu; 255 const uint8_t *ocu;
256 uint8_t cmp_id, ocu_len; 256 uint8_t cmp_id, ocu_len;
257 int i; 257 int i, len;
258 258
259 259
260 ocu_len = ocu_i->u_len; 260 ocu_len = ocu_i->u_len;
@@ -279,8 +279,13 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
279 if (cmp_id == 16) 279 if (cmp_id == 16)
280 c = (c << 8) | ocu[i++]; 280 c = (c << 8) | ocu[i++];
281 281
282 utf_o->u_len += nls->uni2char(c, &utf_o->u_name[utf_o->u_len], 282 len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
283 UDF_NAME_LEN - utf_o->u_len); 283 UDF_NAME_LEN - utf_o->u_len);
284 /* Valid character? */
285 if (len >= 0)
286 utf_o->u_len += len;
287 else
288 utf_o->u_name[utf_o->u_len++] = '?';
284 } 289 }
285 utf_o->u_cmpID = 8; 290 utf_o->u_cmpID = 8;
286 291
@@ -290,7 +295,8 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
290static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni, 295static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni,
291 int length) 296 int length)
292{ 297{
293 unsigned len, i, max_val; 298 int len;
299 unsigned i, max_val;
294 uint16_t uni_char; 300 uint16_t uni_char;
295 int u_len; 301 int u_len;
296 302
@@ -302,8 +308,13 @@ try_again:
302 u_len = 0U; 308 u_len = 0U;
303 for (i = 0U; i < uni->u_len; i++) { 309 for (i = 0U; i < uni->u_len; i++) {
304 len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char); 310 len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char);
305 if (len <= 0) 311 if (!len)
306 continue; 312 continue;
313 /* Invalid character, deal with it */
314 if (len < 0) {
315 len = 1;
316 uni_char = '?';
317 }
307 318
308 if (uni_char > max_val) { 319 if (uni_char > max_val) {
309 max_val = 0xffffU; 320 max_val = 0xffffU;
@@ -324,34 +335,43 @@ try_again:
324int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname, 335int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname,
325 int flen) 336 int flen)
326{ 337{
327 struct ustr filename, unifilename; 338 struct ustr *filename, *unifilename;
328 int len; 339 int len = 0;
329 340
330 if (udf_build_ustr_exact(&unifilename, sname, flen)) 341 filename = kmalloc(sizeof(struct ustr), GFP_NOFS);
342 if (!filename)
331 return 0; 343 return 0;
332 344
345 unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS);
346 if (!unifilename)
347 goto out1;
348
349 if (udf_build_ustr_exact(unifilename, sname, flen))
350 goto out2;
351
333 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { 352 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
334 if (!udf_CS0toUTF8(&filename, &unifilename)) { 353 if (!udf_CS0toUTF8(filename, unifilename)) {
335 udf_debug("Failed in udf_get_filename: sname = %s\n", 354 udf_debug("Failed in udf_get_filename: sname = %s\n",
336 sname); 355 sname);
337 return 0; 356 goto out2;
338 } 357 }
339 } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { 358 } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
340 if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, &filename, 359 if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename,
341 &unifilename)) { 360 unifilename)) {
342 udf_debug("Failed in udf_get_filename: sname = %s\n", 361 udf_debug("Failed in udf_get_filename: sname = %s\n",
343 sname); 362 sname);
344 return 0; 363 goto out2;
345 } 364 }
346 } else 365 } else
347 return 0; 366 goto out2;
348 367
349 len = udf_translate_to_linux(dname, filename.u_name, filename.u_len, 368 len = udf_translate_to_linux(dname, filename->u_name, filename->u_len,
350 unifilename.u_name, unifilename.u_len); 369 unifilename->u_name, unifilename->u_len);
351 if (len) 370out2:
352 return len; 371 kfree(unifilename);
353 372out1:
354 return 0; 373 kfree(filename);
374 return len;
355} 375}
356 376
357int udf_put_filename(struct super_block *sb, const uint8_t *sname, 377int udf_put_filename(struct super_block *sb, const uint8_t *sname,
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 0d9ada173739..54c16ec95dff 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -85,7 +85,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
85 "bit already cleared for fragment %u", i); 85 "bit already cleared for fragment %u", i);
86 } 86 }
87 87
88 DQUOT_FREE_BLOCK (inode, count); 88 vfs_dq_free_block(inode, count);
89 89
90 90
91 fs32_add(sb, &ucg->cg_cs.cs_nffree, count); 91 fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
@@ -195,7 +195,7 @@ do_more:
195 ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); 195 ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
196 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) 196 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
197 ufs_clusteracct (sb, ucpi, blkno, 1); 197 ufs_clusteracct (sb, ucpi, blkno, 1);
198 DQUOT_FREE_BLOCK(inode, uspi->s_fpb); 198 vfs_dq_free_block(inode, uspi->s_fpb);
199 199
200 fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); 200 fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
201 uspi->cs_total.cs_nbfree++; 201 uspi->cs_total.cs_nbfree++;
@@ -556,7 +556,7 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
556 fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1); 556 fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
557 for (i = oldcount; i < newcount; i++) 557 for (i = oldcount; i < newcount; i++)
558 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i); 558 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
559 if(DQUOT_ALLOC_BLOCK(inode, count)) { 559 if (vfs_dq_alloc_block(inode, count)) {
560 *err = -EDQUOT; 560 *err = -EDQUOT;
561 return 0; 561 return 0;
562 } 562 }
@@ -664,7 +664,7 @@ cg_found:
664 for (i = count; i < uspi->s_fpb; i++) 664 for (i = count; i < uspi->s_fpb; i++)
665 ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i); 665 ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
666 i = uspi->s_fpb - count; 666 i = uspi->s_fpb - count;
667 DQUOT_FREE_BLOCK(inode, i); 667 vfs_dq_free_block(inode, i);
668 668
669 fs32_add(sb, &ucg->cg_cs.cs_nffree, i); 669 fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
670 uspi->cs_total.cs_nffree += i; 670 uspi->cs_total.cs_nffree += i;
@@ -676,7 +676,7 @@ cg_found:
676 result = ufs_bitmap_search (sb, ucpi, goal, allocsize); 676 result = ufs_bitmap_search (sb, ucpi, goal, allocsize);
677 if (result == INVBLOCK) 677 if (result == INVBLOCK)
678 return 0; 678 return 0;
679 if(DQUOT_ALLOC_BLOCK(inode, count)) { 679 if (vfs_dq_alloc_block(inode, count)) {
680 *err = -EDQUOT; 680 *err = -EDQUOT;
681 return 0; 681 return 0;
682 } 682 }
@@ -747,7 +747,7 @@ gotit:
747 ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); 747 ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
748 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) 748 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
749 ufs_clusteracct (sb, ucpi, blkno, -1); 749 ufs_clusteracct (sb, ucpi, blkno, -1);
750 if(DQUOT_ALLOC_BLOCK(inode, uspi->s_fpb)) { 750 if (vfs_dq_alloc_block(inode, uspi->s_fpb)) {
751 *err = -EDQUOT; 751 *err = -EDQUOT;
752 return INVBLOCK; 752 return INVBLOCK;
753 } 753 }
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 6f5dcf006096..3527c00fef0d 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -95,8 +95,8 @@ void ufs_free_inode (struct inode * inode)
95 95
96 is_directory = S_ISDIR(inode->i_mode); 96 is_directory = S_ISDIR(inode->i_mode);
97 97
98 DQUOT_FREE_INODE(inode); 98 vfs_dq_free_inode(inode);
99 DQUOT_DROP(inode); 99 vfs_dq_drop(inode);
100 100
101 clear_inode (inode); 101 clear_inode (inode);
102 102
@@ -355,8 +355,8 @@ cg_found:
355 355
356 unlock_super (sb); 356 unlock_super (sb);
357 357
358 if (DQUOT_ALLOC_INODE(inode)) { 358 if (vfs_dq_alloc_inode(inode)) {
359 DQUOT_DROP(inode); 359 vfs_dq_drop(inode);
360 err = -EDQUOT; 360 err = -EDQUOT;
361 goto fail_without_unlock; 361 goto fail_without_unlock;
362 } 362 }
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 39f877898565..3d2512c21f05 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -622,7 +622,6 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
622 struct ufs_inode_info *ufsi = UFS_I(inode); 622 struct ufs_inode_info *ufsi = UFS_I(inode);
623 struct super_block *sb = inode->i_sb; 623 struct super_block *sb = inode->i_sb;
624 mode_t mode; 624 mode_t mode;
625 unsigned i;
626 625
627 /* 626 /*
628 * Copy data to the in-core inode. 627 * Copy data to the in-core inode.
@@ -655,11 +654,12 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
655 654
656 655
657 if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) { 656 if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) {
658 for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++) 657 memcpy(ufsi->i_u1.i_data, &ufs_inode->ui_u2.ui_addr,
659 ufsi->i_u1.i_data[i] = ufs_inode->ui_u2.ui_addr.ui_db[i]; 658 sizeof(ufs_inode->ui_u2.ui_addr));
660 } else { 659 } else {
661 for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++) 660 memcpy(ufsi->i_u1.i_symlink, ufs_inode->ui_u2.ui_symlink,
662 ufsi->i_u1.i_symlink[i] = ufs_inode->ui_u2.ui_symlink[i]; 661 sizeof(ufs_inode->ui_u2.ui_symlink) - 1);
662 ufsi->i_u1.i_symlink[sizeof(ufs_inode->ui_u2.ui_symlink) - 1] = 0;
663 } 663 }
664 return 0; 664 return 0;
665} 665}
@@ -669,7 +669,6 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
669 struct ufs_inode_info *ufsi = UFS_I(inode); 669 struct ufs_inode_info *ufsi = UFS_I(inode);
670 struct super_block *sb = inode->i_sb; 670 struct super_block *sb = inode->i_sb;
671 mode_t mode; 671 mode_t mode;
672 unsigned i;
673 672
674 UFSD("Reading ufs2 inode, ino %lu\n", inode->i_ino); 673 UFSD("Reading ufs2 inode, ino %lu\n", inode->i_ino);
675 /* 674 /*
@@ -704,12 +703,12 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
704 */ 703 */
705 704
706 if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) { 705 if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) {
707 for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++) 706 memcpy(ufsi->i_u1.u2_i_data, &ufs2_inode->ui_u2.ui_addr,
708 ufsi->i_u1.u2_i_data[i] = 707 sizeof(ufs2_inode->ui_u2.ui_addr));
709 ufs2_inode->ui_u2.ui_addr.ui_db[i];
710 } else { 708 } else {
711 for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++) 709 memcpy(ufsi->i_u1.i_symlink, ufs2_inode->ui_u2.ui_symlink,
712 ufsi->i_u1.i_symlink[i] = ufs2_inode->ui_u2.ui_symlink[i]; 710 sizeof(ufs2_inode->ui_u2.ui_symlink) - 1);
711 ufsi->i_u1.i_symlink[sizeof(ufs2_inode->ui_u2.ui_symlink) - 1] = 0;
713 } 712 }
714 return 0; 713 return 0;
715} 714}
@@ -781,7 +780,6 @@ static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode)
781{ 780{
782 struct super_block *sb = inode->i_sb; 781 struct super_block *sb = inode->i_sb;
783 struct ufs_inode_info *ufsi = UFS_I(inode); 782 struct ufs_inode_info *ufsi = UFS_I(inode);
784 unsigned i;
785 783
786 ufs_inode->ui_mode = cpu_to_fs16(sb, inode->i_mode); 784 ufs_inode->ui_mode = cpu_to_fs16(sb, inode->i_mode);
787 ufs_inode->ui_nlink = cpu_to_fs16(sb, inode->i_nlink); 785 ufs_inode->ui_nlink = cpu_to_fs16(sb, inode->i_nlink);
@@ -809,12 +807,12 @@ static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode)
809 /* ufs_inode->ui_u2.ui_addr.ui_db[0] = cpu_to_fs32(sb, inode->i_rdev); */ 807 /* ufs_inode->ui_u2.ui_addr.ui_db[0] = cpu_to_fs32(sb, inode->i_rdev); */
810 ufs_inode->ui_u2.ui_addr.ui_db[0] = ufsi->i_u1.i_data[0]; 808 ufs_inode->ui_u2.ui_addr.ui_db[0] = ufsi->i_u1.i_data[0];
811 } else if (inode->i_blocks) { 809 } else if (inode->i_blocks) {
812 for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++) 810 memcpy(&ufs_inode->ui_u2.ui_addr, ufsi->i_u1.i_data,
813 ufs_inode->ui_u2.ui_addr.ui_db[i] = ufsi->i_u1.i_data[i]; 811 sizeof(ufs_inode->ui_u2.ui_addr));
814 } 812 }
815 else { 813 else {
816 for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++) 814 memcpy(&ufs_inode->ui_u2.ui_symlink, ufsi->i_u1.i_symlink,
817 ufs_inode->ui_u2.ui_symlink[i] = ufsi->i_u1.i_symlink[i]; 815 sizeof(ufs_inode->ui_u2.ui_symlink));
818 } 816 }
819 817
820 if (!inode->i_nlink) 818 if (!inode->i_nlink)
@@ -825,7 +823,6 @@ static void ufs2_update_inode(struct inode *inode, struct ufs2_inode *ufs_inode)
825{ 823{
826 struct super_block *sb = inode->i_sb; 824 struct super_block *sb = inode->i_sb;
827 struct ufs_inode_info *ufsi = UFS_I(inode); 825 struct ufs_inode_info *ufsi = UFS_I(inode);
828 unsigned i;
829 826
830 UFSD("ENTER\n"); 827 UFSD("ENTER\n");
831 ufs_inode->ui_mode = cpu_to_fs16(sb, inode->i_mode); 828 ufs_inode->ui_mode = cpu_to_fs16(sb, inode->i_mode);
@@ -850,11 +847,11 @@ static void ufs2_update_inode(struct inode *inode, struct ufs2_inode *ufs_inode)
850 /* ufs_inode->ui_u2.ui_addr.ui_db[0] = cpu_to_fs32(sb, inode->i_rdev); */ 847 /* ufs_inode->ui_u2.ui_addr.ui_db[0] = cpu_to_fs32(sb, inode->i_rdev); */
851 ufs_inode->ui_u2.ui_addr.ui_db[0] = ufsi->i_u1.u2_i_data[0]; 848 ufs_inode->ui_u2.ui_addr.ui_db[0] = ufsi->i_u1.u2_i_data[0];
852 } else if (inode->i_blocks) { 849 } else if (inode->i_blocks) {
853 for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++) 850 memcpy(&ufs_inode->ui_u2.ui_addr, ufsi->i_u1.u2_i_data,
854 ufs_inode->ui_u2.ui_addr.ui_db[i] = ufsi->i_u1.u2_i_data[i]; 851 sizeof(ufs_inode->ui_u2.ui_addr));
855 } else { 852 } else {
856 for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++) 853 memcpy(&ufs_inode->ui_u2.ui_symlink, ufsi->i_u1.i_symlink,
857 ufs_inode->ui_u2.ui_symlink[i] = ufsi->i_u1.i_symlink[i]; 854 sizeof(ufs_inode->ui_u2.ui_symlink));
858 } 855 }
859 856
860 if (!inode->i_nlink) 857 if (!inode->i_nlink)
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index e3a9b1fac75a..23119fe7ad62 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -147,7 +147,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
147 } else { 147 } else {
148 /* fast symlink */ 148 /* fast symlink */
149 inode->i_op = &ufs_fast_symlink_inode_operations; 149 inode->i_op = &ufs_fast_symlink_inode_operations;
150 memcpy((char*)&UFS_I(inode)->i_u1.i_data,symname,l); 150 memcpy(UFS_I(inode)->i_u1.i_symlink, symname, l);
151 inode->i_size = l-1; 151 inode->i_size = l-1;
152 } 152 }
153 mark_inode_dirty(inode); 153 mark_inode_dirty(inode);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index e65212dfb60e..60359291761f 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -41,7 +41,7 @@
41 * Stefan Reinauer <stepan@home.culture.mipt.ru> 41 * Stefan Reinauer <stepan@home.culture.mipt.ru>
42 * 42 *
43 * Module usage counts added on 96/04/29 by 43 * Module usage counts added on 96/04/29 by
44 * Gertjan van Wingerde <gertjan@cs.vu.nl> 44 * Gertjan van Wingerde <gwingerde@gmail.com>
45 * 45 *
46 * Clean swab support on 19970406 by 46 * Clean swab support on 19970406 by
47 * Francois-Rene Rideau <fare@tunes.org> 47 * Francois-Rene Rideau <fare@tunes.org>
@@ -636,6 +636,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
636 unsigned block_size, super_block_size; 636 unsigned block_size, super_block_size;
637 unsigned flags; 637 unsigned flags;
638 unsigned super_block_offset; 638 unsigned super_block_offset;
639 unsigned maxsymlen;
639 int ret = -EINVAL; 640 int ret = -EINVAL;
640 641
641 uspi = NULL; 642 uspi = NULL;
@@ -1069,6 +1070,16 @@ magic_found:
1069 uspi->s_maxsymlinklen = 1070 uspi->s_maxsymlinklen =
1070 fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen); 1071 fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen);
1071 1072
1073 if (uspi->fs_magic == UFS2_MAGIC)
1074 maxsymlen = 2 * 4 * (UFS_NDADDR + UFS_NINDIR);
1075 else
1076 maxsymlen = 4 * (UFS_NDADDR + UFS_NINDIR);
1077 if (uspi->s_maxsymlinklen > maxsymlen) {
1078 ufs_warning(sb, __func__, "ufs_read_super: excessive maximum "
1079 "fast symlink size (%u)\n", uspi->s_maxsymlinklen);
1080 uspi->s_maxsymlinklen = maxsymlen;
1081 }
1082
1072 inode = ufs_iget(sb, UFS_ROOTINO); 1083 inode = ufs_iget(sb, UFS_ROOTINO);
1073 if (IS_ERR(inode)) { 1084 if (IS_ERR(inode)) {
1074 ret = PTR_ERR(inode); 1085 ret = PTR_ERR(inode);
@@ -1257,6 +1268,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
1257 struct ufs_super_block_first *usb1; 1268 struct ufs_super_block_first *usb1;
1258 struct ufs_super_block_second *usb2; 1269 struct ufs_super_block_second *usb2;
1259 struct ufs_super_block_third *usb3; 1270 struct ufs_super_block_third *usb3;
1271 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
1260 1272
1261 lock_kernel(); 1273 lock_kernel();
1262 1274
@@ -1279,6 +1291,8 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
1279 ? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0; 1291 ? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0;
1280 buf->f_files = uspi->s_ncg * uspi->s_ipg; 1292 buf->f_files = uspi->s_ncg * uspi->s_ipg;
1281 buf->f_namelen = UFS_MAXNAMLEN; 1293 buf->f_namelen = UFS_MAXNAMLEN;
1294 buf->f_fsid.val[0] = (u32)id;
1295 buf->f_fsid.val[1] = (u32)(id >> 32);
1282 1296
1283 unlock_kernel(); 1297 unlock_kernel();
1284 1298
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 11c035168ea6..69b3427d7885 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -23,7 +23,7 @@ struct ufs_sb_info {
23struct ufs_inode_info { 23struct ufs_inode_info {
24 union { 24 union {
25 __fs32 i_data[15]; 25 __fs32 i_data[15];
26 __u8 i_symlink[4*15]; 26 __u8 i_symlink[2 * 4 * 15];
27 __fs64 u2_i_data[15]; 27 __fs64 u2_i_data[15];
28 } i_u1; 28 } i_u1;
29 __u32 i_flags; 29 __u32 i_flags;
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index c3dc491fff89..60f107e47fe9 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -33,6 +33,7 @@ xfs-$(CONFIG_XFS_QUOTA) += $(addprefix quota/, \
33 xfs_qm_syscalls.o \ 33 xfs_qm_syscalls.o \
34 xfs_qm_bhv.o \ 34 xfs_qm_bhv.o \
35 xfs_qm.o) 35 xfs_qm.o)
36xfs-$(CONFIG_XFS_QUOTA) += linux-2.6/xfs_quotaops.o
36 37
37ifeq ($(CONFIG_XFS_QUOTA),y) 38ifeq ($(CONFIG_XFS_QUOTA),y)
38xfs-$(CONFIG_PROC_FS) += quota/xfs_qm_stats.o 39xfs-$(CONFIG_PROC_FS) += quota/xfs_qm_stats.o
diff --git a/fs/xfs/linux-2.6/mutex.h b/fs/xfs/linux-2.6/mutex.h
deleted file mode 100644
index 2a88d56c4dc2..000000000000
--- a/fs/xfs/linux-2.6/mutex.h
+++ /dev/null
@@ -1,25 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_SUPPORT_MUTEX_H__
19#define __XFS_SUPPORT_MUTEX_H__
20
21#include <linux/mutex.h>
22
23typedef struct mutex mutex_t;
24
25#endif /* __XFS_SUPPORT_MUTEX_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index de3a198f771e..c13f67300fe7 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1623,4 +1623,5 @@ const struct address_space_operations xfs_address_space_operations = {
1623 .bmap = xfs_vm_bmap, 1623 .bmap = xfs_vm_bmap,
1624 .direct_IO = xfs_vm_direct_IO, 1624 .direct_IO = xfs_vm_direct_IO,
1625 .migratepage = buffer_migrate_page, 1625 .migratepage = buffer_migrate_page,
1626 .is_partially_uptodate = block_is_partially_uptodate,
1626}; 1627};
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index cb329edc925b..aa1016bb9134 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -34,6 +34,12 @@
34#include <linux/backing-dev.h> 34#include <linux/backing-dev.h>
35#include <linux/freezer.h> 35#include <linux/freezer.h>
36 36
37#include "xfs_sb.h"
38#include "xfs_inum.h"
39#include "xfs_ag.h"
40#include "xfs_dmapi.h"
41#include "xfs_mount.h"
42
37static kmem_zone_t *xfs_buf_zone; 43static kmem_zone_t *xfs_buf_zone;
38STATIC int xfsbufd(void *); 44STATIC int xfsbufd(void *);
39STATIC int xfsbufd_wakeup(int, gfp_t); 45STATIC int xfsbufd_wakeup(int, gfp_t);
@@ -1435,10 +1441,12 @@ xfs_unregister_buftarg(
1435 1441
1436void 1442void
1437xfs_free_buftarg( 1443xfs_free_buftarg(
1438 xfs_buftarg_t *btp) 1444 struct xfs_mount *mp,
1445 struct xfs_buftarg *btp)
1439{ 1446{
1440 xfs_flush_buftarg(btp, 1); 1447 xfs_flush_buftarg(btp, 1);
1441 xfs_blkdev_issue_flush(btp); 1448 if (mp->m_flags & XFS_MOUNT_BARRIER)
1449 xfs_blkdev_issue_flush(btp);
1442 xfs_free_bufhash(btp); 1450 xfs_free_bufhash(btp);
1443 iput(btp->bt_mapping->host); 1451 iput(btp->bt_mapping->host);
1444 1452
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 288ae7c4c800..9b4d666ad31f 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -413,7 +413,7 @@ static inline int XFS_bwrite(xfs_buf_t *bp)
413 * Handling of buftargs. 413 * Handling of buftargs.
414 */ 414 */
415extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int); 415extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int);
416extern void xfs_free_buftarg(xfs_buftarg_t *); 416extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
417extern void xfs_wait_buftarg(xfs_buftarg_t *); 417extern void xfs_wait_buftarg(xfs_buftarg_t *);
418extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); 418extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
419extern int xfs_flush_buftarg(xfs_buftarg_t *, int); 419extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index e14c4e3aea0c..f4e255441574 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -234,9 +234,9 @@ xfs_file_mmap(
234STATIC int 234STATIC int
235xfs_vm_page_mkwrite( 235xfs_vm_page_mkwrite(
236 struct vm_area_struct *vma, 236 struct vm_area_struct *vma,
237 struct page *page) 237 struct vm_fault *vmf)
238{ 238{
239 return block_page_mkwrite(vma, page, xfs_get_blocks); 239 return block_page_mkwrite(vma, vmf, xfs_get_blocks);
240} 240}
241 241
242const struct file_operations xfs_file_operations = { 242const struct file_operations xfs_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 4bd112313f33..d0b499418a7d 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -34,6 +34,7 @@
34#include "xfs_dir2_sf.h" 34#include "xfs_dir2_sf.h"
35#include "xfs_dinode.h" 35#include "xfs_dinode.h"
36#include "xfs_inode.h" 36#include "xfs_inode.h"
37#include "xfs_ioctl.h"
37#include "xfs_btree.h" 38#include "xfs_btree.h"
38#include "xfs_ialloc.h" 39#include "xfs_ialloc.h"
39#include "xfs_rtalloc.h" 40#include "xfs_rtalloc.h"
@@ -78,92 +79,74 @@ xfs_find_handle(
78 int hsize; 79 int hsize;
79 xfs_handle_t handle; 80 xfs_handle_t handle;
80 struct inode *inode; 81 struct inode *inode;
82 struct file *file = NULL;
83 struct path path;
84 int error;
85 struct xfs_inode *ip;
81 86
82 memset((char *)&handle, 0, sizeof(handle)); 87 if (cmd == XFS_IOC_FD_TO_HANDLE) {
83 88 file = fget(hreq->fd);
84 switch (cmd) { 89 if (!file)
85 case XFS_IOC_PATH_TO_FSHANDLE: 90 return -EBADF;
86 case XFS_IOC_PATH_TO_HANDLE: { 91 inode = file->f_path.dentry->d_inode;
87 struct path path; 92 } else {
88 int error = user_lpath((const char __user *)hreq->path, &path); 93 error = user_lpath((const char __user *)hreq->path, &path);
89 if (error) 94 if (error)
90 return error; 95 return error;
91 96 inode = path.dentry->d_inode;
92 ASSERT(path.dentry);
93 ASSERT(path.dentry->d_inode);
94 inode = igrab(path.dentry->d_inode);
95 path_put(&path);
96 break;
97 } 97 }
98 ip = XFS_I(inode);
98 99
99 case XFS_IOC_FD_TO_HANDLE: { 100 /*
100 struct file *file; 101 * We can only generate handles for inodes residing on a XFS filesystem,
101 102 * and only for regular files, directories or symbolic links.
102 file = fget(hreq->fd); 103 */
103 if (!file) 104 error = -EINVAL;
104 return -EBADF; 105 if (inode->i_sb->s_magic != XFS_SB_MAGIC)
106 goto out_put;
105 107
106 ASSERT(file->f_path.dentry); 108 error = -EBADF;
107 ASSERT(file->f_path.dentry->d_inode); 109 if (!S_ISREG(inode->i_mode) &&
108 inode = igrab(file->f_path.dentry->d_inode); 110 !S_ISDIR(inode->i_mode) &&
109 fput(file); 111 !S_ISLNK(inode->i_mode))
110 break; 112 goto out_put;
111 }
112 113
113 default:
114 ASSERT(0);
115 return -XFS_ERROR(EINVAL);
116 }
117 114
118 if (inode->i_sb->s_magic != XFS_SB_MAGIC) { 115 memcpy(&handle.ha_fsid, ip->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
119 /* we're not in XFS anymore, Toto */
120 iput(inode);
121 return -XFS_ERROR(EINVAL);
122 }
123 116
124 switch (inode->i_mode & S_IFMT) { 117 if (cmd == XFS_IOC_PATH_TO_FSHANDLE) {
125 case S_IFREG: 118 /*
126 case S_IFDIR: 119 * This handle only contains an fsid, zero the rest.
127 case S_IFLNK: 120 */
128 break; 121 memset(&handle.ha_fid, 0, sizeof(handle.ha_fid));
129 default: 122 hsize = sizeof(xfs_fsid_t);
130 iput(inode); 123 } else {
131 return -XFS_ERROR(EBADF);
132 }
133
134 /* now we can grab the fsid */
135 memcpy(&handle.ha_fsid, XFS_I(inode)->i_mount->m_fixedfsid,
136 sizeof(xfs_fsid_t));
137 hsize = sizeof(xfs_fsid_t);
138
139 if (cmd != XFS_IOC_PATH_TO_FSHANDLE) {
140 xfs_inode_t *ip = XFS_I(inode);
141 int lock_mode; 124 int lock_mode;
142 125
143 /* need to get access to the xfs_inode to read the generation */
144 lock_mode = xfs_ilock_map_shared(ip); 126 lock_mode = xfs_ilock_map_shared(ip);
145
146 /* fill in fid section of handle from inode */
147 handle.ha_fid.fid_len = sizeof(xfs_fid_t) - 127 handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
148 sizeof(handle.ha_fid.fid_len); 128 sizeof(handle.ha_fid.fid_len);
149 handle.ha_fid.fid_pad = 0; 129 handle.ha_fid.fid_pad = 0;
150 handle.ha_fid.fid_gen = ip->i_d.di_gen; 130 handle.ha_fid.fid_gen = ip->i_d.di_gen;
151 handle.ha_fid.fid_ino = ip->i_ino; 131 handle.ha_fid.fid_ino = ip->i_ino;
152
153 xfs_iunlock_map_shared(ip, lock_mode); 132 xfs_iunlock_map_shared(ip, lock_mode);
154 133
155 hsize = XFS_HSIZE(handle); 134 hsize = XFS_HSIZE(handle);
156 } 135 }
157 136
158 /* now copy our handle into the user buffer & write out the size */ 137 error = -EFAULT;
159 if (copy_to_user(hreq->ohandle, &handle, hsize) || 138 if (copy_to_user(hreq->ohandle, &handle, hsize) ||
160 copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32))) { 139 copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32)))
161 iput(inode); 140 goto out_put;
162 return -XFS_ERROR(EFAULT);
163 }
164 141
165 iput(inode); 142 error = 0;
166 return 0; 143
144 out_put:
145 if (cmd == XFS_IOC_FD_TO_HANDLE)
146 fput(file);
147 else
148 path_put(&path);
149 return error;
167} 150}
168 151
169/* 152/*
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 7aa53fefc67f..6075382336d7 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -211,8 +211,13 @@ xfs_vn_mknod(
211 * Irix uses Missed'em'V split, but doesn't want to see 211 * Irix uses Missed'em'V split, but doesn't want to see
212 * the upper 5 bits of (14bit) major. 212 * the upper 5 bits of (14bit) major.
213 */ 213 */
214 if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff)) 214 if (S_ISCHR(mode) || S_ISBLK(mode)) {
215 return -EINVAL; 215 if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
216 return -EINVAL;
217 rdev = sysv_encode_dev(rdev);
218 } else {
219 rdev = 0;
220 }
216 221
217 if (test_default_acl && test_default_acl(dir)) { 222 if (test_default_acl && test_default_acl(dir)) {
218 if (!_ACL_ALLOC(default_acl)) { 223 if (!_ACL_ALLOC(default_acl)) {
@@ -224,28 +229,11 @@ xfs_vn_mknod(
224 } 229 }
225 } 230 }
226 231
227 xfs_dentry_to_name(&name, dentry);
228
229 if (IS_POSIXACL(dir) && !default_acl) 232 if (IS_POSIXACL(dir) && !default_acl)
230 mode &= ~current->fs->umask; 233 mode &= ~current_umask();
231
232 switch (mode & S_IFMT) {
233 case S_IFCHR:
234 case S_IFBLK:
235 case S_IFIFO:
236 case S_IFSOCK:
237 rdev = sysv_encode_dev(rdev);
238 case S_IFREG:
239 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
240 break;
241 case S_IFDIR:
242 error = xfs_mkdir(XFS_I(dir), &name, mode, &ip, NULL);
243 break;
244 default:
245 error = EINVAL;
246 break;
247 }
248 234
235 xfs_dentry_to_name(&name, dentry);
236 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
249 if (unlikely(error)) 237 if (unlikely(error))
250 goto out_free_acl; 238 goto out_free_acl;
251 239
@@ -416,7 +404,7 @@ xfs_vn_symlink(
416 mode_t mode; 404 mode_t mode;
417 405
418 mode = S_IFLNK | 406 mode = S_IFLNK |
419 (irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO); 407 (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
420 xfs_dentry_to_name(&name, dentry); 408 xfs_dentry_to_name(&name, dentry);
421 409
422 error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL); 410 error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL);
@@ -553,9 +541,6 @@ xfs_vn_getattr(
553 stat->uid = ip->i_d.di_uid; 541 stat->uid = ip->i_d.di_uid;
554 stat->gid = ip->i_d.di_gid; 542 stat->gid = ip->i_d.di_gid;
555 stat->ino = ip->i_ino; 543 stat->ino = ip->i_ino;
556#if XFS_BIG_INUMS
557 stat->ino += mp->m_inoadd;
558#endif
559 stat->atime = inode->i_atime; 544 stat->atime = inode->i_atime;
560 stat->mtime.tv_sec = ip->i_d.di_mtime.t_sec; 545 stat->mtime.tv_sec = ip->i_d.di_mtime.t_sec;
561 stat->mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; 546 stat->mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 507492d6dccd..f65a53f8752f 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -38,7 +38,6 @@
38#include <kmem.h> 38#include <kmem.h>
39#include <mrlock.h> 39#include <mrlock.h>
40#include <sv.h> 40#include <sv.h>
41#include <mutex.h>
42#include <time.h> 41#include <time.h>
43 42
44#include <support/ktrace.h> 43#include <support/ktrace.h>
@@ -51,6 +50,7 @@
51#include <linux/blkdev.h> 50#include <linux/blkdev.h>
52#include <linux/slab.h> 51#include <linux/slab.h>
53#include <linux/module.h> 52#include <linux/module.h>
53#include <linux/mutex.h>
54#include <linux/file.h> 54#include <linux/file.h>
55#include <linux/swap.h> 55#include <linux/swap.h>
56#include <linux/errno.h> 56#include <linux/errno.h>
@@ -147,17 +147,6 @@
147#define SYNCHRONIZE() barrier() 147#define SYNCHRONIZE() barrier()
148#define __return_address __builtin_return_address(0) 148#define __return_address __builtin_return_address(0)
149 149
150/*
151 * IRIX (BSD) quotactl makes use of separate commands for user/group,
152 * whereas on Linux the syscall encodes this information into the cmd
153 * field (see the QCMD macro in quota.h). These macros help keep the
154 * code portable - they are not visible from the syscall interface.
155 */
156#define Q_XSETGQLIM XQM_CMD(8) /* set groups disk limits */
157#define Q_XGETGQUOTA XQM_CMD(9) /* get groups disk limits */
158#define Q_XSETPQLIM XQM_CMD(10) /* set projects disk limits */
159#define Q_XGETPQUOTA XQM_CMD(11) /* get projects disk limits */
160
161#define dfltprid 0 150#define dfltprid 0
162#define MAXPATHLEN 1024 151#define MAXPATHLEN 1024
163 152
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
new file mode 100644
index 000000000000..94d9a633d3d9
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -0,0 +1,157 @@
1/*
2 * Copyright (c) 2008, Christoph Hellwig
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_dmapi.h"
20#include "xfs_sb.h"
21#include "xfs_inum.h"
22#include "xfs_ag.h"
23#include "xfs_mount.h"
24#include "xfs_quota.h"
25#include "xfs_log.h"
26#include "xfs_trans.h"
27#include "xfs_bmap_btree.h"
28#include "xfs_inode.h"
29#include "quota/xfs_qm.h"
30#include <linux/quota.h>
31
32
33STATIC int
34xfs_quota_type(int type)
35{
36 switch (type) {
37 case USRQUOTA:
38 return XFS_DQ_USER;
39 case GRPQUOTA:
40 return XFS_DQ_GROUP;
41 default:
42 return XFS_DQ_PROJ;
43 }
44}
45
46STATIC int
47xfs_fs_quota_sync(
48 struct super_block *sb,
49 int type)
50{
51 struct xfs_mount *mp = XFS_M(sb);
52
53 if (!XFS_IS_QUOTA_RUNNING(mp))
54 return -ENOSYS;
55 return -xfs_sync_inodes(mp, SYNC_DELWRI);
56}
57
58STATIC int
59xfs_fs_get_xstate(
60 struct super_block *sb,
61 struct fs_quota_stat *fqs)
62{
63 struct xfs_mount *mp = XFS_M(sb);
64
65 if (!XFS_IS_QUOTA_RUNNING(mp))
66 return -ENOSYS;
67 return -xfs_qm_scall_getqstat(mp, fqs);
68}
69
70STATIC int
71xfs_fs_set_xstate(
72 struct super_block *sb,
73 unsigned int uflags,
74 int op)
75{
76 struct xfs_mount *mp = XFS_M(sb);
77 unsigned int flags = 0;
78
79 if (sb->s_flags & MS_RDONLY)
80 return -EROFS;
81 if (!XFS_IS_QUOTA_RUNNING(mp))
82 return -ENOSYS;
83 if (!capable(CAP_SYS_ADMIN))
84 return -EPERM;
85
86 if (uflags & XFS_QUOTA_UDQ_ACCT)
87 flags |= XFS_UQUOTA_ACCT;
88 if (uflags & XFS_QUOTA_PDQ_ACCT)
89 flags |= XFS_PQUOTA_ACCT;
90 if (uflags & XFS_QUOTA_GDQ_ACCT)
91 flags |= XFS_GQUOTA_ACCT;
92 if (uflags & XFS_QUOTA_UDQ_ENFD)
93 flags |= XFS_UQUOTA_ENFD;
94 if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD))
95 flags |= XFS_OQUOTA_ENFD;
96
97 switch (op) {
98 case Q_XQUOTAON:
99 return -xfs_qm_scall_quotaon(mp, flags);
100 case Q_XQUOTAOFF:
101 if (!XFS_IS_QUOTA_ON(mp))
102 return -EINVAL;
103 return -xfs_qm_scall_quotaoff(mp, flags);
104 case Q_XQUOTARM:
105 if (XFS_IS_QUOTA_ON(mp))
106 return -EINVAL;
107 return -xfs_qm_scall_trunc_qfiles(mp, flags);
108 }
109
110 return -EINVAL;
111}
112
113STATIC int
114xfs_fs_get_xquota(
115 struct super_block *sb,
116 int type,
117 qid_t id,
118 struct fs_disk_quota *fdq)
119{
120 struct xfs_mount *mp = XFS_M(sb);
121
122 if (!XFS_IS_QUOTA_RUNNING(mp))
123 return -ENOSYS;
124 if (!XFS_IS_QUOTA_ON(mp))
125 return -ESRCH;
126
127 return -xfs_qm_scall_getquota(mp, id, xfs_quota_type(type), fdq);
128}
129
130STATIC int
131xfs_fs_set_xquota(
132 struct super_block *sb,
133 int type,
134 qid_t id,
135 struct fs_disk_quota *fdq)
136{
137 struct xfs_mount *mp = XFS_M(sb);
138
139 if (sb->s_flags & MS_RDONLY)
140 return -EROFS;
141 if (!XFS_IS_QUOTA_RUNNING(mp))
142 return -ENOSYS;
143 if (!XFS_IS_QUOTA_ON(mp))
144 return -ESRCH;
145 if (!capable(CAP_SYS_ADMIN))
146 return -EPERM;
147
148 return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq);
149}
150
151struct quotactl_ops xfs_quotactl_operations = {
152 .quota_sync = xfs_fs_quota_sync,
153 .get_xstate = xfs_fs_get_xstate,
154 .set_xstate = xfs_fs_set_xstate,
155 .get_xquota = xfs_fs_get_xquota,
156 .set_xquota = xfs_fs_set_xquota,
157};
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index c71e226da7f5..bb685269f832 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -68,7 +68,6 @@
68#include <linux/freezer.h> 68#include <linux/freezer.h>
69#include <linux/parser.h> 69#include <linux/parser.h>
70 70
71static struct quotactl_ops xfs_quotactl_operations;
72static struct super_operations xfs_super_operations; 71static struct super_operations xfs_super_operations;
73static kmem_zone_t *xfs_ioend_zone; 72static kmem_zone_t *xfs_ioend_zone;
74mempool_t *xfs_ioend_pool; 73mempool_t *xfs_ioend_pool;
@@ -79,7 +78,6 @@ mempool_t *xfs_ioend_pool;
79#define MNTOPT_RTDEV "rtdev" /* realtime I/O device */ 78#define MNTOPT_RTDEV "rtdev" /* realtime I/O device */
80#define MNTOPT_BIOSIZE "biosize" /* log2 of preferred buffered io size */ 79#define MNTOPT_BIOSIZE "biosize" /* log2 of preferred buffered io size */
81#define MNTOPT_WSYNC "wsync" /* safe-mode nfs compatible mount */ 80#define MNTOPT_WSYNC "wsync" /* safe-mode nfs compatible mount */
82#define MNTOPT_INO64 "ino64" /* force inodes into 64-bit range */
83#define MNTOPT_NOALIGN "noalign" /* turn off stripe alignment */ 81#define MNTOPT_NOALIGN "noalign" /* turn off stripe alignment */
84#define MNTOPT_SWALLOC "swalloc" /* turn on stripe width allocation */ 82#define MNTOPT_SWALLOC "swalloc" /* turn on stripe width allocation */
85#define MNTOPT_SUNIT "sunit" /* data volume stripe unit */ 83#define MNTOPT_SUNIT "sunit" /* data volume stripe unit */
@@ -180,7 +178,7 @@ xfs_parseargs(
180 int dswidth = 0; 178 int dswidth = 0;
181 int iosize = 0; 179 int iosize = 0;
182 int dmapi_implies_ikeep = 1; 180 int dmapi_implies_ikeep = 1;
183 uchar_t iosizelog = 0; 181 __uint8_t iosizelog = 0;
184 182
185 /* 183 /*
186 * Copy binary VFS mount flags we are interested in. 184 * Copy binary VFS mount flags we are interested in.
@@ -291,16 +289,6 @@ xfs_parseargs(
291 mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC; 289 mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
292 } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) { 290 } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
293 mp->m_flags |= XFS_MOUNT_NORECOVERY; 291 mp->m_flags |= XFS_MOUNT_NORECOVERY;
294 } else if (!strcmp(this_char, MNTOPT_INO64)) {
295#if XFS_BIG_INUMS
296 mp->m_flags |= XFS_MOUNT_INO64;
297 mp->m_inoadd = XFS_INO64_OFFSET;
298#else
299 cmn_err(CE_WARN,
300 "XFS: %s option not allowed on this system",
301 this_char);
302 return EINVAL;
303#endif
304 } else if (!strcmp(this_char, MNTOPT_NOALIGN)) { 292 } else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
305 mp->m_flags |= XFS_MOUNT_NOALIGN; 293 mp->m_flags |= XFS_MOUNT_NOALIGN;
306 } else if (!strcmp(this_char, MNTOPT_SWALLOC)) { 294 } else if (!strcmp(this_char, MNTOPT_SWALLOC)) {
@@ -529,7 +517,6 @@ xfs_showargs(
529 /* the few simple ones we can get from the mount struct */ 517 /* the few simple ones we can get from the mount struct */
530 { XFS_MOUNT_IKEEP, "," MNTOPT_IKEEP }, 518 { XFS_MOUNT_IKEEP, "," MNTOPT_IKEEP },
531 { XFS_MOUNT_WSYNC, "," MNTOPT_WSYNC }, 519 { XFS_MOUNT_WSYNC, "," MNTOPT_WSYNC },
532 { XFS_MOUNT_INO64, "," MNTOPT_INO64 },
533 { XFS_MOUNT_NOALIGN, "," MNTOPT_NOALIGN }, 520 { XFS_MOUNT_NOALIGN, "," MNTOPT_NOALIGN },
534 { XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC }, 521 { XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC },
535 { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID }, 522 { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID },
@@ -634,7 +621,7 @@ xfs_max_file_offset(
634 return (((__uint64_t)pagefactor) << bitshift) - 1; 621 return (((__uint64_t)pagefactor) << bitshift) - 1;
635} 622}
636 623
637int 624STATIC int
638xfs_blkdev_get( 625xfs_blkdev_get(
639 xfs_mount_t *mp, 626 xfs_mount_t *mp,
640 const char *name, 627 const char *name,
@@ -651,7 +638,7 @@ xfs_blkdev_get(
651 return -error; 638 return -error;
652} 639}
653 640
654void 641STATIC void
655xfs_blkdev_put( 642xfs_blkdev_put(
656 struct block_device *bdev) 643 struct block_device *bdev)
657{ 644{
@@ -734,15 +721,15 @@ xfs_close_devices(
734{ 721{
735 if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { 722 if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
736 struct block_device *logdev = mp->m_logdev_targp->bt_bdev; 723 struct block_device *logdev = mp->m_logdev_targp->bt_bdev;
737 xfs_free_buftarg(mp->m_logdev_targp); 724 xfs_free_buftarg(mp, mp->m_logdev_targp);
738 xfs_blkdev_put(logdev); 725 xfs_blkdev_put(logdev);
739 } 726 }
740 if (mp->m_rtdev_targp) { 727 if (mp->m_rtdev_targp) {
741 struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; 728 struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev;
742 xfs_free_buftarg(mp->m_rtdev_targp); 729 xfs_free_buftarg(mp, mp->m_rtdev_targp);
743 xfs_blkdev_put(rtdev); 730 xfs_blkdev_put(rtdev);
744 } 731 }
745 xfs_free_buftarg(mp->m_ddev_targp); 732 xfs_free_buftarg(mp, mp->m_ddev_targp);
746} 733}
747 734
748/* 735/*
@@ -811,9 +798,9 @@ xfs_open_devices(
811 798
812 out_free_rtdev_targ: 799 out_free_rtdev_targ:
813 if (mp->m_rtdev_targp) 800 if (mp->m_rtdev_targp)
814 xfs_free_buftarg(mp->m_rtdev_targp); 801 xfs_free_buftarg(mp, mp->m_rtdev_targp);
815 out_free_ddev_targ: 802 out_free_ddev_targ:
816 xfs_free_buftarg(mp->m_ddev_targp); 803 xfs_free_buftarg(mp, mp->m_ddev_targp);
817 out_close_rtdev: 804 out_close_rtdev:
818 if (rtdev) 805 if (rtdev)
819 xfs_blkdev_put(rtdev); 806 xfs_blkdev_put(rtdev);
@@ -872,7 +859,7 @@ xfsaild_wakeup(
872 wake_up_process(ailp->xa_task); 859 wake_up_process(ailp->xa_task);
873} 860}
874 861
875int 862STATIC int
876xfsaild( 863xfsaild(
877 void *data) 864 void *data)
878{ 865{
@@ -990,26 +977,57 @@ xfs_fs_write_inode(
990 int sync) 977 int sync)
991{ 978{
992 struct xfs_inode *ip = XFS_I(inode); 979 struct xfs_inode *ip = XFS_I(inode);
980 struct xfs_mount *mp = ip->i_mount;
993 int error = 0; 981 int error = 0;
994 int flags = 0;
995 982
996 xfs_itrace_entry(ip); 983 xfs_itrace_entry(ip);
984
985 if (XFS_FORCED_SHUTDOWN(mp))
986 return XFS_ERROR(EIO);
987
997 if (sync) { 988 if (sync) {
998 error = xfs_wait_on_pages(ip, 0, -1); 989 error = xfs_wait_on_pages(ip, 0, -1);
999 if (error) 990 if (error)
1000 goto out_error; 991 goto out;
1001 flags |= FLUSH_SYNC;
1002 } 992 }
1003 error = xfs_inode_flush(ip, flags);
1004 993
1005out_error: 994 /*
995 * Bypass inodes which have already been cleaned by
996 * the inode flush clustering code inside xfs_iflush
997 */
998 if (xfs_inode_clean(ip))
999 goto out;
1000
1001 /*
1002 * We make this non-blocking if the inode is contended, return
1003 * EAGAIN to indicate to the caller that they did not succeed.
1004 * This prevents the flush path from blocking on inodes inside
1005 * another operation right now, they get caught later by xfs_sync.
1006 */
1007 if (sync) {
1008 xfs_ilock(ip, XFS_ILOCK_SHARED);
1009 xfs_iflock(ip);
1010
1011 error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
1012 } else {
1013 error = EAGAIN;
1014 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
1015 goto out;
1016 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
1017 goto out_unlock;
1018
1019 error = xfs_iflush(ip, XFS_IFLUSH_ASYNC_NOBLOCK);
1020 }
1021
1022 out_unlock:
1023 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1024 out:
1006 /* 1025 /*
1007 * if we failed to write out the inode then mark 1026 * if we failed to write out the inode then mark
1008 * it dirty again so we'll try again later. 1027 * it dirty again so we'll try again later.
1009 */ 1028 */
1010 if (error) 1029 if (error)
1011 xfs_mark_inode_dirty_sync(ip); 1030 xfs_mark_inode_dirty_sync(ip);
1012
1013 return -error; 1031 return -error;
1014} 1032}
1015 1033
@@ -1169,18 +1187,12 @@ xfs_fs_statfs(
1169 statp->f_bfree = statp->f_bavail = 1187 statp->f_bfree = statp->f_bavail =
1170 sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); 1188 sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
1171 fakeinos = statp->f_bfree << sbp->sb_inopblog; 1189 fakeinos = statp->f_bfree << sbp->sb_inopblog;
1172#if XFS_BIG_INUMS
1173 fakeinos += mp->m_inoadd;
1174#endif
1175 statp->f_files = 1190 statp->f_files =
1176 MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER); 1191 MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER);
1177 if (mp->m_maxicount) 1192 if (mp->m_maxicount)
1178#if XFS_BIG_INUMS 1193 statp->f_files = min_t(typeof(statp->f_files),
1179 if (!mp->m_inoadd) 1194 statp->f_files,
1180#endif 1195 mp->m_maxicount);
1181 statp->f_files = min_t(typeof(statp->f_files),
1182 statp->f_files,
1183 mp->m_maxicount);
1184 statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); 1196 statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
1185 spin_unlock(&mp->m_sb_lock); 1197 spin_unlock(&mp->m_sb_lock);
1186 1198
@@ -1302,57 +1314,6 @@ xfs_fs_show_options(
1302 return -xfs_showargs(XFS_M(mnt->mnt_sb), m); 1314 return -xfs_showargs(XFS_M(mnt->mnt_sb), m);
1303} 1315}
1304 1316
1305STATIC int
1306xfs_fs_quotasync(
1307 struct super_block *sb,
1308 int type)
1309{
1310 return -XFS_QM_QUOTACTL(XFS_M(sb), Q_XQUOTASYNC, 0, NULL);
1311}
1312
1313STATIC int
1314xfs_fs_getxstate(
1315 struct super_block *sb,
1316 struct fs_quota_stat *fqs)
1317{
1318 return -XFS_QM_QUOTACTL(XFS_M(sb), Q_XGETQSTAT, 0, (caddr_t)fqs);
1319}
1320
1321STATIC int
1322xfs_fs_setxstate(
1323 struct super_block *sb,
1324 unsigned int flags,
1325 int op)
1326{
1327 return -XFS_QM_QUOTACTL(XFS_M(sb), op, 0, (caddr_t)&flags);
1328}
1329
1330STATIC int
1331xfs_fs_getxquota(
1332 struct super_block *sb,
1333 int type,
1334 qid_t id,
1335 struct fs_disk_quota *fdq)
1336{
1337 return -XFS_QM_QUOTACTL(XFS_M(sb),
1338 (type == USRQUOTA) ? Q_XGETQUOTA :
1339 ((type == GRPQUOTA) ? Q_XGETGQUOTA :
1340 Q_XGETPQUOTA), id, (caddr_t)fdq);
1341}
1342
1343STATIC int
1344xfs_fs_setxquota(
1345 struct super_block *sb,
1346 int type,
1347 qid_t id,
1348 struct fs_disk_quota *fdq)
1349{
1350 return -XFS_QM_QUOTACTL(XFS_M(sb),
1351 (type == USRQUOTA) ? Q_XSETQLIM :
1352 ((type == GRPQUOTA) ? Q_XSETGQLIM :
1353 Q_XSETPQLIM), id, (caddr_t)fdq);
1354}
1355
1356/* 1317/*
1357 * This function fills in xfs_mount_t fields based on mount args. 1318 * This function fills in xfs_mount_t fields based on mount args.
1358 * Note: the superblock _has_ now been read in. 1319 * Note: the superblock _has_ now been read in.
@@ -1435,7 +1396,9 @@ xfs_fs_fill_super(
1435 sb_min_blocksize(sb, BBSIZE); 1396 sb_min_blocksize(sb, BBSIZE);
1436 sb->s_xattr = xfs_xattr_handlers; 1397 sb->s_xattr = xfs_xattr_handlers;
1437 sb->s_export_op = &xfs_export_operations; 1398 sb->s_export_op = &xfs_export_operations;
1399#ifdef CONFIG_XFS_QUOTA
1438 sb->s_qcop = &xfs_quotactl_operations; 1400 sb->s_qcop = &xfs_quotactl_operations;
1401#endif
1439 sb->s_op = &xfs_super_operations; 1402 sb->s_op = &xfs_super_operations;
1440 1403
1441 error = xfs_dmops_get(mp); 1404 error = xfs_dmops_get(mp);
@@ -1578,14 +1541,6 @@ static struct super_operations xfs_super_operations = {
1578 .show_options = xfs_fs_show_options, 1541 .show_options = xfs_fs_show_options,
1579}; 1542};
1580 1543
1581static struct quotactl_ops xfs_quotactl_operations = {
1582 .quota_sync = xfs_fs_quotasync,
1583 .get_xstate = xfs_fs_getxstate,
1584 .set_xstate = xfs_fs_setxstate,
1585 .get_xquota = xfs_fs_getxquota,
1586 .set_xquota = xfs_fs_setxquota,
1587};
1588
1589static struct file_system_type xfs_fs_type = { 1544static struct file_system_type xfs_fs_type = {
1590 .owner = THIS_MODULE, 1545 .owner = THIS_MODULE,
1591 .name = "xfs", 1546 .name = "xfs",
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index d5d776d4cd67..5a2ea3a21781 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -93,6 +93,7 @@ extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
93 93
94extern const struct export_operations xfs_export_operations; 94extern const struct export_operations xfs_export_operations;
95extern struct xattr_handler *xfs_xattr_handlers[]; 95extern struct xattr_handler *xfs_xattr_handlers[];
96extern struct quotactl_ops xfs_quotactl_operations;
96 97
97#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) 98#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info))
98 99
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 5f6de1efe1f6..04f058c848ae 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -19,6 +19,7 @@
19#define XFS_SYNC_H 1 19#define XFS_SYNC_H 1
20 20
21struct xfs_mount; 21struct xfs_mount;
22struct xfs_perag;
22 23
23typedef struct bhv_vfs_sync_work { 24typedef struct bhv_vfs_sync_work {
24 struct list_head w_list; 25 struct list_head w_list;
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index f65983a230d3..ad7fbead4c97 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -41,11 +41,6 @@ struct attrlist_cursor_kern;
41#define IO_INVIS 0x00020 /* don't update inode timestamps */ 41#define IO_INVIS 0x00020 /* don't update inode timestamps */
42 42
43/* 43/*
44 * Flags for xfs_inode_flush
45 */
46#define FLUSH_SYNC 1 /* wait for flush to complete */
47
48/*
49 * Flush/Invalidate options for vop_toss/flush/flushinval_pages. 44 * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
50 */ 45 */
51#define FI_NONE 0 /* none */ 46#define FI_NONE 0 /* none */
@@ -55,33 +50,6 @@ struct attrlist_cursor_kern;
55 the operation completes. */ 50 the operation completes. */
56 51
57/* 52/*
58 * Dealing with bad inodes
59 */
60static inline int VN_BAD(struct inode *vp)
61{
62 return is_bad_inode(vp);
63}
64
65/*
66 * Extracting atime values in various formats
67 */
68static inline void vn_atime_to_bstime(struct inode *vp, xfs_bstime_t *bs_atime)
69{
70 bs_atime->tv_sec = vp->i_atime.tv_sec;
71 bs_atime->tv_nsec = vp->i_atime.tv_nsec;
72}
73
74static inline void vn_atime_to_timespec(struct inode *vp, struct timespec *ts)
75{
76 *ts = vp->i_atime;
77}
78
79static inline void vn_atime_to_time_t(struct inode *vp, time_t *tt)
80{
81 *tt = vp->i_atime.tv_sec;
82}
83
84/*
85 * Some useful predicates. 53 * Some useful predicates.
86 */ 54 */
87#define VN_MAPPED(vp) mapping_mapped(vp->i_mapping) 55#define VN_MAPPED(vp) mapping_mapped(vp->i_mapping)
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 6543c0b29753..e4babcc63423 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -804,7 +804,7 @@ xfs_qm_dqlookup(
804 uint flist_locked; 804 uint flist_locked;
805 xfs_dquot_t *d; 805 xfs_dquot_t *d;
806 806
807 ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); 807 ASSERT(mutex_is_locked(&qh->qh_lock));
808 808
809 flist_locked = B_FALSE; 809 flist_locked = B_FALSE;
810 810
@@ -877,7 +877,7 @@ xfs_qm_dqlookup(
877 /* 877 /*
878 * move the dquot to the front of the hashchain 878 * move the dquot to the front of the hashchain
879 */ 879 */
880 ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); 880 ASSERT(mutex_is_locked(&qh->qh_lock));
881 if (dqp->HL_PREVP != &qh->qh_next) { 881 if (dqp->HL_PREVP != &qh->qh_next) {
882 xfs_dqtrace_entry(dqp, 882 xfs_dqtrace_entry(dqp,
883 "DQLOOKUP: HASH MOVETOFRONT"); 883 "DQLOOKUP: HASH MOVETOFRONT");
@@ -892,13 +892,13 @@ xfs_qm_dqlookup(
892 } 892 }
893 xfs_dqtrace_entry(dqp, "LOOKUP END"); 893 xfs_dqtrace_entry(dqp, "LOOKUP END");
894 *O_dqpp = dqp; 894 *O_dqpp = dqp;
895 ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); 895 ASSERT(mutex_is_locked(&qh->qh_lock));
896 return (0); 896 return (0);
897 } 897 }
898 } 898 }
899 899
900 *O_dqpp = NULL; 900 *O_dqpp = NULL;
901 ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); 901 ASSERT(mutex_is_locked(&qh->qh_lock));
902 return (1); 902 return (1);
903} 903}
904 904
@@ -956,7 +956,7 @@ xfs_qm_dqget(
956 ASSERT(ip->i_gdquot == NULL); 956 ASSERT(ip->i_gdquot == NULL);
957 } 957 }
958#endif 958#endif
959 XFS_DQ_HASH_LOCK(h); 959 mutex_lock(&h->qh_lock);
960 960
961 /* 961 /*
962 * Look in the cache (hashtable). 962 * Look in the cache (hashtable).
@@ -971,7 +971,7 @@ xfs_qm_dqget(
971 */ 971 */
972 ASSERT(*O_dqpp); 972 ASSERT(*O_dqpp);
973 ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp)); 973 ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
974 XFS_DQ_HASH_UNLOCK(h); 974 mutex_unlock(&h->qh_lock);
975 xfs_dqtrace_entry(*O_dqpp, "DQGET DONE (FROM CACHE)"); 975 xfs_dqtrace_entry(*O_dqpp, "DQGET DONE (FROM CACHE)");
976 return (0); /* success */ 976 return (0); /* success */
977 } 977 }
@@ -991,7 +991,7 @@ xfs_qm_dqget(
991 * we don't keep the lock across a disk read 991 * we don't keep the lock across a disk read
992 */ 992 */
993 version = h->qh_version; 993 version = h->qh_version;
994 XFS_DQ_HASH_UNLOCK(h); 994 mutex_unlock(&h->qh_lock);
995 995
996 /* 996 /*
997 * Allocate the dquot on the kernel heap, and read the ondisk 997 * Allocate the dquot on the kernel heap, and read the ondisk
@@ -1056,7 +1056,7 @@ xfs_qm_dqget(
1056 /* 1056 /*
1057 * Hashlock comes after ilock in lock order 1057 * Hashlock comes after ilock in lock order
1058 */ 1058 */
1059 XFS_DQ_HASH_LOCK(h); 1059 mutex_lock(&h->qh_lock);
1060 if (version != h->qh_version) { 1060 if (version != h->qh_version) {
1061 xfs_dquot_t *tmpdqp; 1061 xfs_dquot_t *tmpdqp;
1062 /* 1062 /*
@@ -1072,7 +1072,7 @@ xfs_qm_dqget(
1072 * and start over. 1072 * and start over.
1073 */ 1073 */
1074 xfs_qm_dqput(tmpdqp); 1074 xfs_qm_dqput(tmpdqp);
1075 XFS_DQ_HASH_UNLOCK(h); 1075 mutex_unlock(&h->qh_lock);
1076 xfs_qm_dqdestroy(dqp); 1076 xfs_qm_dqdestroy(dqp);
1077 XQM_STATS_INC(xqmstats.xs_qm_dquot_dups); 1077 XQM_STATS_INC(xqmstats.xs_qm_dquot_dups);
1078 goto again; 1078 goto again;
@@ -1083,7 +1083,7 @@ xfs_qm_dqget(
1083 * Put the dquot at the beginning of the hash-chain and mp's list 1083 * Put the dquot at the beginning of the hash-chain and mp's list
1084 * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock .. 1084 * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock ..
1085 */ 1085 */
1086 ASSERT(XFS_DQ_IS_HASH_LOCKED(h)); 1086 ASSERT(mutex_is_locked(&h->qh_lock));
1087 dqp->q_hash = h; 1087 dqp->q_hash = h;
1088 XQM_HASHLIST_INSERT(h, dqp); 1088 XQM_HASHLIST_INSERT(h, dqp);
1089 1089
@@ -1102,7 +1102,7 @@ xfs_qm_dqget(
1102 XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp); 1102 XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp);
1103 1103
1104 xfs_qm_mplist_unlock(mp); 1104 xfs_qm_mplist_unlock(mp);
1105 XFS_DQ_HASH_UNLOCK(h); 1105 mutex_unlock(&h->qh_lock);
1106 dqret: 1106 dqret:
1107 ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); 1107 ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
1108 xfs_dqtrace_entry(dqp, "DQGET DONE"); 1108 xfs_dqtrace_entry(dqp, "DQGET DONE");
@@ -1440,7 +1440,7 @@ xfs_qm_dqpurge(
1440 xfs_mount_t *mp = dqp->q_mount; 1440 xfs_mount_t *mp = dqp->q_mount;
1441 1441
1442 ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); 1442 ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
1443 ASSERT(XFS_DQ_IS_HASH_LOCKED(dqp->q_hash)); 1443 ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock));
1444 1444
1445 xfs_dqlock(dqp); 1445 xfs_dqlock(dqp);
1446 /* 1446 /*
@@ -1453,7 +1453,7 @@ xfs_qm_dqpurge(
1453 */ 1453 */
1454 if (dqp->q_nrefs != 0) { 1454 if (dqp->q_nrefs != 0) {
1455 xfs_dqunlock(dqp); 1455 xfs_dqunlock(dqp);
1456 XFS_DQ_HASH_UNLOCK(dqp->q_hash); 1456 mutex_unlock(&dqp->q_hash->qh_lock);
1457 return (1); 1457 return (1);
1458 } 1458 }
1459 1459
@@ -1517,7 +1517,7 @@ xfs_qm_dqpurge(
1517 memset(&dqp->q_core, 0, sizeof(dqp->q_core)); 1517 memset(&dqp->q_core, 0, sizeof(dqp->q_core));
1518 xfs_dqfunlock(dqp); 1518 xfs_dqfunlock(dqp);
1519 xfs_dqunlock(dqp); 1519 xfs_dqunlock(dqp);
1520 XFS_DQ_HASH_UNLOCK(thishash); 1520 mutex_unlock(&thishash->qh_lock);
1521 return (0); 1521 return (0);
1522} 1522}
1523 1523
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index d443e93b4331..de0f402ddb4c 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -34,7 +34,7 @@
34 */ 34 */
35typedef struct xfs_dqhash { 35typedef struct xfs_dqhash {
36 struct xfs_dquot *qh_next; 36 struct xfs_dquot *qh_next;
37 mutex_t qh_lock; 37 struct mutex qh_lock;
38 uint qh_version; /* ever increasing version */ 38 uint qh_version; /* ever increasing version */
39 uint qh_nelems; /* number of dquots on the list */ 39 uint qh_nelems; /* number of dquots on the list */
40} xfs_dqhash_t; 40} xfs_dqhash_t;
@@ -81,7 +81,7 @@ typedef struct xfs_dquot {
81 xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */ 81 xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */
82 xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */ 82 xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */
83 xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */ 83 xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */
84 mutex_t q_qlock; /* quota lock */ 84 struct mutex q_qlock; /* quota lock */
85 struct completion q_flush; /* flush completion queue */ 85 struct completion q_flush; /* flush completion queue */
86 atomic_t q_pincount; /* dquot pin count */ 86 atomic_t q_pincount; /* dquot pin count */
87 wait_queue_head_t q_pinwait; /* dquot pinning wait queue */ 87 wait_queue_head_t q_pinwait; /* dquot pinning wait queue */
@@ -109,19 +109,6 @@ enum {
109 109
110#define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++) 110#define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++)
111 111
112#ifdef DEBUG
113static inline int
114XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp)
115{
116 if (mutex_trylock(&dqp->q_qlock)) {
117 mutex_unlock(&dqp->q_qlock);
118 return 0;
119 }
120 return 1;
121}
122#endif
123
124
125/* 112/*
126 * Manage the q_flush completion queue embedded in the dquot. This completion 113 * Manage the q_flush completion queue embedded in the dquot. This completion
127 * queue synchronizes processes attempting to flush the in-core dquot back to 114 * queue synchronizes processes attempting to flush the in-core dquot back to
@@ -142,6 +129,7 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
142 complete(&dqp->q_flush); 129 complete(&dqp->q_flush);
143} 130}
144 131
132#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock)))
145#define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp)) 133#define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp))
146#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) 134#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY)
147#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) 135#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 7a2beb64314f..5b6695049e00 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -55,7 +55,7 @@
55 * quota functionality, including maintaining the freelist and hash 55 * quota functionality, including maintaining the freelist and hash
56 * tables of dquots. 56 * tables of dquots.
57 */ 57 */
58mutex_t xfs_Gqm_lock; 58struct mutex xfs_Gqm_lock;
59struct xfs_qm *xfs_Gqm; 59struct xfs_qm *xfs_Gqm;
60uint ndquot; 60uint ndquot;
61 61
@@ -69,8 +69,6 @@ STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
69 69
70STATIC void xfs_qm_freelist_init(xfs_frlist_t *); 70STATIC void xfs_qm_freelist_init(xfs_frlist_t *);
71STATIC void xfs_qm_freelist_destroy(xfs_frlist_t *); 71STATIC void xfs_qm_freelist_destroy(xfs_frlist_t *);
72STATIC int xfs_qm_mplist_nowait(xfs_mount_t *);
73STATIC int xfs_qm_dqhashlock_nowait(xfs_dquot_t *);
74 72
75STATIC int xfs_qm_init_quotainos(xfs_mount_t *); 73STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
76STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); 74STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
@@ -82,7 +80,7 @@ static struct shrinker xfs_qm_shaker = {
82}; 80};
83 81
84#ifdef DEBUG 82#ifdef DEBUG
85extern mutex_t qcheck_lock; 83extern struct mutex qcheck_lock;
86#endif 84#endif
87 85
88#ifdef QUOTADEBUG 86#ifdef QUOTADEBUG
@@ -219,7 +217,7 @@ xfs_qm_hold_quotafs_ref(
219 * the structure could disappear between the entry to this routine and 217 * the structure could disappear between the entry to this routine and
220 * a HOLD operation if not locked. 218 * a HOLD operation if not locked.
221 */ 219 */
222 XFS_QM_LOCK(xfs_Gqm); 220 mutex_lock(&xfs_Gqm_lock);
223 221
224 if (xfs_Gqm == NULL) 222 if (xfs_Gqm == NULL)
225 xfs_Gqm = xfs_Gqm_init(); 223 xfs_Gqm = xfs_Gqm_init();
@@ -228,8 +226,8 @@ xfs_qm_hold_quotafs_ref(
228 * debugging and statistical purposes, but ... 226 * debugging and statistical purposes, but ...
229 * Just take a reference and get out. 227 * Just take a reference and get out.
230 */ 228 */
231 XFS_QM_HOLD(xfs_Gqm); 229 xfs_Gqm->qm_nrefs++;
232 XFS_QM_UNLOCK(xfs_Gqm); 230 mutex_unlock(&xfs_Gqm_lock);
233 231
234 return 0; 232 return 0;
235} 233}
@@ -277,13 +275,12 @@ xfs_qm_rele_quotafs_ref(
277 * Destroy the entire XQM. If somebody mounts with quotaon, this'll 275 * Destroy the entire XQM. If somebody mounts with quotaon, this'll
278 * be restarted. 276 * be restarted.
279 */ 277 */
280 XFS_QM_LOCK(xfs_Gqm); 278 mutex_lock(&xfs_Gqm_lock);
281 XFS_QM_RELE(xfs_Gqm); 279 if (--xfs_Gqm->qm_nrefs == 0) {
282 if (xfs_Gqm->qm_nrefs == 0) {
283 xfs_qm_destroy(xfs_Gqm); 280 xfs_qm_destroy(xfs_Gqm);
284 xfs_Gqm = NULL; 281 xfs_Gqm = NULL;
285 } 282 }
286 XFS_QM_UNLOCK(xfs_Gqm); 283 mutex_unlock(&xfs_Gqm_lock);
287} 284}
288 285
289/* 286/*
@@ -577,10 +574,10 @@ xfs_qm_dqpurge_int(
577 continue; 574 continue;
578 } 575 }
579 576
580 if (! xfs_qm_dqhashlock_nowait(dqp)) { 577 if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
581 nrecl = XFS_QI_MPLRECLAIMS(mp); 578 nrecl = XFS_QI_MPLRECLAIMS(mp);
582 xfs_qm_mplist_unlock(mp); 579 xfs_qm_mplist_unlock(mp);
583 XFS_DQ_HASH_LOCK(dqp->q_hash); 580 mutex_lock(&dqp->q_hash->qh_lock);
584 xfs_qm_mplist_lock(mp); 581 xfs_qm_mplist_lock(mp);
585 582
586 /* 583 /*
@@ -590,7 +587,7 @@ xfs_qm_dqpurge_int(
590 * this point, but somebody might be taking things off. 587 * this point, but somebody might be taking things off.
591 */ 588 */
592 if (nrecl != XFS_QI_MPLRECLAIMS(mp)) { 589 if (nrecl != XFS_QI_MPLRECLAIMS(mp)) {
593 XFS_DQ_HASH_UNLOCK(dqp->q_hash); 590 mutex_unlock(&dqp->q_hash->qh_lock);
594 goto again; 591 goto again;
595 } 592 }
596 } 593 }
@@ -632,7 +629,6 @@ xfs_qm_dqattach_one(
632 xfs_dqid_t id, 629 xfs_dqid_t id,
633 uint type, 630 uint type,
634 uint doalloc, 631 uint doalloc,
635 uint dolock,
636 xfs_dquot_t *udqhint, /* hint */ 632 xfs_dquot_t *udqhint, /* hint */
637 xfs_dquot_t **IO_idqpp) 633 xfs_dquot_t **IO_idqpp)
638{ 634{
@@ -641,16 +637,16 @@ xfs_qm_dqattach_one(
641 637
642 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 638 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
643 error = 0; 639 error = 0;
640
644 /* 641 /*
645 * See if we already have it in the inode itself. IO_idqpp is 642 * See if we already have it in the inode itself. IO_idqpp is
646 * &i_udquot or &i_gdquot. This made the code look weird, but 643 * &i_udquot or &i_gdquot. This made the code look weird, but
647 * made the logic a lot simpler. 644 * made the logic a lot simpler.
648 */ 645 */
649 if ((dqp = *IO_idqpp)) { 646 dqp = *IO_idqpp;
650 if (dolock) 647 if (dqp) {
651 xfs_dqlock(dqp);
652 xfs_dqtrace_entry(dqp, "DQATTACH: found in ip"); 648 xfs_dqtrace_entry(dqp, "DQATTACH: found in ip");
653 goto done; 649 return 0;
654 } 650 }
655 651
656 /* 652 /*
@@ -659,38 +655,38 @@ xfs_qm_dqattach_one(
659 * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside 655 * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside
660 * the user dquot. 656 * the user dquot.
661 */ 657 */
662 ASSERT(!udqhint || type == XFS_DQ_GROUP || type == XFS_DQ_PROJ); 658 if (udqhint) {
663 if (udqhint && !dolock) 659 ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
664 xfs_dqlock(udqhint); 660 xfs_dqlock(udqhint);
665 661
666 /* 662 /*
667 * No need to take dqlock to look at the id. 663 * No need to take dqlock to look at the id.
668 * The ID can't change until it gets reclaimed, and it won't 664 *
669 * be reclaimed as long as we have a ref from inode and we hold 665 * The ID can't change until it gets reclaimed, and it won't
670 * the ilock. 666 * be reclaimed as long as we have a ref from inode and we
671 */ 667 * hold the ilock.
672 if (udqhint && 668 */
673 (dqp = udqhint->q_gdquot) && 669 dqp = udqhint->q_gdquot;
674 (be32_to_cpu(dqp->q_core.d_id) == id)) { 670 if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) {
675 ASSERT(XFS_DQ_IS_LOCKED(udqhint)); 671 xfs_dqlock(dqp);
676 xfs_dqlock(dqp); 672 XFS_DQHOLD(dqp);
677 XFS_DQHOLD(dqp); 673 ASSERT(*IO_idqpp == NULL);
678 ASSERT(*IO_idqpp == NULL); 674 *IO_idqpp = dqp;
679 *IO_idqpp = dqp; 675
680 if (!dolock) {
681 xfs_dqunlock(dqp); 676 xfs_dqunlock(dqp);
682 xfs_dqunlock(udqhint); 677 xfs_dqunlock(udqhint);
678 return 0;
683 } 679 }
684 goto done; 680
685 } 681 /*
686 /* 682 * We can't hold a dquot lock when we call the dqget code.
687 * We can't hold a dquot lock when we call the dqget code. 683 * We'll deadlock in no time, because of (not conforming to)
688 * We'll deadlock in no time, because of (not conforming to) 684 * lock ordering - the inodelock comes before any dquot lock,
689 * lock ordering - the inodelock comes before any dquot lock, 685 * and we may drop and reacquire the ilock in xfs_qm_dqget().
690 * and we may drop and reacquire the ilock in xfs_qm_dqget(). 686 */
691 */
692 if (udqhint)
693 xfs_dqunlock(udqhint); 687 xfs_dqunlock(udqhint);
688 }
689
694 /* 690 /*
695 * Find the dquot from somewhere. This bumps the 691 * Find the dquot from somewhere. This bumps the
696 * reference count of dquot and returns it locked. 692 * reference count of dquot and returns it locked.
@@ -698,48 +694,19 @@ xfs_qm_dqattach_one(
698 * disk and we didn't ask it to allocate; 694 * disk and we didn't ask it to allocate;
699 * ESRCH if quotas got turned off suddenly. 695 * ESRCH if quotas got turned off suddenly.
700 */ 696 */
701 if ((error = xfs_qm_dqget(ip->i_mount, ip, id, type, 697 error = xfs_qm_dqget(ip->i_mount, ip, id, type, XFS_QMOPT_DOWARN, &dqp);
702 doalloc|XFS_QMOPT_DOWARN, &dqp))) { 698 if (error)
703 if (udqhint && dolock) 699 return error;
704 xfs_dqlock(udqhint);
705 goto done;
706 }
707 700
708 xfs_dqtrace_entry(dqp, "DQATTACH: found by dqget"); 701 xfs_dqtrace_entry(dqp, "DQATTACH: found by dqget");
702
709 /* 703 /*
710 * dqget may have dropped and re-acquired the ilock, but it guarantees 704 * dqget may have dropped and re-acquired the ilock, but it guarantees
711 * that the dquot returned is the one that should go in the inode. 705 * that the dquot returned is the one that should go in the inode.
712 */ 706 */
713 *IO_idqpp = dqp; 707 *IO_idqpp = dqp;
714 ASSERT(dqp); 708 xfs_dqunlock(dqp);
715 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 709 return 0;
716 if (! dolock) {
717 xfs_dqunlock(dqp);
718 goto done;
719 }
720 if (! udqhint)
721 goto done;
722
723 ASSERT(udqhint);
724 ASSERT(dolock);
725 ASSERT(XFS_DQ_IS_LOCKED(dqp));
726 if (! xfs_qm_dqlock_nowait(udqhint)) {
727 xfs_dqunlock(dqp);
728 xfs_dqlock(udqhint);
729 xfs_dqlock(dqp);
730 }
731 done:
732#ifdef QUOTADEBUG
733 if (udqhint) {
734 if (dolock)
735 ASSERT(XFS_DQ_IS_LOCKED(udqhint));
736 }
737 if (! error) {
738 if (dolock)
739 ASSERT(XFS_DQ_IS_LOCKED(dqp));
740 }
741#endif
742 return error;
743} 710}
744 711
745 712
@@ -754,24 +721,15 @@ xfs_qm_dqattach_one(
754STATIC void 721STATIC void
755xfs_qm_dqattach_grouphint( 722xfs_qm_dqattach_grouphint(
756 xfs_dquot_t *udq, 723 xfs_dquot_t *udq,
757 xfs_dquot_t *gdq, 724 xfs_dquot_t *gdq)
758 uint locked)
759{ 725{
760 xfs_dquot_t *tmp; 726 xfs_dquot_t *tmp;
761 727
762#ifdef QUOTADEBUG 728 xfs_dqlock(udq);
763 if (locked) {
764 ASSERT(XFS_DQ_IS_LOCKED(udq));
765 ASSERT(XFS_DQ_IS_LOCKED(gdq));
766 }
767#endif
768 if (! locked)
769 xfs_dqlock(udq);
770 729
771 if ((tmp = udq->q_gdquot)) { 730 if ((tmp = udq->q_gdquot)) {
772 if (tmp == gdq) { 731 if (tmp == gdq) {
773 if (! locked) 732 xfs_dqunlock(udq);
774 xfs_dqunlock(udq);
775 return; 733 return;
776 } 734 }
777 735
@@ -781,8 +739,6 @@ xfs_qm_dqattach_grouphint(
781 * because the freelist lock comes before dqlocks. 739 * because the freelist lock comes before dqlocks.
782 */ 740 */
783 xfs_dqunlock(udq); 741 xfs_dqunlock(udq);
784 if (locked)
785 xfs_dqunlock(gdq);
786 /* 742 /*
787 * we took a hard reference once upon a time in dqget, 743 * we took a hard reference once upon a time in dqget,
788 * so give it back when the udquot no longer points at it 744 * so give it back when the udquot no longer points at it
@@ -795,9 +751,7 @@ xfs_qm_dqattach_grouphint(
795 751
796 } else { 752 } else {
797 ASSERT(XFS_DQ_IS_LOCKED(udq)); 753 ASSERT(XFS_DQ_IS_LOCKED(udq));
798 if (! locked) { 754 xfs_dqlock(gdq);
799 xfs_dqlock(gdq);
800 }
801 } 755 }
802 756
803 ASSERT(XFS_DQ_IS_LOCKED(udq)); 757 ASSERT(XFS_DQ_IS_LOCKED(udq));
@@ -810,10 +764,9 @@ xfs_qm_dqattach_grouphint(
810 XFS_DQHOLD(gdq); 764 XFS_DQHOLD(gdq);
811 udq->q_gdquot = gdq; 765 udq->q_gdquot = gdq;
812 } 766 }
813 if (! locked) { 767
814 xfs_dqunlock(gdq); 768 xfs_dqunlock(gdq);
815 xfs_dqunlock(udq); 769 xfs_dqunlock(udq);
816 }
817} 770}
818 771
819 772
@@ -821,8 +774,6 @@ xfs_qm_dqattach_grouphint(
821 * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON 774 * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON
822 * into account. 775 * into account.
823 * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed. 776 * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed.
824 * If XFS_QMOPT_DQLOCK, the dquot(s) will be returned locked. This option pretty
825 * much made this code a complete mess, but it has been pretty useful.
826 * If XFS_QMOPT_ILOCKED, then inode sent is already locked EXCL. 777 * If XFS_QMOPT_ILOCKED, then inode sent is already locked EXCL.
827 * Inode may get unlocked and relocked in here, and the caller must deal with 778 * Inode may get unlocked and relocked in here, and the caller must deal with
828 * the consequences. 779 * the consequences.
@@ -851,7 +802,6 @@ xfs_qm_dqattach(
851 if (XFS_IS_UQUOTA_ON(mp)) { 802 if (XFS_IS_UQUOTA_ON(mp)) {
852 error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER, 803 error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
853 flags & XFS_QMOPT_DQALLOC, 804 flags & XFS_QMOPT_DQALLOC,
854 flags & XFS_QMOPT_DQLOCK,
855 NULL, &ip->i_udquot); 805 NULL, &ip->i_udquot);
856 if (error) 806 if (error)
857 goto done; 807 goto done;
@@ -863,11 +813,9 @@ xfs_qm_dqattach(
863 error = XFS_IS_GQUOTA_ON(mp) ? 813 error = XFS_IS_GQUOTA_ON(mp) ?
864 xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, 814 xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
865 flags & XFS_QMOPT_DQALLOC, 815 flags & XFS_QMOPT_DQALLOC,
866 flags & XFS_QMOPT_DQLOCK,
867 ip->i_udquot, &ip->i_gdquot) : 816 ip->i_udquot, &ip->i_gdquot) :
868 xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ, 817 xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ,
869 flags & XFS_QMOPT_DQALLOC, 818 flags & XFS_QMOPT_DQALLOC,
870 flags & XFS_QMOPT_DQLOCK,
871 ip->i_udquot, &ip->i_gdquot); 819 ip->i_udquot, &ip->i_gdquot);
872 /* 820 /*
873 * Don't worry about the udquot that we may have 821 * Don't worry about the udquot that we may have
@@ -898,22 +846,13 @@ xfs_qm_dqattach(
898 /* 846 /*
899 * Attach i_gdquot to the gdquot hint inside the i_udquot. 847 * Attach i_gdquot to the gdquot hint inside the i_udquot.
900 */ 848 */
901 xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot, 849 xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot);
902 flags & XFS_QMOPT_DQLOCK);
903 } 850 }
904 851
905 done: 852 done:
906 853
907#ifdef QUOTADEBUG 854#ifdef QUOTADEBUG
908 if (! error) { 855 if (! error) {
909 if (ip->i_udquot) {
910 if (flags & XFS_QMOPT_DQLOCK)
911 ASSERT(XFS_DQ_IS_LOCKED(ip->i_udquot));
912 }
913 if (ip->i_gdquot) {
914 if (flags & XFS_QMOPT_DQLOCK)
915 ASSERT(XFS_DQ_IS_LOCKED(ip->i_gdquot));
916 }
917 if (XFS_IS_UQUOTA_ON(mp)) 856 if (XFS_IS_UQUOTA_ON(mp))
918 ASSERT(ip->i_udquot); 857 ASSERT(ip->i_udquot);
919 if (XFS_IS_OQUOTA_ON(mp)) 858 if (XFS_IS_OQUOTA_ON(mp))
@@ -2086,7 +2025,7 @@ xfs_qm_shake_freelist(
2086 * a dqlookup process that holds the hashlock that is 2025 * a dqlookup process that holds the hashlock that is
2087 * waiting for the freelist lock. 2026 * waiting for the freelist lock.
2088 */ 2027 */
2089 if (! xfs_qm_dqhashlock_nowait(dqp)) { 2028 if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
2090 xfs_dqfunlock(dqp); 2029 xfs_dqfunlock(dqp);
2091 xfs_dqunlock(dqp); 2030 xfs_dqunlock(dqp);
2092 dqp = dqp->dq_flnext; 2031 dqp = dqp->dq_flnext;
@@ -2103,7 +2042,7 @@ xfs_qm_shake_freelist(
2103 /* XXX put a sentinel so that we can come back here */ 2042 /* XXX put a sentinel so that we can come back here */
2104 xfs_dqfunlock(dqp); 2043 xfs_dqfunlock(dqp);
2105 xfs_dqunlock(dqp); 2044 xfs_dqunlock(dqp);
2106 XFS_DQ_HASH_UNLOCK(hash); 2045 mutex_unlock(&hash->qh_lock);
2107 xfs_qm_freelist_unlock(xfs_Gqm); 2046 xfs_qm_freelist_unlock(xfs_Gqm);
2108 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) 2047 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
2109 return nreclaimed; 2048 return nreclaimed;
@@ -2120,7 +2059,7 @@ xfs_qm_shake_freelist(
2120 XQM_HASHLIST_REMOVE(hash, dqp); 2059 XQM_HASHLIST_REMOVE(hash, dqp);
2121 xfs_dqfunlock(dqp); 2060 xfs_dqfunlock(dqp);
2122 xfs_qm_mplist_unlock(dqp->q_mount); 2061 xfs_qm_mplist_unlock(dqp->q_mount);
2123 XFS_DQ_HASH_UNLOCK(hash); 2062 mutex_unlock(&hash->qh_lock);
2124 2063
2125 off_freelist: 2064 off_freelist:
2126 XQM_FREELIST_REMOVE(dqp); 2065 XQM_FREELIST_REMOVE(dqp);
@@ -2262,7 +2201,7 @@ xfs_qm_dqreclaim_one(void)
2262 continue; 2201 continue;
2263 } 2202 }
2264 2203
2265 if (! xfs_qm_dqhashlock_nowait(dqp)) 2204 if (!mutex_trylock(&dqp->q_hash->qh_lock))
2266 goto mplistunlock; 2205 goto mplistunlock;
2267 2206
2268 ASSERT(dqp->q_nrefs == 0); 2207 ASSERT(dqp->q_nrefs == 0);
@@ -2271,7 +2210,7 @@ xfs_qm_dqreclaim_one(void)
2271 XQM_HASHLIST_REMOVE(dqp->q_hash, dqp); 2210 XQM_HASHLIST_REMOVE(dqp->q_hash, dqp);
2272 XQM_FREELIST_REMOVE(dqp); 2211 XQM_FREELIST_REMOVE(dqp);
2273 dqpout = dqp; 2212 dqpout = dqp;
2274 XFS_DQ_HASH_UNLOCK(dqp->q_hash); 2213 mutex_unlock(&dqp->q_hash->qh_lock);
2275 mplistunlock: 2214 mplistunlock:
2276 xfs_qm_mplist_unlock(dqp->q_mount); 2215 xfs_qm_mplist_unlock(dqp->q_mount);
2277 xfs_dqfunlock(dqp); 2216 xfs_dqfunlock(dqp);
@@ -2774,34 +2713,3 @@ xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq)
2774{ 2713{
2775 xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq); 2714 xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq);
2776} 2715}
2777
2778STATIC int
2779xfs_qm_dqhashlock_nowait(
2780 xfs_dquot_t *dqp)
2781{
2782 int locked;
2783
2784 locked = mutex_trylock(&((dqp)->q_hash->qh_lock));
2785 return locked;
2786}
2787
2788int
2789xfs_qm_freelist_lock_nowait(
2790 xfs_qm_t *xqm)
2791{
2792 int locked;
2793
2794 locked = mutex_trylock(&(xqm->qm_dqfreelist.qh_lock));
2795 return locked;
2796}
2797
2798STATIC int
2799xfs_qm_mplist_nowait(
2800 xfs_mount_t *mp)
2801{
2802 int locked;
2803
2804 ASSERT(mp->m_quotainfo);
2805 locked = mutex_trylock(&(XFS_QI_MPLLOCK(mp)));
2806 return locked;
2807}
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index ddf09166387c..a371954cae1b 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -27,7 +27,7 @@ struct xfs_qm;
27struct xfs_inode; 27struct xfs_inode;
28 28
29extern uint ndquot; 29extern uint ndquot;
30extern mutex_t xfs_Gqm_lock; 30extern struct mutex xfs_Gqm_lock;
31extern struct xfs_qm *xfs_Gqm; 31extern struct xfs_qm *xfs_Gqm;
32extern kmem_zone_t *qm_dqzone; 32extern kmem_zone_t *qm_dqzone;
33extern kmem_zone_t *qm_dqtrxzone; 33extern kmem_zone_t *qm_dqtrxzone;
@@ -79,7 +79,7 @@ typedef xfs_dqhash_t xfs_dqlist_t;
79typedef struct xfs_frlist { 79typedef struct xfs_frlist {
80 struct xfs_dquot *qh_next; 80 struct xfs_dquot *qh_next;
81 struct xfs_dquot *qh_prev; 81 struct xfs_dquot *qh_prev;
82 mutex_t qh_lock; 82 struct mutex qh_lock;
83 uint qh_version; 83 uint qh_version;
84 uint qh_nelems; 84 uint qh_nelems;
85} xfs_frlist_t; 85} xfs_frlist_t;
@@ -115,7 +115,7 @@ typedef struct xfs_quotainfo {
115 xfs_qwarncnt_t qi_bwarnlimit; /* limit for blks warnings */ 115 xfs_qwarncnt_t qi_bwarnlimit; /* limit for blks warnings */
116 xfs_qwarncnt_t qi_iwarnlimit; /* limit for inodes warnings */ 116 xfs_qwarncnt_t qi_iwarnlimit; /* limit for inodes warnings */
117 xfs_qwarncnt_t qi_rtbwarnlimit;/* limit for rt blks warnings */ 117 xfs_qwarncnt_t qi_rtbwarnlimit;/* limit for rt blks warnings */
118 mutex_t qi_quotaofflock;/* to serialize quotaoff */ 118 struct mutex qi_quotaofflock;/* to serialize quotaoff */
119 xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */ 119 xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */
120 uint qi_dqperchunk; /* # ondisk dqs in above chunk */ 120 uint qi_dqperchunk; /* # ondisk dqs in above chunk */
121 xfs_qcnt_t qi_bhardlimit; /* default data blk hard limit */ 121 xfs_qcnt_t qi_bhardlimit; /* default data blk hard limit */
@@ -158,11 +158,6 @@ typedef struct xfs_dquot_acct {
158#define XFS_QM_IWARNLIMIT 5 158#define XFS_QM_IWARNLIMIT 5
159#define XFS_QM_RTBWARNLIMIT 5 159#define XFS_QM_RTBWARNLIMIT 5
160 160
161#define XFS_QM_LOCK(xqm) (mutex_lock(&xqm##_lock))
162#define XFS_QM_UNLOCK(xqm) (mutex_unlock(&xqm##_lock))
163#define XFS_QM_HOLD(xqm) ((xqm)->qm_nrefs++)
164#define XFS_QM_RELE(xqm) ((xqm)->qm_nrefs--)
165
166extern void xfs_qm_destroy_quotainfo(xfs_mount_t *); 161extern void xfs_qm_destroy_quotainfo(xfs_mount_t *);
167extern void xfs_qm_mount_quotas(xfs_mount_t *); 162extern void xfs_qm_mount_quotas(xfs_mount_t *);
168extern int xfs_qm_quotacheck(xfs_mount_t *); 163extern int xfs_qm_quotacheck(xfs_mount_t *);
@@ -178,6 +173,16 @@ extern void xfs_qm_dqdetach(xfs_inode_t *);
178extern int xfs_qm_dqpurge_all(xfs_mount_t *, uint); 173extern int xfs_qm_dqpurge_all(xfs_mount_t *, uint);
179extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint); 174extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
180 175
176/* quota ops */
177extern int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
178extern int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
179 fs_disk_quota_t *);
180extern int xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint,
181 fs_disk_quota_t *);
182extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
183extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint);
184extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
185
181/* vop stuff */ 186/* vop stuff */
182extern int xfs_qm_vop_dqalloc(xfs_mount_t *, xfs_inode_t *, 187extern int xfs_qm_vop_dqalloc(xfs_mount_t *, xfs_inode_t *,
183 uid_t, gid_t, prid_t, uint, 188 uid_t, gid_t, prid_t, uint,
@@ -194,11 +199,6 @@ extern int xfs_qm_vop_chown_reserve(xfs_trans_t *, xfs_inode_t *,
194/* list stuff */ 199/* list stuff */
195extern void xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *); 200extern void xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *);
196extern void xfs_qm_freelist_unlink(xfs_dquot_t *); 201extern void xfs_qm_freelist_unlink(xfs_dquot_t *);
197extern int xfs_qm_freelist_lock_nowait(xfs_qm_t *);
198
199/* system call interface */
200extern int xfs_qm_quotactl(struct xfs_mount *, int, int,
201 xfs_caddr_t);
202 202
203#ifdef DEBUG 203#ifdef DEBUG
204extern int xfs_qm_internalqcheck(xfs_mount_t *); 204extern int xfs_qm_internalqcheck(xfs_mount_t *);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index bc6c5cca3e12..63037c689a4b 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -235,7 +235,6 @@ struct xfs_qmops xfs_qmcore_xfs = {
235 .xfs_dqvopchownresv = xfs_qm_vop_chown_reserve, 235 .xfs_dqvopchownresv = xfs_qm_vop_chown_reserve,
236 .xfs_dqstatvfs = xfs_qm_statvfs, 236 .xfs_dqstatvfs = xfs_qm_statvfs,
237 .xfs_dqsync = xfs_qm_sync, 237 .xfs_dqsync = xfs_qm_sync,
238 .xfs_quotactl = xfs_qm_quotactl,
239 .xfs_dqtrxops = &xfs_trans_dquot_ops, 238 .xfs_dqtrxops = &xfs_trans_dquot_ops,
240}; 239};
241EXPORT_SYMBOL(xfs_qmcore_xfs); 240EXPORT_SYMBOL(xfs_qmcore_xfs);
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 68139b38aede..c7b66f6506ce 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -57,135 +57,16 @@
57# define qdprintk(s, args...) do { } while (0) 57# define qdprintk(s, args...) do { } while (0)
58#endif 58#endif
59 59
60STATIC int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
61STATIC int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
62 fs_disk_quota_t *);
63STATIC int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
64STATIC int xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint,
65 fs_disk_quota_t *);
66STATIC int xfs_qm_scall_quotaon(xfs_mount_t *, uint);
67STATIC int xfs_qm_scall_quotaoff(xfs_mount_t *, uint, boolean_t);
68STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); 60STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
69STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, 61STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
70 uint); 62 uint);
71STATIC uint xfs_qm_import_flags(uint);
72STATIC uint xfs_qm_export_flags(uint); 63STATIC uint xfs_qm_export_flags(uint);
73STATIC uint xfs_qm_import_qtype_flags(uint);
74STATIC uint xfs_qm_export_qtype_flags(uint); 64STATIC uint xfs_qm_export_qtype_flags(uint);
75STATIC void xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *, 65STATIC void xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *,
76 fs_disk_quota_t *); 66 fs_disk_quota_t *);
77 67
78 68
79/* 69/*
80 * The main distribution switch of all XFS quotactl system calls.
81 */
82int
83xfs_qm_quotactl(
84 xfs_mount_t *mp,
85 int cmd,
86 int id,
87 xfs_caddr_t addr)
88{
89 int error;
90
91 ASSERT(addr != NULL || cmd == Q_XQUOTASYNC);
92
93 /*
94 * The following commands are valid even when quotaoff.
95 */
96 switch (cmd) {
97 case Q_XQUOTARM:
98 /*
99 * Truncate quota files. quota must be off.
100 */
101 if (XFS_IS_QUOTA_ON(mp))
102 return XFS_ERROR(EINVAL);
103 if (mp->m_flags & XFS_MOUNT_RDONLY)
104 return XFS_ERROR(EROFS);
105 return (xfs_qm_scall_trunc_qfiles(mp,
106 xfs_qm_import_qtype_flags(*(uint *)addr)));
107
108 case Q_XGETQSTAT:
109 /*
110 * Get quota status information.
111 */
112 return (xfs_qm_scall_getqstat(mp, (fs_quota_stat_t *)addr));
113
114 case Q_XQUOTAON:
115 /*
116 * QUOTAON - enabling quota enforcement.
117 * Quota accounting must be turned on at mount time.
118 */
119 if (mp->m_flags & XFS_MOUNT_RDONLY)
120 return XFS_ERROR(EROFS);
121 return (xfs_qm_scall_quotaon(mp,
122 xfs_qm_import_flags(*(uint *)addr)));
123
124 case Q_XQUOTAOFF:
125 if (mp->m_flags & XFS_MOUNT_RDONLY)
126 return XFS_ERROR(EROFS);
127 break;
128
129 case Q_XQUOTASYNC:
130 return xfs_sync_inodes(mp, SYNC_DELWRI);
131
132 default:
133 break;
134 }
135
136 if (! XFS_IS_QUOTA_ON(mp))
137 return XFS_ERROR(ESRCH);
138
139 switch (cmd) {
140 case Q_XQUOTAOFF:
141 if (mp->m_flags & XFS_MOUNT_RDONLY)
142 return XFS_ERROR(EROFS);
143 error = xfs_qm_scall_quotaoff(mp,
144 xfs_qm_import_flags(*(uint *)addr),
145 B_FALSE);
146 break;
147
148 case Q_XGETQUOTA:
149 error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_USER,
150 (fs_disk_quota_t *)addr);
151 break;
152 case Q_XGETGQUOTA:
153 error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_GROUP,
154 (fs_disk_quota_t *)addr);
155 break;
156 case Q_XGETPQUOTA:
157 error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_PROJ,
158 (fs_disk_quota_t *)addr);
159 break;
160
161 case Q_XSETQLIM:
162 if (mp->m_flags & XFS_MOUNT_RDONLY)
163 return XFS_ERROR(EROFS);
164 error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_USER,
165 (fs_disk_quota_t *)addr);
166 break;
167 case Q_XSETGQLIM:
168 if (mp->m_flags & XFS_MOUNT_RDONLY)
169 return XFS_ERROR(EROFS);
170 error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_GROUP,
171 (fs_disk_quota_t *)addr);
172 break;
173 case Q_XSETPQLIM:
174 if (mp->m_flags & XFS_MOUNT_RDONLY)
175 return XFS_ERROR(EROFS);
176 error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_PROJ,
177 (fs_disk_quota_t *)addr);
178 break;
179
180 default:
181 error = XFS_ERROR(EINVAL);
182 break;
183 }
184
185 return (error);
186}
187
188/*
189 * Turn off quota accounting and/or enforcement for all udquots and/or 70 * Turn off quota accounting and/or enforcement for all udquots and/or
190 * gdquots. Called only at unmount time. 71 * gdquots. Called only at unmount time.
191 * 72 *
@@ -193,11 +74,10 @@ xfs_qm_quotactl(
193 * incore, and modifies the ondisk dquot directly. Therefore, for example, 74 * incore, and modifies the ondisk dquot directly. Therefore, for example,
194 * it is an error to call this twice, without purging the cache. 75 * it is an error to call this twice, without purging the cache.
195 */ 76 */
196STATIC int 77int
197xfs_qm_scall_quotaoff( 78xfs_qm_scall_quotaoff(
198 xfs_mount_t *mp, 79 xfs_mount_t *mp,
199 uint flags, 80 uint flags)
200 boolean_t force)
201{ 81{
202 uint dqtype; 82 uint dqtype;
203 int error; 83 int error;
@@ -205,8 +85,6 @@ xfs_qm_scall_quotaoff(
205 xfs_qoff_logitem_t *qoffstart; 85 xfs_qoff_logitem_t *qoffstart;
206 int nculprits; 86 int nculprits;
207 87
208 if (!force && !capable(CAP_SYS_ADMIN))
209 return XFS_ERROR(EPERM);
210 /* 88 /*
211 * No file system can have quotas enabled on disk but not in core. 89 * No file system can have quotas enabled on disk but not in core.
212 * Note that quota utilities (like quotaoff) _expect_ 90 * Note that quota utilities (like quotaoff) _expect_
@@ -375,7 +253,7 @@ out_error:
375 return (error); 253 return (error);
376} 254}
377 255
378STATIC int 256int
379xfs_qm_scall_trunc_qfiles( 257xfs_qm_scall_trunc_qfiles(
380 xfs_mount_t *mp, 258 xfs_mount_t *mp,
381 uint flags) 259 uint flags)
@@ -383,8 +261,6 @@ xfs_qm_scall_trunc_qfiles(
383 int error = 0, error2 = 0; 261 int error = 0, error2 = 0;
384 xfs_inode_t *qip; 262 xfs_inode_t *qip;
385 263
386 if (!capable(CAP_SYS_ADMIN))
387 return XFS_ERROR(EPERM);
388 if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) { 264 if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
389 qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags); 265 qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
390 return XFS_ERROR(EINVAL); 266 return XFS_ERROR(EINVAL);
@@ -416,7 +292,7 @@ xfs_qm_scall_trunc_qfiles(
416 * effect immediately. 292 * effect immediately.
417 * (Switching on quota accounting must be done at mount time.) 293 * (Switching on quota accounting must be done at mount time.)
418 */ 294 */
419STATIC int 295int
420xfs_qm_scall_quotaon( 296xfs_qm_scall_quotaon(
421 xfs_mount_t *mp, 297 xfs_mount_t *mp,
422 uint flags) 298 uint flags)
@@ -426,9 +302,6 @@ xfs_qm_scall_quotaon(
426 uint accflags; 302 uint accflags;
427 __int64_t sbflags; 303 __int64_t sbflags;
428 304
429 if (!capable(CAP_SYS_ADMIN))
430 return XFS_ERROR(EPERM);
431
432 flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); 305 flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
433 /* 306 /*
434 * Switching on quota accounting must be done at mount time. 307 * Switching on quota accounting must be done at mount time.
@@ -517,7 +390,7 @@ xfs_qm_scall_quotaon(
517/* 390/*
518 * Return quota status information, such as uquota-off, enforcements, etc. 391 * Return quota status information, such as uquota-off, enforcements, etc.
519 */ 392 */
520STATIC int 393int
521xfs_qm_scall_getqstat( 394xfs_qm_scall_getqstat(
522 xfs_mount_t *mp, 395 xfs_mount_t *mp,
523 fs_quota_stat_t *out) 396 fs_quota_stat_t *out)
@@ -582,7 +455,7 @@ xfs_qm_scall_getqstat(
582/* 455/*
583 * Adjust quota limits, and start/stop timers accordingly. 456 * Adjust quota limits, and start/stop timers accordingly.
584 */ 457 */
585STATIC int 458int
586xfs_qm_scall_setqlim( 459xfs_qm_scall_setqlim(
587 xfs_mount_t *mp, 460 xfs_mount_t *mp,
588 xfs_dqid_t id, 461 xfs_dqid_t id,
@@ -595,9 +468,6 @@ xfs_qm_scall_setqlim(
595 int error; 468 int error;
596 xfs_qcnt_t hard, soft; 469 xfs_qcnt_t hard, soft;
597 470
598 if (!capable(CAP_SYS_ADMIN))
599 return XFS_ERROR(EPERM);
600
601 if ((newlim->d_fieldmask & 471 if ((newlim->d_fieldmask &
602 (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK|FS_DQ_WARNS_MASK)) == 0) 472 (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK|FS_DQ_WARNS_MASK)) == 0)
603 return (0); 473 return (0);
@@ -742,7 +612,7 @@ xfs_qm_scall_setqlim(
742 return error; 612 return error;
743} 613}
744 614
745STATIC int 615int
746xfs_qm_scall_getquota( 616xfs_qm_scall_getquota(
747 xfs_mount_t *mp, 617 xfs_mount_t *mp,
748 xfs_dqid_t id, 618 xfs_dqid_t id,
@@ -935,30 +805,6 @@ xfs_qm_export_dquot(
935} 805}
936 806
937STATIC uint 807STATIC uint
938xfs_qm_import_qtype_flags(
939 uint uflags)
940{
941 uint oflags = 0;
942
943 /*
944 * Can't be more than one, or none.
945 */
946 if (((uflags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) ==
947 (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) ||
948 ((uflags & (XFS_GROUP_QUOTA | XFS_PROJ_QUOTA)) ==
949 (XFS_GROUP_QUOTA | XFS_PROJ_QUOTA)) ||
950 ((uflags & (XFS_USER_QUOTA | XFS_PROJ_QUOTA)) ==
951 (XFS_USER_QUOTA | XFS_PROJ_QUOTA)) ||
952 ((uflags & (XFS_GROUP_QUOTA|XFS_USER_QUOTA|XFS_PROJ_QUOTA)) == 0))
953 return (0);
954
955 oflags |= (uflags & XFS_USER_QUOTA) ? XFS_DQ_USER : 0;
956 oflags |= (uflags & XFS_PROJ_QUOTA) ? XFS_DQ_PROJ : 0;
957 oflags |= (uflags & XFS_GROUP_QUOTA) ? XFS_DQ_GROUP: 0;
958 return oflags;
959}
960
961STATIC uint
962xfs_qm_export_qtype_flags( 808xfs_qm_export_qtype_flags(
963 uint flags) 809 uint flags)
964{ 810{
@@ -979,26 +825,6 @@ xfs_qm_export_qtype_flags(
979} 825}
980 826
981STATIC uint 827STATIC uint
982xfs_qm_import_flags(
983 uint uflags)
984{
985 uint flags = 0;
986
987 if (uflags & XFS_QUOTA_UDQ_ACCT)
988 flags |= XFS_UQUOTA_ACCT;
989 if (uflags & XFS_QUOTA_PDQ_ACCT)
990 flags |= XFS_PQUOTA_ACCT;
991 if (uflags & XFS_QUOTA_GDQ_ACCT)
992 flags |= XFS_GQUOTA_ACCT;
993 if (uflags & XFS_QUOTA_UDQ_ENFD)
994 flags |= XFS_UQUOTA_ENFD;
995 if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD))
996 flags |= XFS_OQUOTA_ENFD;
997 return (flags);
998}
999
1000
1001STATIC uint
1002xfs_qm_export_flags( 828xfs_qm_export_flags(
1003 uint flags) 829 uint flags)
1004{ 830{
@@ -1134,7 +960,7 @@ xfs_dqhash_t *qmtest_udqtab;
1134xfs_dqhash_t *qmtest_gdqtab; 960xfs_dqhash_t *qmtest_gdqtab;
1135int qmtest_hashmask; 961int qmtest_hashmask;
1136int qmtest_nfails; 962int qmtest_nfails;
1137mutex_t qcheck_lock; 963struct mutex qcheck_lock;
1138 964
1139#define DQTEST_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \ 965#define DQTEST_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
1140 (__psunsigned_t)(id)) & \ 966 (__psunsigned_t)(id)) & \
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
index c4fcea600bc2..8286b2842b6b 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -42,34 +42,24 @@
42#define XFS_QI_QOFFLOCK(mp) ((mp)->m_quotainfo->qi_quotaofflock) 42#define XFS_QI_QOFFLOCK(mp) ((mp)->m_quotainfo->qi_quotaofflock)
43 43
44#define XFS_QI_MPL_LIST(mp) ((mp)->m_quotainfo->qi_dqlist) 44#define XFS_QI_MPL_LIST(mp) ((mp)->m_quotainfo->qi_dqlist)
45#define XFS_QI_MPLLOCK(mp) ((mp)->m_quotainfo->qi_dqlist.qh_lock)
46#define XFS_QI_MPLNEXT(mp) ((mp)->m_quotainfo->qi_dqlist.qh_next) 45#define XFS_QI_MPLNEXT(mp) ((mp)->m_quotainfo->qi_dqlist.qh_next)
47#define XFS_QI_MPLNDQUOTS(mp) ((mp)->m_quotainfo->qi_dqlist.qh_nelems) 46#define XFS_QI_MPLNDQUOTS(mp) ((mp)->m_quotainfo->qi_dqlist.qh_nelems)
48 47
49#define XQMLCK(h) (mutex_lock(&((h)->qh_lock))) 48#define xfs_qm_mplist_lock(mp) \
50#define XQMUNLCK(h) (mutex_unlock(&((h)->qh_lock))) 49 mutex_lock(&(XFS_QI_MPL_LIST(mp).qh_lock))
51#ifdef DEBUG 50#define xfs_qm_mplist_nowait(mp) \
52struct xfs_dqhash; 51 mutex_trylock(&(XFS_QI_MPL_LIST(mp).qh_lock))
53static inline int XQMISLCKD(struct xfs_dqhash *h) 52#define xfs_qm_mplist_unlock(mp) \
54{ 53 mutex_unlock(&(XFS_QI_MPL_LIST(mp).qh_lock))
55 if (mutex_trylock(&h->qh_lock)) { 54#define XFS_QM_IS_MPLIST_LOCKED(mp) \
56 mutex_unlock(&h->qh_lock); 55 mutex_is_locked(&(XFS_QI_MPL_LIST(mp).qh_lock))
57 return 0; 56
58 } 57#define xfs_qm_freelist_lock(qm) \
59 return 1; 58 mutex_lock(&((qm)->qm_dqfreelist.qh_lock))
60} 59#define xfs_qm_freelist_lock_nowait(qm) \
61#endif 60 mutex_trylock(&((qm)->qm_dqfreelist.qh_lock))
62 61#define xfs_qm_freelist_unlock(qm) \
63#define XFS_DQ_HASH_LOCK(h) XQMLCK(h) 62 mutex_unlock(&((qm)->qm_dqfreelist.qh_lock))
64#define XFS_DQ_HASH_UNLOCK(h) XQMUNLCK(h)
65#define XFS_DQ_IS_HASH_LOCKED(h) XQMISLCKD(h)
66
67#define xfs_qm_mplist_lock(mp) XQMLCK(&(XFS_QI_MPL_LIST(mp)))
68#define xfs_qm_mplist_unlock(mp) XQMUNLCK(&(XFS_QI_MPL_LIST(mp)))
69#define XFS_QM_IS_MPLIST_LOCKED(mp) XQMISLCKD(&(XFS_QI_MPL_LIST(mp)))
70
71#define xfs_qm_freelist_lock(qm) XQMLCK(&((qm)->qm_dqfreelist))
72#define xfs_qm_freelist_unlock(qm) XQMUNLCK(&((qm)->qm_dqfreelist))
73 63
74/* 64/*
75 * Hash into a bucket in the dquot hash table, based on <mp, id>. 65 * Hash into a bucket in the dquot hash table, based on <mp, id>.
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 99611381e740..447173bcf96d 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -624,10 +624,9 @@ xfs_trans_dqresv(
624 xfs_qcnt_t *resbcountp; 624 xfs_qcnt_t *resbcountp;
625 xfs_quotainfo_t *q = mp->m_quotainfo; 625 xfs_quotainfo_t *q = mp->m_quotainfo;
626 626
627 if (! (flags & XFS_QMOPT_DQLOCK)) { 627
628 xfs_dqlock(dqp); 628 xfs_dqlock(dqp);
629 } 629
630 ASSERT(XFS_DQ_IS_LOCKED(dqp));
631 if (flags & XFS_TRANS_DQ_RES_BLKS) { 630 if (flags & XFS_TRANS_DQ_RES_BLKS) {
632 hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit); 631 hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
633 if (!hardlimit) 632 if (!hardlimit)
@@ -740,10 +739,8 @@ xfs_trans_dqresv(
740 ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount)); 739 ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount));
741 740
742error_return: 741error_return:
743 if (! (flags & XFS_QMOPT_DQLOCK)) { 742 xfs_dqunlock(dqp);
744 xfs_dqunlock(dqp); 743 return error;
745 }
746 return (error);
747} 744}
748 745
749 746
@@ -753,8 +750,7 @@ error_return:
753 * grp/prj quotas is important, because this follows a both-or-nothing 750 * grp/prj quotas is important, because this follows a both-or-nothing
754 * approach. 751 * approach.
755 * 752 *
756 * flags = XFS_QMOPT_DQLOCK indicate if dquot(s) need to be locked. 753 * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
757 * XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
758 * XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT. Used by pquota. 754 * XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT. Used by pquota.
759 * XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks 755 * XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks
760 * XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks 756 * XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index ae5482965424..3f3610a7ee05 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -24,6 +24,7 @@
24#include "xfs_ag.h" 24#include "xfs_ag.h"
25#include "xfs_dmapi.h" 25#include "xfs_dmapi.h"
26#include "xfs_mount.h" 26#include "xfs_mount.h"
27#include "xfs_error.h"
27 28
28static char message[1024]; /* keep it off the stack */ 29static char message[1024]; /* keep it off the stack */
29static DEFINE_SPINLOCK(xfs_err_lock); 30static DEFINE_SPINLOCK(xfs_err_lock);
diff --git a/fs/xfs/support/uuid.c b/fs/xfs/support/uuid.c
index 5830c040ea7e..b83f76b6d410 100644
--- a/fs/xfs/support/uuid.c
+++ b/fs/xfs/support/uuid.c
@@ -17,10 +17,6 @@
17 */ 17 */
18#include <xfs.h> 18#include <xfs.h>
19 19
20static DEFINE_MUTEX(uuid_monitor);
21static int uuid_table_size;
22static uuid_t *uuid_table;
23
24/* IRIX interpretation of an uuid_t */ 20/* IRIX interpretation of an uuid_t */
25typedef struct { 21typedef struct {
26 __be32 uu_timelow; 22 __be32 uu_timelow;
@@ -46,12 +42,6 @@ uuid_getnodeuniq(uuid_t *uuid, int fsid [2])
46 fsid[1] = be32_to_cpu(uup->uu_timelow); 42 fsid[1] = be32_to_cpu(uup->uu_timelow);
47} 43}
48 44
49void
50uuid_create_nil(uuid_t *uuid)
51{
52 memset(uuid, 0, sizeof(*uuid));
53}
54
55int 45int
56uuid_is_nil(uuid_t *uuid) 46uuid_is_nil(uuid_t *uuid)
57{ 47{
@@ -71,64 +61,3 @@ uuid_equal(uuid_t *uuid1, uuid_t *uuid2)
71{ 61{
72 return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1; 62 return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1;
73} 63}
74
75/*
76 * Given a 128-bit uuid, return a 64-bit value by adding the top and bottom
77 * 64-bit words. NOTE: This function can not be changed EVER. Although
78 * brain-dead, some applications depend on this 64-bit value remaining
79 * persistent. Specifically, DMI vendors store the value as a persistent
80 * filehandle.
81 */
82__uint64_t
83uuid_hash64(uuid_t *uuid)
84{
85 __uint64_t *sp = (__uint64_t *)uuid;
86
87 return sp[0] + sp[1];
88}
89
90int
91uuid_table_insert(uuid_t *uuid)
92{
93 int i, hole;
94
95 mutex_lock(&uuid_monitor);
96 for (i = 0, hole = -1; i < uuid_table_size; i++) {
97 if (uuid_is_nil(&uuid_table[i])) {
98 hole = i;
99 continue;
100 }
101 if (uuid_equal(uuid, &uuid_table[i])) {
102 mutex_unlock(&uuid_monitor);
103 return 0;
104 }
105 }
106 if (hole < 0) {
107 uuid_table = kmem_realloc(uuid_table,
108 (uuid_table_size + 1) * sizeof(*uuid_table),
109 uuid_table_size * sizeof(*uuid_table),
110 KM_SLEEP);
111 hole = uuid_table_size++;
112 }
113 uuid_table[hole] = *uuid;
114 mutex_unlock(&uuid_monitor);
115 return 1;
116}
117
118void
119uuid_table_remove(uuid_t *uuid)
120{
121 int i;
122
123 mutex_lock(&uuid_monitor);
124 for (i = 0; i < uuid_table_size; i++) {
125 if (uuid_is_nil(&uuid_table[i]))
126 continue;
127 if (!uuid_equal(uuid, &uuid_table[i]))
128 continue;
129 uuid_create_nil(&uuid_table[i]);
130 break;
131 }
132 ASSERT(i < uuid_table_size);
133 mutex_unlock(&uuid_monitor);
134}
diff --git a/fs/xfs/support/uuid.h b/fs/xfs/support/uuid.h
index cff5b607d445..4732d71262cc 100644
--- a/fs/xfs/support/uuid.h
+++ b/fs/xfs/support/uuid.h
@@ -22,12 +22,8 @@ typedef struct {
22 unsigned char __u_bits[16]; 22 unsigned char __u_bits[16];
23} uuid_t; 23} uuid_t;
24 24
25extern void uuid_create_nil(uuid_t *uuid);
26extern int uuid_is_nil(uuid_t *uuid); 25extern int uuid_is_nil(uuid_t *uuid);
27extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2); 26extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
28extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]); 27extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
29extern __uint64_t uuid_hash64(uuid_t *uuid);
30extern int uuid_table_insert(uuid_t *uuid);
31extern void uuid_table_remove(uuid_t *uuid);
32 28
33#endif /* __XFS_SUPPORT_UUID_H__ */ 29#endif /* __XFS_SUPPORT_UUID_H__ */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 143d63ecb20a..c8641f713caa 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -223,8 +223,8 @@ typedef struct xfs_perag
223 be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp)) 223 be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
224#define XFS_MIN_FREELIST_PAG(pag,mp) \ 224#define XFS_MIN_FREELIST_PAG(pag,mp) \
225 (XFS_MIN_FREELIST_RAW( \ 225 (XFS_MIN_FREELIST_RAW( \
226 (uint_t)(pag)->pagf_levels[XFS_BTNUM_BNOi], \ 226 (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
227 (uint_t)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp)) 227 (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
228 228
229#define XFS_AGB_TO_FSB(mp,agno,agbno) \ 229#define XFS_AGB_TO_FSB(mp,agno,agbno) \
230 (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno)) 230 (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 028e44e58ea9..2cf944eb796d 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -1872,6 +1872,25 @@ xfs_alloc_compute_maxlevels(
1872} 1872}
1873 1873
1874/* 1874/*
1875 * Find the length of the longest extent in an AG.
1876 */
1877xfs_extlen_t
1878xfs_alloc_longest_free_extent(
1879 struct xfs_mount *mp,
1880 struct xfs_perag *pag)
1881{
1882 xfs_extlen_t need, delta = 0;
1883
1884 need = XFS_MIN_FREELIST_PAG(pag, mp);
1885 if (need > pag->pagf_flcount)
1886 delta = need - pag->pagf_flcount;
1887
1888 if (pag->pagf_longest > delta)
1889 return pag->pagf_longest - delta;
1890 return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
1891}
1892
1893/*
1875 * Decide whether to use this allocation group for this allocation. 1894 * Decide whether to use this allocation group for this allocation.
1876 * If so, fix up the btree freelist's size. 1895 * If so, fix up the btree freelist's size.
1877 */ 1896 */
@@ -1923,15 +1942,12 @@ xfs_alloc_fix_freelist(
1923 } 1942 }
1924 1943
1925 if (!(flags & XFS_ALLOC_FLAG_FREEING)) { 1944 if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
1926 need = XFS_MIN_FREELIST_PAG(pag, mp);
1927 delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0;
1928 /* 1945 /*
1929 * If it looks like there isn't a long enough extent, or enough 1946 * If it looks like there isn't a long enough extent, or enough
1930 * total blocks, reject it. 1947 * total blocks, reject it.
1931 */ 1948 */
1932 longest = (pag->pagf_longest > delta) ? 1949 need = XFS_MIN_FREELIST_PAG(pag, mp);
1933 (pag->pagf_longest - delta) : 1950 longest = xfs_alloc_longest_free_extent(mp, pag);
1934 (pag->pagf_flcount > 0 || pag->pagf_longest > 0);
1935 if ((args->minlen + args->alignment + args->minalignslop - 1) > 1951 if ((args->minlen + args->alignment + args->minalignslop - 1) >
1936 longest || 1952 longest ||
1937 ((int)(pag->pagf_freeblks + pag->pagf_flcount - 1953 ((int)(pag->pagf_freeblks + pag->pagf_flcount -
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 588172796f7b..e704caee10df 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -100,6 +100,12 @@ typedef struct xfs_alloc_arg {
100#define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/ 100#define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/
101#define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */ 101#define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */
102 102
103/*
104 * Find the length of the longest extent in an AG.
105 */
106xfs_extlen_t
107xfs_alloc_longest_free_extent(struct xfs_mount *mp,
108 struct xfs_perag *pag);
103 109
104#ifdef __KERNEL__ 110#ifdef __KERNEL__
105 111
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 6c323f8a4cd1..afdc8911637d 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -155,7 +155,8 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
155 * minimum offset only needs to be the space required for 155 * minimum offset only needs to be the space required for
156 * the btree root. 156 * the btree root.
157 */ 157 */
158 if (!dp->i_d.di_forkoff && dp->i_df.if_bytes > mp->m_attroffset) 158 if (!dp->i_d.di_forkoff && dp->i_df.if_bytes >
159 xfs_default_attroffset(dp))
159 dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS); 160 dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
160 break; 161 break;
161 162
@@ -298,6 +299,26 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
298} 299}
299 300
300/* 301/*
302 * After the last attribute is removed revert to original inode format,
303 * making all literal area available to the data fork once more.
304 */
305STATIC void
306xfs_attr_fork_reset(
307 struct xfs_inode *ip,
308 struct xfs_trans *tp)
309{
310 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
311 ip->i_d.di_forkoff = 0;
312 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
313
314 ASSERT(ip->i_d.di_anextents == 0);
315 ASSERT(ip->i_afp == NULL);
316
317 ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
318 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
319}
320
321/*
301 * Remove an attribute from the shortform attribute list structure. 322 * Remove an attribute from the shortform attribute list structure.
302 */ 323 */
303int 324int
@@ -344,22 +365,10 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
344 */ 365 */
345 totsize -= size; 366 totsize -= size;
346 if (totsize == sizeof(xfs_attr_sf_hdr_t) && 367 if (totsize == sizeof(xfs_attr_sf_hdr_t) &&
347 !(args->op_flags & XFS_DA_OP_ADDNAME) && 368 (mp->m_flags & XFS_MOUNT_ATTR2) &&
348 (mp->m_flags & XFS_MOUNT_ATTR2) && 369 (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
349 (dp->i_d.di_format != XFS_DINODE_FMT_BTREE)) { 370 !(args->op_flags & XFS_DA_OP_ADDNAME)) {
350 /* 371 xfs_attr_fork_reset(dp, args->trans);
351 * Last attribute now removed, revert to original
352 * inode format making all literal area available
353 * to the data fork once more.
354 */
355 xfs_idestroy_fork(dp, XFS_ATTR_FORK);
356 dp->i_d.di_forkoff = 0;
357 dp->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
358 ASSERT(dp->i_d.di_anextents == 0);
359 ASSERT(dp->i_afp == NULL);
360 dp->i_df.if_ext_max =
361 XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
362 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
363 } else { 372 } else {
364 xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); 373 xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
365 dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); 374 dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
@@ -786,20 +795,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
786 if (forkoff == -1) { 795 if (forkoff == -1) {
787 ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2); 796 ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2);
788 ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE); 797 ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE);
789 798 xfs_attr_fork_reset(dp, args->trans);
790 /*
791 * Last attribute was removed, revert to original
792 * inode format making all literal area available
793 * to the data fork once more.
794 */
795 xfs_idestroy_fork(dp, XFS_ATTR_FORK);
796 dp->i_d.di_forkoff = 0;
797 dp->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
798 ASSERT(dp->i_d.di_anextents == 0);
799 ASSERT(dp->i_afp == NULL);
800 dp->i_df.if_ext_max =
801 XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
802 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
803 goto out; 799 goto out;
804 } 800 }
805 801
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index c852cd65aaea..3a6ed426327a 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2479,7 +2479,7 @@ xfs_bmap_adjacent(
2479 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); 2479 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
2480 /* 2480 /*
2481 * If allocating at eof, and there's a previous real block, 2481 * If allocating at eof, and there's a previous real block,
2482 * try to use it's last block as our starting point. 2482 * try to use its last block as our starting point.
2483 */ 2483 */
2484 if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF && 2484 if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF &&
2485 !isnullstartblock(ap->prevp->br_startblock) && 2485 !isnullstartblock(ap->prevp->br_startblock) &&
@@ -2712,9 +2712,6 @@ xfs_bmap_btalloc(
2712 xfs_agnumber_t startag; 2712 xfs_agnumber_t startag;
2713 xfs_alloc_arg_t args; 2713 xfs_alloc_arg_t args;
2714 xfs_extlen_t blen; 2714 xfs_extlen_t blen;
2715 xfs_extlen_t delta;
2716 xfs_extlen_t longest;
2717 xfs_extlen_t need;
2718 xfs_extlen_t nextminlen = 0; 2715 xfs_extlen_t nextminlen = 0;
2719 xfs_perag_t *pag; 2716 xfs_perag_t *pag;
2720 int nullfb; /* true if ap->firstblock isn't set */ 2717 int nullfb; /* true if ap->firstblock isn't set */
@@ -2796,13 +2793,8 @@ xfs_bmap_btalloc(
2796 * See xfs_alloc_fix_freelist... 2793 * See xfs_alloc_fix_freelist...
2797 */ 2794 */
2798 if (pag->pagf_init) { 2795 if (pag->pagf_init) {
2799 need = XFS_MIN_FREELIST_PAG(pag, mp); 2796 xfs_extlen_t longest;
2800 delta = need > pag->pagf_flcount ? 2797 longest = xfs_alloc_longest_free_extent(mp, pag);
2801 need - pag->pagf_flcount : 0;
2802 longest = (pag->pagf_longest > delta) ?
2803 (pag->pagf_longest - delta) :
2804 (pag->pagf_flcount > 0 ||
2805 pag->pagf_longest > 0);
2806 if (blen < longest) 2798 if (blen < longest)
2807 blen = longest; 2799 blen = longest;
2808 } else 2800 } else
@@ -3577,6 +3569,27 @@ xfs_bmap_extents_to_btree(
3577} 3569}
3578 3570
3579/* 3571/*
3572 * Calculate the default attribute fork offset for newly created inodes.
3573 */
3574uint
3575xfs_default_attroffset(
3576 struct xfs_inode *ip)
3577{
3578 struct xfs_mount *mp = ip->i_mount;
3579 uint offset;
3580
3581 if (mp->m_sb.sb_inodesize == 256) {
3582 offset = XFS_LITINO(mp) -
3583 XFS_BMDR_SPACE_CALC(MINABTPTRS);
3584 } else {
3585 offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
3586 }
3587
3588 ASSERT(offset < XFS_LITINO(mp));
3589 return offset;
3590}
3591
3592/*
3580 * Helper routine to reset inode di_forkoff field when switching 3593 * Helper routine to reset inode di_forkoff field when switching
3581 * attribute fork from local to extent format - we reset it where 3594 * attribute fork from local to extent format - we reset it where
3582 * possible to make space available for inline data fork extents. 3595 * possible to make space available for inline data fork extents.
@@ -3588,15 +3601,18 @@ xfs_bmap_forkoff_reset(
3588 int whichfork) 3601 int whichfork)
3589{ 3602{
3590 if (whichfork == XFS_ATTR_FORK && 3603 if (whichfork == XFS_ATTR_FORK &&
3591 (ip->i_d.di_format != XFS_DINODE_FMT_DEV) && 3604 ip->i_d.di_format != XFS_DINODE_FMT_DEV &&
3592 (ip->i_d.di_format != XFS_DINODE_FMT_UUID) && 3605 ip->i_d.di_format != XFS_DINODE_FMT_UUID &&
3593 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && 3606 ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
3594 ((mp->m_attroffset >> 3) > ip->i_d.di_forkoff)) { 3607 uint dfl_forkoff = xfs_default_attroffset(ip) >> 3;
3595 ip->i_d.di_forkoff = mp->m_attroffset >> 3; 3608
3596 ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / 3609 if (dfl_forkoff > ip->i_d.di_forkoff) {
3597 (uint)sizeof(xfs_bmbt_rec_t); 3610 ip->i_d.di_forkoff = dfl_forkoff;
3598 ip->i_afp->if_ext_max = XFS_IFORK_ASIZE(ip) / 3611 ip->i_df.if_ext_max =
3599 (uint)sizeof(xfs_bmbt_rec_t); 3612 XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
3613 ip->i_afp->if_ext_max =
3614 XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t);
3615 }
3600 } 3616 }
3601} 3617}
3602 3618
@@ -4065,7 +4081,7 @@ xfs_bmap_add_attrfork(
4065 case XFS_DINODE_FMT_BTREE: 4081 case XFS_DINODE_FMT_BTREE:
4066 ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size); 4082 ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
4067 if (!ip->i_d.di_forkoff) 4083 if (!ip->i_d.di_forkoff)
4068 ip->i_d.di_forkoff = mp->m_attroffset >> 3; 4084 ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
4069 else if (mp->m_flags & XFS_MOUNT_ATTR2) 4085 else if (mp->m_flags & XFS_MOUNT_ATTR2)
4070 version = 2; 4086 version = 2;
4071 break; 4087 break;
@@ -4212,12 +4228,12 @@ xfs_bmap_compute_maxlevels(
4212 * (a signed 16-bit number, xfs_aextnum_t). 4228 * (a signed 16-bit number, xfs_aextnum_t).
4213 * 4229 *
4214 * Note that we can no longer assume that if we are in ATTR1 that 4230 * Note that we can no longer assume that if we are in ATTR1 that
4215 * the fork offset of all the inodes will be (m_attroffset >> 3) 4231 * the fork offset of all the inodes will be
4216 * because we could have mounted with ATTR2 and then mounted back 4232 * (xfs_default_attroffset(ip) >> 3) because we could have mounted
4217 * with ATTR1, keeping the di_forkoff's fixed but probably at 4233 * with ATTR2 and then mounted back with ATTR1, keeping the
4218 * various positions. Therefore, for both ATTR1 and ATTR2 4234 * di_forkoff's fixed but probably at various positions. Therefore,
4219 * we have to assume the worst case scenario of a minimum size 4235 * for both ATTR1 and ATTR2 we have to assume the worst case scenario
4220 * available. 4236 * of a minimum size available.
4221 */ 4237 */
4222 if (whichfork == XFS_DATA_FORK) { 4238 if (whichfork == XFS_DATA_FORK) {
4223 maxleafents = MAXEXTNUM; 4239 maxleafents = MAXEXTNUM;
@@ -4804,7 +4820,7 @@ xfs_bmapi(
4804 xfs_extlen_t minlen; /* min allocation size */ 4820 xfs_extlen_t minlen; /* min allocation size */
4805 xfs_mount_t *mp; /* xfs mount structure */ 4821 xfs_mount_t *mp; /* xfs mount structure */
4806 int n; /* current extent index */ 4822 int n; /* current extent index */
4807 int nallocs; /* number of extents alloc\'d */ 4823 int nallocs; /* number of extents alloc'd */
4808 xfs_extnum_t nextents; /* number of extents in file */ 4824 xfs_extnum_t nextents; /* number of extents in file */
4809 xfs_fileoff_t obno; /* old block number (offset) */ 4825 xfs_fileoff_t obno; /* old block number (offset) */
4810 xfs_bmbt_irec_t prev; /* previous file extent record */ 4826 xfs_bmbt_irec_t prev; /* previous file extent record */
@@ -6204,7 +6220,7 @@ xfs_bmap_get_bp(
6204 return(bp); 6220 return(bp);
6205} 6221}
6206 6222
6207void 6223STATIC void
6208xfs_check_block( 6224xfs_check_block(
6209 struct xfs_btree_block *block, 6225 struct xfs_btree_block *block,
6210 xfs_mount_t *mp, 6226 xfs_mount_t *mp,
@@ -6494,7 +6510,7 @@ xfs_bmap_count_tree(
6494 block = XFS_BUF_TO_BLOCK(bp); 6510 block = XFS_BUF_TO_BLOCK(bp);
6495 6511
6496 if (--level) { 6512 if (--level) {
6497 /* Not at node above leafs, count this level of nodes */ 6513 /* Not at node above leaves, count this level of nodes */
6498 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); 6514 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
6499 while (nextbno != NULLFSBLOCK) { 6515 while (nextbno != NULLFSBLOCK) {
6500 if ((error = xfs_btree_read_bufl(mp, tp, nextbno, 6516 if ((error = xfs_btree_read_bufl(mp, tp, nextbno,
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index be2979d88d32..1b8ff9256bd0 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -125,7 +125,7 @@ typedef struct xfs_bmalloca {
125 struct xfs_bmbt_irec *gotp; /* extent after, or delayed */ 125 struct xfs_bmbt_irec *gotp; /* extent after, or delayed */
126 xfs_extlen_t alen; /* i/o length asked/allocated */ 126 xfs_extlen_t alen; /* i/o length asked/allocated */
127 xfs_extlen_t total; /* total blocks needed for xaction */ 127 xfs_extlen_t total; /* total blocks needed for xaction */
128 xfs_extlen_t minlen; /* mininum allocation size (blocks) */ 128 xfs_extlen_t minlen; /* minimum allocation size (blocks) */
129 xfs_extlen_t minleft; /* amount must be left after alloc */ 129 xfs_extlen_t minleft; /* amount must be left after alloc */
130 char eof; /* set if allocating past last extent */ 130 char eof; /* set if allocating past last extent */
131 char wasdel; /* replacing a delayed allocation */ 131 char wasdel; /* replacing a delayed allocation */
@@ -338,6 +338,10 @@ xfs_check_nostate_extents(
338 xfs_extnum_t idx, 338 xfs_extnum_t idx,
339 xfs_extnum_t num); 339 xfs_extnum_t num);
340 340
341uint
342xfs_default_attroffset(
343 struct xfs_inode *ip);
344
341#ifdef __KERNEL__ 345#ifdef __KERNEL__
342 346
343/* 347/*
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index e73c332eb23f..e9df99574829 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -1883,7 +1883,7 @@ xfs_btree_lshift(
1883 1883
1884 /* 1884 /*
1885 * We add one entry to the left side and remove one for the right side. 1885 * We add one entry to the left side and remove one for the right side.
1886 * Accout for it here, the changes will be updated on disk and logged 1886 * Account for it here, the changes will be updated on disk and logged
1887 * later. 1887 * later.
1888 */ 1888 */
1889 lrecs++; 1889 lrecs++;
@@ -3535,7 +3535,7 @@ xfs_btree_delrec(
3535 XFS_BTREE_STATS_INC(cur, join); 3535 XFS_BTREE_STATS_INC(cur, join);
3536 3536
3537 /* 3537 /*
3538 * Fix up the the number of records and right block pointer in the 3538 * Fix up the number of records and right block pointer in the
3539 * surviving block, and log it. 3539 * surviving block, and log it.
3540 */ 3540 */
3541 xfs_btree_set_numrecs(left, lrecs + rrecs); 3541 xfs_btree_set_numrecs(left, lrecs + rrecs);
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 789fffdf8b2f..4f852b735b96 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -41,7 +41,7 @@ extern kmem_zone_t *xfs_btree_cur_zone;
41/* 41/*
42 * Generic btree header. 42 * Generic btree header.
43 * 43 *
44 * This is a comination of the actual format used on disk for short and long 44 * This is a combination of the actual format used on disk for short and long
45 * format btrees. The first three fields are shared by both format, but 45 * format btrees. The first three fields are shared by both format, but
46 * the pointers are different and should be used with care. 46 * the pointers are different and should be used with care.
47 * 47 *
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index c45f74ff1a5b..9ff6e57a5075 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1503,7 +1503,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
1503 * This is implemented with some source-level loop unrolling. 1503 * This is implemented with some source-level loop unrolling.
1504 */ 1504 */
1505xfs_dahash_t 1505xfs_dahash_t
1506xfs_da_hashname(const uchar_t *name, int namelen) 1506xfs_da_hashname(const __uint8_t *name, int namelen)
1507{ 1507{
1508 xfs_dahash_t hash; 1508 xfs_dahash_t hash;
1509 1509
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 70b710c1792d..8c536167bf75 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -91,9 +91,9 @@ enum xfs_dacmp {
91 * Structure to ease passing around component names. 91 * Structure to ease passing around component names.
92 */ 92 */
93typedef struct xfs_da_args { 93typedef struct xfs_da_args {
94 const uchar_t *name; /* string (maybe not NULL terminated) */ 94 const __uint8_t *name; /* string (maybe not NULL terminated) */
95 int namelen; /* length of string (maybe no NULL) */ 95 int namelen; /* length of string (maybe no NULL) */
96 uchar_t *value; /* set of bytes (maybe contain NULLs) */ 96 __uint8_t *value; /* set of bytes (maybe contain NULLs) */
97 int valuelen; /* length of value */ 97 int valuelen; /* length of value */
98 int flags; /* argument flags (eg: ATTR_NOCREATE) */ 98 int flags; /* argument flags (eg: ATTR_NOCREATE) */
99 xfs_dahash_t hashval; /* hash value of name */ 99 xfs_dahash_t hashval; /* hash value of name */
@@ -185,7 +185,7 @@ typedef struct xfs_da_state {
185 unsigned char inleaf; /* insert into 1->lf, 0->splf */ 185 unsigned char inleaf; /* insert into 1->lf, 0->splf */
186 unsigned char extravalid; /* T/F: extrablk is in use */ 186 unsigned char extravalid; /* T/F: extrablk is in use */
187 unsigned char extraafter; /* T/F: extrablk is after new */ 187 unsigned char extraafter; /* T/F: extrablk is after new */
188 xfs_da_state_blk_t extrablk; /* for double-splits on leafs */ 188 xfs_da_state_blk_t extrablk; /* for double-splits on leaves */
189 /* for dirv2 extrablk is data */ 189 /* for dirv2 extrablk is data */
190} xfs_da_state_t; 190} xfs_da_state_t;
191 191
@@ -251,7 +251,7 @@ xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
251int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, 251int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
252 xfs_dabuf_t *dead_buf); 252 xfs_dabuf_t *dead_buf);
253 253
254uint xfs_da_hashname(const uchar_t *name_string, int name_length); 254uint xfs_da_hashname(const __uint8_t *name_string, int name_length);
255enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, 255enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
256 const char *name, int len); 256 const char *name, int len);
257 257
@@ -268,5 +268,6 @@ xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf);
268 268
269extern struct kmem_zone *xfs_da_state_zone; 269extern struct kmem_zone *xfs_da_state_zone;
270extern struct kmem_zone *xfs_dabuf_zone; 270extern struct kmem_zone *xfs_dabuf_zone;
271extern const struct xfs_nameops xfs_default_nameops;
271 272
272#endif /* __XFS_DA_BTREE_H__ */ 273#endif /* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index f8278cfcc1d3..e6d839bddbf0 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -79,6 +79,12 @@ xfs_swapext(
79 goto out_put_target_file; 79 goto out_put_target_file;
80 } 80 }
81 81
82 if (IS_SWAPFILE(file->f_path.dentry->d_inode) ||
83 IS_SWAPFILE(target_file->f_path.dentry->d_inode)) {
84 error = XFS_ERROR(EINVAL);
85 goto out_put_target_file;
86 }
87
82 ip = XFS_I(file->f_path.dentry->d_inode); 88 ip = XFS_I(file->f_path.dentry->d_inode);
83 tip = XFS_I(target_file->f_path.dentry->d_inode); 89 tip = XFS_I(target_file->f_path.dentry->d_inode);
84 90
@@ -118,19 +124,17 @@ xfs_swap_extents(
118 xfs_bstat_t *sbp = &sxp->sx_stat; 124 xfs_bstat_t *sbp = &sxp->sx_stat;
119 xfs_ifork_t *tempifp, *ifp, *tifp; 125 xfs_ifork_t *tempifp, *ifp, *tifp;
120 int ilf_fields, tilf_fields; 126 int ilf_fields, tilf_fields;
121 static uint lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
122 int error = 0; 127 int error = 0;
123 int aforkblks = 0; 128 int aforkblks = 0;
124 int taforkblks = 0; 129 int taforkblks = 0;
125 __uint64_t tmp; 130 __uint64_t tmp;
126 char locked = 0;
127 131
128 mp = ip->i_mount; 132 mp = ip->i_mount;
129 133
130 tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); 134 tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
131 if (!tempifp) { 135 if (!tempifp) {
132 error = XFS_ERROR(ENOMEM); 136 error = XFS_ERROR(ENOMEM);
133 goto error0; 137 goto out;
134 } 138 }
135 139
136 sbp = &sxp->sx_stat; 140 sbp = &sxp->sx_stat;
@@ -143,25 +147,24 @@ xfs_swap_extents(
143 */ 147 */
144 xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); 148 xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
145 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); 149 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
146 locked = 1;
147 150
148 /* Verify that both files have the same format */ 151 /* Verify that both files have the same format */
149 if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { 152 if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
150 error = XFS_ERROR(EINVAL); 153 error = XFS_ERROR(EINVAL);
151 goto error0; 154 goto out_unlock;
152 } 155 }
153 156
154 /* Verify both files are either real-time or non-realtime */ 157 /* Verify both files are either real-time or non-realtime */
155 if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { 158 if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
156 error = XFS_ERROR(EINVAL); 159 error = XFS_ERROR(EINVAL);
157 goto error0; 160 goto out_unlock;
158 } 161 }
159 162
160 /* Should never get a local format */ 163 /* Should never get a local format */
161 if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || 164 if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
162 tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { 165 tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
163 error = XFS_ERROR(EINVAL); 166 error = XFS_ERROR(EINVAL);
164 goto error0; 167 goto out_unlock;
165 } 168 }
166 169
167 if (VN_CACHED(VFS_I(tip)) != 0) { 170 if (VN_CACHED(VFS_I(tip)) != 0) {
@@ -169,13 +172,13 @@ xfs_swap_extents(
169 error = xfs_flushinval_pages(tip, 0, -1, 172 error = xfs_flushinval_pages(tip, 0, -1,
170 FI_REMAPF_LOCKED); 173 FI_REMAPF_LOCKED);
171 if (error) 174 if (error)
172 goto error0; 175 goto out_unlock;
173 } 176 }
174 177
175 /* Verify O_DIRECT for ftmp */ 178 /* Verify O_DIRECT for ftmp */
176 if (VN_CACHED(VFS_I(tip)) != 0) { 179 if (VN_CACHED(VFS_I(tip)) != 0) {
177 error = XFS_ERROR(EINVAL); 180 error = XFS_ERROR(EINVAL);
178 goto error0; 181 goto out_unlock;
179 } 182 }
180 183
181 /* Verify all data are being swapped */ 184 /* Verify all data are being swapped */
@@ -183,7 +186,7 @@ xfs_swap_extents(
183 sxp->sx_length != ip->i_d.di_size || 186 sxp->sx_length != ip->i_d.di_size ||
184 sxp->sx_length != tip->i_d.di_size) { 187 sxp->sx_length != tip->i_d.di_size) {
185 error = XFS_ERROR(EFAULT); 188 error = XFS_ERROR(EFAULT);
186 goto error0; 189 goto out_unlock;
187 } 190 }
188 191
189 /* 192 /*
@@ -193,7 +196,7 @@ xfs_swap_extents(
193 */ 196 */
194 if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) { 197 if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) {
195 error = XFS_ERROR(EINVAL); 198 error = XFS_ERROR(EINVAL);
196 goto error0; 199 goto out_unlock;
197 } 200 }
198 201
199 /* 202 /*
@@ -208,7 +211,7 @@ xfs_swap_extents(
208 (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) || 211 (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) ||
209 (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) { 212 (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) {
210 error = XFS_ERROR(EBUSY); 213 error = XFS_ERROR(EBUSY);
211 goto error0; 214 goto out_unlock;
212 } 215 }
213 216
214 /* We need to fail if the file is memory mapped. Once we have tossed 217 /* We need to fail if the file is memory mapped. Once we have tossed
@@ -219,7 +222,7 @@ xfs_swap_extents(
219 */ 222 */
220 if (VN_MAPPED(VFS_I(ip))) { 223 if (VN_MAPPED(VFS_I(ip))) {
221 error = XFS_ERROR(EBUSY); 224 error = XFS_ERROR(EBUSY);
222 goto error0; 225 goto out_unlock;
223 } 226 }
224 227
225 xfs_iunlock(ip, XFS_ILOCK_EXCL); 228 xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -242,8 +245,7 @@ xfs_swap_extents(
242 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 245 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
243 xfs_iunlock(tip, XFS_IOLOCK_EXCL); 246 xfs_iunlock(tip, XFS_IOLOCK_EXCL);
244 xfs_trans_cancel(tp, 0); 247 xfs_trans_cancel(tp, 0);
245 locked = 0; 248 goto out;
246 goto error0;
247 } 249 }
248 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); 250 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
249 251
@@ -253,19 +255,15 @@ xfs_swap_extents(
253 if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) && 255 if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
254 (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { 256 (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
255 error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks); 257 error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
256 if (error) { 258 if (error)
257 xfs_trans_cancel(tp, 0); 259 goto out_trans_cancel;
258 goto error0;
259 }
260 } 260 }
261 if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) && 261 if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
262 (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { 262 (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
263 error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, 263 error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
264 &taforkblks); 264 &taforkblks);
265 if (error) { 265 if (error)
266 xfs_trans_cancel(tp, 0); 266 goto out_trans_cancel;
267 goto error0;
268 }
269 } 267 }
270 268
271 /* 269 /*
@@ -332,10 +330,10 @@ xfs_swap_extents(
332 330
333 331
334 IHOLD(ip); 332 IHOLD(ip);
335 xfs_trans_ijoin(tp, ip, lock_flags); 333 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
336 334
337 IHOLD(tip); 335 IHOLD(tip);
338 xfs_trans_ijoin(tp, tip, lock_flags); 336 xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
339 337
340 xfs_trans_log_inode(tp, ip, ilf_fields); 338 xfs_trans_log_inode(tp, ip, ilf_fields);
341 xfs_trans_log_inode(tp, tip, tilf_fields); 339 xfs_trans_log_inode(tp, tip, tilf_fields);
@@ -344,19 +342,19 @@ xfs_swap_extents(
344 * If this is a synchronous mount, make sure that the 342 * If this is a synchronous mount, make sure that the
345 * transaction goes to disk before returning to the user. 343 * transaction goes to disk before returning to the user.
346 */ 344 */
347 if (mp->m_flags & XFS_MOUNT_WSYNC) { 345 if (mp->m_flags & XFS_MOUNT_WSYNC)
348 xfs_trans_set_sync(tp); 346 xfs_trans_set_sync(tp);
349 }
350 347
351 error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT); 348 error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT);
352 locked = 0;
353 349
354 error0: 350out_unlock:
355 if (locked) { 351 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
356 xfs_iunlock(ip, lock_flags); 352 xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
357 xfs_iunlock(tip, lock_flags); 353out:
358 } 354 kmem_free(tempifp);
359 if (tempifp != NULL)
360 kmem_free(tempifp);
361 return error; 355 return error;
356
357out_trans_cancel:
358 xfs_trans_cancel(tp, 0);
359 goto out_unlock;
362} 360}
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index 162e8726df5e..e5b153b2e6a3 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -103,7 +103,9 @@ typedef enum xfs_dinode_fmt {
103/* 103/*
104 * Inode size for given fs. 104 * Inode size for given fs.
105 */ 105 */
106#define XFS_LITINO(mp) ((mp)->m_litino) 106#define XFS_LITINO(mp) \
107 ((int)(((mp)->m_sb.sb_inodesize) - sizeof(struct xfs_dinode)))
108
107#define XFS_BROOT_SIZE_ADJ \ 109#define XFS_BROOT_SIZE_ADJ \
108 (XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t)) 110 (XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t))
109 111
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 1afb12278b8d..c657bec6d951 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -46,8 +46,6 @@
46 46
47struct xfs_name xfs_name_dotdot = {"..", 2}; 47struct xfs_name xfs_name_dotdot = {"..", 2};
48 48
49extern const struct xfs_nameops xfs_default_nameops;
50
51/* 49/*
52 * ASCII case-insensitive (ie. A-Z) support for directories that was 50 * ASCII case-insensitive (ie. A-Z) support for directories that was
53 * used in IRIX. 51 * used in IRIX.
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index e1f0a06aaf04..ab52e9e1c1ee 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -448,7 +448,6 @@ xfs_dir2_block_getdents(
448 xfs_mount_t *mp; /* filesystem mount point */ 448 xfs_mount_t *mp; /* filesystem mount point */
449 char *ptr; /* current data entry */ 449 char *ptr; /* current data entry */
450 int wantoff; /* starting block offset */ 450 int wantoff; /* starting block offset */
451 xfs_ino_t ino;
452 xfs_off_t cook; 451 xfs_off_t cook;
453 452
454 mp = dp->i_mount; 453 mp = dp->i_mount;
@@ -509,16 +508,12 @@ xfs_dir2_block_getdents(
509 508
510 cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, 509 cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
511 (char *)dep - (char *)block); 510 (char *)dep - (char *)block);
512 ino = be64_to_cpu(dep->inumber);
513#if XFS_BIG_INUMS
514 ino += mp->m_inoadd;
515#endif
516 511
517 /* 512 /*
518 * If it didn't fit, set the final offset to here & return. 513 * If it didn't fit, set the final offset to here & return.
519 */ 514 */
520 if (filldir(dirent, dep->name, dep->namelen, cook & 0x7fffffff, 515 if (filldir(dirent, dep->name, dep->namelen, cook & 0x7fffffff,
521 ino, DT_UNKNOWN)) { 516 be64_to_cpu(dep->inumber), DT_UNKNOWN)) {
522 *offset = cook & 0x7fffffff; 517 *offset = cook & 0x7fffffff;
523 xfs_da_brelse(NULL, bp); 518 xfs_da_brelse(NULL, bp);
524 return 0; 519 return 0;
diff --git a/fs/xfs/xfs_dir2_data.h b/fs/xfs/xfs_dir2_data.h
index b816e0252739..efbc290c7fec 100644
--- a/fs/xfs/xfs_dir2_data.h
+++ b/fs/xfs/xfs_dir2_data.h
@@ -38,7 +38,7 @@ struct xfs_trans;
38 38
39/* 39/*
40 * Directory address space divided into sections, 40 * Directory address space divided into sections,
41 * spaces separated by 32gb. 41 * spaces separated by 32GB.
42 */ 42 */
43#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG)) 43#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
44#define XFS_DIR2_DATA_SPACE 0 44#define XFS_DIR2_DATA_SPACE 0
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index ef805a374eec..fa913e459442 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -549,7 +549,7 @@ xfs_dir2_leaf_addname(
549 * Check the internal consistency of a leaf1 block. 549 * Check the internal consistency of a leaf1 block.
550 * Pop an assert if something is wrong. 550 * Pop an assert if something is wrong.
551 */ 551 */
552void 552STATIC void
553xfs_dir2_leaf_check( 553xfs_dir2_leaf_check(
554 xfs_inode_t *dp, /* incore directory inode */ 554 xfs_inode_t *dp, /* incore directory inode */
555 xfs_dabuf_t *bp) /* leaf's buffer */ 555 xfs_dabuf_t *bp) /* leaf's buffer */
@@ -780,7 +780,6 @@ xfs_dir2_leaf_getdents(
780 int ra_index; /* *map index for read-ahead */ 780 int ra_index; /* *map index for read-ahead */
781 int ra_offset; /* map entry offset for ra */ 781 int ra_offset; /* map entry offset for ra */
782 int ra_want; /* readahead count wanted */ 782 int ra_want; /* readahead count wanted */
783 xfs_ino_t ino;
784 783
785 /* 784 /*
786 * If the offset is at or past the largest allowed value, 785 * If the offset is at or past the largest allowed value,
@@ -1076,24 +1075,12 @@ xfs_dir2_leaf_getdents(
1076 continue; 1075 continue;
1077 } 1076 }
1078 1077
1079 /*
1080 * Copy the entry into the putargs, and try formatting it.
1081 */
1082 dep = (xfs_dir2_data_entry_t *)ptr; 1078 dep = (xfs_dir2_data_entry_t *)ptr;
1083
1084 length = xfs_dir2_data_entsize(dep->namelen); 1079 length = xfs_dir2_data_entsize(dep->namelen);
1085 1080
1086 ino = be64_to_cpu(dep->inumber);
1087#if XFS_BIG_INUMS
1088 ino += mp->m_inoadd;
1089#endif
1090
1091 /*
1092 * Won't fit. Return to caller.
1093 */
1094 if (filldir(dirent, dep->name, dep->namelen, 1081 if (filldir(dirent, dep->name, dep->namelen,
1095 xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff, 1082 xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff,
1096 ino, DT_UNKNOWN)) 1083 be64_to_cpu(dep->inumber), DT_UNKNOWN))
1097 break; 1084 break;
1098 1085
1099 /* 1086 /*
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index fa6c3a5ddbc6..5a81ccd1045b 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -1104,7 +1104,7 @@ xfs_dir2_leafn_remove(
1104 } 1104 }
1105 xfs_dir2_leafn_check(dp, bp); 1105 xfs_dir2_leafn_check(dp, bp);
1106 /* 1106 /*
1107 * Return indication of whether this leaf block is emtpy enough 1107 * Return indication of whether this leaf block is empty enough
1108 * to justify trying to join it with a neighbor. 1108 * to justify trying to join it with a neighbor.
1109 */ 1109 */
1110 *rval = 1110 *rval =
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index a8a8a6efad5b..e89734e84646 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -748,11 +748,7 @@ xfs_dir2_sf_getdents(
748 * Put . entry unless we're starting past it. 748 * Put . entry unless we're starting past it.
749 */ 749 */
750 if (*offset <= dot_offset) { 750 if (*offset <= dot_offset) {
751 ino = dp->i_ino; 751 if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, dp->i_ino, DT_DIR)) {
752#if XFS_BIG_INUMS
753 ino += mp->m_inoadd;
754#endif
755 if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, ino, DT_DIR)) {
756 *offset = dot_offset & 0x7fffffff; 752 *offset = dot_offset & 0x7fffffff;
757 return 0; 753 return 0;
758 } 754 }
@@ -763,9 +759,6 @@ xfs_dir2_sf_getdents(
763 */ 759 */
764 if (*offset <= dotdot_offset) { 760 if (*offset <= dotdot_offset) {
765 ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent); 761 ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent);
766#if XFS_BIG_INUMS
767 ino += mp->m_inoadd;
768#endif
769 if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) { 762 if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) {
770 *offset = dotdot_offset & 0x7fffffff; 763 *offset = dotdot_offset & 0x7fffffff;
771 return 0; 764 return 0;
@@ -786,10 +779,6 @@ xfs_dir2_sf_getdents(
786 } 779 }
787 780
788 ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep)); 781 ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep));
789#if XFS_BIG_INUMS
790 ino += mp->m_inoadd;
791#endif
792
793 if (filldir(dirent, sfep->name, sfep->namelen, 782 if (filldir(dirent, sfep->name, sfep->namelen,
794 off & 0x7fffffff, ino, DT_UNKNOWN)) { 783 off & 0x7fffffff, ino, DT_UNKNOWN)) {
795 *offset = off & 0x7fffffff; 784 *offset = off & 0x7fffffff;
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 2f049f63e85f..0d22c56fdf64 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -33,12 +33,10 @@ typedef struct xfs_extent {
33 * conversion routine. 33 * conversion routine.
34 */ 34 */
35 35
36#ifndef HAVE_FORMAT32
37typedef struct xfs_extent_32 { 36typedef struct xfs_extent_32 {
38 __uint64_t ext_start; 37 __uint64_t ext_start;
39 __uint32_t ext_len; 38 __uint32_t ext_len;
40} __attribute__((packed)) xfs_extent_32_t; 39} __attribute__((packed)) xfs_extent_32_t;
41#endif
42 40
43typedef struct xfs_extent_64 { 41typedef struct xfs_extent_64 {
44 __uint64_t ext_start; 42 __uint64_t ext_start;
@@ -59,7 +57,6 @@ typedef struct xfs_efi_log_format {
59 xfs_extent_t efi_extents[1]; /* array of extents to free */ 57 xfs_extent_t efi_extents[1]; /* array of extents to free */
60} xfs_efi_log_format_t; 58} xfs_efi_log_format_t;
61 59
62#ifndef HAVE_FORMAT32
63typedef struct xfs_efi_log_format_32 { 60typedef struct xfs_efi_log_format_32 {
64 __uint16_t efi_type; /* efi log item type */ 61 __uint16_t efi_type; /* efi log item type */
65 __uint16_t efi_size; /* size of this item */ 62 __uint16_t efi_size; /* size of this item */
@@ -67,7 +64,6 @@ typedef struct xfs_efi_log_format_32 {
67 __uint64_t efi_id; /* efi identifier */ 64 __uint64_t efi_id; /* efi identifier */
68 xfs_extent_32_t efi_extents[1]; /* array of extents to free */ 65 xfs_extent_32_t efi_extents[1]; /* array of extents to free */
69} __attribute__((packed)) xfs_efi_log_format_32_t; 66} __attribute__((packed)) xfs_efi_log_format_32_t;
70#endif
71 67
72typedef struct xfs_efi_log_format_64 { 68typedef struct xfs_efi_log_format_64 {
73 __uint16_t efi_type; /* efi log item type */ 69 __uint16_t efi_type; /* efi log item type */
@@ -90,7 +86,6 @@ typedef struct xfs_efd_log_format {
90 xfs_extent_t efd_extents[1]; /* array of extents freed */ 86 xfs_extent_t efd_extents[1]; /* array of extents freed */
91} xfs_efd_log_format_t; 87} xfs_efd_log_format_t;
92 88
93#ifndef HAVE_FORMAT32
94typedef struct xfs_efd_log_format_32 { 89typedef struct xfs_efd_log_format_32 {
95 __uint16_t efd_type; /* efd log item type */ 90 __uint16_t efd_type; /* efd log item type */
96 __uint16_t efd_size; /* size of this item */ 91 __uint16_t efd_size; /* size of this item */
@@ -98,7 +93,6 @@ typedef struct xfs_efd_log_format_32 {
98 __uint64_t efd_efi_id; /* id of corresponding efi */ 93 __uint64_t efd_efi_id; /* id of corresponding efi */
99 xfs_extent_32_t efd_extents[1]; /* array of extents freed */ 94 xfs_extent_32_t efd_extents[1]; /* array of extents freed */
100} __attribute__((packed)) xfs_efd_log_format_32_t; 95} __attribute__((packed)) xfs_efd_log_format_32_t;
101#endif
102 96
103typedef struct xfs_efd_log_format_64 { 97typedef struct xfs_efd_log_format_64 {
104 __uint16_t efd_type; /* efd log item type */ 98 __uint16_t efd_type; /* efd log item type */
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index f3bb75da384e..6c87c8f304ef 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -140,7 +140,7 @@ _xfs_filestream_pick_ag(
140 xfs_extlen_t minlen) 140 xfs_extlen_t minlen)
141{ 141{
142 int err, trylock, nscan; 142 int err, trylock, nscan;
143 xfs_extlen_t delta, longest, need, free, minfree, maxfree = 0; 143 xfs_extlen_t longest, free, minfree, maxfree = 0;
144 xfs_agnumber_t ag, max_ag = NULLAGNUMBER; 144 xfs_agnumber_t ag, max_ag = NULLAGNUMBER;
145 struct xfs_perag *pag; 145 struct xfs_perag *pag;
146 146
@@ -186,12 +186,7 @@ _xfs_filestream_pick_ag(
186 goto next_ag; 186 goto next_ag;
187 } 187 }
188 188
189 need = XFS_MIN_FREELIST_PAG(pag, mp); 189 longest = xfs_alloc_longest_free_extent(mp, pag);
190 delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0;
191 longest = (pag->pagf_longest > delta) ?
192 (pag->pagf_longest - delta) :
193 (pag->pagf_flcount > 0 || pag->pagf_longest > 0);
194
195 if (((minlen && longest >= minlen) || 190 if (((minlen && longest >= minlen) ||
196 (!minlen && pag->pagf_freeblks >= minfree)) && 191 (!minlen && pag->pagf_freeblks >= minfree)) &&
197 (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) || 192 (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 680d0e0ec932..8379e3bca26c 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -576,7 +576,7 @@ out:
576 if (fdblks_delta) { 576 if (fdblks_delta) {
577 /* 577 /*
578 * If we are putting blocks back here, m_resblks_avail is 578 * If we are putting blocks back here, m_resblks_avail is
579 * already at it's max so this will put it in the free pool. 579 * already at its max so this will put it in the free pool.
580 * 580 *
581 * If we need space, we'll either succeed in getting it 581 * If we need space, we'll either succeed in getting it
582 * from the free block count or we'll get an enospc. If 582 * from the free block count or we'll get an enospc. If
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index ab016e5ae7be..3120a3a5e20f 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -230,7 +230,7 @@ xfs_ialloc_ag_alloc(
230 args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1; 230 args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1;
231 231
232 /* Allow space for the inode btree to split. */ 232 /* Allow space for the inode btree to split. */
233 args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1; 233 args.minleft = args.mp->m_in_maxlevels - 1;
234 if ((error = xfs_alloc_vextent(&args))) 234 if ((error = xfs_alloc_vextent(&args)))
235 return error; 235 return error;
236 } else 236 } else
@@ -270,7 +270,7 @@ xfs_ialloc_ag_alloc(
270 /* 270 /*
271 * Allow space for the inode btree to split. 271 * Allow space for the inode btree to split.
272 */ 272 */
273 args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1; 273 args.minleft = args.mp->m_in_maxlevels - 1;
274 if ((error = xfs_alloc_vextent(&args))) 274 if ((error = xfs_alloc_vextent(&args)))
275 return error; 275 return error;
276 } 276 }
@@ -349,7 +349,7 @@ xfs_ialloc_ag_alloc(
349 * Initialize all inodes in this buffer and then log them. 349 * Initialize all inodes in this buffer and then log them.
350 * 350 *
351 * XXX: It would be much better if we had just one transaction to 351 * XXX: It would be much better if we had just one transaction to
352 * log a whole cluster of inodes instead of all the indivdual 352 * log a whole cluster of inodes instead of all the individual
353 * transactions causing a lot of log traffic. 353 * transactions causing a lot of log traffic.
354 */ 354 */
355 xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog); 355 xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog);
@@ -943,7 +943,7 @@ nextag:
943 ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % 943 ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
944 XFS_INODES_PER_CHUNK) == 0); 944 XFS_INODES_PER_CHUNK) == 0);
945 ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); 945 ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
946 XFS_INOBT_CLR_FREE(&rec, offset); 946 rec.ir_free &= ~XFS_INOBT_MASK(offset);
947 rec.ir_freecount--; 947 rec.ir_freecount--;
948 if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, 948 if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount,
949 rec.ir_free))) 949 rec.ir_free)))
@@ -1105,11 +1105,11 @@ xfs_difree(
1105 */ 1105 */
1106 off = agino - rec.ir_startino; 1106 off = agino - rec.ir_startino;
1107 ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK); 1107 ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
1108 ASSERT(!XFS_INOBT_IS_FREE(&rec, off)); 1108 ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
1109 /* 1109 /*
1110 * Mark the inode free & increment the count. 1110 * Mark the inode free & increment the count.
1111 */ 1111 */
1112 XFS_INOBT_SET_FREE(&rec, off); 1112 rec.ir_free |= XFS_INOBT_MASK(off);
1113 rec.ir_freecount++; 1113 rec.ir_freecount++;
1114 1114
1115 /* 1115 /*
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 99f2408e8d8e..c282a9af5393 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -164,7 +164,7 @@ xfs_inobt_init_rec_from_cur(
164} 164}
165 165
166/* 166/*
167 * intial value of ptr for lookup 167 * initial value of ptr for lookup
168 */ 168 */
169STATIC void 169STATIC void
170xfs_inobt_init_ptr_from_cur( 170xfs_inobt_init_ptr_from_cur(
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index 5580e255ff06..f782ad0c4769 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -32,14 +32,14 @@ struct xfs_mount;
32#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */ 32#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */
33 33
34typedef __uint64_t xfs_inofree_t; 34typedef __uint64_t xfs_inofree_t;
35#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t)) 35#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t))
36#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3) 36#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3)
37#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) 37#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1)
38#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i))
38 39
39static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) 40static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
40{ 41{
41 return (((n) >= XFS_INODES_PER_CHUNK ? \ 42 return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
42 (xfs_inofree_t)0 : ((xfs_inofree_t)1 << (n))) - 1) << (i);
43} 43}
44 44
45/* 45/*
@@ -69,20 +69,6 @@ typedef struct xfs_inobt_key {
69typedef __be32 xfs_inobt_ptr_t; 69typedef __be32 xfs_inobt_ptr_t;
70 70
71/* 71/*
72 * Bit manipulations for ir_free.
73 */
74#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i))
75#define XFS_INOBT_IS_FREE(rp,i) \
76 (((rp)->ir_free & XFS_INOBT_MASK(i)) != 0)
77#define XFS_INOBT_SET_FREE(rp,i) ((rp)->ir_free |= XFS_INOBT_MASK(i))
78#define XFS_INOBT_CLR_FREE(rp,i) ((rp)->ir_free &= ~XFS_INOBT_MASK(i))
79
80/*
81 * Maximum number of inode btree levels.
82 */
83#define XFS_IN_MAXLEVELS(mp) ((mp)->m_in_maxlevels)
84
85/*
86 * block numbers in the AG. 72 * block numbers in the AG.
87 */ 73 */
88#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1)) 74#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index e2fb6210d4c5..478e587087fe 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -246,9 +246,6 @@ xfs_iget_cache_miss(
246 goto out_destroy; 246 goto out_destroy;
247 } 247 }
248 248
249 if (lock_flags)
250 xfs_ilock(ip, lock_flags);
251
252 /* 249 /*
253 * Preload the radix tree so we can insert safely under the 250 * Preload the radix tree so we can insert safely under the
254 * write spinlock. Note that we cannot sleep inside the preload 251 * write spinlock. Note that we cannot sleep inside the preload
@@ -256,7 +253,16 @@ xfs_iget_cache_miss(
256 */ 253 */
257 if (radix_tree_preload(GFP_KERNEL)) { 254 if (radix_tree_preload(GFP_KERNEL)) {
258 error = EAGAIN; 255 error = EAGAIN;
259 goto out_unlock; 256 goto out_destroy;
257 }
258
259 /*
260 * Because the inode hasn't been added to the radix-tree yet it can't
261 * be found by another thread, so we can do the non-sleeping lock here.
262 */
263 if (lock_flags) {
264 if (!xfs_ilock_nowait(ip, lock_flags))
265 BUG();
260 } 266 }
261 267
262 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); 268 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
@@ -284,7 +290,6 @@ xfs_iget_cache_miss(
284out_preload_end: 290out_preload_end:
285 write_unlock(&pag->pag_ici_lock); 291 write_unlock(&pag->pag_ici_lock);
286 radix_tree_preload_end(); 292 radix_tree_preload_end();
287out_unlock:
288 if (lock_flags) 293 if (lock_flags)
289 xfs_iunlock(ip, lock_flags); 294 xfs_iunlock(ip, lock_flags);
290out_destroy: 295out_destroy:
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 1f175fa34b22..f879c1bc4b96 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -122,7 +122,7 @@ typedef struct xfs_ictimestamp {
122 122
123/* 123/*
124 * NOTE: This structure must be kept identical to struct xfs_dinode 124 * NOTE: This structure must be kept identical to struct xfs_dinode
125 * in xfs_dinode.h except for the endianess annotations. 125 * in xfs_dinode.h except for the endianness annotations.
126 */ 126 */
127typedef struct xfs_icdinode { 127typedef struct xfs_icdinode {
128 __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */ 128 __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 9957d0602d54..a52ac125f055 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -40,7 +40,6 @@ typedef struct xfs_inode_log_format {
40 __int32_t ilf_boffset; /* off of inode in buffer */ 40 __int32_t ilf_boffset; /* off of inode in buffer */
41} xfs_inode_log_format_t; 41} xfs_inode_log_format_t;
42 42
43#ifndef HAVE_FORMAT32
44typedef struct xfs_inode_log_format_32 { 43typedef struct xfs_inode_log_format_32 {
45 __uint16_t ilf_type; /* inode log item type */ 44 __uint16_t ilf_type; /* inode log item type */
46 __uint16_t ilf_size; /* size of this item */ 45 __uint16_t ilf_size; /* size of this item */
@@ -56,7 +55,6 @@ typedef struct xfs_inode_log_format_32 {
56 __int32_t ilf_len; /* len of inode buffer */ 55 __int32_t ilf_len; /* len of inode buffer */
57 __int32_t ilf_boffset; /* off of inode in buffer */ 56 __int32_t ilf_boffset; /* off of inode in buffer */
58} __attribute__((packed)) xfs_inode_log_format_32_t; 57} __attribute__((packed)) xfs_inode_log_format_32_t;
59#endif
60 58
61typedef struct xfs_inode_log_format_64 { 59typedef struct xfs_inode_log_format_64 {
62 __uint16_t ilf_type; /* inode log item type */ 60 __uint16_t ilf_type; /* inode log item type */
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index ee1a0c134cc2..a1cc1322fc0f 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -63,7 +63,7 @@ typedef enum {
63 */ 63 */
64 64
65typedef struct xfs_iomap { 65typedef struct xfs_iomap {
66 xfs_daddr_t iomap_bn; /* first 512b blk of mapping */ 66 xfs_daddr_t iomap_bn; /* first 512B blk of mapping */
67 xfs_buftarg_t *iomap_target; 67 xfs_buftarg_t *iomap_target;
68 xfs_off_t iomap_offset; /* offset of mapping, bytes */ 68 xfs_off_t iomap_offset; /* offset of mapping, bytes */
69 xfs_off_t iomap_bsize; /* size of mapping, bytes */ 69 xfs_off_t iomap_bsize; /* size of mapping, bytes */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index cf98a805ec90..aeb2d2221c7d 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -83,7 +83,12 @@ xfs_bulkstat_one_iget(
83 buf->bs_uid = dic->di_uid; 83 buf->bs_uid = dic->di_uid;
84 buf->bs_gid = dic->di_gid; 84 buf->bs_gid = dic->di_gid;
85 buf->bs_size = dic->di_size; 85 buf->bs_size = dic->di_size;
86 vn_atime_to_bstime(VFS_I(ip), &buf->bs_atime); 86 /*
87 * We are reading the atime from the Linux inode because the
88 * dinode might not be uptodate.
89 */
90 buf->bs_atime.tv_sec = VFS_I(ip)->i_atime.tv_sec;
91 buf->bs_atime.tv_nsec = VFS_I(ip)->i_atime.tv_nsec;
87 buf->bs_mtime.tv_sec = dic->di_mtime.t_sec; 92 buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
88 buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec; 93 buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
89 buf->bs_ctime.tv_sec = dic->di_ctime.t_sec; 94 buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
@@ -579,7 +584,7 @@ xfs_bulkstat(
579 * first inode of the cluster. 584 * first inode of the cluster.
580 * 585 *
581 * Careful with clustidx. There can be 586 * Careful with clustidx. There can be
582 * multple clusters per chunk, a single 587 * multiple clusters per chunk, a single
583 * cluster per chunk or a cluster that has 588 * cluster per chunk or a cluster that has
584 * inodes represented from several different 589 * inodes represented from several different
585 * chunks (if blocksize is large). 590 * chunks (if blocksize is large).
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index f4726f702a9e..f76c6d7cea21 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -574,7 +574,7 @@ xfs_log_mount(
574 error = xfs_trans_ail_init(mp); 574 error = xfs_trans_ail_init(mp);
575 if (error) { 575 if (error) {
576 cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error); 576 cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error);
577 goto error; 577 goto out_free_log;
578 } 578 }
579 mp->m_log->l_ailp = mp->m_ail; 579 mp->m_log->l_ailp = mp->m_ail;
580 580
@@ -594,20 +594,22 @@ xfs_log_mount(
594 mp->m_flags |= XFS_MOUNT_RDONLY; 594 mp->m_flags |= XFS_MOUNT_RDONLY;
595 if (error) { 595 if (error) {
596 cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error); 596 cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error);
597 goto error; 597 goto out_destroy_ail;
598 } 598 }
599 } 599 }
600 600
601 /* Normal transactions can now occur */ 601 /* Normal transactions can now occur */
602 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; 602 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
603 603
604 /* End mounting message in xfs_log_mount_finish */
605 return 0; 604 return 0;
606error: 605
607 xfs_log_unmount_dealloc(mp); 606out_destroy_ail:
607 xfs_trans_ail_destroy(mp);
608out_free_log:
609 xlog_dealloc_log(mp->m_log);
608out: 610out:
609 return error; 611 return error;
610} /* xfs_log_mount */ 612}
611 613
612/* 614/*
613 * Finish the recovery of the file system. This is separate from 615 * Finish the recovery of the file system. This is separate from
@@ -633,19 +635,6 @@ xfs_log_mount_finish(xfs_mount_t *mp)
633} 635}
634 636
635/* 637/*
636 * Unmount processing for the log.
637 */
638int
639xfs_log_unmount(xfs_mount_t *mp)
640{
641 int error;
642
643 error = xfs_log_unmount_write(mp);
644 xfs_log_unmount_dealloc(mp);
645 return error;
646}
647
648/*
649 * Final log writes as part of unmount. 638 * Final log writes as part of unmount.
650 * 639 *
651 * Mark the filesystem clean as unmount happens. Note that during relocation 640 * Mark the filesystem clean as unmount happens. Note that during relocation
@@ -795,7 +784,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
795 * and deallocate the log as the aild references the log. 784 * and deallocate the log as the aild references the log.
796 */ 785 */
797void 786void
798xfs_log_unmount_dealloc(xfs_mount_t *mp) 787xfs_log_unmount(xfs_mount_t *mp)
799{ 788{
800 xfs_trans_ail_destroy(mp); 789 xfs_trans_ail_destroy(mp);
801 xlog_dealloc_log(mp->m_log); 790 xlog_dealloc_log(mp->m_log);
@@ -1109,7 +1098,7 @@ xlog_bdstrat_cb(struct xfs_buf *bp)
1109/* 1098/*
1110 * Return size of each in-core log record buffer. 1099 * Return size of each in-core log record buffer.
1111 * 1100 *
1112 * All machines get 8 x 32KB buffers by default, unless tuned otherwise. 1101 * All machines get 8 x 32kB buffers by default, unless tuned otherwise.
1113 * 1102 *
1114 * If the filesystem blocksize is too large, we may need to choose a 1103 * If the filesystem blocksize is too large, we may need to choose a
1115 * larger size since the directory code currently logs entire blocks. 1104 * larger size since the directory code currently logs entire blocks.
@@ -1139,8 +1128,8 @@ xlog_get_iclog_buffer_size(xfs_mount_t *mp,
1139 } 1128 }
1140 1129
1141 if (xfs_sb_version_haslogv2(&mp->m_sb)) { 1130 if (xfs_sb_version_haslogv2(&mp->m_sb)) {
1142 /* # headers = size / 32K 1131 /* # headers = size / 32k
1143 * one header holds cycles from 32K of data 1132 * one header holds cycles from 32k of data
1144 */ 1133 */
1145 1134
1146 xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE; 1135 xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE;
@@ -1156,7 +1145,7 @@ xlog_get_iclog_buffer_size(xfs_mount_t *mp,
1156 goto done; 1145 goto done;
1157 } 1146 }
1158 1147
1159 /* All machines use 32KB buffers by default. */ 1148 /* All machines use 32kB buffers by default. */
1160 log->l_iclog_size = XLOG_BIG_RECORD_BSIZE; 1149 log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
1161 log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT; 1150 log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
1162 1151
@@ -1164,32 +1153,8 @@ xlog_get_iclog_buffer_size(xfs_mount_t *mp,
1164 log->l_iclog_hsize = BBSIZE; 1153 log->l_iclog_hsize = BBSIZE;
1165 log->l_iclog_heads = 1; 1154 log->l_iclog_heads = 1;
1166 1155
1167 /* 1156done:
1168 * For 16KB, we use 3 32KB buffers. For 32KB block sizes, we use 1157 /* are we being asked to make the sizes selected above visible? */
1169 * 4 32KB buffers. For 64KB block sizes, we use 8 32KB buffers.
1170 */
1171 if (mp->m_sb.sb_blocksize >= 16*1024) {
1172 log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
1173 log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
1174 if (mp->m_logbufs <= 0) {
1175 switch (mp->m_sb.sb_blocksize) {
1176 case 16*1024: /* 16 KB */
1177 log->l_iclog_bufs = 3;
1178 break;
1179 case 32*1024: /* 32 KB */
1180 log->l_iclog_bufs = 4;
1181 break;
1182 case 64*1024: /* 64 KB */
1183 log->l_iclog_bufs = 8;
1184 break;
1185 default:
1186 xlog_panic("XFS: Invalid blocksize");
1187 break;
1188 }
1189 }
1190 }
1191
1192done: /* are we being asked to make the sizes selected above visible? */
1193 if (mp->m_logbufs == 0) 1158 if (mp->m_logbufs == 0)
1194 mp->m_logbufs = log->l_iclog_bufs; 1159 mp->m_logbufs = log->l_iclog_bufs;
1195 if (mp->m_logbsize == 0) 1160 if (mp->m_logbsize == 0)
@@ -3214,7 +3179,7 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
3214 */ 3179 */
3215 3180
3216/* 3181/*
3217 * Free a used ticket when it's refcount falls to zero. 3182 * Free a used ticket when its refcount falls to zero.
3218 */ 3183 */
3219void 3184void
3220xfs_log_ticket_put( 3185xfs_log_ticket_put(
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 8a3e84e900a3..d0c9baa50b1a 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -170,9 +170,8 @@ int xfs_log_write(struct xfs_mount *mp,
170 int nentries, 170 int nentries,
171 xfs_log_ticket_t ticket, 171 xfs_log_ticket_t ticket,
172 xfs_lsn_t *start_lsn); 172 xfs_lsn_t *start_lsn);
173int xfs_log_unmount(struct xfs_mount *mp);
174int xfs_log_unmount_write(struct xfs_mount *mp); 173int xfs_log_unmount_write(struct xfs_mount *mp);
175void xfs_log_unmount_dealloc(struct xfs_mount *mp); 174void xfs_log_unmount(struct xfs_mount *mp);
176int xfs_log_force_umount(struct xfs_mount *mp, int logerror); 175int xfs_log_force_umount(struct xfs_mount *mp, int logerror);
177int xfs_log_need_covered(struct xfs_mount *mp); 176int xfs_log_need_covered(struct xfs_mount *mp);
178 177
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 654167be0efb..bcad5f4c1fd1 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -359,7 +359,7 @@ typedef struct xlog_in_core {
359 int ic_size; 359 int ic_size;
360 int ic_offset; 360 int ic_offset;
361 int ic_bwritecnt; 361 int ic_bwritecnt;
362 ushort_t ic_state; 362 unsigned short ic_state;
363 char *ic_datap; /* pointer to iclog data */ 363 char *ic_datap; /* pointer to iclog data */
364#ifdef XFS_LOG_TRACE 364#ifdef XFS_LOG_TRACE
365 struct ktrace *ic_trace; 365 struct ktrace *ic_trace;
@@ -455,7 +455,6 @@ extern void xlog_recover_process_iunlinks(xlog_t *log);
455 455
456extern struct xfs_buf *xlog_get_bp(xlog_t *, int); 456extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
457extern void xlog_put_bp(struct xfs_buf *); 457extern void xlog_put_bp(struct xfs_buf *);
458extern int xlog_bread(xlog_t *, xfs_daddr_t, int, struct xfs_buf *);
459 458
460extern kmem_zone_t *xfs_log_ticket_zone; 459extern kmem_zone_t *xfs_log_ticket_zone;
461 460
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index b1047de2fffd..7ba450116d4f 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -94,12 +94,30 @@ xlog_put_bp(
94 xfs_buf_free(bp); 94 xfs_buf_free(bp);
95} 95}
96 96
97STATIC xfs_caddr_t
98xlog_align(
99 xlog_t *log,
100 xfs_daddr_t blk_no,
101 int nbblks,
102 xfs_buf_t *bp)
103{
104 xfs_caddr_t ptr;
105
106 if (!log->l_sectbb_log)
107 return XFS_BUF_PTR(bp);
108
109 ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
110 ASSERT(XFS_BUF_SIZE(bp) >=
111 BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
112 return ptr;
113}
114
97 115
98/* 116/*
99 * nbblks should be uint, but oh well. Just want to catch that 32-bit length. 117 * nbblks should be uint, but oh well. Just want to catch that 32-bit length.
100 */ 118 */
101int 119STATIC int
102xlog_bread( 120xlog_bread_noalign(
103 xlog_t *log, 121 xlog_t *log,
104 xfs_daddr_t blk_no, 122 xfs_daddr_t blk_no,
105 int nbblks, 123 int nbblks,
@@ -137,6 +155,24 @@ xlog_bread(
137 return error; 155 return error;
138} 156}
139 157
158STATIC int
159xlog_bread(
160 xlog_t *log,
161 xfs_daddr_t blk_no,
162 int nbblks,
163 xfs_buf_t *bp,
164 xfs_caddr_t *offset)
165{
166 int error;
167
168 error = xlog_bread_noalign(log, blk_no, nbblks, bp);
169 if (error)
170 return error;
171
172 *offset = xlog_align(log, blk_no, nbblks, bp);
173 return 0;
174}
175
140/* 176/*
141 * Write out the buffer at the given block for the given number of blocks. 177 * Write out the buffer at the given block for the given number of blocks.
142 * The buffer is kept locked across the write and is returned locked. 178 * The buffer is kept locked across the write and is returned locked.
@@ -180,24 +216,6 @@ xlog_bwrite(
180 return error; 216 return error;
181} 217}
182 218
183STATIC xfs_caddr_t
184xlog_align(
185 xlog_t *log,
186 xfs_daddr_t blk_no,
187 int nbblks,
188 xfs_buf_t *bp)
189{
190 xfs_caddr_t ptr;
191
192 if (!log->l_sectbb_log)
193 return XFS_BUF_PTR(bp);
194
195 ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
196 ASSERT(XFS_BUF_SIZE(bp) >=
197 BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
198 return ptr;
199}
200
201#ifdef DEBUG 219#ifdef DEBUG
202/* 220/*
203 * dump debug superblock and log record information 221 * dump debug superblock and log record information
@@ -211,11 +229,11 @@ xlog_header_check_dump(
211 229
212 cmn_err(CE_DEBUG, "%s: SB : uuid = ", __func__); 230 cmn_err(CE_DEBUG, "%s: SB : uuid = ", __func__);
213 for (b = 0; b < 16; b++) 231 for (b = 0; b < 16; b++)
214 cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]); 232 cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&mp->m_sb.sb_uuid)[b]);
215 cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT); 233 cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT);
216 cmn_err(CE_DEBUG, " log : uuid = "); 234 cmn_err(CE_DEBUG, " log : uuid = ");
217 for (b = 0; b < 16; b++) 235 for (b = 0; b < 16; b++)
218 cmn_err(CE_DEBUG, "%02x",((uchar_t *)&head->h_fs_uuid)[b]); 236 cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&head->h_fs_uuid)[b]);
219 cmn_err(CE_DEBUG, ", fmt = %d\n", be32_to_cpu(head->h_fmt)); 237 cmn_err(CE_DEBUG, ", fmt = %d\n", be32_to_cpu(head->h_fmt));
220} 238}
221#else 239#else
@@ -321,9 +339,9 @@ xlog_find_cycle_start(
321 339
322 mid_blk = BLK_AVG(first_blk, *last_blk); 340 mid_blk = BLK_AVG(first_blk, *last_blk);
323 while (mid_blk != first_blk && mid_blk != *last_blk) { 341 while (mid_blk != first_blk && mid_blk != *last_blk) {
324 if ((error = xlog_bread(log, mid_blk, 1, bp))) 342 error = xlog_bread(log, mid_blk, 1, bp, &offset);
343 if (error)
325 return error; 344 return error;
326 offset = xlog_align(log, mid_blk, 1, bp);
327 mid_cycle = xlog_get_cycle(offset); 345 mid_cycle = xlog_get_cycle(offset);
328 if (mid_cycle == cycle) { 346 if (mid_cycle == cycle) {
329 *last_blk = mid_blk; 347 *last_blk = mid_blk;
@@ -379,10 +397,10 @@ xlog_find_verify_cycle(
379 397
380 bcount = min(bufblks, (start_blk + nbblks - i)); 398 bcount = min(bufblks, (start_blk + nbblks - i));
381 399
382 if ((error = xlog_bread(log, i, bcount, bp))) 400 error = xlog_bread(log, i, bcount, bp, &buf);
401 if (error)
383 goto out; 402 goto out;
384 403
385 buf = xlog_align(log, i, bcount, bp);
386 for (j = 0; j < bcount; j++) { 404 for (j = 0; j < bcount; j++) {
387 cycle = xlog_get_cycle(buf); 405 cycle = xlog_get_cycle(buf);
388 if (cycle == stop_on_cycle_no) { 406 if (cycle == stop_on_cycle_no) {
@@ -436,9 +454,9 @@ xlog_find_verify_log_record(
436 return ENOMEM; 454 return ENOMEM;
437 smallmem = 1; 455 smallmem = 1;
438 } else { 456 } else {
439 if ((error = xlog_bread(log, start_blk, num_blks, bp))) 457 error = xlog_bread(log, start_blk, num_blks, bp, &offset);
458 if (error)
440 goto out; 459 goto out;
441 offset = xlog_align(log, start_blk, num_blks, bp);
442 offset += ((num_blks - 1) << BBSHIFT); 460 offset += ((num_blks - 1) << BBSHIFT);
443 } 461 }
444 462
@@ -453,9 +471,9 @@ xlog_find_verify_log_record(
453 } 471 }
454 472
455 if (smallmem) { 473 if (smallmem) {
456 if ((error = xlog_bread(log, i, 1, bp))) 474 error = xlog_bread(log, i, 1, bp, &offset);
475 if (error)
457 goto out; 476 goto out;
458 offset = xlog_align(log, i, 1, bp);
459 } 477 }
460 478
461 head = (xlog_rec_header_t *)offset; 479 head = (xlog_rec_header_t *)offset;
@@ -559,15 +577,18 @@ xlog_find_head(
559 bp = xlog_get_bp(log, 1); 577 bp = xlog_get_bp(log, 1);
560 if (!bp) 578 if (!bp)
561 return ENOMEM; 579 return ENOMEM;
562 if ((error = xlog_bread(log, 0, 1, bp))) 580
581 error = xlog_bread(log, 0, 1, bp, &offset);
582 if (error)
563 goto bp_err; 583 goto bp_err;
564 offset = xlog_align(log, 0, 1, bp); 584
565 first_half_cycle = xlog_get_cycle(offset); 585 first_half_cycle = xlog_get_cycle(offset);
566 586
567 last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */ 587 last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */
568 if ((error = xlog_bread(log, last_blk, 1, bp))) 588 error = xlog_bread(log, last_blk, 1, bp, &offset);
589 if (error)
569 goto bp_err; 590 goto bp_err;
570 offset = xlog_align(log, last_blk, 1, bp); 591
571 last_half_cycle = xlog_get_cycle(offset); 592 last_half_cycle = xlog_get_cycle(offset);
572 ASSERT(last_half_cycle != 0); 593 ASSERT(last_half_cycle != 0);
573 594
@@ -817,9 +838,10 @@ xlog_find_tail(
817 if (!bp) 838 if (!bp)
818 return ENOMEM; 839 return ENOMEM;
819 if (*head_blk == 0) { /* special case */ 840 if (*head_blk == 0) { /* special case */
820 if ((error = xlog_bread(log, 0, 1, bp))) 841 error = xlog_bread(log, 0, 1, bp, &offset);
842 if (error)
821 goto bread_err; 843 goto bread_err;
822 offset = xlog_align(log, 0, 1, bp); 844
823 if (xlog_get_cycle(offset) == 0) { 845 if (xlog_get_cycle(offset) == 0) {
824 *tail_blk = 0; 846 *tail_blk = 0;
825 /* leave all other log inited values alone */ 847 /* leave all other log inited values alone */
@@ -832,9 +854,10 @@ xlog_find_tail(
832 */ 854 */
833 ASSERT(*head_blk < INT_MAX); 855 ASSERT(*head_blk < INT_MAX);
834 for (i = (int)(*head_blk) - 1; i >= 0; i--) { 856 for (i = (int)(*head_blk) - 1; i >= 0; i--) {
835 if ((error = xlog_bread(log, i, 1, bp))) 857 error = xlog_bread(log, i, 1, bp, &offset);
858 if (error)
836 goto bread_err; 859 goto bread_err;
837 offset = xlog_align(log, i, 1, bp); 860
838 if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) { 861 if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
839 found = 1; 862 found = 1;
840 break; 863 break;
@@ -848,9 +871,10 @@ xlog_find_tail(
848 */ 871 */
849 if (!found) { 872 if (!found) {
850 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) { 873 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
851 if ((error = xlog_bread(log, i, 1, bp))) 874 error = xlog_bread(log, i, 1, bp, &offset);
875 if (error)
852 goto bread_err; 876 goto bread_err;
853 offset = xlog_align(log, i, 1, bp); 877
854 if (XLOG_HEADER_MAGIC_NUM == 878 if (XLOG_HEADER_MAGIC_NUM ==
855 be32_to_cpu(*(__be32 *)offset)) { 879 be32_to_cpu(*(__be32 *)offset)) {
856 found = 2; 880 found = 2;
@@ -922,10 +946,10 @@ xlog_find_tail(
922 if (*head_blk == after_umount_blk && 946 if (*head_blk == after_umount_blk &&
923 be32_to_cpu(rhead->h_num_logops) == 1) { 947 be32_to_cpu(rhead->h_num_logops) == 1) {
924 umount_data_blk = (i + hblks) % log->l_logBBsize; 948 umount_data_blk = (i + hblks) % log->l_logBBsize;
925 if ((error = xlog_bread(log, umount_data_blk, 1, bp))) { 949 error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
950 if (error)
926 goto bread_err; 951 goto bread_err;
927 } 952
928 offset = xlog_align(log, umount_data_blk, 1, bp);
929 op_head = (xlog_op_header_t *)offset; 953 op_head = (xlog_op_header_t *)offset;
930 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { 954 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
931 /* 955 /*
@@ -1017,9 +1041,10 @@ xlog_find_zeroed(
1017 bp = xlog_get_bp(log, 1); 1041 bp = xlog_get_bp(log, 1);
1018 if (!bp) 1042 if (!bp)
1019 return ENOMEM; 1043 return ENOMEM;
1020 if ((error = xlog_bread(log, 0, 1, bp))) 1044 error = xlog_bread(log, 0, 1, bp, &offset);
1045 if (error)
1021 goto bp_err; 1046 goto bp_err;
1022 offset = xlog_align(log, 0, 1, bp); 1047
1023 first_cycle = xlog_get_cycle(offset); 1048 first_cycle = xlog_get_cycle(offset);
1024 if (first_cycle == 0) { /* completely zeroed log */ 1049 if (first_cycle == 0) { /* completely zeroed log */
1025 *blk_no = 0; 1050 *blk_no = 0;
@@ -1028,9 +1053,10 @@ xlog_find_zeroed(
1028 } 1053 }
1029 1054
1030 /* check partially zeroed log */ 1055 /* check partially zeroed log */
1031 if ((error = xlog_bread(log, log_bbnum-1, 1, bp))) 1056 error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
1057 if (error)
1032 goto bp_err; 1058 goto bp_err;
1033 offset = xlog_align(log, log_bbnum-1, 1, bp); 1059
1034 last_cycle = xlog_get_cycle(offset); 1060 last_cycle = xlog_get_cycle(offset);
1035 if (last_cycle != 0) { /* log completely written to */ 1061 if (last_cycle != 0) { /* log completely written to */
1036 xlog_put_bp(bp); 1062 xlog_put_bp(bp);
@@ -1152,10 +1178,10 @@ xlog_write_log_records(
1152 */ 1178 */
1153 balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block); 1179 balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block);
1154 if (balign != start_block) { 1180 if (balign != start_block) {
1155 if ((error = xlog_bread(log, start_block, 1, bp))) { 1181 error = xlog_bread_noalign(log, start_block, 1, bp);
1156 xlog_put_bp(bp); 1182 if (error)
1157 return error; 1183 goto out_put_bp;
1158 } 1184
1159 j = start_block - balign; 1185 j = start_block - balign;
1160 } 1186 }
1161 1187
@@ -1175,10 +1201,14 @@ xlog_write_log_records(
1175 balign = BBTOB(ealign - start_block); 1201 balign = BBTOB(ealign - start_block);
1176 error = XFS_BUF_SET_PTR(bp, offset + balign, 1202 error = XFS_BUF_SET_PTR(bp, offset + balign,
1177 BBTOB(sectbb)); 1203 BBTOB(sectbb));
1178 if (!error) 1204 if (error)
1179 error = xlog_bread(log, ealign, sectbb, bp); 1205 break;
1180 if (!error) 1206
1181 error = XFS_BUF_SET_PTR(bp, offset, bufblks); 1207 error = xlog_bread_noalign(log, ealign, sectbb, bp);
1208 if (error)
1209 break;
1210
1211 error = XFS_BUF_SET_PTR(bp, offset, bufblks);
1182 if (error) 1212 if (error)
1183 break; 1213 break;
1184 } 1214 }
@@ -1195,6 +1225,8 @@ xlog_write_log_records(
1195 start_block += endcount; 1225 start_block += endcount;
1196 j = 0; 1226 j = 0;
1197 } 1227 }
1228
1229 out_put_bp:
1198 xlog_put_bp(bp); 1230 xlog_put_bp(bp);
1199 return error; 1231 return error;
1200} 1232}
@@ -1455,10 +1487,19 @@ xlog_recover_add_to_trans(
1455 item = item->ri_prev; 1487 item = item->ri_prev;
1456 1488
1457 if (item->ri_total == 0) { /* first region to be added */ 1489 if (item->ri_total == 0) { /* first region to be added */
1458 item->ri_total = in_f->ilf_size; 1490 if (in_f->ilf_size == 0 ||
1459 ASSERT(item->ri_total <= XLOG_MAX_REGIONS_IN_ITEM); 1491 in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
1460 item->ri_buf = kmem_zalloc((item->ri_total * 1492 xlog_warn(
1461 sizeof(xfs_log_iovec_t)), KM_SLEEP); 1493 "XFS: bad number of regions (%d) in inode log format",
1494 in_f->ilf_size);
1495 ASSERT(0);
1496 return XFS_ERROR(EIO);
1497 }
1498
1499 item->ri_total = in_f->ilf_size;
1500 item->ri_buf =
1501 kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
1502 KM_SLEEP);
1462 } 1503 }
1463 ASSERT(item->ri_total > item->ri_cnt); 1504 ASSERT(item->ri_total > item->ri_cnt);
1464 /* Description region is ri_buf[0] */ 1505 /* Description region is ri_buf[0] */
@@ -2502,16 +2543,10 @@ xlog_recover_do_inode_trans(
2502 } 2543 }
2503 2544
2504write_inode_buffer: 2545write_inode_buffer:
2505 if (ITEM_TYPE(item) == XFS_LI_INODE) { 2546 ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
2506 ASSERT(bp->b_mount == NULL || bp->b_mount == mp); 2547 bp->b_mount = mp;
2507 bp->b_mount = mp; 2548 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2508 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); 2549 xfs_bdwrite(mp, bp);
2509 xfs_bdwrite(mp, bp);
2510 } else {
2511 XFS_BUF_STALE(bp);
2512 error = xfs_bwrite(mp, bp);
2513 }
2514
2515error: 2550error:
2516 if (need_free) 2551 if (need_free)
2517 kmem_free(in_f); 2552 kmem_free(in_f);
@@ -2760,51 +2795,48 @@ xlog_recover_do_trans(
2760 int error = 0; 2795 int error = 0;
2761 xlog_recover_item_t *item, *first_item; 2796 xlog_recover_item_t *item, *first_item;
2762 2797
2763 if ((error = xlog_recover_reorder_trans(trans))) 2798 error = xlog_recover_reorder_trans(trans);
2799 if (error)
2764 return error; 2800 return error;
2801
2765 first_item = item = trans->r_itemq; 2802 first_item = item = trans->r_itemq;
2766 do { 2803 do {
2767 /* 2804 switch (ITEM_TYPE(item)) {
2768 * we don't need to worry about the block number being 2805 case XFS_LI_BUF:
2769 * truncated in > 1 TB buffers because in user-land, 2806 error = xlog_recover_do_buffer_trans(log, item, pass);
2770 * we're now n32 or 64-bit so xfs_daddr_t is 64-bits so 2807 break;
2771 * the blknos will get through the user-mode buffer 2808 case XFS_LI_INODE:
2772 * cache properly. The only bad case is o32 kernels 2809 error = xlog_recover_do_inode_trans(log, item, pass);
2773 * where xfs_daddr_t is 32-bits but mount will warn us 2810 break;
2774 * off a > 1 TB filesystem before we get here. 2811 case XFS_LI_EFI:
2775 */ 2812 error = xlog_recover_do_efi_trans(log, item,
2776 if ((ITEM_TYPE(item) == XFS_LI_BUF)) { 2813 trans->r_lsn, pass);
2777 if ((error = xlog_recover_do_buffer_trans(log, item, 2814 break;
2778 pass))) 2815 case XFS_LI_EFD:
2779 break;
2780 } else if ((ITEM_TYPE(item) == XFS_LI_INODE)) {
2781 if ((error = xlog_recover_do_inode_trans(log, item,
2782 pass)))
2783 break;
2784 } else if (ITEM_TYPE(item) == XFS_LI_EFI) {
2785 if ((error = xlog_recover_do_efi_trans(log, item, trans->r_lsn,
2786 pass)))
2787 break;
2788 } else if (ITEM_TYPE(item) == XFS_LI_EFD) {
2789 xlog_recover_do_efd_trans(log, item, pass); 2816 xlog_recover_do_efd_trans(log, item, pass);
2790 } else if (ITEM_TYPE(item) == XFS_LI_DQUOT) { 2817 error = 0;
2791 if ((error = xlog_recover_do_dquot_trans(log, item, 2818 break;
2792 pass))) 2819 case XFS_LI_DQUOT:
2793 break; 2820 error = xlog_recover_do_dquot_trans(log, item, pass);
2794 } else if ((ITEM_TYPE(item) == XFS_LI_QUOTAOFF)) { 2821 break;
2795 if ((error = xlog_recover_do_quotaoff_trans(log, item, 2822 case XFS_LI_QUOTAOFF:
2796 pass))) 2823 error = xlog_recover_do_quotaoff_trans(log, item,
2797 break; 2824 pass);
2798 } else { 2825 break;
2799 xlog_warn("XFS: xlog_recover_do_trans"); 2826 default:
2827 xlog_warn(
2828 "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
2800 ASSERT(0); 2829 ASSERT(0);
2801 error = XFS_ERROR(EIO); 2830 error = XFS_ERROR(EIO);
2802 break; 2831 break;
2803 } 2832 }
2833
2834 if (error)
2835 return error;
2804 item = item->ri_next; 2836 item = item->ri_next;
2805 } while (first_item != item); 2837 } while (first_item != item);
2806 2838
2807 return error; 2839 return 0;
2808} 2840}
2809 2841
2810/* 2842/*
@@ -3481,9 +3513,11 @@ xlog_do_recovery_pass(
3481 hbp = xlog_get_bp(log, 1); 3513 hbp = xlog_get_bp(log, 1);
3482 if (!hbp) 3514 if (!hbp)
3483 return ENOMEM; 3515 return ENOMEM;
3484 if ((error = xlog_bread(log, tail_blk, 1, hbp))) 3516
3517 error = xlog_bread(log, tail_blk, 1, hbp, &offset);
3518 if (error)
3485 goto bread_err1; 3519 goto bread_err1;
3486 offset = xlog_align(log, tail_blk, 1, hbp); 3520
3487 rhead = (xlog_rec_header_t *)offset; 3521 rhead = (xlog_rec_header_t *)offset;
3488 error = xlog_valid_rec_header(log, rhead, tail_blk); 3522 error = xlog_valid_rec_header(log, rhead, tail_blk);
3489 if (error) 3523 if (error)
@@ -3517,9 +3551,10 @@ xlog_do_recovery_pass(
3517 memset(rhash, 0, sizeof(rhash)); 3551 memset(rhash, 0, sizeof(rhash));
3518 if (tail_blk <= head_blk) { 3552 if (tail_blk <= head_blk) {
3519 for (blk_no = tail_blk; blk_no < head_blk; ) { 3553 for (blk_no = tail_blk; blk_no < head_blk; ) {
3520 if ((error = xlog_bread(log, blk_no, hblks, hbp))) 3554 error = xlog_bread(log, blk_no, hblks, hbp, &offset);
3555 if (error)
3521 goto bread_err2; 3556 goto bread_err2;
3522 offset = xlog_align(log, blk_no, hblks, hbp); 3557
3523 rhead = (xlog_rec_header_t *)offset; 3558 rhead = (xlog_rec_header_t *)offset;
3524 error = xlog_valid_rec_header(log, rhead, blk_no); 3559 error = xlog_valid_rec_header(log, rhead, blk_no);
3525 if (error) 3560 if (error)
@@ -3527,10 +3562,11 @@ xlog_do_recovery_pass(
3527 3562
3528 /* blocks in data section */ 3563 /* blocks in data section */
3529 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); 3564 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3530 error = xlog_bread(log, blk_no + hblks, bblks, dbp); 3565 error = xlog_bread(log, blk_no + hblks, bblks, dbp,
3566 &offset);
3531 if (error) 3567 if (error)
3532 goto bread_err2; 3568 goto bread_err2;
3533 offset = xlog_align(log, blk_no + hblks, bblks, dbp); 3569
3534 xlog_unpack_data(rhead, offset, log); 3570 xlog_unpack_data(rhead, offset, log);
3535 if ((error = xlog_recover_process_data(log, 3571 if ((error = xlog_recover_process_data(log,
3536 rhash, rhead, offset, pass))) 3572 rhash, rhead, offset, pass)))
@@ -3553,10 +3589,10 @@ xlog_do_recovery_pass(
3553 wrapped_hblks = 0; 3589 wrapped_hblks = 0;
3554 if (blk_no + hblks <= log->l_logBBsize) { 3590 if (blk_no + hblks <= log->l_logBBsize) {
3555 /* Read header in one read */ 3591 /* Read header in one read */
3556 error = xlog_bread(log, blk_no, hblks, hbp); 3592 error = xlog_bread(log, blk_no, hblks, hbp,
3593 &offset);
3557 if (error) 3594 if (error)
3558 goto bread_err2; 3595 goto bread_err2;
3559 offset = xlog_align(log, blk_no, hblks, hbp);
3560 } else { 3596 } else {
3561 /* This LR is split across physical log end */ 3597 /* This LR is split across physical log end */
3562 if (blk_no != log->l_logBBsize) { 3598 if (blk_no != log->l_logBBsize) {
@@ -3564,12 +3600,13 @@ xlog_do_recovery_pass(
3564 ASSERT(blk_no <= INT_MAX); 3600 ASSERT(blk_no <= INT_MAX);
3565 split_hblks = log->l_logBBsize - (int)blk_no; 3601 split_hblks = log->l_logBBsize - (int)blk_no;
3566 ASSERT(split_hblks > 0); 3602 ASSERT(split_hblks > 0);
3567 if ((error = xlog_bread(log, blk_no, 3603 error = xlog_bread(log, blk_no,
3568 split_hblks, hbp))) 3604 split_hblks, hbp,
3605 &offset);
3606 if (error)
3569 goto bread_err2; 3607 goto bread_err2;
3570 offset = xlog_align(log, blk_no,
3571 split_hblks, hbp);
3572 } 3608 }
3609
3573 /* 3610 /*
3574 * Note: this black magic still works with 3611 * Note: this black magic still works with
3575 * large sector sizes (non-512) only because: 3612 * large sector sizes (non-512) only because:
@@ -3587,14 +3624,19 @@ xlog_do_recovery_pass(
3587 error = XFS_BUF_SET_PTR(hbp, 3624 error = XFS_BUF_SET_PTR(hbp,
3588 bufaddr + BBTOB(split_hblks), 3625 bufaddr + BBTOB(split_hblks),
3589 BBTOB(hblks - split_hblks)); 3626 BBTOB(hblks - split_hblks));
3590 if (!error) 3627 if (error)
3591 error = xlog_bread(log, 0, 3628 goto bread_err2;
3592 wrapped_hblks, hbp); 3629
3593 if (!error) 3630 error = xlog_bread_noalign(log, 0,
3594 error = XFS_BUF_SET_PTR(hbp, bufaddr, 3631 wrapped_hblks, hbp);
3632 if (error)
3633 goto bread_err2;
3634
3635 error = XFS_BUF_SET_PTR(hbp, bufaddr,
3595 BBTOB(hblks)); 3636 BBTOB(hblks));
3596 if (error) 3637 if (error)
3597 goto bread_err2; 3638 goto bread_err2;
3639
3598 if (!offset) 3640 if (!offset)
3599 offset = xlog_align(log, 0, 3641 offset = xlog_align(log, 0,
3600 wrapped_hblks, hbp); 3642 wrapped_hblks, hbp);
@@ -3610,10 +3652,10 @@ xlog_do_recovery_pass(
3610 3652
3611 /* Read in data for log record */ 3653 /* Read in data for log record */
3612 if (blk_no + bblks <= log->l_logBBsize) { 3654 if (blk_no + bblks <= log->l_logBBsize) {
3613 error = xlog_bread(log, blk_no, bblks, dbp); 3655 error = xlog_bread(log, blk_no, bblks, dbp,
3656 &offset);
3614 if (error) 3657 if (error)
3615 goto bread_err2; 3658 goto bread_err2;
3616 offset = xlog_align(log, blk_no, bblks, dbp);
3617 } else { 3659 } else {
3618 /* This log record is split across the 3660 /* This log record is split across the
3619 * physical end of log */ 3661 * physical end of log */
@@ -3627,12 +3669,13 @@ xlog_do_recovery_pass(
3627 split_bblks = 3669 split_bblks =
3628 log->l_logBBsize - (int)blk_no; 3670 log->l_logBBsize - (int)blk_no;
3629 ASSERT(split_bblks > 0); 3671 ASSERT(split_bblks > 0);
3630 if ((error = xlog_bread(log, blk_no, 3672 error = xlog_bread(log, blk_no,
3631 split_bblks, dbp))) 3673 split_bblks, dbp,
3674 &offset);
3675 if (error)
3632 goto bread_err2; 3676 goto bread_err2;
3633 offset = xlog_align(log, blk_no,
3634 split_bblks, dbp);
3635 } 3677 }
3678
3636 /* 3679 /*
3637 * Note: this black magic still works with 3680 * Note: this black magic still works with
3638 * large sector sizes (non-512) only because: 3681 * large sector sizes (non-512) only because:
@@ -3649,15 +3692,19 @@ xlog_do_recovery_pass(
3649 error = XFS_BUF_SET_PTR(dbp, 3692 error = XFS_BUF_SET_PTR(dbp,
3650 bufaddr + BBTOB(split_bblks), 3693 bufaddr + BBTOB(split_bblks),
3651 BBTOB(bblks - split_bblks)); 3694 BBTOB(bblks - split_bblks));
3652 if (!error)
3653 error = xlog_bread(log, wrapped_hblks,
3654 bblks - split_bblks,
3655 dbp);
3656 if (!error)
3657 error = XFS_BUF_SET_PTR(dbp, bufaddr,
3658 h_size);
3659 if (error) 3695 if (error)
3660 goto bread_err2; 3696 goto bread_err2;
3697
3698 error = xlog_bread_noalign(log, wrapped_hblks,
3699 bblks - split_bblks,
3700 dbp);
3701 if (error)
3702 goto bread_err2;
3703
3704 error = XFS_BUF_SET_PTR(dbp, bufaddr, h_size);
3705 if (error)
3706 goto bread_err2;
3707
3661 if (!offset) 3708 if (!offset)
3662 offset = xlog_align(log, wrapped_hblks, 3709 offset = xlog_align(log, wrapped_hblks,
3663 bblks - split_bblks, dbp); 3710 bblks - split_bblks, dbp);
@@ -3674,17 +3721,21 @@ xlog_do_recovery_pass(
3674 3721
3675 /* read first part of physical log */ 3722 /* read first part of physical log */
3676 while (blk_no < head_blk) { 3723 while (blk_no < head_blk) {
3677 if ((error = xlog_bread(log, blk_no, hblks, hbp))) 3724 error = xlog_bread(log, blk_no, hblks, hbp, &offset);
3725 if (error)
3678 goto bread_err2; 3726 goto bread_err2;
3679 offset = xlog_align(log, blk_no, hblks, hbp); 3727
3680 rhead = (xlog_rec_header_t *)offset; 3728 rhead = (xlog_rec_header_t *)offset;
3681 error = xlog_valid_rec_header(log, rhead, blk_no); 3729 error = xlog_valid_rec_header(log, rhead, blk_no);
3682 if (error) 3730 if (error)
3683 goto bread_err2; 3731 goto bread_err2;
3732
3684 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); 3733 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3685 if ((error = xlog_bread(log, blk_no+hblks, bblks, dbp))) 3734 error = xlog_bread(log, blk_no+hblks, bblks, dbp,
3735 &offset);
3736 if (error)
3686 goto bread_err2; 3737 goto bread_err2;
3687 offset = xlog_align(log, blk_no+hblks, bblks, dbp); 3738
3688 xlog_unpack_data(rhead, offset, log); 3739 xlog_unpack_data(rhead, offset, log);
3689 if ((error = xlog_recover_process_data(log, rhash, 3740 if ((error = xlog_recover_process_data(log, rhash,
3690 rhead, offset, pass))) 3741 rhead, offset, pass)))
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 35300250e86d..b101990df027 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -45,7 +45,6 @@
45#include "xfs_fsops.h" 45#include "xfs_fsops.h"
46#include "xfs_utils.h" 46#include "xfs_utils.h"
47 47
48STATIC int xfs_uuid_mount(xfs_mount_t *);
49STATIC void xfs_unmountfs_wait(xfs_mount_t *); 48STATIC void xfs_unmountfs_wait(xfs_mount_t *);
50 49
51 50
@@ -121,6 +120,84 @@ static const struct {
121 { sizeof(xfs_sb_t), 0 } 120 { sizeof(xfs_sb_t), 0 }
122}; 121};
123 122
123static DEFINE_MUTEX(xfs_uuid_table_mutex);
124static int xfs_uuid_table_size;
125static uuid_t *xfs_uuid_table;
126
127/*
128 * See if the UUID is unique among mounted XFS filesystems.
129 * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
130 */
131STATIC int
132xfs_uuid_mount(
133 struct xfs_mount *mp)
134{
135 uuid_t *uuid = &mp->m_sb.sb_uuid;
136 int hole, i;
137
138 if (mp->m_flags & XFS_MOUNT_NOUUID)
139 return 0;
140
141 if (uuid_is_nil(uuid)) {
142 cmn_err(CE_WARN,
143 "XFS: Filesystem %s has nil UUID - can't mount",
144 mp->m_fsname);
145 return XFS_ERROR(EINVAL);
146 }
147
148 mutex_lock(&xfs_uuid_table_mutex);
149 for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) {
150 if (uuid_is_nil(&xfs_uuid_table[i])) {
151 hole = i;
152 continue;
153 }
154 if (uuid_equal(uuid, &xfs_uuid_table[i]))
155 goto out_duplicate;
156 }
157
158 if (hole < 0) {
159 xfs_uuid_table = kmem_realloc(xfs_uuid_table,
160 (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
161 xfs_uuid_table_size * sizeof(*xfs_uuid_table),
162 KM_SLEEP);
163 hole = xfs_uuid_table_size++;
164 }
165 xfs_uuid_table[hole] = *uuid;
166 mutex_unlock(&xfs_uuid_table_mutex);
167
168 return 0;
169
170 out_duplicate:
171 mutex_unlock(&xfs_uuid_table_mutex);
172 cmn_err(CE_WARN, "XFS: Filesystem %s has duplicate UUID - can't mount",
173 mp->m_fsname);
174 return XFS_ERROR(EINVAL);
175}
176
177STATIC void
178xfs_uuid_unmount(
179 struct xfs_mount *mp)
180{
181 uuid_t *uuid = &mp->m_sb.sb_uuid;
182 int i;
183
184 if (mp->m_flags & XFS_MOUNT_NOUUID)
185 return;
186
187 mutex_lock(&xfs_uuid_table_mutex);
188 for (i = 0; i < xfs_uuid_table_size; i++) {
189 if (uuid_is_nil(&xfs_uuid_table[i]))
190 continue;
191 if (!uuid_equal(uuid, &xfs_uuid_table[i]))
192 continue;
193 memset(&xfs_uuid_table[i], 0, sizeof(uuid_t));
194 break;
195 }
196 ASSERT(i < xfs_uuid_table_size);
197 mutex_unlock(&xfs_uuid_table_mutex);
198}
199
200
124/* 201/*
125 * Free up the resources associated with a mount structure. Assume that 202 * Free up the resources associated with a mount structure. Assume that
126 * the structure was initially zeroed, so we can tell which fields got 203 * the structure was initially zeroed, so we can tell which fields got
@@ -256,6 +333,22 @@ xfs_mount_validate_sb(
256 return XFS_ERROR(ENOSYS); 333 return XFS_ERROR(ENOSYS);
257 } 334 }
258 335
336 /*
337 * Currently only very few inode sizes are supported.
338 */
339 switch (sbp->sb_inodesize) {
340 case 256:
341 case 512:
342 case 1024:
343 case 2048:
344 break;
345 default:
346 xfs_fs_mount_cmn_err(flags,
347 "inode size of %d bytes not supported",
348 sbp->sb_inodesize);
349 return XFS_ERROR(ENOSYS);
350 }
351
259 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || 352 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
260 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { 353 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
261 xfs_fs_mount_cmn_err(flags, 354 xfs_fs_mount_cmn_err(flags,
@@ -574,32 +667,10 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
574 mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT; 667 mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
575 mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1; 668 mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
576 mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog; 669 mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
577 mp->m_litino = sbp->sb_inodesize - sizeof(struct xfs_dinode);
578 mp->m_blockmask = sbp->sb_blocksize - 1; 670 mp->m_blockmask = sbp->sb_blocksize - 1;
579 mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG; 671 mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
580 mp->m_blockwmask = mp->m_blockwsize - 1; 672 mp->m_blockwmask = mp->m_blockwsize - 1;
581 673
582 /*
583 * Setup for attributes, in case they get created.
584 * This value is for inodes getting attributes for the first time,
585 * the per-inode value is for old attribute values.
586 */
587 ASSERT(sbp->sb_inodesize >= 256 && sbp->sb_inodesize <= 2048);
588 switch (sbp->sb_inodesize) {
589 case 256:
590 mp->m_attroffset = XFS_LITINO(mp) -
591 XFS_BMDR_SPACE_CALC(MINABTPTRS);
592 break;
593 case 512:
594 case 1024:
595 case 2048:
596 mp->m_attroffset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
597 break;
598 default:
599 ASSERT(0);
600 }
601 ASSERT(mp->m_attroffset < XFS_LITINO(mp));
602
603 mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1); 674 mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
604 mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0); 675 mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
605 mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2; 676 mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
@@ -645,7 +716,7 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
645 for (index = 0; index < agcount; index++) { 716 for (index = 0; index < agcount; index++) {
646 /* 717 /*
647 * read the agf, then the agi. This gets us 718 * read the agf, then the agi. This gets us
648 * all the inforamtion we need and populates the 719 * all the information we need and populates the
649 * per-ag structures for us. 720 * per-ag structures for us.
650 */ 721 */
651 error = xfs_alloc_pagf_init(mp, NULL, index, 0); 722 error = xfs_alloc_pagf_init(mp, NULL, index, 0);
@@ -886,8 +957,6 @@ xfs_check_sizes(xfs_mount_t *mp)
886} 957}
887 958
888/* 959/*
889 * xfs_mountfs
890 *
891 * This function does the following on an initial mount of a file system: 960 * This function does the following on an initial mount of a file system:
892 * - reads the superblock from disk and init the mount struct 961 * - reads the superblock from disk and init the mount struct
893 * - if we're a 32-bit kernel, do a size check on the superblock 962 * - if we're a 32-bit kernel, do a size check on the superblock
@@ -905,7 +974,6 @@ xfs_mountfs(
905 xfs_inode_t *rip; 974 xfs_inode_t *rip;
906 __uint64_t resblks; 975 __uint64_t resblks;
907 uint quotamount, quotaflags; 976 uint quotamount, quotaflags;
908 int uuid_mounted = 0;
909 int error = 0; 977 int error = 0;
910 978
911 xfs_mount_common(mp, sbp); 979 xfs_mount_common(mp, sbp);
@@ -960,7 +1028,7 @@ xfs_mountfs(
960 */ 1028 */
961 error = xfs_update_alignment(mp); 1029 error = xfs_update_alignment(mp);
962 if (error) 1030 if (error)
963 goto error1; 1031 goto out;
964 1032
965 xfs_alloc_compute_maxlevels(mp); 1033 xfs_alloc_compute_maxlevels(mp);
966 xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK); 1034 xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
@@ -971,19 +1039,9 @@ xfs_mountfs(
971 1039
972 mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog); 1040 mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog);
973 1041
974 /* 1042 error = xfs_uuid_mount(mp);
975 * XFS uses the uuid from the superblock as the unique 1043 if (error)
976 * identifier for fsid. We can not use the uuid from the volume 1044 goto out;
977 * since a single partition filesystem is identical to a single
978 * partition volume/filesystem.
979 */
980 if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) {
981 if (xfs_uuid_mount(mp)) {
982 error = XFS_ERROR(EINVAL);
983 goto error1;
984 }
985 uuid_mounted=1;
986 }
987 1045
988 /* 1046 /*
989 * Set the minimum read and write sizes 1047 * Set the minimum read and write sizes
@@ -1007,7 +1065,7 @@ xfs_mountfs(
1007 */ 1065 */
1008 error = xfs_check_sizes(mp); 1066 error = xfs_check_sizes(mp);
1009 if (error) 1067 if (error)
1010 goto error1; 1068 goto out_remove_uuid;
1011 1069
1012 /* 1070 /*
1013 * Initialize realtime fields in the mount structure 1071 * Initialize realtime fields in the mount structure
@@ -1015,7 +1073,7 @@ xfs_mountfs(
1015 error = xfs_rtmount_init(mp); 1073 error = xfs_rtmount_init(mp);
1016 if (error) { 1074 if (error) {
1017 cmn_err(CE_WARN, "XFS: RT mount failed"); 1075 cmn_err(CE_WARN, "XFS: RT mount failed");
1018 goto error1; 1076 goto out_remove_uuid;
1019 } 1077 }
1020 1078
1021 /* 1079 /*
@@ -1045,26 +1103,26 @@ xfs_mountfs(
1045 mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), 1103 mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t),
1046 KM_MAYFAIL); 1104 KM_MAYFAIL);
1047 if (!mp->m_perag) 1105 if (!mp->m_perag)
1048 goto error1; 1106 goto out_remove_uuid;
1049 1107
1050 mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount); 1108 mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount);
1051 1109
1110 if (!sbp->sb_logblocks) {
1111 cmn_err(CE_WARN, "XFS: no log defined");
1112 XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
1113 error = XFS_ERROR(EFSCORRUPTED);
1114 goto out_free_perag;
1115 }
1116
1052 /* 1117 /*
1053 * log's mount-time initialization. Perform 1st part recovery if needed 1118 * log's mount-time initialization. Perform 1st part recovery if needed
1054 */ 1119 */
1055 if (likely(sbp->sb_logblocks > 0)) { /* check for volume case */ 1120 error = xfs_log_mount(mp, mp->m_logdev_targp,
1056 error = xfs_log_mount(mp, mp->m_logdev_targp, 1121 XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
1057 XFS_FSB_TO_DADDR(mp, sbp->sb_logstart), 1122 XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
1058 XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); 1123 if (error) {
1059 if (error) { 1124 cmn_err(CE_WARN, "XFS: log mount failed");
1060 cmn_err(CE_WARN, "XFS: log mount failed"); 1125 goto out_free_perag;
1061 goto error2;
1062 }
1063 } else { /* No log has been defined */
1064 cmn_err(CE_WARN, "XFS: no log defined");
1065 XFS_ERROR_REPORT("xfs_mountfs_int(1)", XFS_ERRLEVEL_LOW, mp);
1066 error = XFS_ERROR(EFSCORRUPTED);
1067 goto error2;
1068 } 1126 }
1069 1127
1070 /* 1128 /*
@@ -1086,15 +1144,14 @@ xfs_mountfs(
1086 * If we are currently making the filesystem, the initialisation will 1144 * If we are currently making the filesystem, the initialisation will
1087 * fail as the perag data is in an undefined state. 1145 * fail as the perag data is in an undefined state.
1088 */ 1146 */
1089
1090 if (xfs_sb_version_haslazysbcount(&mp->m_sb) && 1147 if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
1091 !XFS_LAST_UNMOUNT_WAS_CLEAN(mp) && 1148 !XFS_LAST_UNMOUNT_WAS_CLEAN(mp) &&
1092 !mp->m_sb.sb_inprogress) { 1149 !mp->m_sb.sb_inprogress) {
1093 error = xfs_initialize_perag_data(mp, sbp->sb_agcount); 1150 error = xfs_initialize_perag_data(mp, sbp->sb_agcount);
1094 if (error) { 1151 if (error)
1095 goto error2; 1152 goto out_free_perag;
1096 }
1097 } 1153 }
1154
1098 /* 1155 /*
1099 * Get and sanity-check the root inode. 1156 * Get and sanity-check the root inode.
1100 * Save the pointer to it in the mount structure. 1157 * Save the pointer to it in the mount structure.
@@ -1102,7 +1159,7 @@ xfs_mountfs(
1102 error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip, 0); 1159 error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip, 0);
1103 if (error) { 1160 if (error) {
1104 cmn_err(CE_WARN, "XFS: failed to read root inode"); 1161 cmn_err(CE_WARN, "XFS: failed to read root inode");
1105 goto error3; 1162 goto out_log_dealloc;
1106 } 1163 }
1107 1164
1108 ASSERT(rip != NULL); 1165 ASSERT(rip != NULL);
@@ -1116,7 +1173,7 @@ xfs_mountfs(
1116 XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW, 1173 XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
1117 mp); 1174 mp);
1118 error = XFS_ERROR(EFSCORRUPTED); 1175 error = XFS_ERROR(EFSCORRUPTED);
1119 goto error4; 1176 goto out_rele_rip;
1120 } 1177 }
1121 mp->m_rootip = rip; /* save it */ 1178 mp->m_rootip = rip; /* save it */
1122 1179
@@ -1131,7 +1188,7 @@ xfs_mountfs(
1131 * Free up the root inode. 1188 * Free up the root inode.
1132 */ 1189 */
1133 cmn_err(CE_WARN, "XFS: failed to read RT inodes"); 1190 cmn_err(CE_WARN, "XFS: failed to read RT inodes");
1134 goto error4; 1191 goto out_rele_rip;
1135 } 1192 }
1136 1193
1137 /* 1194 /*
@@ -1143,7 +1200,7 @@ xfs_mountfs(
1143 error = xfs_mount_log_sb(mp, mp->m_update_flags); 1200 error = xfs_mount_log_sb(mp, mp->m_update_flags);
1144 if (error) { 1201 if (error) {
1145 cmn_err(CE_WARN, "XFS: failed to write sb changes"); 1202 cmn_err(CE_WARN, "XFS: failed to write sb changes");
1146 goto error4; 1203 goto out_rtunmount;
1147 } 1204 }
1148 } 1205 }
1149 1206
@@ -1152,7 +1209,7 @@ xfs_mountfs(
1152 */ 1209 */
1153 error = XFS_QM_INIT(mp, &quotamount, &quotaflags); 1210 error = XFS_QM_INIT(mp, &quotamount, &quotaflags);
1154 if (error) 1211 if (error)
1155 goto error4; 1212 goto out_rtunmount;
1156 1213
1157 /* 1214 /*
1158 * Finish recovering the file system. This part needed to be 1215 * Finish recovering the file system. This part needed to be
@@ -1162,7 +1219,7 @@ xfs_mountfs(
1162 error = xfs_log_mount_finish(mp); 1219 error = xfs_log_mount_finish(mp);
1163 if (error) { 1220 if (error) {
1164 cmn_err(CE_WARN, "XFS: log mount finish failed"); 1221 cmn_err(CE_WARN, "XFS: log mount finish failed");
1165 goto error4; 1222 goto out_rtunmount;
1166 } 1223 }
1167 1224
1168 /* 1225 /*
@@ -1170,7 +1227,7 @@ xfs_mountfs(
1170 */ 1227 */
1171 error = XFS_QM_MOUNT(mp, quotamount, quotaflags); 1228 error = XFS_QM_MOUNT(mp, quotamount, quotaflags);
1172 if (error) 1229 if (error)
1173 goto error4; 1230 goto out_rtunmount;
1174 1231
1175 /* 1232 /*
1176 * Now we are mounted, reserve a small amount of unused space for 1233 * Now we are mounted, reserve a small amount of unused space for
@@ -1194,18 +1251,17 @@ xfs_mountfs(
1194 1251
1195 return 0; 1252 return 0;
1196 1253
1197 error4: 1254 out_rtunmount:
1198 /* 1255 xfs_rtunmount_inodes(mp);
1199 * Free up the root inode. 1256 out_rele_rip:
1200 */
1201 IRELE(rip); 1257 IRELE(rip);
1202 error3: 1258 out_log_dealloc:
1203 xfs_log_unmount_dealloc(mp); 1259 xfs_log_unmount(mp);
1204 error2: 1260 out_free_perag:
1205 xfs_free_perag(mp); 1261 xfs_free_perag(mp);
1206 error1: 1262 out_remove_uuid:
1207 if (uuid_mounted) 1263 xfs_uuid_unmount(mp);
1208 uuid_table_remove(&mp->m_sb.sb_uuid); 1264 out:
1209 return error; 1265 return error;
1210} 1266}
1211 1267
@@ -1226,15 +1282,12 @@ xfs_unmountfs(
1226 */ 1282 */
1227 XFS_QM_UNMOUNT(mp); 1283 XFS_QM_UNMOUNT(mp);
1228 1284
1229 if (mp->m_rbmip) 1285 xfs_rtunmount_inodes(mp);
1230 IRELE(mp->m_rbmip);
1231 if (mp->m_rsumip)
1232 IRELE(mp->m_rsumip);
1233 IRELE(mp->m_rootip); 1286 IRELE(mp->m_rootip);
1234 1287
1235 /* 1288 /*
1236 * We can potentially deadlock here if we have an inode cluster 1289 * We can potentially deadlock here if we have an inode cluster
1237 * that has been freed has it's buffer still pinned in memory because 1290 * that has been freed has its buffer still pinned in memory because
1238 * the transaction is still sitting in a iclog. The stale inodes 1291 * the transaction is still sitting in a iclog. The stale inodes
1239 * on that buffer will have their flush locks held until the 1292 * on that buffer will have their flush locks held until the
1240 * transaction hits the disk and the callbacks run. the inode 1293 * transaction hits the disk and the callbacks run. the inode
@@ -1266,7 +1319,7 @@ xfs_unmountfs(
1266 * Unreserve any blocks we have so that when we unmount we don't account 1319 * Unreserve any blocks we have so that when we unmount we don't account
1267 * the reserved free space as used. This is really only necessary for 1320 * the reserved free space as used. This is really only necessary for
1268 * lazy superblock counting because it trusts the incore superblock 1321 * lazy superblock counting because it trusts the incore superblock
1269 * counters to be aboslutely correct on clean unmount. 1322 * counters to be absolutely correct on clean unmount.
1270 * 1323 *
1271 * We don't bother correcting this elsewhere for lazy superblock 1324 * We don't bother correcting this elsewhere for lazy superblock
1272 * counting because on mount of an unclean filesystem we reconstruct the 1325 * counting because on mount of an unclean filesystem we reconstruct the
@@ -1288,10 +1341,9 @@ xfs_unmountfs(
1288 "Freespace may not be correct on next mount."); 1341 "Freespace may not be correct on next mount.");
1289 xfs_unmountfs_writesb(mp); 1342 xfs_unmountfs_writesb(mp);
1290 xfs_unmountfs_wait(mp); /* wait for async bufs */ 1343 xfs_unmountfs_wait(mp); /* wait for async bufs */
1291 xfs_log_unmount(mp); /* Done! No more fs ops. */ 1344 xfs_log_unmount_write(mp);
1292 1345 xfs_log_unmount(mp);
1293 if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) 1346 xfs_uuid_unmount(mp);
1294 uuid_table_remove(&mp->m_sb.sb_uuid);
1295 1347
1296#if defined(DEBUG) 1348#if defined(DEBUG)
1297 xfs_errortag_clearall(mp, 0); 1349 xfs_errortag_clearall(mp, 0);
@@ -1793,29 +1845,6 @@ xfs_freesb(
1793} 1845}
1794 1846
1795/* 1847/*
1796 * See if the UUID is unique among mounted XFS filesystems.
1797 * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
1798 */
1799STATIC int
1800xfs_uuid_mount(
1801 xfs_mount_t *mp)
1802{
1803 if (uuid_is_nil(&mp->m_sb.sb_uuid)) {
1804 cmn_err(CE_WARN,
1805 "XFS: Filesystem %s has nil UUID - can't mount",
1806 mp->m_fsname);
1807 return -1;
1808 }
1809 if (!uuid_table_insert(&mp->m_sb.sb_uuid)) {
1810 cmn_err(CE_WARN,
1811 "XFS: Filesystem %s has duplicate UUID - can't mount",
1812 mp->m_fsname);
1813 return -1;
1814 }
1815 return 0;
1816}
1817
1818/*
1819 * Used to log changes to the superblock unit and width fields which could 1848 * Used to log changes to the superblock unit and width fields which could
1820 * be altered by the mount options, as well as any potential sb_features2 1849 * be altered by the mount options, as well as any potential sb_features2
1821 * fixup. Only the first superblock is updated. 1850 * fixup. Only the first superblock is updated.
@@ -1868,7 +1897,7 @@ xfs_mount_log_sb(
1868 * we disable the per-cpu counter and go through the slow path. 1897 * we disable the per-cpu counter and go through the slow path.
1869 * 1898 *
1870 * The slow path is the current xfs_mod_incore_sb() function. This means that 1899 * The slow path is the current xfs_mod_incore_sb() function. This means that
1871 * when we disable a per-cpu counter, we need to drain it's resources back to 1900 * when we disable a per-cpu counter, we need to drain its resources back to
1872 * the global superblock. We do this after disabling the counter to prevent 1901 * the global superblock. We do this after disabling the counter to prevent
1873 * more threads from queueing up on the counter. 1902 * more threads from queueing up on the counter.
1874 * 1903 *
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f5e9937f9bdb..7af44adffc8f 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -136,7 +136,6 @@ typedef int (*xfs_dqvopchownresv_t)(struct xfs_trans *, struct xfs_inode *,
136 struct xfs_dquot *, struct xfs_dquot *, uint); 136 struct xfs_dquot *, struct xfs_dquot *, uint);
137typedef void (*xfs_dqstatvfs_t)(struct xfs_inode *, struct kstatfs *); 137typedef void (*xfs_dqstatvfs_t)(struct xfs_inode *, struct kstatfs *);
138typedef int (*xfs_dqsync_t)(struct xfs_mount *, int flags); 138typedef int (*xfs_dqsync_t)(struct xfs_mount *, int flags);
139typedef int (*xfs_quotactl_t)(struct xfs_mount *, int, int, xfs_caddr_t);
140 139
141typedef struct xfs_qmops { 140typedef struct xfs_qmops {
142 xfs_qminit_t xfs_qminit; 141 xfs_qminit_t xfs_qminit;
@@ -154,7 +153,6 @@ typedef struct xfs_qmops {
154 xfs_dqvopchownresv_t xfs_dqvopchownresv; 153 xfs_dqvopchownresv_t xfs_dqvopchownresv;
155 xfs_dqstatvfs_t xfs_dqstatvfs; 154 xfs_dqstatvfs_t xfs_dqstatvfs;
156 xfs_dqsync_t xfs_dqsync; 155 xfs_dqsync_t xfs_dqsync;
157 xfs_quotactl_t xfs_quotactl;
158 struct xfs_dqtrxops *xfs_dqtrxops; 156 struct xfs_dqtrxops *xfs_dqtrxops;
159} xfs_qmops_t; 157} xfs_qmops_t;
160 158
@@ -188,8 +186,6 @@ typedef struct xfs_qmops {
188 (*(ip)->i_mount->m_qm_ops->xfs_dqstatvfs)(ip, statp) 186 (*(ip)->i_mount->m_qm_ops->xfs_dqstatvfs)(ip, statp)
189#define XFS_QM_DQSYNC(mp, flags) \ 187#define XFS_QM_DQSYNC(mp, flags) \
190 (*(mp)->m_qm_ops->xfs_dqsync)(mp, flags) 188 (*(mp)->m_qm_ops->xfs_dqsync)(mp, flags)
191#define XFS_QM_QUOTACTL(mp, cmd, id, addr) \
192 (*(mp)->m_qm_ops->xfs_quotactl)(mp, cmd, id, addr)
193 189
194#ifdef HAVE_PERCPU_SB 190#ifdef HAVE_PERCPU_SB
195 191
@@ -273,19 +269,17 @@ typedef struct xfs_mount {
273 uint m_inobt_mnr[2]; /* min inobt btree records */ 269 uint m_inobt_mnr[2]; /* min inobt btree records */
274 uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */ 270 uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
275 uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ 271 uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
276 uint m_in_maxlevels; /* XFS_IN_MAXLEVELS */ 272 uint m_in_maxlevels; /* max inobt btree levels. */
277 struct xfs_perag *m_perag; /* per-ag accounting info */ 273 struct xfs_perag *m_perag; /* per-ag accounting info */
278 struct rw_semaphore m_peraglock; /* lock for m_perag (pointer) */ 274 struct rw_semaphore m_peraglock; /* lock for m_perag (pointer) */
279 struct mutex m_growlock; /* growfs mutex */ 275 struct mutex m_growlock; /* growfs mutex */
280 int m_fixedfsid[2]; /* unchanged for life of FS */ 276 int m_fixedfsid[2]; /* unchanged for life of FS */
281 uint m_dmevmask; /* DMI events for this FS */ 277 uint m_dmevmask; /* DMI events for this FS */
282 __uint64_t m_flags; /* global mount flags */ 278 __uint64_t m_flags; /* global mount flags */
283 uint m_attroffset; /* inode attribute offset */
284 uint m_dir_node_ents; /* #entries in a dir danode */ 279 uint m_dir_node_ents; /* #entries in a dir danode */
285 uint m_attr_node_ents; /* #entries in attr danode */ 280 uint m_attr_node_ents; /* #entries in attr danode */
286 int m_ialloc_inos; /* inodes in inode allocation */ 281 int m_ialloc_inos; /* inodes in inode allocation */
287 int m_ialloc_blks; /* blocks in inode allocation */ 282 int m_ialloc_blks; /* blocks in inode allocation */
288 int m_litino; /* size of inode union area */
289 int m_inoalign_mask;/* mask sb_inoalignmt if used */ 283 int m_inoalign_mask;/* mask sb_inoalignmt if used */
290 uint m_qflags; /* quota status flags */ 284 uint m_qflags; /* quota status flags */
291 xfs_trans_reservations_t m_reservations;/* precomputed res values */ 285 xfs_trans_reservations_t m_reservations;/* precomputed res values */
@@ -293,9 +287,6 @@ typedef struct xfs_mount {
293 __uint64_t m_maxioffset; /* maximum inode offset */ 287 __uint64_t m_maxioffset; /* maximum inode offset */
294 __uint64_t m_resblks; /* total reserved blocks */ 288 __uint64_t m_resblks; /* total reserved blocks */
295 __uint64_t m_resblks_avail;/* available reserved blocks */ 289 __uint64_t m_resblks_avail;/* available reserved blocks */
296#if XFS_BIG_INUMS
297 xfs_ino_t m_inoadd; /* add value for ino64_offset */
298#endif
299 int m_dalign; /* stripe unit */ 290 int m_dalign; /* stripe unit */
300 int m_swidth; /* stripe width */ 291 int m_swidth; /* stripe width */
301 int m_sinoalign; /* stripe unit inode alignment */ 292 int m_sinoalign; /* stripe unit inode alignment */
@@ -337,7 +328,6 @@ typedef struct xfs_mount {
337#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops 328#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops
338 must be synchronous except 329 must be synchronous except
339 for space allocations */ 330 for space allocations */
340#define XFS_MOUNT_INO64 (1ULL << 1)
341#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */ 331#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */
342#define XFS_MOUNT_WAS_CLEAN (1ULL << 3) 332#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
343#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem 333#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
@@ -389,8 +379,8 @@ typedef struct xfs_mount {
389 * Synchronous read and write sizes. This should be 379 * Synchronous read and write sizes. This should be
390 * better for NFSv2 wsync filesystems. 380 * better for NFSv2 wsync filesystems.
391 */ 381 */
392#define XFS_WSYNC_READIO_LOG 15 /* 32K */ 382#define XFS_WSYNC_READIO_LOG 15 /* 32k */
393#define XFS_WSYNC_WRITEIO_LOG 14 /* 16K */ 383#define XFS_WSYNC_WRITEIO_LOG 14 /* 16k */
394 384
395/* 385/*
396 * Allow large block sizes to be reported to userspace programs if the 386 * Allow large block sizes to be reported to userspace programs if the
@@ -500,9 +490,6 @@ typedef struct xfs_mod_sb {
500 int64_t msb_delta; /* Change to make to specified field */ 490 int64_t msb_delta; /* Change to make to specified field */
501} xfs_mod_sb_t; 491} xfs_mod_sb_t;
502 492
503#define XFS_MOUNT_ILOCK(mp) mutex_lock(&((mp)->m_ilock))
504#define XFS_MOUNT_IUNLOCK(mp) mutex_unlock(&((mp)->m_ilock))
505
506extern int xfs_log_sbcount(xfs_mount_t *, uint); 493extern int xfs_log_sbcount(xfs_mount_t *, uint);
507extern int xfs_mountfs(xfs_mount_t *mp); 494extern int xfs_mountfs(xfs_mount_t *mp);
508extern void xfs_mountfs_check_barriers(xfs_mount_t *mp); 495extern void xfs_mountfs_check_barriers(xfs_mount_t *mp);
diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c
index 27f80581520a..e101790ea8e7 100644
--- a/fs/xfs/xfs_qmops.c
+++ b/fs/xfs/xfs_qmops.c
@@ -126,7 +126,6 @@ static struct xfs_qmops xfs_qmcore_stub = {
126 .xfs_dqvopchownresv = (xfs_dqvopchownresv_t) fs_noerr, 126 .xfs_dqvopchownresv = (xfs_dqvopchownresv_t) fs_noerr,
127 .xfs_dqstatvfs = (xfs_dqstatvfs_t) fs_noval, 127 .xfs_dqstatvfs = (xfs_dqstatvfs_t) fs_noval,
128 .xfs_dqsync = (xfs_dqsync_t) fs_noerr, 128 .xfs_dqsync = (xfs_dqsync_t) fs_noerr,
129 .xfs_quotactl = (xfs_quotactl_t) fs_nosys,
130}; 129};
131 130
132int 131int
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 48965ecaa155..f5d1202dde25 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -18,6 +18,8 @@
18#ifndef __XFS_QUOTA_H__ 18#ifndef __XFS_QUOTA_H__
19#define __XFS_QUOTA_H__ 19#define __XFS_QUOTA_H__
20 20
21struct xfs_trans;
22
21/* 23/*
22 * The ondisk form of a dquot structure. 24 * The ondisk form of a dquot structure.
23 */ 25 */
@@ -185,7 +187,6 @@ typedef struct xfs_qoff_logformat {
185 * to a single function. None of these XFS_QMOPT_* flags are meant to have 187 * to a single function. None of these XFS_QMOPT_* flags are meant to have
186 * persistent values (ie. their values can and will change between versions) 188 * persistent values (ie. their values can and will change between versions)
187 */ 189 */
188#define XFS_QMOPT_DQLOCK 0x0000001 /* dqlock */
189#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */ 190#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */
190#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ 191#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */
191#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ 192#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index c5bb86f3ec05..385f6dceba5d 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -2288,6 +2288,16 @@ xfs_rtmount_inodes(
2288 return 0; 2288 return 0;
2289} 2289}
2290 2290
2291void
2292xfs_rtunmount_inodes(
2293 struct xfs_mount *mp)
2294{
2295 if (mp->m_rbmip)
2296 IRELE(mp->m_rbmip);
2297 if (mp->m_rsumip)
2298 IRELE(mp->m_rsumip);
2299}
2300
2291/* 2301/*
2292 * Pick an extent for allocation at the start of a new realtime file. 2302 * Pick an extent for allocation at the start of a new realtime file.
2293 * Use the sequence number stored in the atime field of the bitmap inode. 2303 * Use the sequence number stored in the atime field of the bitmap inode.
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 8d8dcd215716..b2d67adb6a08 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -23,8 +23,8 @@ struct xfs_trans;
23 23
24/* Min and max rt extent sizes, specified in bytes */ 24/* Min and max rt extent sizes, specified in bytes */
25#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */ 25#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */
26#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64KB */ 26#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */
27#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4KB */ 27#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */
28 28
29/* 29/*
30 * Constants for bit manipulations. 30 * Constants for bit manipulations.
@@ -108,6 +108,9 @@ xfs_rtfree_extent(
108int /* error */ 108int /* error */
109xfs_rtmount_init( 109xfs_rtmount_init(
110 struct xfs_mount *mp); /* file system mount structure */ 110 struct xfs_mount *mp); /* file system mount structure */
111void
112xfs_rtunmount_inodes(
113 struct xfs_mount *mp);
111 114
112/* 115/*
113 * Get the bitmap and summary inodes into the mount structure 116 * Get the bitmap and summary inodes into the mount structure
@@ -146,6 +149,7 @@ xfs_growfs_rt(
146# define xfs_growfs_rt(mp,in) (ENOSYS) 149# define xfs_growfs_rt(mp,in) (ENOSYS)
147# define xfs_rtmount_init(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) 150# define xfs_rtmount_init(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
148# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) 151# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
152# define xfs_rtunmount_inodes(m)
149#endif /* CONFIG_XFS_RT */ 153#endif /* CONFIG_XFS_RT */
150 154
151#endif /* __KERNEL__ */ 155#endif /* __KERNEL__ */
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index d6fe4a88d79f..775249a54f6f 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -292,7 +292,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
292 * In a write transaction we can allocate a maximum of 2 292 * In a write transaction we can allocate a maximum of 2
293 * extents. This gives: 293 * extents. This gives:
294 * the inode getting the new extents: inode size 294 * the inode getting the new extents: inode size
295 * the inode\'s bmap btree: max depth * block size 295 * the inode's bmap btree: max depth * block size
296 * the agfs of the ags from which the extents are allocated: 2 * sector 296 * the agfs of the ags from which the extents are allocated: 2 * sector
297 * the superblock free block counter: sector size 297 * the superblock free block counter: sector size
298 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size 298 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
@@ -321,7 +321,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
321/* 321/*
322 * In truncating a file we free up to two extents at once. We can modify: 322 * In truncating a file we free up to two extents at once. We can modify:
323 * the inode being truncated: inode size 323 * the inode being truncated: inode size
324 * the inode\'s bmap btree: (max depth + 1) * block size 324 * the inode's bmap btree: (max depth + 1) * block size
325 * And the bmap_finish transaction can free the blocks and bmap blocks: 325 * And the bmap_finish transaction can free the blocks and bmap blocks:
326 * the agf for each of the ags: 4 * sector size 326 * the agf for each of the ags: 4 * sector size
327 * the agfl for each of the ags: 4 * sector size 327 * the agfl for each of the ags: 4 * sector size
@@ -343,7 +343,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
343 (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \ 343 (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \
344 (128 * 5) + \ 344 (128 * 5) + \
345 XFS_ALLOCFREE_LOG_RES(mp, 1) + \ 345 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
346 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ 346 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
347 XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) 347 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
348 348
349#define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate) 349#define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate)
@@ -431,8 +431,8 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
431 * the new inode: inode size 431 * the new inode: inode size
432 * the inode btree entry: 1 block 432 * the inode btree entry: 1 block
433 * the directory btree: (max depth + v2) * dir block size 433 * the directory btree: (max depth + v2) * dir block size
434 * the directory inode\'s bmap btree: (max depth + v2) * block size 434 * the directory inode's bmap btree: (max depth + v2) * block size
435 * the blocks for the symlink: 1 KB 435 * the blocks for the symlink: 1 kB
436 * Or in the first xact we allocate some inodes giving: 436 * Or in the first xact we allocate some inodes giving:
437 * the agi and agf of the ag getting the new inodes: 2 * sectorsize 437 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
438 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize 438 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
@@ -449,9 +449,9 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
449 (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \ 449 (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \
450 (2 * (mp)->m_sb.sb_sectsize + \ 450 (2 * (mp)->m_sb.sb_sectsize + \
451 XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \ 451 XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
452 XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \ 452 XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
453 XFS_ALLOCFREE_LOG_RES(mp, 1) + \ 453 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
454 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ 454 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
455 XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) 455 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
456 456
457#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink) 457#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink)
@@ -463,7 +463,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
463 * the inode btree entry: block size 463 * the inode btree entry: block size
464 * the superblock for the nlink flag: sector size 464 * the superblock for the nlink flag: sector size
465 * the directory btree: (max depth + v2) * dir block size 465 * the directory btree: (max depth + v2) * dir block size
466 * the directory inode\'s bmap btree: (max depth + v2) * block size 466 * the directory inode's bmap btree: (max depth + v2) * block size
467 * Or in the first xact we allocate some inodes giving: 467 * Or in the first xact we allocate some inodes giving:
468 * the agi and agf of the ag getting the new inodes: 2 * sectorsize 468 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
469 * the superblock for the nlink flag: sector size 469 * the superblock for the nlink flag: sector size
@@ -481,9 +481,9 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
481 (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \ 481 (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \
482 (3 * (mp)->m_sb.sb_sectsize + \ 482 (3 * (mp)->m_sb.sb_sectsize + \
483 XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \ 483 XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
484 XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \ 484 XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
485 XFS_ALLOCFREE_LOG_RES(mp, 1) + \ 485 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
486 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ 486 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
487 XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) 487 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
488 488
489#define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create) 489#define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create)
@@ -513,7 +513,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
513 MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \ 513 MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \
514 (128 * 5) + \ 514 (128 * 5) + \
515 XFS_ALLOCFREE_LOG_RES(mp, 1) + \ 515 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
516 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ 516 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
517 XFS_ALLOCFREE_LOG_COUNT(mp, 1)))) 517 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
518 518
519 519
@@ -637,7 +637,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
637/* 637/*
638 * Removing the attribute fork of a file 638 * Removing the attribute fork of a file
639 * the inode being truncated: inode size 639 * the inode being truncated: inode size
640 * the inode\'s bmap btree: max depth * block size 640 * the inode's bmap btree: max depth * block size
641 * And the bmap_finish transaction can free the blocks and bmap blocks: 641 * And the bmap_finish transaction can free the blocks and bmap blocks:
642 * the agf for each of the ags: 4 * sector size 642 * the agf for each of the ags: 4 * sector size
643 * the agfl for each of the ags: 4 * sector size 643 * the agfl for each of the ags: 4 * sector size
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 2d47f10f8bed..f31271c30de9 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -79,7 +79,7 @@ xfs_trans_ail_tail(
79 * the push is run asynchronously in a separate thread, so we return the tail 79 * the push is run asynchronously in a separate thread, so we return the tail
80 * of the log right now instead of the tail after the push. This means we will 80 * of the log right now instead of the tail after the push. This means we will
81 * either continue right away, or we will sleep waiting on the async thread to 81 * either continue right away, or we will sleep waiting on the async thread to
82 * do it's work. 82 * do its work.
83 * 83 *
84 * We do this unlocked - we only need to know whether there is anything in the 84 * We do this unlocked - we only need to know whether there is anything in the
85 * AIL at the time we are called. We don't need to access the contents of 85 * AIL at the time we are called. We don't need to access the contents of
@@ -160,7 +160,7 @@ xfs_trans_ail_cursor_next(
160/* 160/*
161 * Now that the traversal is complete, we need to remove the cursor 161 * Now that the traversal is complete, we need to remove the cursor
162 * from the list of traversing cursors. Avoid removing the embedded 162 * from the list of traversing cursors. Avoid removing the embedded
163 * push cursor, but use the fact it is alway present to make the 163 * push cursor, but use the fact it is always present to make the
164 * list deletion simple. 164 * list deletion simple.
165 */ 165 */
166void 166void
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index e110bf57d7f4..eb3fc57f9eef 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -22,7 +22,7 @@
22#include "xfs_inum.h" 22#include "xfs_inum.h"
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_trans_priv.h" 24#include "xfs_trans_priv.h"
25/* XXX: from here down needed until struct xfs_trans has it's own ailp */ 25/* XXX: from here down needed until struct xfs_trans has its own ailp */
26#include "xfs_bit.h" 26#include "xfs_bit.h"
27#include "xfs_buf_item.h" 27#include "xfs_buf_item.h"
28#include "xfs_sb.h" 28#include "xfs_sb.h"
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h
index 4ea2e5074bdd..7d2c920dfb9c 100644
--- a/fs/xfs/xfs_trans_space.h
+++ b/fs/xfs/xfs_trans_space.h
@@ -47,7 +47,7 @@
47#define XFS_DIRREMOVE_SPACE_RES(mp) \ 47#define XFS_DIRREMOVE_SPACE_RES(mp) \
48 XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK) 48 XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
49#define XFS_IALLOC_SPACE_RES(mp) \ 49#define XFS_IALLOC_SPACE_RES(mp) \
50 (XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp)-1) 50 (XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels - 1)
51 51
52/* 52/*
53 * Space reservation values for various transactions. 53 * Space reservation values for various transactions.
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index b2f724502f1b..d725428c9df6 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -21,14 +21,6 @@
21#ifdef __KERNEL__ 21#ifdef __KERNEL__
22 22
23/* 23/*
24 * POSIX Extensions
25 */
26typedef unsigned char uchar_t;
27typedef unsigned short ushort_t;
28typedef unsigned int uint_t;
29typedef unsigned long ulong_t;
30
31/*
32 * Additional type declarations for XFS 24 * Additional type declarations for XFS
33 */ 25 */
34typedef signed char __int8_t; 26typedef signed char __int8_t;
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index fcc2285d03ed..79b9e5ea5359 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -374,7 +374,7 @@ xfs_truncate_file(
374 374
375 /* 375 /*
376 * Follow the normal truncate locking protocol. Since we 376 * Follow the normal truncate locking protocol. Since we
377 * hold the inode in the transaction, we know that it's number 377 * hold the inode in the transaction, we know that its number
378 * of references will stay constant. 378 * of references will stay constant.
379 */ 379 */
380 xfs_ilock(ip, XFS_ILOCK_EXCL); 380 xfs_ilock(ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 0e55c5d7db5f..7394c7af5de5 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1136,7 +1136,7 @@ xfs_inactive(
1136 * If the inode is already free, then there can be nothing 1136 * If the inode is already free, then there can be nothing
1137 * to clean up here. 1137 * to clean up here.
1138 */ 1138 */
1139 if (ip->i_d.di_mode == 0 || VN_BAD(VFS_I(ip))) { 1139 if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) {
1140 ASSERT(ip->i_df.if_real_bytes == 0); 1140 ASSERT(ip->i_df.if_real_bytes == 0);
1141 ASSERT(ip->i_df.if_broot_bytes == 0); 1141 ASSERT(ip->i_df.if_broot_bytes == 0);
1142 return VN_INACTIVE_CACHE; 1142 return VN_INACTIVE_CACHE;
@@ -1387,23 +1387,28 @@ xfs_create(
1387 xfs_inode_t **ipp, 1387 xfs_inode_t **ipp,
1388 cred_t *credp) 1388 cred_t *credp)
1389{ 1389{
1390 xfs_mount_t *mp = dp->i_mount; 1390 int is_dir = S_ISDIR(mode);
1391 xfs_inode_t *ip; 1391 struct xfs_mount *mp = dp->i_mount;
1392 xfs_trans_t *tp; 1392 struct xfs_inode *ip = NULL;
1393 struct xfs_trans *tp = NULL;
1393 int error; 1394 int error;
1394 xfs_bmap_free_t free_list; 1395 xfs_bmap_free_t free_list;
1395 xfs_fsblock_t first_block; 1396 xfs_fsblock_t first_block;
1396 boolean_t unlock_dp_on_error = B_FALSE; 1397 boolean_t unlock_dp_on_error = B_FALSE;
1397 int dm_event_sent = 0;
1398 uint cancel_flags; 1398 uint cancel_flags;
1399 int committed; 1399 int committed;
1400 xfs_prid_t prid; 1400 xfs_prid_t prid;
1401 struct xfs_dquot *udqp, *gdqp; 1401 struct xfs_dquot *udqp = NULL;
1402 struct xfs_dquot *gdqp = NULL;
1402 uint resblks; 1403 uint resblks;
1404 uint log_res;
1405 uint log_count;
1403 1406
1404 ASSERT(!*ipp);
1405 xfs_itrace_entry(dp); 1407 xfs_itrace_entry(dp);
1406 1408
1409 if (XFS_FORCED_SHUTDOWN(mp))
1410 return XFS_ERROR(EIO);
1411
1407 if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) { 1412 if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
1408 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE, 1413 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1409 dp, DM_RIGHT_NULL, NULL, 1414 dp, DM_RIGHT_NULL, NULL,
@@ -1412,84 +1417,97 @@ xfs_create(
1412 1417
1413 if (error) 1418 if (error)
1414 return error; 1419 return error;
1415 dm_event_sent = 1;
1416 } 1420 }
1417 1421
1418 if (XFS_FORCED_SHUTDOWN(mp))
1419 return XFS_ERROR(EIO);
1420
1421 /* Return through std_return after this point. */
1422
1423 udqp = gdqp = NULL;
1424 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 1422 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1425 prid = dp->i_d.di_projid; 1423 prid = dp->i_d.di_projid;
1426 else 1424 else
1427 prid = (xfs_prid_t)dfltprid; 1425 prid = dfltprid;
1428 1426
1429 /* 1427 /*
1430 * Make sure that we have allocated dquot(s) on disk. 1428 * Make sure that we have allocated dquot(s) on disk.
1431 */ 1429 */
1432 error = XFS_QM_DQVOPALLOC(mp, dp, 1430 error = XFS_QM_DQVOPALLOC(mp, dp,
1433 current_fsuid(), current_fsgid(), prid, 1431 current_fsuid(), current_fsgid(), prid,
1434 XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp); 1432 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
1435 if (error) 1433 if (error)
1436 goto std_return; 1434 goto std_return;
1437 1435
1438 ip = NULL; 1436 if (is_dir) {
1437 rdev = 0;
1438 resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
1439 log_res = XFS_MKDIR_LOG_RES(mp);
1440 log_count = XFS_MKDIR_LOG_COUNT;
1441 tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
1442 } else {
1443 resblks = XFS_CREATE_SPACE_RES(mp, name->len);
1444 log_res = XFS_CREATE_LOG_RES(mp);
1445 log_count = XFS_CREATE_LOG_COUNT;
1446 tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1447 }
1439 1448
1440 tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1441 cancel_flags = XFS_TRANS_RELEASE_LOG_RES; 1449 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1442 resblks = XFS_CREATE_SPACE_RES(mp, name->len); 1450
1443 /* 1451 /*
1444 * Initially assume that the file does not exist and 1452 * Initially assume that the file does not exist and
1445 * reserve the resources for that case. If that is not 1453 * reserve the resources for that case. If that is not
1446 * the case we'll drop the one we have and get a more 1454 * the case we'll drop the one we have and get a more
1447 * appropriate transaction later. 1455 * appropriate transaction later.
1448 */ 1456 */
1449 error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0, 1457 error = xfs_trans_reserve(tp, resblks, log_res, 0,
1450 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT); 1458 XFS_TRANS_PERM_LOG_RES, log_count);
1451 if (error == ENOSPC) { 1459 if (error == ENOSPC) {
1452 resblks = 0; 1460 resblks = 0;
1453 error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0, 1461 error = xfs_trans_reserve(tp, 0, log_res, 0,
1454 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT); 1462 XFS_TRANS_PERM_LOG_RES, log_count);
1455 } 1463 }
1456 if (error) { 1464 if (error) {
1457 cancel_flags = 0; 1465 cancel_flags = 0;
1458 goto error_return; 1466 goto out_trans_cancel;
1459 } 1467 }
1460 1468
1461 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); 1469 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1462 unlock_dp_on_error = B_TRUE; 1470 unlock_dp_on_error = B_TRUE;
1463 1471
1464 xfs_bmap_init(&free_list, &first_block); 1472 /*
1473 * Check for directory link count overflow.
1474 */
1475 if (is_dir && dp->i_d.di_nlink >= XFS_MAXLINK) {
1476 error = XFS_ERROR(EMLINK);
1477 goto out_trans_cancel;
1478 }
1465 1479
1466 ASSERT(ip == NULL); 1480 xfs_bmap_init(&free_list, &first_block);
1467 1481
1468 /* 1482 /*
1469 * Reserve disk quota and the inode. 1483 * Reserve disk quota and the inode.
1470 */ 1484 */
1471 error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0); 1485 error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
1472 if (error) 1486 if (error)
1473 goto error_return; 1487 goto out_trans_cancel;
1474 1488
1475 error = xfs_dir_canenter(tp, dp, name, resblks); 1489 error = xfs_dir_canenter(tp, dp, name, resblks);
1476 if (error) 1490 if (error)
1477 goto error_return; 1491 goto out_trans_cancel;
1478 error = xfs_dir_ialloc(&tp, dp, mode, 1, 1492
1479 rdev, credp, prid, resblks > 0, 1493 /*
1480 &ip, &committed); 1494 * A newly created regular or special file just has one directory
1495 * entry pointing to them, but a directory also the "." entry
1496 * pointing to itself.
1497 */
1498 error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, credp,
1499 prid, resblks > 0, &ip, &committed);
1481 if (error) { 1500 if (error) {
1482 if (error == ENOSPC) 1501 if (error == ENOSPC)
1483 goto error_return; 1502 goto out_trans_cancel;
1484 goto abort_return; 1503 goto out_trans_abort;
1485 } 1504 }
1486 xfs_itrace_ref(ip);
1487 1505
1488 /* 1506 /*
1489 * At this point, we've gotten a newly allocated inode. 1507 * At this point, we've gotten a newly allocated inode.
1490 * It is locked (and joined to the transaction). 1508 * It is locked (and joined to the transaction).
1491 */ 1509 */
1492 1510 xfs_itrace_ref(ip);
1493 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 1511 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1494 1512
1495 /* 1513 /*
@@ -1508,19 +1526,28 @@ xfs_create(
1508 resblks - XFS_IALLOC_SPACE_RES(mp) : 0); 1526 resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1509 if (error) { 1527 if (error) {
1510 ASSERT(error != ENOSPC); 1528 ASSERT(error != ENOSPC);
1511 goto abort_return; 1529 goto out_trans_abort;
1512 } 1530 }
1513 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1531 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1514 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 1532 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1515 1533
1534 if (is_dir) {
1535 error = xfs_dir_init(tp, ip, dp);
1536 if (error)
1537 goto out_bmap_cancel;
1538
1539 error = xfs_bumplink(tp, dp);
1540 if (error)
1541 goto out_bmap_cancel;
1542 }
1543
1516 /* 1544 /*
1517 * If this is a synchronous mount, make sure that the 1545 * If this is a synchronous mount, make sure that the
1518 * create transaction goes to disk before returning to 1546 * create transaction goes to disk before returning to
1519 * the user. 1547 * the user.
1520 */ 1548 */
1521 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { 1549 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
1522 xfs_trans_set_sync(tp); 1550 xfs_trans_set_sync(tp);
1523 }
1524 1551
1525 /* 1552 /*
1526 * Attach the dquot(s) to the inodes and modify them incore. 1553 * Attach the dquot(s) to the inodes and modify them incore.
@@ -1537,16 +1564,13 @@ xfs_create(
1537 IHOLD(ip); 1564 IHOLD(ip);
1538 1565
1539 error = xfs_bmap_finish(&tp, &free_list, &committed); 1566 error = xfs_bmap_finish(&tp, &free_list, &committed);
1540 if (error) { 1567 if (error)
1541 xfs_bmap_cancel(&free_list); 1568 goto out_abort_rele;
1542 goto abort_rele;
1543 }
1544 1569
1545 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1570 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1546 if (error) { 1571 if (error) {
1547 IRELE(ip); 1572 IRELE(ip);
1548 tp = NULL; 1573 goto out_dqrele;
1549 goto error_return;
1550 } 1574 }
1551 1575
1552 XFS_QM_DQRELE(mp, udqp); 1576 XFS_QM_DQRELE(mp, udqp);
@@ -1555,26 +1579,22 @@ xfs_create(
1555 *ipp = ip; 1579 *ipp = ip;
1556 1580
1557 /* Fallthrough to std_return with error = 0 */ 1581 /* Fallthrough to std_return with error = 0 */
1558 1582 std_return:
1559std_return: 1583 if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
1560 if ((*ipp || (error != 0 && dm_event_sent != 0)) && 1584 XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, dp, DM_RIGHT_NULL,
1561 DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) { 1585 ip, DM_RIGHT_NULL, name->name, NULL, mode,
1562 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, 1586 error, 0);
1563 dp, DM_RIGHT_NULL,
1564 *ipp ? ip : NULL,
1565 DM_RIGHT_NULL, name->name, NULL,
1566 mode, error, 0);
1567 } 1587 }
1588
1568 return error; 1589 return error;
1569 1590
1570 abort_return: 1591 out_bmap_cancel:
1592 xfs_bmap_cancel(&free_list);
1593 out_trans_abort:
1571 cancel_flags |= XFS_TRANS_ABORT; 1594 cancel_flags |= XFS_TRANS_ABORT;
1572 /* FALLTHROUGH */ 1595 out_trans_cancel:
1573 1596 xfs_trans_cancel(tp, cancel_flags);
1574 error_return: 1597 out_dqrele:
1575 if (tp != NULL)
1576 xfs_trans_cancel(tp, cancel_flags);
1577
1578 XFS_QM_DQRELE(mp, udqp); 1598 XFS_QM_DQRELE(mp, udqp);
1579 XFS_QM_DQRELE(mp, gdqp); 1599 XFS_QM_DQRELE(mp, gdqp);
1580 1600
@@ -1583,20 +1603,18 @@ std_return:
1583 1603
1584 goto std_return; 1604 goto std_return;
1585 1605
1586 abort_rele: 1606 out_abort_rele:
1587 /* 1607 /*
1588 * Wait until after the current transaction is aborted to 1608 * Wait until after the current transaction is aborted to
1589 * release the inode. This prevents recursive transactions 1609 * release the inode. This prevents recursive transactions
1590 * and deadlocks from xfs_inactive. 1610 * and deadlocks from xfs_inactive.
1591 */ 1611 */
1612 xfs_bmap_cancel(&free_list);
1592 cancel_flags |= XFS_TRANS_ABORT; 1613 cancel_flags |= XFS_TRANS_ABORT;
1593 xfs_trans_cancel(tp, cancel_flags); 1614 xfs_trans_cancel(tp, cancel_flags);
1594 IRELE(ip); 1615 IRELE(ip);
1595 1616 unlock_dp_on_error = B_FALSE;
1596 XFS_QM_DQRELE(mp, udqp); 1617 goto out_dqrele;
1597 XFS_QM_DQRELE(mp, gdqp);
1598
1599 goto std_return;
1600} 1618}
1601 1619
1602#ifdef DEBUG 1620#ifdef DEBUG
@@ -2004,8 +2022,10 @@ xfs_link(
2004 /* Return through std_return after this point. */ 2022 /* Return through std_return after this point. */
2005 2023
2006 error = XFS_QM_DQATTACH(mp, sip, 0); 2024 error = XFS_QM_DQATTACH(mp, sip, 0);
2007 if (!error && sip != tdp) 2025 if (error)
2008 error = XFS_QM_DQATTACH(mp, tdp, 0); 2026 goto std_return;
2027
2028 error = XFS_QM_DQATTACH(mp, tdp, 0);
2009 if (error) 2029 if (error)
2010 goto std_return; 2030 goto std_return;
2011 2031
@@ -2110,209 +2130,6 @@ std_return:
2110 goto std_return; 2130 goto std_return;
2111} 2131}
2112 2132
2113
2114int
2115xfs_mkdir(
2116 xfs_inode_t *dp,
2117 struct xfs_name *dir_name,
2118 mode_t mode,
2119 xfs_inode_t **ipp,
2120 cred_t *credp)
2121{
2122 xfs_mount_t *mp = dp->i_mount;
2123 xfs_inode_t *cdp; /* inode of created dir */
2124 xfs_trans_t *tp;
2125 int cancel_flags;
2126 int error;
2127 int committed;
2128 xfs_bmap_free_t free_list;
2129 xfs_fsblock_t first_block;
2130 boolean_t unlock_dp_on_error = B_FALSE;
2131 boolean_t created = B_FALSE;
2132 int dm_event_sent = 0;
2133 xfs_prid_t prid;
2134 struct xfs_dquot *udqp, *gdqp;
2135 uint resblks;
2136
2137 if (XFS_FORCED_SHUTDOWN(mp))
2138 return XFS_ERROR(EIO);
2139
2140 tp = NULL;
2141
2142 if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
2143 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
2144 dp, DM_RIGHT_NULL, NULL,
2145 DM_RIGHT_NULL, dir_name->name, NULL,
2146 mode, 0, 0);
2147 if (error)
2148 return error;
2149 dm_event_sent = 1;
2150 }
2151
2152 /* Return through std_return after this point. */
2153
2154 xfs_itrace_entry(dp);
2155
2156 mp = dp->i_mount;
2157 udqp = gdqp = NULL;
2158 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2159 prid = dp->i_d.di_projid;
2160 else
2161 prid = (xfs_prid_t)dfltprid;
2162
2163 /*
2164 * Make sure that we have allocated dquot(s) on disk.
2165 */
2166 error = XFS_QM_DQVOPALLOC(mp, dp,
2167 current_fsuid(), current_fsgid(), prid,
2168 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2169 if (error)
2170 goto std_return;
2171
2172 tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
2173 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2174 resblks = XFS_MKDIR_SPACE_RES(mp, dir_name->len);
2175 error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
2176 XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
2177 if (error == ENOSPC) {
2178 resblks = 0;
2179 error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
2180 XFS_TRANS_PERM_LOG_RES,
2181 XFS_MKDIR_LOG_COUNT);
2182 }
2183 if (error) {
2184 cancel_flags = 0;
2185 goto error_return;
2186 }
2187
2188 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2189 unlock_dp_on_error = B_TRUE;
2190
2191 /*
2192 * Check for directory link count overflow.
2193 */
2194 if (dp->i_d.di_nlink >= XFS_MAXLINK) {
2195 error = XFS_ERROR(EMLINK);
2196 goto error_return;
2197 }
2198
2199 /*
2200 * Reserve disk quota and the inode.
2201 */
2202 error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2203 if (error)
2204 goto error_return;
2205
2206 error = xfs_dir_canenter(tp, dp, dir_name, resblks);
2207 if (error)
2208 goto error_return;
2209 /*
2210 * create the directory inode.
2211 */
2212 error = xfs_dir_ialloc(&tp, dp, mode, 2,
2213 0, credp, prid, resblks > 0,
2214 &cdp, NULL);
2215 if (error) {
2216 if (error == ENOSPC)
2217 goto error_return;
2218 goto abort_return;
2219 }
2220 xfs_itrace_ref(cdp);
2221
2222 /*
2223 * Now we add the directory inode to the transaction.
2224 * We waited until now since xfs_dir_ialloc might start
2225 * a new transaction. Had we joined the transaction
2226 * earlier, the locks might have gotten released. An error
2227 * from here on will result in the transaction cancel
2228 * unlocking dp so don't do it explicitly in the error path.
2229 */
2230 IHOLD(dp);
2231 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2232 unlock_dp_on_error = B_FALSE;
2233
2234 xfs_bmap_init(&free_list, &first_block);
2235
2236 error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino,
2237 &first_block, &free_list, resblks ?
2238 resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2239 if (error) {
2240 ASSERT(error != ENOSPC);
2241 goto error1;
2242 }
2243 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2244
2245 error = xfs_dir_init(tp, cdp, dp);
2246 if (error)
2247 goto error2;
2248
2249 error = xfs_bumplink(tp, dp);
2250 if (error)
2251 goto error2;
2252
2253 created = B_TRUE;
2254
2255 *ipp = cdp;
2256 IHOLD(cdp);
2257
2258 /*
2259 * Attach the dquots to the new inode and modify the icount incore.
2260 */
2261 XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
2262
2263 /*
2264 * If this is a synchronous mount, make sure that the
2265 * mkdir transaction goes to disk before returning to
2266 * the user.
2267 */
2268 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2269 xfs_trans_set_sync(tp);
2270 }
2271
2272 error = xfs_bmap_finish(&tp, &free_list, &committed);
2273 if (error) {
2274 IRELE(cdp);
2275 goto error2;
2276 }
2277
2278 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2279 XFS_QM_DQRELE(mp, udqp);
2280 XFS_QM_DQRELE(mp, gdqp);
2281 if (error) {
2282 IRELE(cdp);
2283 }
2284
2285 /* Fall through to std_return with error = 0 or errno from
2286 * xfs_trans_commit. */
2287
2288std_return:
2289 if ((created || (error != 0 && dm_event_sent != 0)) &&
2290 DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
2291 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2292 dp, DM_RIGHT_NULL,
2293 created ? cdp : NULL,
2294 DM_RIGHT_NULL,
2295 dir_name->name, NULL,
2296 mode, error, 0);
2297 }
2298 return error;
2299
2300 error2:
2301 error1:
2302 xfs_bmap_cancel(&free_list);
2303 abort_return:
2304 cancel_flags |= XFS_TRANS_ABORT;
2305 error_return:
2306 xfs_trans_cancel(tp, cancel_flags);
2307 XFS_QM_DQRELE(mp, udqp);
2308 XFS_QM_DQRELE(mp, gdqp);
2309
2310 if (unlock_dp_on_error)
2311 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2312
2313 goto std_return;
2314}
2315
2316int 2133int
2317xfs_symlink( 2134xfs_symlink(
2318 xfs_inode_t *dp, 2135 xfs_inode_t *dp,
@@ -2587,51 +2404,6 @@ std_return:
2587} 2404}
2588 2405
2589int 2406int
2590xfs_inode_flush(
2591 xfs_inode_t *ip,
2592 int flags)
2593{
2594 xfs_mount_t *mp = ip->i_mount;
2595 int error = 0;
2596
2597 if (XFS_FORCED_SHUTDOWN(mp))
2598 return XFS_ERROR(EIO);
2599
2600 /*
2601 * Bypass inodes which have already been cleaned by
2602 * the inode flush clustering code inside xfs_iflush
2603 */
2604 if (xfs_inode_clean(ip))
2605 return 0;
2606
2607 /*
2608 * We make this non-blocking if the inode is contended,
2609 * return EAGAIN to indicate to the caller that they
2610 * did not succeed. This prevents the flush path from
2611 * blocking on inodes inside another operation right
2612 * now, they get caught later by xfs_sync.
2613 */
2614 if (flags & FLUSH_SYNC) {
2615 xfs_ilock(ip, XFS_ILOCK_SHARED);
2616 xfs_iflock(ip);
2617 } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
2618 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
2619 xfs_iunlock(ip, XFS_ILOCK_SHARED);
2620 return EAGAIN;
2621 }
2622 } else {
2623 return EAGAIN;
2624 }
2625
2626 error = xfs_iflush(ip, (flags & FLUSH_SYNC) ? XFS_IFLUSH_SYNC
2627 : XFS_IFLUSH_ASYNC_NOBLOCK);
2628 xfs_iunlock(ip, XFS_ILOCK_SHARED);
2629
2630 return error;
2631}
2632
2633
2634int
2635xfs_set_dmattrs( 2407xfs_set_dmattrs(
2636 xfs_inode_t *ip, 2408 xfs_inode_t *ip,
2637 u_int evmask, 2409 u_int evmask,
@@ -2676,7 +2448,7 @@ xfs_reclaim(
2676 ASSERT(!VN_MAPPED(VFS_I(ip))); 2448 ASSERT(!VN_MAPPED(VFS_I(ip)));
2677 2449
2678 /* bad inode, get out here ASAP */ 2450 /* bad inode, get out here ASAP */
2679 if (VN_BAD(VFS_I(ip))) { 2451 if (is_bad_inode(VFS_I(ip))) {
2680 xfs_ireclaim(ip); 2452 xfs_ireclaim(ip);
2681 return 0; 2453 return 0;
2682 } 2454 }
@@ -3090,7 +2862,7 @@ xfs_free_file_space(
3090 2862
3091 /* 2863 /*
3092 * Need to zero the stuff we're not freeing, on disk. 2864 * Need to zero the stuff we're not freeing, on disk.
3093 * If its a realtime file & can't use unwritten extents then we 2865 * If it's a realtime file & can't use unwritten extents then we
3094 * actually need to zero the extent edges. Otherwise xfs_bunmapi 2866 * actually need to zero the extent edges. Otherwise xfs_bunmapi
3095 * will take care of it for us. 2867 * will take care of it for us.
3096 */ 2868 */
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 76df328c61b4..04373c6c61ff 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -31,14 +31,11 @@ int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
31 struct xfs_inode *ip); 31 struct xfs_inode *ip);
32int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, 32int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
33 struct xfs_name *target_name); 33 struct xfs_name *target_name);
34int xfs_mkdir(struct xfs_inode *dp, struct xfs_name *dir_name,
35 mode_t mode, struct xfs_inode **ipp, cred_t *credp);
36int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize, 34int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize,
37 xfs_off_t *offset, filldir_t filldir); 35 xfs_off_t *offset, filldir_t filldir);
38int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, 36int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
39 const char *target_path, mode_t mode, struct xfs_inode **ipp, 37 const char *target_path, mode_t mode, struct xfs_inode **ipp,
40 cred_t *credp); 38 cred_t *credp);
41int xfs_inode_flush(struct xfs_inode *ip, int flags);
42int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); 39int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
43int xfs_reclaim(struct xfs_inode *ip); 40int xfs_reclaim(struct xfs_inode *ip);
44int xfs_change_file_space(struct xfs_inode *ip, int cmd, 41int xfs_change_file_space(struct xfs_inode *ip, int cmd,